io4it 0.0.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. io4it-0.0.0.9/License.txt +6 -0
  2. io4it-0.0.0.9/PKG-INFO +8 -0
  3. io4it-0.0.0.9/io4it.egg-info/PKG-INFO +8 -0
  4. io4it-0.0.0.9/io4it.egg-info/SOURCES.txt +33 -0
  5. io4it-0.0.0.9/io4it.egg-info/dependency_links.txt +1 -0
  6. io4it-0.0.0.9/io4it.egg-info/entry_points.txt +2 -0
  7. io4it-0.0.0.9/io4it.egg-info/namespace_packages.txt +1 -0
  8. io4it-0.0.0.9/io4it.egg-info/requires.txt +10 -0
  9. io4it-0.0.0.9/io4it.egg-info/top_level.txt +1 -0
  10. io4it-0.0.0.9/orangecontrib/IO4IT/__init__.py +0 -0
  11. io4it-0.0.0.9/orangecontrib/IO4IT/ocr_function/__init__.py +0 -0
  12. io4it-0.0.0.9/orangecontrib/IO4IT/ocr_function/word_converter.py +327 -0
  13. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/OWMarkdownizer.py +202 -0
  14. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/OWPathPropagator.py +123 -0
  15. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/OWS3Uploader.py +92 -0
  16. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/OWS3downloader.py +94 -0
  17. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/OWS3list.py +107 -0
  18. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/OWSpeechToText.py +362 -0
  19. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/OWwordpdf2docx.py +129 -0
  20. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/__init__.py +19 -0
  21. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/designer/ow_in_or_out_path.ui +85 -0
  22. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/designer/owspeechtotext.ui +104 -0
  23. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/designer/wordpdf2docx.ui +57 -0
  24. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/icons/category.svg +50 -0
  25. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/icons/download.png +0 -0
  26. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/icons/in_or_out.png +0 -0
  27. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/icons/list_aws.png +0 -0
  28. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/icons/md.png +0 -0
  29. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/icons/speech_to_text.png +0 -0
  30. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/icons/upload.png +0 -0
  31. io4it-0.0.0.9/orangecontrib/IO4IT/widgets/icons/wordpdf2docx.png +0 -0
  32. io4it-0.0.0.9/orangecontrib/__init__.py +1 -0
  33. io4it-0.0.0.9/setup.cfg +4 -0
  34. io4it-0.0.0.9/setup.py +59 -0
@@ -0,0 +1,6 @@
1
+ THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT ANY WARRANTY WHATSOEVER.
2
+
3
+ If you use or redistribute this software, you are permitted to do so
4
+ under the terms of GNU [GPL-3.0]+ license.
5
+
6
+ [GPL-3.0]: https://www.gnu.org/licenses/gpl-3.0.en.html
io4it-0.0.0.9/PKG-INFO ADDED
@@ -0,0 +1,8 @@
1
+ Metadata-Version: 2.1
2
+ Name: io4it
3
+ Version: 0.0.0.9
4
+ Home-page:
5
+ Author:
6
+ Author-email:
7
+ Keywords: orange3 add-on
8
+ License-File: License.txt
@@ -0,0 +1,8 @@
1
+ Metadata-Version: 2.1
2
+ Name: io4it
3
+ Version: 0.0.0.9
4
+ Home-page:
5
+ Author:
6
+ Author-email:
7
+ Keywords: orange3 add-on
8
+ License-File: License.txt
@@ -0,0 +1,33 @@
1
+ License.txt
2
+ setup.cfg
3
+ setup.py
4
+ io4it.egg-info/PKG-INFO
5
+ io4it.egg-info/SOURCES.txt
6
+ io4it.egg-info/dependency_links.txt
7
+ io4it.egg-info/entry_points.txt
8
+ io4it.egg-info/namespace_packages.txt
9
+ io4it.egg-info/requires.txt
10
+ io4it.egg-info/top_level.txt
11
+ orangecontrib/__init__.py
12
+ orangecontrib/IO4IT/__init__.py
13
+ orangecontrib/IO4IT/ocr_function/__init__.py
14
+ orangecontrib/IO4IT/ocr_function/word_converter.py
15
+ orangecontrib/IO4IT/widgets/OWMarkdownizer.py
16
+ orangecontrib/IO4IT/widgets/OWPathPropagator.py
17
+ orangecontrib/IO4IT/widgets/OWS3Uploader.py
18
+ orangecontrib/IO4IT/widgets/OWS3downloader.py
19
+ orangecontrib/IO4IT/widgets/OWS3list.py
20
+ orangecontrib/IO4IT/widgets/OWSpeechToText.py
21
+ orangecontrib/IO4IT/widgets/OWwordpdf2docx.py
22
+ orangecontrib/IO4IT/widgets/__init__.py
23
+ orangecontrib/IO4IT/widgets/designer/ow_in_or_out_path.ui
24
+ orangecontrib/IO4IT/widgets/designer/owspeechtotext.ui
25
+ orangecontrib/IO4IT/widgets/designer/wordpdf2docx.ui
26
+ orangecontrib/IO4IT/widgets/icons/category.svg
27
+ orangecontrib/IO4IT/widgets/icons/download.png
28
+ orangecontrib/IO4IT/widgets/icons/in_or_out.png
29
+ orangecontrib/IO4IT/widgets/icons/list_aws.png
30
+ orangecontrib/IO4IT/widgets/icons/md.png
31
+ orangecontrib/IO4IT/widgets/icons/speech_to_text.png
32
+ orangecontrib/IO4IT/widgets/icons/upload.png
33
+ orangecontrib/IO4IT/widgets/icons/wordpdf2docx.png
@@ -0,0 +1,2 @@
1
+ [orange.widgets]
2
+ Advanced Artificial Intelligence Tools = orangecontrib.IO4IT.widgets
@@ -0,0 +1 @@
1
+ orangecontrib
@@ -0,0 +1,10 @@
1
+ boto3
2
+ docling
3
+ docling-core
4
+ speechbrain
5
+ whisper
6
+ whisper-openai
7
+ pyannote.audio
8
+ pyannote.core
9
+ wave
10
+ scikit-learn
@@ -0,0 +1 @@
1
+ orangecontrib
File without changes
@@ -0,0 +1,327 @@
1
+ import os
2
+ import win32com.client
3
+ from pathlib import Path
4
+ import pathlib
5
+ import tempfile
6
+ import shutil
7
+ import time
8
+ import pythoncom
9
+ import fitz
10
+
11
+ if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
12
+ from Orange.widgets.orangecontrib.AAIT.utils.MetManagement import get_local_store_path,reset_folder
13
+ else:
14
+ from orangecontrib.AAIT.utils.MetManagement import get_local_store_path,reset_folder
15
+
16
+ def enable_long_path(path):
17
+ """Simplifie la gestion des chemins longs sous Windows."""
18
+ return pathlib.Path(r"\\?\\" + str(path))
19
+
20
+ def convert_pdf_structure(input_dir: str, output_dir: str,ignore_exsting_out_put=False,eprogress_callback=None):
21
+ """
22
+ return a string with log in case of error
23
+ Recursively lists all .pdf and .PDF files in the input directory,
24
+ replicates the folder structure in the output directory, and
25
+ creates empty .docx files with the same names.
26
+
27
+ Parameters:
28
+ input_dir (str): Path to the input directory containing PDF files.
29
+ output_dir (str): Path to the output directory where DOCX files will be created.
30
+ """
31
+ error_log=""
32
+ if os.name != 'nt':
33
+ error_log="version developped for windows computer "
34
+ return error_log
35
+
36
+ nbre_file = 0
37
+ for i, data in enumerate(input_dir):
38
+ input_path = Path(str(input_dir[i]))
39
+ for pdf_file in input_path.rglob("*.pdf"):
40
+ nbre_file += 1
41
+
42
+ k = 1
43
+ for i, data in enumerate(input_dir):
44
+ input_path = Path(str(input_dir[i]))
45
+ output_path = Path(str(output_dir[i]))
46
+
47
+ if not input_path.exists() or not input_path.is_dir():
48
+ print(f"Error: The input directory '{input_dir}' does not exist or is not a directory.")
49
+ return f"Error: The input directory '{input_dir}' does not exist or is not a directory. "
50
+
51
+ for pdf_file in input_path.rglob("*.pdf"): # Recursively search for .pdf and .PDF files
52
+ relative_path = pdf_file.relative_to(input_path) # Get relative path from input root
53
+ new_file_path = output_path / relative_path.with_suffix(".docx") # Change extension to .docx
54
+
55
+
56
+
57
+ if ignore_exsting_out_put:
58
+ if os.path.exists(enable_long_path(str(new_file_path))):
59
+ print("ignoring",pdf_file)
60
+ continue
61
+
62
+
63
+ if 0!= convert_pdf_with_temp(str(pdf_file),str(new_file_path)):#convert_pdf_with_temp #convert_pdf_to_docx
64
+ if error_log!="":
65
+ error_log+="\n"
66
+ error_log+="error -> "+str(pdf_file)
67
+ return error_log # a supprimer
68
+ if progress_callback is not None:
69
+ progress_value = float(100 * (k) / nbre_file)
70
+ k += 1
71
+ progress_callback(progress_value)
72
+ # purge temp dir if everithing is ok
73
+ if error_log=="":
74
+ reset_folder(get_local_store_path() + "temp_word_conversion/", attempts=10, delay=0.05, recreate=False)
75
+ return error_log
76
+
77
+
78
+
79
+
80
+ def convert_pdf_to_docx(pdf_path, docx_path):
81
+ """
82
+ Convertit un fichier PDF en DOCX en utilisant Microsoft Word.
83
+
84
+ Args:
85
+ pdf_path (str): Chemin du fichier PDF source.
86
+ docx_path (str): Chemin du fichier DOCX de destination.
87
+
88
+ Returns:
89
+ int: 0 si la conversion a réussi, 1 en cas d'échec.
90
+ """
91
+ if not os.path.exists(pdf_path):
92
+ print(f"Erreur : Le fichier {pdf_path} n'existe pas.")
93
+ return 1
94
+
95
+ try:
96
+ # Initialiser COM
97
+ pythoncom.CoInitialize()
98
+
99
+ # Lancer Word
100
+ word = win32com.client.Dispatch("Word.Application")
101
+ word.DisplayAlerts = 0 # Désactiver les alertes
102
+ word.Visible = True # Mettre à True pour voir Word en action
103
+ print(f"Conversion de {pdf_path} en {docx_path}...")
104
+
105
+ # Ouvrir le PDF en lecture seule
106
+ doc = word.Documents.Open(pdf_path, ReadOnly=True, ConfirmConversions=False)
107
+
108
+ # Sauvegarder en DOCX
109
+ doc.SaveAs(docx_path, FileFormat=16) # 16 = wdFormatDocumentDefault
110
+ doc.Close(False)
111
+
112
+ print(f"Conversion réussie : {docx_path}")
113
+ return 0
114
+
115
+ except Exception as e:
116
+ print(f"Erreur lors de la conversion : {e}")
117
+ return 1
118
+
119
+ finally:
120
+ if 'word' in locals():
121
+ word.Quit()
122
+
123
+ # Libérer COM
124
+ pythoncom.CoUninitialize()
125
+
126
+
127
+ def wait_for_file_access(file_path, timeout=10, interval=0.5):
128
+ """
129
+ Attendre que le fichier soit accessible en lecture/écriture.
130
+
131
+ Args:
132
+ file_path (str): Chemin du fichier à vérifier.
133
+ timeout (int): Temps max en secondes avant d'abandonner.
134
+ interval (float): Temps d'attente entre chaque vérification.
135
+
136
+ Returns:
137
+ bool: True si le fichier est accessible, False sinon.
138
+ """
139
+ start_time = time.time()
140
+
141
+ while time.time() - start_time < timeout:
142
+ if os.path.exists(file_path) and os.access(file_path, os.R_OK | os.W_OK):
143
+ try:
144
+ with open(file_path, "a"):
145
+ pass # Test d'ouverture en écriture
146
+ return True
147
+ except IOError:
148
+ pass
149
+
150
+ time.sleep(interval) # Attendre avant de réessayer
151
+
152
+ print(f"Erreur : Le fichier {file_path} est verrouillé ou inaccessible.")
153
+ return False
154
+
155
+
156
+
157
+
158
+ def is_pdf_a4(pdf_path: str) -> bool:
159
+ """
160
+ Vérifie si toutes les pages du PDF sont au format A4.
161
+ Retourne True si toutes les pages sont en A4, False sinon.
162
+ """
163
+ A4_WIDTH_PTS = 595 # Largeur A4 en points (approx. 210mm)
164
+ A4_HEIGHT_PTS = 842 # Hauteur A4 en points (approx. 297mm)
165
+ TOLERANCE = 5 # Marge de tolérance en points
166
+
167
+ try:
168
+ doc = fitz.open(pdf_path)
169
+ if len(doc) == 0:
170
+ return False
171
+
172
+ for page in doc:
173
+ width, height = page.rect.width, page.rect.height
174
+ if not (
175
+ (abs(width - A4_WIDTH_PTS) <= TOLERANCE and abs(height - A4_HEIGHT_PTS) <= TOLERANCE) or
176
+ (abs(width - A4_HEIGHT_PTS) <= TOLERANCE and abs(height - A4_WIDTH_PTS) <= TOLERANCE)
177
+ ):
178
+ return False
179
+ except Exception as e:
180
+ print("is A4?",e)
181
+ return False
182
+ return True
183
+
184
+ def convert_pdf_to_a4(input_pdf, output_pdf):
185
+ try:
186
+ # Dimensions A4 en points
187
+ a4_width, a4_height = fitz.paper_size("a4")
188
+ doc = fitz.open(input_pdf)
189
+ new_doc = fitz.open()
190
+
191
+ for page in doc:
192
+ page_w, page_h = page.rect.width, page.rect.height
193
+
194
+ # Si la page est déjà en A4 (tolérance de 1 point)
195
+ if abs(page_w - a4_width) < 1 and abs(page_h - a4_height) < 1:
196
+ new_doc.insert_pdf(doc, from_page=page.number, to_page=page.number)
197
+ continue
198
+
199
+ # Définition de la transformation selon l'orientation de la page
200
+ if page_w > page_h: # Paysage
201
+ # Après rotation, les dimensions seront inversées (largeur <-> hauteur)
202
+ effective_scale = min(a4_width / page_h, a4_height / page_w)
203
+ matrix = fitz.Matrix(effective_scale, effective_scale)
204
+ # Rotation de 90° et translation pour repositionner le contenu
205
+ matrix = matrix.prerotate(90).pretranslate(page_h * effective_scale, 0)
206
+ else: # Portrait
207
+ effective_scale = min(a4_width / page_w, a4_height / page_h)
208
+ matrix = fitz.Matrix(effective_scale, effective_scale)
209
+
210
+ # Générer le pixmap à la résolution finale souhaitée
211
+ pix = page.get_pixmap(matrix=matrix)
212
+
213
+ # Calcul du centrage sur la page A4
214
+ new_img_w, new_img_h = pix.width, pix.height
215
+ x_offset = (a4_width - new_img_w) / 2
216
+ y_offset = (a4_height - new_img_h) / 2
217
+
218
+ # Créer la nouvelle page et y insérer l'image
219
+ new_page = new_doc.new_page(width=a4_width, height=a4_height)
220
+ new_page.insert_image(
221
+ fitz.Rect(x_offset, y_offset, x_offset + new_img_w, y_offset + new_img_h),
222
+ pixmap=pix
223
+ )
224
+ new_doc.save(output_pdf)
225
+ new_doc.close()
226
+ doc.close()
227
+ return 0
228
+ except:
229
+ return 1
230
+
231
+
232
+ def write_two_strings_to_file(file_path: str,string1: str, string2: str):
233
+ """
234
+ Writes two strings to a file, one per line, handling errors gracefully.
235
+
236
+ :param string1: The first string to write.
237
+ :param string2: The second string to write.
238
+ :param file_path: The path where the file should be saved.
239
+ """
240
+ try:
241
+ file = open(file_path, 'w', encoding='utf-8')
242
+ file.write(string1 + "\n")
243
+ file.write(string2 )
244
+ print(f"Successfully written to {file_path}")
245
+ except IOError as e:
246
+ print(f"Error writing to file: {e}")
247
+ return 1
248
+ except Exception as e:
249
+ print(f"An unexpected error occurred: {e}")
250
+ return 1
251
+ finally:
252
+ file.close()
253
+ return 0
254
+
255
+
256
+ def convert_pdf_with_temp(temp_pdf, output_path):
257
+ """
258
+ Copie le PDF source dans un dossier temporaire, le convertit en DOCX,
259
+ puis copie le fichier résultant vers le chemin de sortie spécifié,
260
+ en gérant les chemins longs.
261
+ """
262
+ pdf_path = enable_long_path(os.path.abspath(temp_pdf))
263
+ output_path = enable_long_path(os.path.abspath(output_path))
264
+ output_dir = output_path.parent
265
+
266
+ if not pdf_path.exists():
267
+ print(f"Le fichier {pdf_path} n'existe pas.")
268
+ return 1
269
+
270
+ # Créer le dossier de sortie s'il n'existe pas
271
+ if not output_dir.exists():
272
+ output_dir.mkdir(parents=True, exist_ok=True)
273
+
274
+ try:
275
+ dest_dir = get_local_store_path() + "temp_word_conversion/"
276
+ if 0 != reset_folder(dest_dir, attempts=10, delay=0.05):
277
+ print("impossible to reset " + dest_dir)
278
+ return 1
279
+ # Création du dossier temporaire
280
+ temp_pdf = os.path.join(dest_dir, "input_toto.pdf")
281
+ temp_docx = os.path.join(dest_dir, "input_toto.docx")
282
+
283
+ print(dest_dir+"conversion_en_cours.txt")
284
+ print("######################################")
285
+ if 0!=write_two_strings_to_file(dest_dir+"conversion_en_cours.txt",str(pdf_path),str(output_path)):
286
+ print("error writing ",dest_dir+"conversion_en_cours.txt")
287
+ return 1
288
+ # Copie du fichier source vers le dossier temporaire
289
+ shutil.copy2(pdf_path, temp_pdf)
290
+ wait_for_file_access(temp_pdf)
291
+
292
+ if is_pdf_a4(temp_pdf)==False:
293
+ temp_pdf2 = os.path.join(dest_dir, "input_totoA4.pdf")
294
+ if 0!=convert_pdf_to_a4(temp_pdf,temp_pdf2):
295
+ print("erreur au resize du pdf")
296
+ return 1
297
+ temp_pdf=temp_pdf2
298
+ wait_for_file_access(temp_pdf)
299
+ time.sleep(1)
300
+ result=0
301
+ # Conversion du PDF en DOCX
302
+ for _ in range(4):
303
+ time.sleep(1)
304
+ result = convert_pdf_to_docx(str(temp_pdf), str(temp_docx))
305
+ if result==0:
306
+ break
307
+ if result == 0:
308
+ # Copie du fichier converti vers la destination finale
309
+ shutil.copy2(temp_docx, output_path)
310
+ print(f"recopie réussie : {output_path}")
311
+
312
+ # Supprimer les fichiers temporaires après le déplacement
313
+ # if temp_docx.exists():
314
+ # temp_docx.unlink()
315
+ # if temp_pdf.exists():
316
+ # temp_pdf.unlink()
317
+ return 0
318
+ else:
319
+ print("Erreur lors de la conversion.")
320
+ return 1
321
+
322
+ except Exception as e:
323
+ print(f"Erreur : {e}")
324
+ return 1
325
+
326
+
327
+
@@ -0,0 +1,202 @@
1
+ import os
2
+ import logging
3
+ import urllib.parse
4
+ from pathlib import Path
5
+
6
+ from PyQt5.QtCore import QThread, pyqtSignal
7
+ from AnyQt.QtWidgets import QApplication, QLabel, QPushButton, QProgressBar, QListWidget, QListWidgetItem
8
+
9
+ import Orange.data
10
+ from Orange.widgets import widget
11
+ from Orange.widgets.utils.signals import Input, Output
12
+ from Orange.data import Domain, StringVariable, Table
13
+
14
+ from docling.datamodel.base_models import InputFormat
15
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
16
+ from docling.document_converter import DocumentConverter, PdfFormatOption
17
+ from docling_core.types.doc import ImageRefMode
18
+
19
+ _log = logging.getLogger(__name__)
20
+
21
+ IMAGE_RESOLUTION_SCALE = 2.0
22
+
23
+ class MarkdownConversionThread(QThread):
24
+ result_signal = pyqtSignal(list)
25
+ progress_signal = pyqtSignal(str, int)
26
+
27
+ def __init__(self, input_dir, output_dir, parent=None):
28
+ super().__init__(parent)
29
+ self.input_dir = input_dir
30
+ self.output_dir = output_dir
31
+
32
+ def run(self):
33
+ results = []
34
+ files = list(self.input_dir.glob("*.pdf")) + list(self.input_dir.glob("*.docx"))
35
+ total_files = len(files)
36
+ processed = 0
37
+
38
+ pipeline_options = PdfPipelineOptions()
39
+ pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
40
+ pipeline_options.generate_page_images = True
41
+ pipeline_options.generate_picture_images = True
42
+
43
+ doc_converter_pdf = DocumentConverter(
44
+ format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
45
+ )
46
+
47
+ for idx, file_path in enumerate(files):
48
+ try:
49
+ output_subdir = self.output_dir / file_path.parent.relative_to(file_path.parents[1])
50
+ output_subdir.mkdir(parents=True, exist_ok=True)
51
+
52
+ doc_filename = file_path.stem + "_md-with-image-refs.md"
53
+ output_file_path = output_subdir / doc_filename
54
+
55
+ if output_file_path.exists():
56
+ print(f"🔁 Fichier déjà traité, ignoré : {doc_filename}")
57
+ results.append((doc_filename, output_file_path.read_text(encoding='utf-8')))
58
+ processed += 1
59
+ self.progress_signal.emit(doc_filename, int(processed / total_files * 100))
60
+ continue
61
+
62
+ # Conversion
63
+ if file_path.suffix.lower() == ".pdf":
64
+ conv_res = doc_converter_pdf.convert(file_path)
65
+ else:
66
+ conv_res = DocumentConverter().convert(file_path)
67
+
68
+ conv_res.document.save_as_markdown(output_file_path, image_mode=ImageRefMode.REFERENCED)
69
+
70
+ with open(output_file_path, 'r', encoding='utf-8') as f:
71
+ content = urllib.parse.unquote(f.read())
72
+
73
+ with open(output_file_path, 'w', encoding='utf-8') as f:
74
+ f.write(content)
75
+
76
+ results.append((doc_filename, content))
77
+ processed += 1
78
+ self.progress_signal.emit(doc_filename, int(processed / total_files * 100))
79
+
80
+ except Exception as e:
81
+ print(f"❌ Erreur lors du traitement de {file_path}: {e}")
82
+ continue
83
+
84
+ self.result_signal.emit(results)
85
+
86
+
87
+ class FileProcessorApp(widget.OWWidget):
88
+ name = "Markdownizer"
89
+ description = "Convert PDFs, DOCX, TXT, CSV to Markdown and store in an output folder"
90
+ icon = "icons/md.png"
91
+ if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
92
+ icon = "icons_dev/md.png"
93
+ priority = 1001
94
+ category = "Advanced Artificial Intelligence Tools"
95
+ want_control_area = False
96
+
97
+ class Inputs:
98
+ data = Input("Input Directory", Orange.data.Table)
99
+
100
+ class Outputs:
101
+ data = Output("Markdown Data Table", Orange.data.Table)
102
+
103
+ @Inputs.data
104
+ def set_data(self, data):
105
+ self.data = data
106
+ if self.data is not None:
107
+ path_index = None
108
+ for i, meta_var in enumerate(self.data.domain.metas):
109
+ if meta_var.name.lower() == 'input_dir':
110
+ path_index = i
111
+ break
112
+
113
+ if path_index is not None:
114
+ self.input_path = self.data.metas[0][path_index]
115
+ print("Extracted input_dir:", self.input_path)
116
+ self.startProcessing()
117
+ else:
118
+ print("No 'input_dir' column found in input data. Available columns:", [m.name for m in self.data.domain.metas])
119
+
120
+ def __init__(self):
121
+ super().__init__()
122
+ self.initUI()
123
+ self.data = None
124
+ self.input_path = None
125
+
126
+ def initUI(self):
127
+ self.setGeometry(200, 200, 600, 400)
128
+ self.mainArea.layout().setSpacing(10)
129
+ self.status_label = QLabel("Sélectionnez un dossier contenant des fichiers.")
130
+ self.progress_bar = QProgressBar()
131
+ self.progress_bar.setValue(0)
132
+
133
+ self.start_button = QPushButton("Démarrer le traitement")
134
+ self.start_button.clicked.connect(self.startProcessing)
135
+
136
+ self.file_list = QListWidget()
137
+
138
+ self.mainArea.layout().addWidget(self.status_label)
139
+ self.mainArea.layout().addWidget(self.progress_bar)
140
+ self.mainArea.layout().addWidget(self.start_button)
141
+ self.mainArea.layout().addWidget(self.file_list)
142
+
143
+ def startProcessing(self):
144
+ if not self.input_path:
145
+ print("No valid input path found.")
146
+ return
147
+
148
+ input_dir = Path(self.input_path)
149
+ if not input_dir.exists():
150
+ print("Input directory does not exist:", input_dir)
151
+ return
152
+
153
+ self.output_dir = input_dir.parent / (input_dir.name + "_md")
154
+ self.progress_bar.setValue(0)
155
+ self.start_button.setEnabled(False)
156
+ self.status_label.setText("Traitement en cours...")
157
+
158
+ self.thread = MarkdownConversionThread(input_dir, self.output_dir)
159
+ self.thread.result_signal.connect(self.handle_results)
160
+ self.thread.progress_signal.connect(self.update_progress)
161
+ self.thread.start()
162
+ self.progressBarInit()
163
+
164
+ def update_progress(self, filename, progress):
165
+ print(f"File processed: {filename}")
166
+ self.file_list.addItem(f"✅ {filename}")
167
+ self.progress_bar.setValue(progress)
168
+
169
+ def handle_results(self, results):
170
+ self.processingComplete(results)
171
+ self.progressBarFinished()
172
+
173
+ def processingComplete(self, results):
174
+ self.status_label.setText("Traitement terminé.")
175
+ self.start_button.setEnabled(True)
176
+ self.send_output(results)
177
+
178
+ def send_output(self, results):
179
+ domain = Domain([], metas=[
180
+ StringVariable('input_dir'),
181
+ StringVariable('output_dir'),
182
+ StringVariable('name'),
183
+ StringVariable('content')
184
+ ])
185
+ metas = [[
186
+ str(self.input_path),
187
+ str(self.output_dir),
188
+ name,
189
+ content
190
+ ] for name, content in results] if results else [["", "", "", ""]]
191
+
192
+ table = Table(domain, [[] for _ in metas])
193
+ for i, meta in enumerate(metas):
194
+ table.metas[i] = meta
195
+ self.Outputs.data.send(table)
196
+
197
+ if __name__ == "__main__":
198
+ import sys
199
+ app = QApplication(sys.argv)
200
+ window = FileProcessorApp()
201
+ window.show()
202
+ sys.exit(app.exec_())