io4it 2.1.2__tar.gz → 2.1.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- io4it-2.1.2.1/PKG-INFO +30 -0
- io4it-2.1.2.1/io4it.egg-info/PKG-INFO +30 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/io4it.egg-info/SOURCES.txt +3 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/OWChatGpt.py +44 -45
- io4it-2.1.2.1/orangecontrib/IO4IT/widgets/OWExtractTablesDocxToCSV.py +261 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/OWMarkdownizer.py +0 -1
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/OWOfficeNormalizer.py +103 -46
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/OWPdfType.py +23 -35
- io4it-2.1.2.1/orangecontrib/IO4IT/widgets/designer/OWmailSender.py +155 -0
- io4it-2.1.2/orangecontrib/IO4IT/widgets/designer/owofficenormalizer.ui → io4it-2.1.2.1/orangecontrib/IO4IT/widgets/designer/owdocxtocsv.ui +17 -4
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/owmailsender.ui +96 -31
- io4it-2.1.2.1/orangecontrib/IO4IT/widgets/designer/owofficenormalizer.ui +124 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/setup.py +1 -1
- io4it-2.1.2/PKG-INFO +0 -7
- io4it-2.1.2/io4it.egg-info/PKG-INFO +0 -7
- {io4it-2.1.2 → io4it-2.1.2.1}/io4it.egg-info/dependency_links.txt +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/io4it.egg-info/entry_points.txt +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/io4it.egg-info/namespace_packages.txt +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/io4it.egg-info/requires.txt +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/io4it.egg-info/top_level.txt +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/__init__.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/ocr_function/__init__.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/ocr_function/word_converter.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/utils/__init__.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/utils/mail.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/utils/offuscation_basique.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/utils/pool_exec_utils.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/utils/utils_md.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/OWDeep_Search.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/OWDoclingMarkdownizerSimple.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/OWExportMarkdown.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/OWInboxMailMonitoring.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/OWMarkdownLoader.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/OWProcessPoolExecutor.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/OWS3Uploader.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/OWS3downloader.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/OWS3list.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/OWSpeechToText.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/OWmailLoader.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/OWmailSender.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/OWwordpdf2docx.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/__init__.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/__init__.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/nogui.ui +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/ow_file_ext_selector.ui +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/owchatgpt.ui +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/owdeepsearch.ui +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/owdoclingasr.ui +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/owdoclingmarkdownizersimple.ui +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/owexportmarkdown.ui +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/owinboxmailmonitoring.ui +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/owmailloader.ui +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/owmarkdownizer.ui +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/owmarkdownloader.ui +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/owpdftype.ui +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/owprocesspoolexecutor.ui +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/owspeechtotext.ui +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/owvisualizationer.ui +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/designer/wordpdf2docx.ui +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/__init__.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/chatgpt.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/check_pdf.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/deepsearch.svg +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/dep_md_old.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/download.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/export_md.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/extract_table.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/file_extensor.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/list_aws.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/load_md.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/mail_loader.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/mail_writer.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/md.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/monitor-email.svg +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/office_normalizer.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/process_pool_executor.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/speech_to_text.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/upload.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/visualizationer.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons/wordpdf2docx.png +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/IO4IT/widgets/icons_dev/__init__.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/orangecontrib/__init__.py +0 -0
- {io4it-2.1.2 → io4it-2.1.2.1}/setup.cfg +0 -0
io4it-2.1.2.1/PKG-INFO
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: io4it
|
|
3
|
+
Version: 2.1.2.1
|
|
4
|
+
Home-page:
|
|
5
|
+
Author:
|
|
6
|
+
Author-email:
|
|
7
|
+
Keywords: orange3 add-on
|
|
8
|
+
Requires-Dist: pylatexenc
|
|
9
|
+
Requires-Dist: docopt
|
|
10
|
+
Requires-Dist: boto3
|
|
11
|
+
Requires-Dist: opencv-python-headless==4.6.0.66
|
|
12
|
+
Requires-Dist: docling==2.30.0
|
|
13
|
+
Requires-Dist: docling-core==2.26.3
|
|
14
|
+
Requires-Dist: speechbrain
|
|
15
|
+
Requires-Dist: whisper
|
|
16
|
+
Requires-Dist: whisper-openai
|
|
17
|
+
Requires-Dist: pyannote.audio
|
|
18
|
+
Requires-Dist: pyannote-core
|
|
19
|
+
Requires-Dist: pypandoc
|
|
20
|
+
Requires-Dist: pypandoc-binary
|
|
21
|
+
Requires-Dist: scikit-learn
|
|
22
|
+
Requires-Dist: openai
|
|
23
|
+
Requires-Dist: pip-system-certs==5.0
|
|
24
|
+
Requires-Dist: docx2pdf
|
|
25
|
+
Requires-Dist: doc2docx
|
|
26
|
+
Requires-Dist: msal
|
|
27
|
+
Requires-Dist: exchangelib
|
|
28
|
+
Requires-Dist: CATEGORIT
|
|
29
|
+
Dynamic: keywords
|
|
30
|
+
Dynamic: requires-dist
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: io4it
|
|
3
|
+
Version: 2.1.2.1
|
|
4
|
+
Home-page:
|
|
5
|
+
Author:
|
|
6
|
+
Author-email:
|
|
7
|
+
Keywords: orange3 add-on
|
|
8
|
+
Requires-Dist: pylatexenc
|
|
9
|
+
Requires-Dist: docopt
|
|
10
|
+
Requires-Dist: boto3
|
|
11
|
+
Requires-Dist: opencv-python-headless==4.6.0.66
|
|
12
|
+
Requires-Dist: docling==2.30.0
|
|
13
|
+
Requires-Dist: docling-core==2.26.3
|
|
14
|
+
Requires-Dist: speechbrain
|
|
15
|
+
Requires-Dist: whisper
|
|
16
|
+
Requires-Dist: whisper-openai
|
|
17
|
+
Requires-Dist: pyannote.audio
|
|
18
|
+
Requires-Dist: pyannote-core
|
|
19
|
+
Requires-Dist: pypandoc
|
|
20
|
+
Requires-Dist: pypandoc-binary
|
|
21
|
+
Requires-Dist: scikit-learn
|
|
22
|
+
Requires-Dist: openai
|
|
23
|
+
Requires-Dist: pip-system-certs==5.0
|
|
24
|
+
Requires-Dist: docx2pdf
|
|
25
|
+
Requires-Dist: doc2docx
|
|
26
|
+
Requires-Dist: msal
|
|
27
|
+
Requires-Dist: exchangelib
|
|
28
|
+
Requires-Dist: CATEGORIT
|
|
29
|
+
Dynamic: keywords
|
|
30
|
+
Dynamic: requires-dist
|
|
@@ -19,6 +19,7 @@ orangecontrib/IO4IT/widgets/OWChatGpt.py
|
|
|
19
19
|
orangecontrib/IO4IT/widgets/OWDeep_Search.py
|
|
20
20
|
orangecontrib/IO4IT/widgets/OWDoclingMarkdownizerSimple.py
|
|
21
21
|
orangecontrib/IO4IT/widgets/OWExportMarkdown.py
|
|
22
|
+
orangecontrib/IO4IT/widgets/OWExtractTablesDocxToCSV.py
|
|
22
23
|
orangecontrib/IO4IT/widgets/OWInboxMailMonitoring.py
|
|
23
24
|
orangecontrib/IO4IT/widgets/OWMarkdownLoader.py
|
|
24
25
|
orangecontrib/IO4IT/widgets/OWMarkdownizer.py
|
|
@@ -33,6 +34,7 @@ orangecontrib/IO4IT/widgets/OWmailLoader.py
|
|
|
33
34
|
orangecontrib/IO4IT/widgets/OWmailSender.py
|
|
34
35
|
orangecontrib/IO4IT/widgets/OWwordpdf2docx.py
|
|
35
36
|
orangecontrib/IO4IT/widgets/__init__.py
|
|
37
|
+
orangecontrib/IO4IT/widgets/designer/OWmailSender.py
|
|
36
38
|
orangecontrib/IO4IT/widgets/designer/__init__.py
|
|
37
39
|
orangecontrib/IO4IT/widgets/designer/nogui.ui
|
|
38
40
|
orangecontrib/IO4IT/widgets/designer/ow_file_ext_selector.ui
|
|
@@ -40,6 +42,7 @@ orangecontrib/IO4IT/widgets/designer/owchatgpt.ui
|
|
|
40
42
|
orangecontrib/IO4IT/widgets/designer/owdeepsearch.ui
|
|
41
43
|
orangecontrib/IO4IT/widgets/designer/owdoclingasr.ui
|
|
42
44
|
orangecontrib/IO4IT/widgets/designer/owdoclingmarkdownizersimple.ui
|
|
45
|
+
orangecontrib/IO4IT/widgets/designer/owdocxtocsv.ui
|
|
43
46
|
orangecontrib/IO4IT/widgets/designer/owexportmarkdown.ui
|
|
44
47
|
orangecontrib/IO4IT/widgets/designer/owinboxmailmonitoring.ui
|
|
45
48
|
orangecontrib/IO4IT/widgets/designer/owmailloader.ui
|
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
import sys
|
|
3
3
|
import base64
|
|
4
4
|
import ast
|
|
5
|
-
import
|
|
5
|
+
from openai import OpenAI
|
|
6
6
|
import Orange
|
|
7
7
|
from Orange.data import StringVariable
|
|
8
8
|
from Orange.widgets.widget import OWWidget, Input, Output
|
|
@@ -98,30 +98,53 @@ class ChatGpt(OWWidget):
|
|
|
98
98
|
|
|
99
99
|
def generate_answers(self):
|
|
100
100
|
try:
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
101
|
+
client = OpenAI(api_key=self.api_keys)
|
|
102
|
+
system_content = []
|
|
103
|
+
if getattr(self, "system_prompt", None):
|
|
104
|
+
system_content = [{"type": "input_text", "text": str(self.system_prompt)}]
|
|
105
|
+
|
|
106
|
+
user_content = []
|
|
107
|
+
if isinstance(self.prompt, list):
|
|
108
|
+
user_content.extend(self.prompt)
|
|
109
|
+
else:
|
|
110
|
+
user_content.append({"type": "input_text", "text": str(self.prompt)})
|
|
111
|
+
|
|
112
|
+
if getattr(self, "image_paths", None):
|
|
113
|
+
# normalize image_paths to a list
|
|
114
|
+
if isinstance(self.image_paths, str):
|
|
115
|
+
self.image_paths = ast.literal_eval(self.image_paths)
|
|
116
|
+
|
|
117
|
+
for img_path in self.image_paths:
|
|
118
|
+
filename = os.path.basename(img_path)
|
|
119
|
+
user_content.append({"type": "input_text", "text": f"Photo : {filename}"})
|
|
120
|
+
|
|
121
|
+
with open(img_path, "rb") as f:
|
|
122
|
+
b64_img = base64.b64encode(f.read()).decode("utf-8")
|
|
123
|
+
|
|
124
|
+
mime = "image/png" if filename.lower().endswith(".png") else "image/jpeg"
|
|
125
|
+
user_content.append({
|
|
126
|
+
"type": "input_image",
|
|
127
|
+
"image_url": f"data:{mime};base64,{b64_img}",
|
|
128
|
+
})
|
|
129
|
+
response = client.responses.create(
|
|
130
|
+
model=self.model,
|
|
131
|
+
input=[
|
|
132
|
+
{"role": "system", "content": system_content},
|
|
133
|
+
{"role": "user", "content": user_content},
|
|
134
|
+
],
|
|
135
|
+
max_output_tokens=self.max_tokens,
|
|
136
|
+
# temperature=self.temperature,
|
|
137
|
+
)
|
|
138
|
+
self.text_response = response.output_text
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
if self.text_response is None:
|
|
142
|
+
self.error("No response from model.")
|
|
143
|
+
|
|
118
144
|
except Exception as e:
|
|
119
145
|
print(e)
|
|
120
146
|
self.error(f"Error: {e}")
|
|
121
147
|
return
|
|
122
|
-
if self.text_response is None:
|
|
123
|
-
self.error("No response from chatgpt.")
|
|
124
|
-
|
|
125
148
|
|
|
126
149
|
def run(self):
|
|
127
150
|
self.error("")
|
|
@@ -137,30 +160,6 @@ class ChatGpt(OWWidget):
|
|
|
137
160
|
self.error("No api keys provided.")
|
|
138
161
|
return
|
|
139
162
|
|
|
140
|
-
# si on relance la génération par le bouton le prompt est déjà rempli (déjà une liste)
|
|
141
|
-
# mais on peut changer le model, la temp ou le nombre max de token
|
|
142
|
-
if isinstance(self.prompt, list):
|
|
143
|
-
self.prompt = [{"type": "text", "text": self.prompt}]
|
|
144
|
-
if self.image_paths is not None and self.image_paths != []:
|
|
145
|
-
if type(self.image_paths) == str:
|
|
146
|
-
self.image_paths = ast.literal_eval(self.image_paths)
|
|
147
|
-
for img_path in self.image_paths:
|
|
148
|
-
filename = os.path.basename(img_path)
|
|
149
|
-
|
|
150
|
-
# Ajoute une mention de l’image avant
|
|
151
|
-
self.prompt.append({
|
|
152
|
-
"type": "text",
|
|
153
|
-
"text": f"Photo : {filename}"
|
|
154
|
-
})
|
|
155
|
-
|
|
156
|
-
with open(img_path, "rb") as f:
|
|
157
|
-
b64_img = base64.b64encode(f.read()).decode("utf-8")
|
|
158
|
-
self.prompt.append({
|
|
159
|
-
"type": "image_url",
|
|
160
|
-
"image_url": {
|
|
161
|
-
"url": f"data:image/jpeg;base64,{b64_img}"
|
|
162
|
-
}
|
|
163
|
-
})
|
|
164
163
|
self.progressBarInit()
|
|
165
164
|
self.thread = thread_management.Thread(self.generate_answers)
|
|
166
165
|
self.thread.progress.connect(self.handle_progress)
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import docx
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import re
|
|
6
|
+
# Removed: import json
|
|
7
|
+
|
|
8
|
+
from AnyQt.QtWidgets import QApplication, QPushButton
|
|
9
|
+
from Orange.widgets import widget
|
|
10
|
+
from Orange.widgets.utils.signals import Input, Output
|
|
11
|
+
from Orange.data import Domain, StringVariable, Table, DiscreteVariable
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
15
|
+
from Orange.widgets.orangecontrib.AAIT.utils.import_uic import uic
|
|
16
|
+
from Orange.widgets.orangecontrib.AAIT.utils.initialize_from_ini import apply_modification_from_python_file
|
|
17
|
+
else:
|
|
18
|
+
from orangecontrib.AAIT.utils.import_uic import uic
|
|
19
|
+
from orangecontrib.AAIT.utils.initialize_from_ini import apply_modification_from_python_file
|
|
20
|
+
|
|
21
|
+
@apply_modification_from_python_file(filepath_original_widget=__file__)
|
|
22
|
+
class OWExtractTablesDocxToCSV(widget.OWWidget):
|
|
23
|
+
"""
|
|
24
|
+
Orange Widget qui extrait les tableaux de documents Word (.docx) et les sauvegarde
|
|
25
|
+
en fichiers XLSX distincts (une table Word = un fichier XLSX).
|
|
26
|
+
"""
|
|
27
|
+
name = "Docx to XLSX Tables (Full Tables)"
|
|
28
|
+
description = "Extrait toutes les tables de documents Word et les sauvegarde en fichiers XLSX distincts"
|
|
29
|
+
category = "AAIT - TOOLBOX"
|
|
30
|
+
icon = "icons/extract_table.png"
|
|
31
|
+
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
32
|
+
icon = "icons_dev/extract_table.png"
|
|
33
|
+
gui = os.path.join(os.path.dirname(os.path.abspath(__file__)), "designer/owdocxtocsv.ui")
|
|
34
|
+
want_control_area = False
|
|
35
|
+
priority = 1005
|
|
36
|
+
|
|
37
|
+
class Inputs:
|
|
38
|
+
data = Input("Files Table", Table)
|
|
39
|
+
|
|
40
|
+
class Outputs:
|
|
41
|
+
data = Output("Processed Files Table", Table)
|
|
42
|
+
status_data = Output("Status Table", Table)
|
|
43
|
+
|
|
44
|
+
def __init__(self):
|
|
45
|
+
super().__init__()
|
|
46
|
+
try:
|
|
47
|
+
uic.loadUi(self.gui, self)
|
|
48
|
+
except Exception as e:
|
|
49
|
+
self.warning(f"Impossible de charger le fichier UI. {e}")
|
|
50
|
+
|
|
51
|
+
class DummyCheckbox:
|
|
52
|
+
def stateChanged(self, *args): pass
|
|
53
|
+
|
|
54
|
+
self.checkBox_alpha_headers = DummyCheckbox()
|
|
55
|
+
self.gui = None
|
|
56
|
+
|
|
57
|
+
# Connexion du bouton d'exécution
|
|
58
|
+
self.pushButton_run = self.findChild(QPushButton, "pushButton_run")
|
|
59
|
+
if self.pushButton_run:
|
|
60
|
+
self.pushButton_run.clicked.connect(self.run)
|
|
61
|
+
|
|
62
|
+
self.data = None
|
|
63
|
+
self.autorun = True
|
|
64
|
+
self.processed_statuses = []
|
|
65
|
+
self.use_alpha_headers = False
|
|
66
|
+
if self.gui:
|
|
67
|
+
self.checkBox_alpha_headers.stateChanged.connect(self._update_alpha_headers_state)
|
|
68
|
+
|
|
69
|
+
self.post_initialized()
|
|
70
|
+
|
|
71
|
+
def _update_alpha_headers_state(self, state):
|
|
72
|
+
self.use_alpha_headers = bool(state)
|
|
73
|
+
|
|
74
|
+
@Inputs.data
|
|
75
|
+
def set_data(self, in_data: Table | None):
|
|
76
|
+
self.data = in_data
|
|
77
|
+
if self.autorun:
|
|
78
|
+
self.run()
|
|
79
|
+
|
|
80
|
+
def run(self):
|
|
81
|
+
if self.data is None:
|
|
82
|
+
self.Outputs.data.send(None)
|
|
83
|
+
self.Outputs.status_data.send(None)
|
|
84
|
+
return
|
|
85
|
+
|
|
86
|
+
self.error("")
|
|
87
|
+
try:
|
|
88
|
+
self.data.domain["file_path"]
|
|
89
|
+
except KeyError:
|
|
90
|
+
self.error("Le tableau d'entrée doit contenir une colonne 'file_path'.")
|
|
91
|
+
self.Outputs.data.send(None)
|
|
92
|
+
self.Outputs.status_data.send(None)
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
self.progressBarInit()
|
|
96
|
+
self.processed_statuses = []
|
|
97
|
+
self.Outputs.status_data.send(None)
|
|
98
|
+
|
|
99
|
+
result_rows = self._process_files(self.data)
|
|
100
|
+
|
|
101
|
+
output_domain = Domain([], metas=[
|
|
102
|
+
StringVariable("src_path"),
|
|
103
|
+
StringVariable("output_dir_path"),
|
|
104
|
+
StringVariable("status")
|
|
105
|
+
])
|
|
106
|
+
result_table = Table.from_list(output_domain, result_rows)
|
|
107
|
+
self.Outputs.data.send(result_table)
|
|
108
|
+
|
|
109
|
+
self.progressBarFinished()
|
|
110
|
+
|
|
111
|
+
def _process_files(self, in_data: Table) -> list:
|
|
112
|
+
result_rows = []
|
|
113
|
+
file_paths = [str(x) for x in in_data.get_column("file_path")]
|
|
114
|
+
total_files = len(file_paths)
|
|
115
|
+
|
|
116
|
+
if not file_paths:
|
|
117
|
+
return []
|
|
118
|
+
|
|
119
|
+
for i, full_path in enumerate(file_paths):
|
|
120
|
+
self.progressBarSet((i + 1) / total_files * 100)
|
|
121
|
+
|
|
122
|
+
status_short = "ko"
|
|
123
|
+
details = "traitement échoué"
|
|
124
|
+
output_dir_path = ""
|
|
125
|
+
|
|
126
|
+
if not full_path.lower().endswith('.docx'):
|
|
127
|
+
status_short = "skipped"
|
|
128
|
+
details = "Fichier ignoré : n'est pas un fichier .docx."
|
|
129
|
+
output_dir_path = "N/A"
|
|
130
|
+
self.processed_statuses.append([full_path, status_short, details])
|
|
131
|
+
self._send_status_table()
|
|
132
|
+
result_rows.append([full_path, output_dir_path, f"{status_short}: {details}"])
|
|
133
|
+
QApplication.processEvents()
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
tables_found, output_dir_path = self._extraire_et_convertir(full_path)
|
|
138
|
+
|
|
139
|
+
if tables_found > 0:
|
|
140
|
+
status_short = "ok"
|
|
141
|
+
details = f"{tables_found} table(s) extraite(s) et convertie(s) en XLSX."
|
|
142
|
+
else:
|
|
143
|
+
status_short = "ko"
|
|
144
|
+
details = "Aucune table valide trouvée."
|
|
145
|
+
|
|
146
|
+
except FileNotFoundError:
|
|
147
|
+
details = "Fichier non trouvé."
|
|
148
|
+
except Exception as e:
|
|
149
|
+
details = f"Une erreur inattendue est survenue : {e}"
|
|
150
|
+
|
|
151
|
+
self.processed_statuses.append([full_path, status_short, details])
|
|
152
|
+
self._send_status_table()
|
|
153
|
+
|
|
154
|
+
result_rows.append([full_path, output_dir_path, f"{status_short}: {details}"])
|
|
155
|
+
|
|
156
|
+
QApplication.processEvents()
|
|
157
|
+
|
|
158
|
+
return result_rows
|
|
159
|
+
|
|
160
|
+
def _extraire_et_convertir(self, docx_path):
|
|
161
|
+
"""
|
|
162
|
+
Extrait les tableaux d'un document Word et sauvegarde chaque table entière en XLSX.
|
|
163
|
+
Retourne (nombre_de_tables_trouvées, chemin_dossier_sortie).
|
|
164
|
+
"""
|
|
165
|
+
dir_name, file_name = os.path.split(docx_path)
|
|
166
|
+
base_name, _ = os.path.splitext(file_name)
|
|
167
|
+
|
|
168
|
+
output_dir = os.path.join(dir_name, base_name + '_tables_data')
|
|
169
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
170
|
+
|
|
171
|
+
doc = docx.Document(docx_path)
|
|
172
|
+
total_tables_found = 0
|
|
173
|
+
|
|
174
|
+
for i, table in enumerate(doc.tables):
|
|
175
|
+
raw_data = []
|
|
176
|
+
for row in table.rows:
|
|
177
|
+
# Extraction des données de toute la table
|
|
178
|
+
row_data = [cell.text.strip() for cell in row.cells]
|
|
179
|
+
raw_data.append(row_data)
|
|
180
|
+
|
|
181
|
+
# S'assurer qu'il y a des données non vides dans la table
|
|
182
|
+
if not raw_data or not any(row for row in raw_data):
|
|
183
|
+
continue
|
|
184
|
+
|
|
185
|
+
# --- PAS DE DÉCOUPAGE ---
|
|
186
|
+
|
|
187
|
+
table_index = i + 1
|
|
188
|
+
# Utilisation de 'a' comme suffixe pour le nom de fichier
|
|
189
|
+
table_name = f"table_{table_index}_a"
|
|
190
|
+
|
|
191
|
+
df = self._create_dataframe(raw_data)
|
|
192
|
+
|
|
193
|
+
if df is not None:
|
|
194
|
+
self._save_sub_table(df, output_dir, table_name)
|
|
195
|
+
total_tables_found += 1
|
|
196
|
+
|
|
197
|
+
return total_tables_found, output_dir
|
|
198
|
+
|
|
199
|
+
def _create_dataframe(self, data):
|
|
200
|
+
"""
|
|
201
|
+
Crée le DataFrame à partir des lignes brutes.
|
|
202
|
+
"""
|
|
203
|
+
# Nettoyer les lignes vides
|
|
204
|
+
data = [row for row in data if row and any(cell.strip() for cell in row)]
|
|
205
|
+
if not data:
|
|
206
|
+
return None
|
|
207
|
+
|
|
208
|
+
max_cols = max(len(row) for row in data)
|
|
209
|
+
data = [row + [''] * (max_cols - len(row)) for row in data]
|
|
210
|
+
|
|
211
|
+
if self.use_alpha_headers:
|
|
212
|
+
# Cas A : En-têtes alphabétiques. Toutes les lignes sont des données.
|
|
213
|
+
headers = [chr(ord('A') + j) for j in range(max_cols)]
|
|
214
|
+
df = pd.DataFrame(data, columns=headers)
|
|
215
|
+
else:
|
|
216
|
+
# Cas B : Première ligne comme en-tête.
|
|
217
|
+
|
|
218
|
+
if len(data) == 1:
|
|
219
|
+
# Si le segment n'a qu'une seule ligne, on utilise des en-têtes alphabétiques.
|
|
220
|
+
headers = [chr(ord('A') + j) for j in range(max_cols)]
|
|
221
|
+
df = pd.DataFrame(data, columns=headers)
|
|
222
|
+
else:
|
|
223
|
+
# Cas standard : première ligne = en-tête, reste = données.
|
|
224
|
+
headers = data[0]
|
|
225
|
+
data_rows = data[1:]
|
|
226
|
+
|
|
227
|
+
min_cols = min(len(headers), max_cols)
|
|
228
|
+
|
|
229
|
+
df = pd.DataFrame(data_rows, columns=headers[:min_cols])
|
|
230
|
+
|
|
231
|
+
df.columns = df.columns.astype(str)
|
|
232
|
+
|
|
233
|
+
return df
|
|
234
|
+
|
|
235
|
+
def _save_sub_table(self, df, output_dir, table_full_name):
|
|
236
|
+
"""Sauvegarde le DataFrame exclusivement en XLSX."""
|
|
237
|
+
|
|
238
|
+
output_xlsx_path = os.path.join(output_dir, f"{table_full_name}.xlsx")
|
|
239
|
+
try:
|
|
240
|
+
df.to_excel(output_xlsx_path, index=False, engine='openpyxl')
|
|
241
|
+
except Exception as e:
|
|
242
|
+
self.warning(f"Impossible de sauvegarder la table '{table_full_name}' en format XLSX : {e}")
|
|
243
|
+
|
|
244
|
+
def _send_status_table(self):
|
|
245
|
+
domain = Domain([], metas=[
|
|
246
|
+
StringVariable("src_path"),
|
|
247
|
+
DiscreteVariable("status", values=["ok", "ko", "skipped"]),
|
|
248
|
+
StringVariable("details")
|
|
249
|
+
])
|
|
250
|
+
status_table = Table.from_list(domain, self.processed_statuses)
|
|
251
|
+
self.Outputs.status_data.send(status_table)
|
|
252
|
+
|
|
253
|
+
def post_initialized(self):
|
|
254
|
+
pass
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
if __name__ == "__main__":
|
|
258
|
+
app = QApplication(sys.argv)
|
|
259
|
+
my_widget = OWExtractTablesDocxToCSV()
|
|
260
|
+
my_widget.show()
|
|
261
|
+
app.exec()
|
|
@@ -16,7 +16,6 @@ import easyocr
|
|
|
16
16
|
|
|
17
17
|
from AnyQt.QtCore import QThread, pyqtSignal
|
|
18
18
|
from AnyQt.QtWidgets import QApplication, QLabel, QSpinBox, QTextEdit, QPushButton
|
|
19
|
-
from AnyQt import uic
|
|
20
19
|
|
|
21
20
|
from Orange.widgets import widget
|
|
22
21
|
from Orange.widgets.utils.signals import Input, Output
|