io4it 3.0.1.1__tar.gz → 3.0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {io4it-3.0.1.1 → io4it-3.0.1.2}/PKG-INFO +1 -1
- {io4it-3.0.1.1 → io4it-3.0.1.2}/io4it.egg-info/PKG-INFO +1 -1
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/utils/config.json +15 -4
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWDoclingToMarkdown.py +12 -3
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWMarkdownLoader.py +12 -3
- io4it-3.0.1.2/orangecontrib/IO4IT/widgets/OWWebSearch.py +532 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owparserhtml.ui +2 -2
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owwebsearch.ui +58 -6
- {io4it-3.0.1.1 → io4it-3.0.1.2}/setup.py +1 -1
- io4it-3.0.1.1/orangecontrib/IO4IT/widgets/OWWebSearch.py +0 -313
- {io4it-3.0.1.1 → io4it-3.0.1.2}/io4it.egg-info/SOURCES.txt +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/io4it.egg-info/dependency_links.txt +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/io4it.egg-info/entry_points.txt +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/io4it.egg-info/namespace_packages.txt +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/io4it.egg-info/requires.txt +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/io4it.egg-info/top_level.txt +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/__init__.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/ocr_function/__init__.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/ocr_function/word_converter.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/utils/__init__.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/utils/mail.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/utils/offuscation_basique.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/utils/pool_exec_utils.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/utils/secret_manager.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/utils/utils_md.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWChatGpt.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWDeep_Search.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWDoclingASR.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWExportMarkdown.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWExtractTablesDocxToXlsx.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWInboxMailMonitoring.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWMD2HTML.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWMarkdownizer.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWOfficeNormalizer.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWParserHTML.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWPdfType.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWProcessPoolExecutor.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWS3Uploader.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWS3downloader.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWS3list.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWSpeechToText.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWmailLoader.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWmailSender.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWwordpdf2docx.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/__init__.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/__init__.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/nogui.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/ow_file_ext_selector.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owchatgpt.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owdeepsearch.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owdoclingasr.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owdoclingtomarkdown.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owdocxtoxlsx.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owexportmarkdown.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owinboxmailmonitoring.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owmailloader.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owmailsender.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owmarkdownizer.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owmarkdownloader.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owmd2html.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owofficenormalizer.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owpdftype.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owprocesspoolexecutor.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owspeechtotext.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owvisualizationer.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/wordpdf2docx.ui +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/__init__.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/chatgpt.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/check_pdf.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/deepsearch.svg +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/dep_md_old.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/download.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/export_md.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/extract_table.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/file_extensor.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/html.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/list_aws.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/load_md.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/mail_loader.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/mail_writer.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/md.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/monitor-email.svg +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/office_normalizer.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/owmd2html.svg +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/process_pool_executor.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/speech_to_text.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/upload.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/visualizationer.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/websearch.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/wordpdf2docx.png +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons_dev/__init__.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/__init__.py +0 -0
- {io4it-3.0.1.1 → io4it-3.0.1.2}/setup.cfg +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"domain_context": {
|
|
3
3
|
"prix": ["cours", "cotation", "tarif", "valeur", "coût"],
|
|
4
|
-
"marché": ["négoce", "commerce", "trading", "bourse"],
|
|
4
|
+
"marché": ["négoce", "commerce", "trading", "bourse", "mondial", "international"],
|
|
5
5
|
"économie": ["économique", "fiscal", "finance", "budget"],
|
|
6
6
|
"bourse": ["action", "indice", "trading", "investissement"],
|
|
7
7
|
|
|
@@ -9,13 +9,23 @@
|
|
|
9
9
|
"agriculture": ["agricole", "exploitation", "récolte", "production"],
|
|
10
10
|
"élevage": ["bétail", "ferme", "cheptel"],
|
|
11
11
|
|
|
12
|
+
"produits mer": ["coquillage", "crustacé", "mollusque", "pêche", "aquaculture"],
|
|
13
|
+
"saint jacques": ["coquille", "pectinidé", "noix", "scallop"],
|
|
14
|
+
"coquille": ["bivalve", "mollusque", "pêche"],
|
|
15
|
+
|
|
16
|
+
"actualités": ["news", "information", "récent", "nouveau"],
|
|
17
|
+
"tendances": ["évolution", "dynamique", "orientation", "conjoncture"],
|
|
18
|
+
"perspectives": ["prévisions", "anticipations", "outlook", "horizon"],
|
|
19
|
+
"conjoncture": ["situation", "contexte", "analyse", "bilan"],
|
|
20
|
+
|
|
21
|
+
"scientifique": ["espèce", "taxonomie", "biologie", "distribution", "habitat"],
|
|
22
|
+
"pêche": ["fishery", "capture", "exploitation", "stock"],
|
|
23
|
+
|
|
12
24
|
"ia": ["intelligence artificielle", "machine learning", "deep learning"],
|
|
13
25
|
"technologie": ["tech", "digital", "numérique", "innovation"],
|
|
14
26
|
"logiciel": ["software", "application", "programme"],
|
|
15
27
|
|
|
16
|
-
"actualités": ["news", "info", "dernières informations", "récent"],
|
|
17
28
|
"article": ["publication", "presse", "média", "journal"],
|
|
18
|
-
|
|
19
29
|
"recherche": ["étude", "scientifique", "analyse", "rapport"],
|
|
20
30
|
"étude": ["recherche", "analyse", "données", "résultat"],
|
|
21
31
|
|
|
@@ -29,6 +39,7 @@
|
|
|
29
39
|
"stop_words": [
|
|
30
40
|
"le", "la", "les", "un", "une", "des", "du", "de", "et", "en",
|
|
31
41
|
"pour", "dans", "avec", "sur", "par", "ce", "cette", "ces",
|
|
32
|
-
"ou", "où", "qui", "que", "quoi", "dont", "quel"
|
|
42
|
+
"ou", "où", "qui", "que", "quoi", "dont", "quel",
|
|
43
|
+
"informations", "disponibles", "concernant", "relatif"
|
|
33
44
|
]
|
|
34
45
|
}
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import os, time
|
|
1
|
+
import os, time, sys
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from concurrent.futures import as_completed
|
|
4
4
|
|
|
5
|
-
from AnyQt.QtWidgets import QLabel
|
|
5
|
+
from AnyQt.QtWidgets import QLabel, QApplication
|
|
6
6
|
from AnyQt.QtCore import pyqtSignal
|
|
7
7
|
from Orange.widgets import widget
|
|
8
8
|
from Orange.widgets.utils.signals import Input, Output
|
|
@@ -303,4 +303,13 @@ class OWDoclingToMarkdown(widget.OWWidget):
|
|
|
303
303
|
self.Outputs.data.send(None)
|
|
304
304
|
|
|
305
305
|
def handle_finish(self):
|
|
306
|
-
self.progressBarFinished()
|
|
306
|
+
self.progressBarFinished()
|
|
307
|
+
|
|
308
|
+
if __name__ == "__main__":
|
|
309
|
+
app = QApplication(sys.argv)
|
|
310
|
+
my_widget = OWDoclingToMarkdown()
|
|
311
|
+
my_widget.show()
|
|
312
|
+
if hasattr(app, "exec"):
|
|
313
|
+
app.exec()
|
|
314
|
+
else:
|
|
315
|
+
app.exec_()
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
import os
|
|
1
|
+
import os, sys
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
import numpy as np
|
|
4
4
|
|
|
5
5
|
from Orange.widgets import widget
|
|
6
6
|
from Orange.widgets.utils.signals import Input, Output
|
|
7
7
|
from Orange.data import Domain, StringVariable, Table
|
|
8
|
-
from AnyQt.QtWidgets import QCheckBox
|
|
8
|
+
from AnyQt.QtWidgets import QCheckBox, QApplication
|
|
9
9
|
|
|
10
10
|
try:
|
|
11
11
|
from Orange.widgets.orangecontrib.AAIT.utils.import_uic import uic
|
|
@@ -111,4 +111,13 @@ class OWMarkdownLoader(widget.OWWidget):
|
|
|
111
111
|
metas = np.array(md_rows, dtype=object) if md_rows else np.empty((0, 2), dtype=object)
|
|
112
112
|
md_table = Table.from_numpy(domain, X, metas=metas)
|
|
113
113
|
|
|
114
|
-
self.Outputs.md_files.send(md_table)
|
|
114
|
+
self.Outputs.md_files.send(md_table)
|
|
115
|
+
|
|
116
|
+
if __name__ == "__main__":
|
|
117
|
+
app = QApplication(sys.argv)
|
|
118
|
+
my_widget = OWMarkdownLoader()
|
|
119
|
+
my_widget.show()
|
|
120
|
+
if hasattr(app, "exec"):
|
|
121
|
+
app.exec()
|
|
122
|
+
else:
|
|
123
|
+
app.exec_()
|
|
@@ -0,0 +1,532 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import List, Dict
|
|
3
|
+
from ddgs import DDGS
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
import Orange
|
|
7
|
+
import re
|
|
8
|
+
from Orange.widgets.widget import Input, Output
|
|
9
|
+
from AnyQt.QtWidgets import QApplication, QPushButton, QLineEdit, QSpinBox, QDoubleSpinBox
|
|
10
|
+
import json
|
|
11
|
+
from Orange.widgets.settings import Setting
|
|
12
|
+
|
|
13
|
+
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
14
|
+
from Orange.widgets.orangecontrib.AAIT.utils import thread_management, base_widget
|
|
15
|
+
from Orange.widgets.orangecontrib.HLIT_dev.remote_server_smb import convert
|
|
16
|
+
else:
|
|
17
|
+
from orangecontrib.AAIT.utils import thread_management, base_widget
|
|
18
|
+
from orangecontrib.HLIT_dev.remote_server_smb import convert
|
|
19
|
+
|
|
20
|
+
class WebSearch(base_widget.BaseListWidget):
|
|
21
|
+
name = "WebSearch"
|
|
22
|
+
description = "Search url website from a query with DDG."
|
|
23
|
+
icon = "icons/websearch.png"
|
|
24
|
+
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
25
|
+
icon = "icons_dev/websearch.png"
|
|
26
|
+
priority = 3000
|
|
27
|
+
gui = ""
|
|
28
|
+
want_control_area = False
|
|
29
|
+
category = "AAIT - TOOLBOX"
|
|
30
|
+
gui = os.path.join(os.path.dirname(os.path.abspath(__file__)), "designer/owwebsearch.ui")
|
|
31
|
+
# Settings
|
|
32
|
+
selected_column_name = Setting("content")
|
|
33
|
+
region = Setting('fr-fr')
|
|
34
|
+
time_range = Setting('y')
|
|
35
|
+
max_results = Setting(20)
|
|
36
|
+
relevance_threshold = Setting(0.3)
|
|
37
|
+
|
|
38
|
+
class Inputs:
|
|
39
|
+
data = Input("Data", Orange.data.Table)
|
|
40
|
+
|
|
41
|
+
@Inputs.data
|
|
42
|
+
def set_data(self, in_data):
|
|
43
|
+
self.data = in_data
|
|
44
|
+
if in_data is None:
|
|
45
|
+
self.Outputs.data.send(None)
|
|
46
|
+
return
|
|
47
|
+
if self.data:
|
|
48
|
+
self.var_selector.add_variables(self.data.domain)
|
|
49
|
+
self.var_selector.select_variable_by_name(self.selected_column_name)
|
|
50
|
+
self.run()
|
|
51
|
+
|
|
52
|
+
class Outputs:
|
|
53
|
+
data = Output("Data", Orange.data.Table)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def __init__(self):
|
|
57
|
+
super().__init__()
|
|
58
|
+
# Qt Management
|
|
59
|
+
self.setFixedWidth(500)
|
|
60
|
+
self.setFixedHeight(600)
|
|
61
|
+
|
|
62
|
+
self.edit_region = self.findChild(QLineEdit, 'boxRegion')
|
|
63
|
+
self.edit_region.setPlaceholderText("Region")
|
|
64
|
+
self.edit_region.setText(self.region)
|
|
65
|
+
self.edit_region.editingFinished.connect(self.update_parameters)
|
|
66
|
+
|
|
67
|
+
self.edit_time_range = self.findChild(QLineEdit, 'boxTimeRange')
|
|
68
|
+
self.edit_time_range.setPlaceholderText("Time Range")
|
|
69
|
+
self.edit_time_range.setText(self.time_range)
|
|
70
|
+
self.edit_time_range.editingFinished.connect(self.update_parameters)
|
|
71
|
+
|
|
72
|
+
self.edit_max_results = self.bind_spinbox("boxMaxResults", self.max_results)
|
|
73
|
+
self.edit_relevance_threshold = self.bind_spinbox("boxRelevanceThreshold", self.relevance_threshold, is_double=True)
|
|
74
|
+
|
|
75
|
+
self.pushButton_run =self.findChild(QPushButton, 'pushButton_send')
|
|
76
|
+
self.pushButton_run.clicked.connect(self.run)
|
|
77
|
+
self.load_config()
|
|
78
|
+
|
|
79
|
+
def bind_spinbox(self, name, value, is_double=False):
|
|
80
|
+
widget_type = QDoubleSpinBox if is_double else QSpinBox
|
|
81
|
+
box = self.findChild(widget_type, name)
|
|
82
|
+
box.setValue(value)
|
|
83
|
+
box.editingFinished.connect(self.update_parameters)
|
|
84
|
+
return box
|
|
85
|
+
|
|
86
|
+
def update_parameters(self):
|
|
87
|
+
self.max_results = self.edit_max_results.value()
|
|
88
|
+
self.relevance_threshold = self.edit_relevance_threshold.value()
|
|
89
|
+
self.time_range = self.edit_time_range.text()
|
|
90
|
+
self.region = self.edit_region.text()
|
|
91
|
+
|
|
92
|
+
def load_config(self):
|
|
93
|
+
config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils/config.json")
|
|
94
|
+
with open(config_path, "r", encoding="utf-8") as f:
|
|
95
|
+
config = json.load(f)
|
|
96
|
+
self.domain_context = config["domain_context"]
|
|
97
|
+
self.stop_words = set(config["stop_words"])
|
|
98
|
+
|
|
99
|
+
def detect_domain(self, query: str):
|
|
100
|
+
"""Détecte les domaines dans la requête"""
|
|
101
|
+
query_lower = query.lower()
|
|
102
|
+
detected = []
|
|
103
|
+
|
|
104
|
+
for domain_key in self.domain_context.keys():
|
|
105
|
+
if domain_key in query_lower:
|
|
106
|
+
detected.append(domain_key)
|
|
107
|
+
|
|
108
|
+
return detected
|
|
109
|
+
|
|
110
|
+
def get_contextual_terms(self, query: str):
|
|
111
|
+
"""Récupère les termes contextuels basés sur le domaine"""
|
|
112
|
+
domains = self.detect_domain(query)
|
|
113
|
+
|
|
114
|
+
if not domains:
|
|
115
|
+
return []
|
|
116
|
+
|
|
117
|
+
context_terms = []
|
|
118
|
+
for domain in domains:
|
|
119
|
+
terms = self.domain_context.get(domain, [])[:3]
|
|
120
|
+
context_terms.extend(terms)
|
|
121
|
+
|
|
122
|
+
return context_terms
|
|
123
|
+
|
|
124
|
+
def optimize_query(self, query: str):
|
|
125
|
+
"""Génère des variations optimisées"""
|
|
126
|
+
query = self.clean_query(query)
|
|
127
|
+
variations = []
|
|
128
|
+
|
|
129
|
+
# Détecter noms scientifiques et dates
|
|
130
|
+
scientific_names = self.detect_scientific_name(query)
|
|
131
|
+
temporal_exprs = self.detect_temporal_expressions(query)
|
|
132
|
+
key_phrases = self.extract_key_phrases(query)
|
|
133
|
+
|
|
134
|
+
words = query.split()
|
|
135
|
+
important_words = [
|
|
136
|
+
w for w in words
|
|
137
|
+
if len(w) > 3 and w.lower() not in self.stop_words
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
# Noms scientifiques entre guillemets
|
|
141
|
+
if scientific_names:
|
|
142
|
+
for sci_name in scientific_names:
|
|
143
|
+
variations.append(f'"{sci_name}"')
|
|
144
|
+
other_words = [w for w in important_words if w not in sci_name.split()]
|
|
145
|
+
context = other_words[:3] + temporal_exprs
|
|
146
|
+
if context:
|
|
147
|
+
variations.append(f'"{sci_name}" {" ".join(context)}')
|
|
148
|
+
|
|
149
|
+
# Avec expressions temporelles
|
|
150
|
+
if temporal_exprs and not scientific_names:
|
|
151
|
+
if len(important_words) >= 1:
|
|
152
|
+
non_temporal_words = []
|
|
153
|
+
for word in important_words:
|
|
154
|
+
is_part_of_temporal = False
|
|
155
|
+
for temp_expr in temporal_exprs:
|
|
156
|
+
if word.lower() in temp_expr.lower():
|
|
157
|
+
is_part_of_temporal = True
|
|
158
|
+
break
|
|
159
|
+
if not is_part_of_temporal:
|
|
160
|
+
non_temporal_words.append(word)
|
|
161
|
+
|
|
162
|
+
if non_temporal_words and temporal_exprs:
|
|
163
|
+
variations.append(f"{' '.join(non_temporal_words[:3])} {' '.join(temporal_exprs)}")
|
|
164
|
+
|
|
165
|
+
if key_phrases and non_temporal_words:
|
|
166
|
+
main_phrase = key_phrases[0]
|
|
167
|
+
contains_temporal = any(temp in main_phrase for temp in temporal_exprs)
|
|
168
|
+
if not contains_temporal and len(main_phrase.split()) >= 2:
|
|
169
|
+
variations.append(f'"{main_phrase}" {" ".join(temporal_exprs)}')
|
|
170
|
+
|
|
171
|
+
# Phrase clé entre guillemets
|
|
172
|
+
if key_phrases and not scientific_names and not temporal_exprs:
|
|
173
|
+
main_phrase = key_phrases[0]
|
|
174
|
+
if len(main_phrase.split()) >= 2:
|
|
175
|
+
variations.append(f'"{main_phrase}"')
|
|
176
|
+
if len(key_phrases) > 1:
|
|
177
|
+
variations.append(f'"{main_phrase}" {key_phrases[1]}')
|
|
178
|
+
|
|
179
|
+
# Ultra-simplifié
|
|
180
|
+
if len(important_words) >= 2:
|
|
181
|
+
simplified = ' '.join(important_words[:4])
|
|
182
|
+
if temporal_exprs:
|
|
183
|
+
for temp_expr in temporal_exprs:
|
|
184
|
+
if temp_expr.lower() not in simplified.lower():
|
|
185
|
+
simplified = f"{simplified} {temp_expr}"
|
|
186
|
+
variations.append(simplified)
|
|
187
|
+
elif len(important_words) == 1 and temporal_exprs:
|
|
188
|
+
variations.append(f"{important_words[0]} {' '.join(temporal_exprs)}")
|
|
189
|
+
|
|
190
|
+
# Enrichissement contextuel
|
|
191
|
+
context_terms = self.get_contextual_terms(query)
|
|
192
|
+
if context_terms and important_words:
|
|
193
|
+
enriched_parts = important_words[:3] + context_terms[:2]
|
|
194
|
+
if temporal_exprs:
|
|
195
|
+
enriched_parts.extend(temporal_exprs)
|
|
196
|
+
enriched = ' '.join(enriched_parts)
|
|
197
|
+
variations.append(enriched)
|
|
198
|
+
|
|
199
|
+
variations.append(query)
|
|
200
|
+
|
|
201
|
+
# Dédupliquer
|
|
202
|
+
seen = set()
|
|
203
|
+
unique_variations = []
|
|
204
|
+
for v in variations:
|
|
205
|
+
v_clean = v.strip()
|
|
206
|
+
if v_clean and v_clean not in seen and len(v_clean.split()) <= 10:
|
|
207
|
+
seen.add(v_clean)
|
|
208
|
+
unique_variations.append(v_clean)
|
|
209
|
+
|
|
210
|
+
return unique_variations[:6]
|
|
211
|
+
|
|
212
|
+
def calculate_relevance(self, query: str, title: str, snippet: str):
|
|
213
|
+
"""Calcule un score de pertinence"""
|
|
214
|
+
query_lower = query.lower()
|
|
215
|
+
title_lower = title.lower()
|
|
216
|
+
snippet_lower = snippet.lower()
|
|
217
|
+
|
|
218
|
+
query_words = [
|
|
219
|
+
w for w in query_lower.split()
|
|
220
|
+
if len(w) > 3 and w not in self.stop_words
|
|
221
|
+
]
|
|
222
|
+
|
|
223
|
+
if not query_words:
|
|
224
|
+
return 0.5
|
|
225
|
+
|
|
226
|
+
score = 0.0
|
|
227
|
+
max_score = len(query_words)
|
|
228
|
+
|
|
229
|
+
for word in query_words:
|
|
230
|
+
if word in title_lower:
|
|
231
|
+
score += 0.6
|
|
232
|
+
elif word in snippet_lower:
|
|
233
|
+
score += 0.4
|
|
234
|
+
else:
|
|
235
|
+
word_norm = self.normalize_text(word)
|
|
236
|
+
title_norm = self.normalize_text(title_lower)
|
|
237
|
+
snippet_norm = self.normalize_text(snippet_lower)
|
|
238
|
+
|
|
239
|
+
if word_norm in title_norm:
|
|
240
|
+
score += 0.5
|
|
241
|
+
elif word_norm in snippet_norm:
|
|
242
|
+
score += 0.3
|
|
243
|
+
|
|
244
|
+
return min(score / max_score, 1.0)
|
|
245
|
+
|
|
246
|
+
def normalize_text(self, text: str):
|
|
247
|
+
"""Normalise le texte"""
|
|
248
|
+
accent_map = {
|
|
249
|
+
'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e',
|
|
250
|
+
'à': 'a', 'â': 'a', 'ä': 'a',
|
|
251
|
+
'î': 'i', 'ï': 'i',
|
|
252
|
+
'ô': 'o', 'ö': 'o',
|
|
253
|
+
'ù': 'u', 'û': 'u', 'ü': 'u',
|
|
254
|
+
'ç': 'c', 'ñ': 'n'
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
result = text.lower()
|
|
258
|
+
for old, new in accent_map.items():
|
|
259
|
+
result = result.replace(old, new)
|
|
260
|
+
|
|
261
|
+
return result
|
|
262
|
+
|
|
263
|
+
def filter_by_relevance(self, results: List[Dict], query: str):
|
|
264
|
+
"""Filtre avec vérification de fraîcheur"""
|
|
265
|
+
from datetime import datetime, timedelta
|
|
266
|
+
|
|
267
|
+
scored_results = []
|
|
268
|
+
current_year = datetime.now().year
|
|
269
|
+
|
|
270
|
+
for result in results:
|
|
271
|
+
score = self.calculate_relevance(
|
|
272
|
+
query,
|
|
273
|
+
result.get('title', ''),
|
|
274
|
+
result.get('body', result.get('snippet', ''))
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Pénaliser les vieux contenus
|
|
278
|
+
title = result.get('title', '')
|
|
279
|
+
snippet = result.get('body', result.get('snippet', ''))
|
|
280
|
+
|
|
281
|
+
# Chercher des années dans le contenu
|
|
282
|
+
years_found = re.findall(r'\b(20\d{2})\b', title + ' ' + snippet)
|
|
283
|
+
if years_found:
|
|
284
|
+
max_year = max(int(y) for y in years_found)
|
|
285
|
+
year_diff = current_year - max_year
|
|
286
|
+
|
|
287
|
+
# Pénalité selon l'ancienneté
|
|
288
|
+
if year_diff > 2:
|
|
289
|
+
score *= 0.3
|
|
290
|
+
elif year_diff > 1:
|
|
291
|
+
score *= 0.7
|
|
292
|
+
|
|
293
|
+
result['relevance_score'] = score
|
|
294
|
+
scored_results.append(result)
|
|
295
|
+
|
|
296
|
+
filtered = [r for r in scored_results if r['relevance_score'] >= self.relevance_threshold]
|
|
297
|
+
filtered.sort(key=lambda x: x['relevance_score'], reverse=True)
|
|
298
|
+
|
|
299
|
+
return filtered
|
|
300
|
+
|
|
301
|
+
def detect_scientific_name(self, query: str):
|
|
302
|
+
"""Détecte les noms scientifiques"""
|
|
303
|
+
excluded_words = {'prix', 'cours', 'marché', 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet',
|
|
304
|
+
'août', 'septembre', 'octobre', 'novembre', 'décembre', 'année', 'mois', 'jour', 'monde',
|
|
305
|
+
'france', 'europe', 'production', 'commerce', 'export', 'import', 'aquaculture',
|
|
306
|
+
'distribution', 'habitat', 'recherche', 'étude', 'analyse', 'rapport'}
|
|
307
|
+
context_words = {'prix', 'cours', 'marché', 'production', 'recherche', 'étude', 'analyse', 'rapport', 'habitat',
|
|
308
|
+
'aquaculture', 'distribution', 'ecology', 'biology', 'genetic', 'fishery', 'cultivation',
|
|
309
|
+
'harvest', 'spawning'}
|
|
310
|
+
pattern = r'\b([a-zA-Z]{4,})\s+([a-zA-Z]{4,})\b'
|
|
311
|
+
matches = re.finditer(pattern, query.lower())
|
|
312
|
+
|
|
313
|
+
scientific_names = []
|
|
314
|
+
for match in matches:
|
|
315
|
+
word1, word2 = match.groups()
|
|
316
|
+
|
|
317
|
+
if word1 in excluded_words or word2 in excluded_words:
|
|
318
|
+
continue
|
|
319
|
+
|
|
320
|
+
if word1 in context_words and word2 in context_words:
|
|
321
|
+
continue
|
|
322
|
+
|
|
323
|
+
if word1 in context_words:
|
|
324
|
+
continue
|
|
325
|
+
|
|
326
|
+
latin_suffixes = ('us', 'a', 'is', 'um', 'ae', 'i', 'ica', 'ensis', 'anus', 'ina', 'ella', 'ina')
|
|
327
|
+
has_latin_ending = word2.endswith(latin_suffixes) or word1.endswith(latin_suffixes)
|
|
328
|
+
|
|
329
|
+
if has_latin_ending:
|
|
330
|
+
normalized = f"{word1.capitalize()} {word2.lower()}"
|
|
331
|
+
if normalized not in scientific_names:
|
|
332
|
+
scientific_names.append(normalized)
|
|
333
|
+
|
|
334
|
+
return scientific_names
|
|
335
|
+
|
|
336
|
+
def detect_temporal_expressions(self, query: str):
|
|
337
|
+
"""Détecte les expressions temporelles"""
|
|
338
|
+
temporal_patterns = [
|
|
339
|
+
r'\b(janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)\s+\d{4}\b',
|
|
340
|
+
r'\b\d{1,2}[/-]\d{1,2}[/-]\d{4}\b',
|
|
341
|
+
r'\b[QT][1-4]\s+\d{4}\b',
|
|
342
|
+
r'\b(S[12]|premier|second|1er|2ème)\s+(semestre|trimestre)\s+\d{4}\b',
|
|
343
|
+
r'\b(20\d{2})\b',
|
|
344
|
+
r'\b(cette|l\'|cette)\s+(année|semaine|mois)\b',
|
|
345
|
+
r'\b(dernier|dernière|prochain|prochaine)\s+(année|semaine|mois|trimestre)\b',
|
|
346
|
+
]
|
|
347
|
+
|
|
348
|
+
found = []
|
|
349
|
+
remaining_query = query
|
|
350
|
+
|
|
351
|
+
for pattern in temporal_patterns:
|
|
352
|
+
matches = re.finditer(pattern, remaining_query, re.IGNORECASE)
|
|
353
|
+
for match in matches:
|
|
354
|
+
expr = match.group()
|
|
355
|
+
found.append(expr)
|
|
356
|
+
remaining_query = remaining_query.replace(expr, ' ' * len(expr))
|
|
357
|
+
|
|
358
|
+
return found
|
|
359
|
+
|
|
360
|
+
def extract_key_phrases(self, query: str):
|
|
361
|
+
"""Extrait les phrases clés"""
|
|
362
|
+
temporal_exprs = self.detect_temporal_expressions(query)
|
|
363
|
+
|
|
364
|
+
temp_query = query
|
|
365
|
+
temporal_tokens = {}
|
|
366
|
+
for i, expr in enumerate(temporal_exprs):
|
|
367
|
+
token = f"__TEMPORAL_{i}__"
|
|
368
|
+
temporal_tokens[token] = expr
|
|
369
|
+
temp_query = temp_query.replace(expr, token)
|
|
370
|
+
|
|
371
|
+
words = temp_query.split()
|
|
372
|
+
|
|
373
|
+
important_indices = []
|
|
374
|
+
for i, word in enumerate(words):
|
|
375
|
+
if word.startswith("__TEMPORAL_"):
|
|
376
|
+
important_indices.append(i)
|
|
377
|
+
elif len(word) > 3 and word.lower() not in self.stop_words:
|
|
378
|
+
important_indices.append(i)
|
|
379
|
+
|
|
380
|
+
phrases = []
|
|
381
|
+
if not important_indices:
|
|
382
|
+
return []
|
|
383
|
+
|
|
384
|
+
current_phrase = [words[important_indices[0]]]
|
|
385
|
+
last_idx = important_indices[0]
|
|
386
|
+
|
|
387
|
+
for idx in important_indices[1:]:
|
|
388
|
+
if idx - last_idx <= 2:
|
|
389
|
+
for j in range(last_idx + 1, idx + 1):
|
|
390
|
+
if words[j].lower() not in self.stop_words or len(current_phrase) == 1 or words[j].startswith(
|
|
391
|
+
"__TEMPORAL_"):
|
|
392
|
+
current_phrase.append(words[j])
|
|
393
|
+
else:
|
|
394
|
+
if len(current_phrase) >= 1:
|
|
395
|
+
phrase = ' '.join(current_phrase)
|
|
396
|
+
for token, expr in temporal_tokens.items():
|
|
397
|
+
phrase = phrase.replace(token, expr)
|
|
398
|
+
phrases.append(phrase)
|
|
399
|
+
current_phrase = [words[idx]]
|
|
400
|
+
last_idx = idx
|
|
401
|
+
|
|
402
|
+
if len(current_phrase) >= 1:
|
|
403
|
+
phrase = ' '.join(current_phrase)
|
|
404
|
+
for token, expr in temporal_tokens.items():
|
|
405
|
+
phrase = phrase.replace(token, expr)
|
|
406
|
+
phrases.append(phrase)
|
|
407
|
+
|
|
408
|
+
return phrases
|
|
409
|
+
|
|
410
|
+
def clean_query(self, query: str):
|
|
411
|
+
"""Nettoie la requête"""
|
|
412
|
+
query = query.strip()
|
|
413
|
+
|
|
414
|
+
generic_prefixes = [
|
|
415
|
+
r'^(les?\s+)?informations?\s+(disponibles?\s+)?(sur|concernant|relatif|au sujet)\s+',
|
|
416
|
+
r'^(je\s+)?(cherche|recherche|veux|voudrais|souhaite)\s+',
|
|
417
|
+
r'^(peux-tu|pouvez-vous|trouve|trouver)\s+',
|
|
418
|
+
r'^(donne-moi|donnez-moi)\s+',
|
|
419
|
+
]
|
|
420
|
+
|
|
421
|
+
for pattern in generic_prefixes:
|
|
422
|
+
query = re.sub(pattern, '', query, flags=re.IGNORECASE)
|
|
423
|
+
|
|
424
|
+
return query.strip()
|
|
425
|
+
|
|
426
|
+
def search(self, use_optimization: bool = True):
|
|
427
|
+
all_results = []
|
|
428
|
+
seen_urls = set()
|
|
429
|
+
|
|
430
|
+
if use_optimization:
|
|
431
|
+
query_variations = self.optimize_query(self.query)
|
|
432
|
+
queries_to_try = query_variations
|
|
433
|
+
else:
|
|
434
|
+
queries_to_try = [self.query]
|
|
435
|
+
|
|
436
|
+
for idx, q in enumerate(queries_to_try, 1):
|
|
437
|
+
if len(all_results) >= self.max_results:
|
|
438
|
+
break
|
|
439
|
+
|
|
440
|
+
try:
|
|
441
|
+
with DDGS() as ddgs:
|
|
442
|
+
search_results = list(ddgs.text(
|
|
443
|
+
q,
|
|
444
|
+
region=self.region,
|
|
445
|
+
safesearch='off',
|
|
446
|
+
timelimit=self.time_range,
|
|
447
|
+
max_results=min(50, self.max_results * 3)
|
|
448
|
+
))
|
|
449
|
+
filtered = self.filter_by_relevance(search_results, self.query)
|
|
450
|
+
|
|
451
|
+
new_count = 0
|
|
452
|
+
for r in filtered:
|
|
453
|
+
if r['href'] not in seen_urls:
|
|
454
|
+
seen_urls.add(r['href'])
|
|
455
|
+
|
|
456
|
+
result = {
|
|
457
|
+
'url': r['href'],
|
|
458
|
+
'title': r['title'],
|
|
459
|
+
'snippet': r.get('body', ''),
|
|
460
|
+
'source': 'DuckDuckGo',
|
|
461
|
+
'query': self.query,
|
|
462
|
+
'query_variation': q,
|
|
463
|
+
'relevance_score': r['relevance_score'],
|
|
464
|
+
'fetched_at': datetime.now().isoformat(),
|
|
465
|
+
'rank': len(all_results) + 1
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
all_results.append(result)
|
|
469
|
+
new_count += 1
|
|
470
|
+
|
|
471
|
+
if len(all_results) >= self.max_results:
|
|
472
|
+
break
|
|
473
|
+
|
|
474
|
+
except Exception as e:
|
|
475
|
+
print(e)
|
|
476
|
+
continue
|
|
477
|
+
|
|
478
|
+
return all_results[:self.max_results]
|
|
479
|
+
|
|
480
|
+
def run(self):
|
|
481
|
+
self.error("")
|
|
482
|
+
self.warning("")
|
|
483
|
+
if self.data is None:
|
|
484
|
+
self.Outputs.data.send(None)
|
|
485
|
+
return
|
|
486
|
+
|
|
487
|
+
if not self.selected_column_name in self.data.domain:
|
|
488
|
+
self.warning(f'Previously selected column "{self.selected_column_name}" does not exist in your data.')
|
|
489
|
+
return
|
|
490
|
+
|
|
491
|
+
self.query = self.data.get_column(self.selected_column_name)[0]
|
|
492
|
+
|
|
493
|
+
self.progressBarInit()
|
|
494
|
+
self.thread = thread_management.Thread(self.search)
|
|
495
|
+
self.thread.progress.connect(self.handle_progress)
|
|
496
|
+
self.thread.result.connect(self.handle_result)
|
|
497
|
+
self.thread.finish.connect(self.handle_finish)
|
|
498
|
+
self.thread.start()
|
|
499
|
+
|
|
500
|
+
def handle_progress(self, progress) -> None:
|
|
501
|
+
value = progress[0]
|
|
502
|
+
text = progress[1]
|
|
503
|
+
if value is not None:
|
|
504
|
+
self.progressBarSet(value)
|
|
505
|
+
if text is None:
|
|
506
|
+
self.textBrowser.setText("")
|
|
507
|
+
else:
|
|
508
|
+
self.textBrowser.insertPlainText(text)
|
|
509
|
+
|
|
510
|
+
def handle_result(self, result):
|
|
511
|
+
if result is None or len(result) == 0:
|
|
512
|
+
self.Outputs.data.send(None)
|
|
513
|
+
return
|
|
514
|
+
data = convert.convert_json_implicite_to_data_table(result)
|
|
515
|
+
self.Outputs.data.send(data)
|
|
516
|
+
|
|
517
|
+
def handle_finish(self):
|
|
518
|
+
self.progressBarFinished()
|
|
519
|
+
|
|
520
|
+
def post_initialized(self):
|
|
521
|
+
pass
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
if __name__ == "__main__":
|
|
525
|
+
app = QApplication(sys.argv)
|
|
526
|
+
my_widget = WebSearch()
|
|
527
|
+
my_widget.show()
|
|
528
|
+
|
|
529
|
+
if hasattr(app, "exec"):
|
|
530
|
+
sys.exit(app.exec())
|
|
531
|
+
else:
|
|
532
|
+
sys.exit(app.exec_())
|
|
@@ -30,12 +30,12 @@
|
|
|
30
30
|
<rect>
|
|
31
31
|
<x>10</x>
|
|
32
32
|
<y>10</y>
|
|
33
|
-
<width>
|
|
33
|
+
<width>391</width>
|
|
34
34
|
<height>51</height>
|
|
35
35
|
</rect>
|
|
36
36
|
</property>
|
|
37
37
|
<property name="text">
|
|
38
|
-
<string>This widget takes a list of url and return the
|
|
38
|
+
<string>This widget takes a list of url from a column "url" and return the content of the web site .</string>
|
|
39
39
|
</property>
|
|
40
40
|
<property name="textFormat">
|
|
41
41
|
<enum>Qt::AutoText</enum>
|