io4it 3.0.1.1__tar.gz → 3.0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. {io4it-3.0.1.1 → io4it-3.0.1.2}/PKG-INFO +1 -1
  2. {io4it-3.0.1.1 → io4it-3.0.1.2}/io4it.egg-info/PKG-INFO +1 -1
  3. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/utils/config.json +15 -4
  4. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWDoclingToMarkdown.py +12 -3
  5. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWMarkdownLoader.py +12 -3
  6. io4it-3.0.1.2/orangecontrib/IO4IT/widgets/OWWebSearch.py +532 -0
  7. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owparserhtml.ui +2 -2
  8. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owwebsearch.ui +58 -6
  9. {io4it-3.0.1.1 → io4it-3.0.1.2}/setup.py +1 -1
  10. io4it-3.0.1.1/orangecontrib/IO4IT/widgets/OWWebSearch.py +0 -313
  11. {io4it-3.0.1.1 → io4it-3.0.1.2}/io4it.egg-info/SOURCES.txt +0 -0
  12. {io4it-3.0.1.1 → io4it-3.0.1.2}/io4it.egg-info/dependency_links.txt +0 -0
  13. {io4it-3.0.1.1 → io4it-3.0.1.2}/io4it.egg-info/entry_points.txt +0 -0
  14. {io4it-3.0.1.1 → io4it-3.0.1.2}/io4it.egg-info/namespace_packages.txt +0 -0
  15. {io4it-3.0.1.1 → io4it-3.0.1.2}/io4it.egg-info/requires.txt +0 -0
  16. {io4it-3.0.1.1 → io4it-3.0.1.2}/io4it.egg-info/top_level.txt +0 -0
  17. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/__init__.py +0 -0
  18. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/ocr_function/__init__.py +0 -0
  19. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/ocr_function/word_converter.py +0 -0
  20. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/utils/__init__.py +0 -0
  21. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/utils/mail.py +0 -0
  22. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/utils/offuscation_basique.py +0 -0
  23. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/utils/pool_exec_utils.py +0 -0
  24. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/utils/secret_manager.py +0 -0
  25. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/utils/utils_md.py +0 -0
  26. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWChatGpt.py +0 -0
  27. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWDeep_Search.py +0 -0
  28. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWDoclingASR.py +0 -0
  29. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWExportMarkdown.py +0 -0
  30. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWExtractTablesDocxToXlsx.py +0 -0
  31. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWInboxMailMonitoring.py +0 -0
  32. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWMD2HTML.py +0 -0
  33. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWMarkdownizer.py +0 -0
  34. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWOfficeNormalizer.py +0 -0
  35. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWParserHTML.py +0 -0
  36. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWPdfType.py +0 -0
  37. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWProcessPoolExecutor.py +0 -0
  38. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWS3Uploader.py +0 -0
  39. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWS3downloader.py +0 -0
  40. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWS3list.py +0 -0
  41. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWSpeechToText.py +0 -0
  42. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWmailLoader.py +0 -0
  43. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWmailSender.py +0 -0
  44. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/OWwordpdf2docx.py +0 -0
  45. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/__init__.py +0 -0
  46. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/__init__.py +0 -0
  47. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/nogui.ui +0 -0
  48. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/ow_file_ext_selector.ui +0 -0
  49. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owchatgpt.ui +0 -0
  50. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owdeepsearch.ui +0 -0
  51. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owdoclingasr.ui +0 -0
  52. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owdoclingtomarkdown.ui +0 -0
  53. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owdocxtoxlsx.ui +0 -0
  54. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owexportmarkdown.ui +0 -0
  55. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owinboxmailmonitoring.ui +0 -0
  56. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owmailloader.ui +0 -0
  57. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owmailsender.ui +0 -0
  58. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owmarkdownizer.ui +0 -0
  59. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owmarkdownloader.ui +0 -0
  60. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owmd2html.ui +0 -0
  61. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owofficenormalizer.ui +0 -0
  62. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owpdftype.ui +0 -0
  63. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owprocesspoolexecutor.ui +0 -0
  64. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owspeechtotext.ui +0 -0
  65. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/owvisualizationer.ui +0 -0
  66. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/designer/wordpdf2docx.ui +0 -0
  67. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/__init__.py +0 -0
  68. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/chatgpt.png +0 -0
  69. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/check_pdf.png +0 -0
  70. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/deepsearch.svg +0 -0
  71. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/dep_md_old.png +0 -0
  72. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/download.png +0 -0
  73. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/export_md.png +0 -0
  74. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/extract_table.png +0 -0
  75. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/file_extensor.png +0 -0
  76. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/html.png +0 -0
  77. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/list_aws.png +0 -0
  78. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/load_md.png +0 -0
  79. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/mail_loader.png +0 -0
  80. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/mail_writer.png +0 -0
  81. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/md.png +0 -0
  82. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/monitor-email.svg +0 -0
  83. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/office_normalizer.png +0 -0
  84. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/owmd2html.svg +0 -0
  85. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/process_pool_executor.png +0 -0
  86. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/speech_to_text.png +0 -0
  87. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/upload.png +0 -0
  88. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/visualizationer.png +0 -0
  89. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/websearch.png +0 -0
  90. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons/wordpdf2docx.png +0 -0
  91. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/IO4IT/widgets/icons_dev/__init__.py +0 -0
  92. {io4it-3.0.1.1 → io4it-3.0.1.2}/orangecontrib/__init__.py +0 -0
  93. {io4it-3.0.1.1 → io4it-3.0.1.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: io4it
3
- Version: 3.0.1.1
3
+ Version: 3.0.1.2
4
4
  Home-page:
5
5
  Author:
6
6
  Author-email:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: io4it
3
- Version: 3.0.1.1
3
+ Version: 3.0.1.2
4
4
  Home-page:
5
5
  Author:
6
6
  Author-email:
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "domain_context": {
3
3
  "prix": ["cours", "cotation", "tarif", "valeur", "coût"],
4
- "marché": ["négoce", "commerce", "trading", "bourse"],
4
+ "marché": ["négoce", "commerce", "trading", "bourse", "mondial", "international"],
5
5
  "économie": ["économique", "fiscal", "finance", "budget"],
6
6
  "bourse": ["action", "indice", "trading", "investissement"],
7
7
 
@@ -9,13 +9,23 @@
9
9
  "agriculture": ["agricole", "exploitation", "récolte", "production"],
10
10
  "élevage": ["bétail", "ferme", "cheptel"],
11
11
 
12
+ "produits mer": ["coquillage", "crustacé", "mollusque", "pêche", "aquaculture"],
13
+ "saint jacques": ["coquille", "pectinidé", "noix", "scallop"],
14
+ "coquille": ["bivalve", "mollusque", "pêche"],
15
+
16
+ "actualités": ["news", "information", "récent", "nouveau"],
17
+ "tendances": ["évolution", "dynamique", "orientation", "conjoncture"],
18
+ "perspectives": ["prévisions", "anticipations", "outlook", "horizon"],
19
+ "conjoncture": ["situation", "contexte", "analyse", "bilan"],
20
+
21
+ "scientifique": ["espèce", "taxonomie", "biologie", "distribution", "habitat"],
22
+ "pêche": ["fishery", "capture", "exploitation", "stock"],
23
+
12
24
  "ia": ["intelligence artificielle", "machine learning", "deep learning"],
13
25
  "technologie": ["tech", "digital", "numérique", "innovation"],
14
26
  "logiciel": ["software", "application", "programme"],
15
27
 
16
- "actualités": ["news", "info", "dernières informations", "récent"],
17
28
  "article": ["publication", "presse", "média", "journal"],
18
-
19
29
  "recherche": ["étude", "scientifique", "analyse", "rapport"],
20
30
  "étude": ["recherche", "analyse", "données", "résultat"],
21
31
 
@@ -29,6 +39,7 @@
29
39
  "stop_words": [
30
40
  "le", "la", "les", "un", "une", "des", "du", "de", "et", "en",
31
41
  "pour", "dans", "avec", "sur", "par", "ce", "cette", "ces",
32
- "ou", "où", "qui", "que", "quoi", "dont", "quel"
42
+ "ou", "où", "qui", "que", "quoi", "dont", "quel",
43
+ "informations", "disponibles", "concernant", "relatif"
33
44
  ]
34
45
  }
@@ -1,8 +1,8 @@
1
- import os, time
1
+ import os, time, sys
2
2
  from pathlib import Path
3
3
  from concurrent.futures import as_completed
4
4
 
5
- from AnyQt.QtWidgets import QLabel
5
+ from AnyQt.QtWidgets import QLabel, QApplication
6
6
  from AnyQt.QtCore import pyqtSignal
7
7
  from Orange.widgets import widget
8
8
  from Orange.widgets.utils.signals import Input, Output
@@ -303,4 +303,13 @@ class OWDoclingToMarkdown(widget.OWWidget):
303
303
  self.Outputs.data.send(None)
304
304
 
305
305
  def handle_finish(self):
306
- self.progressBarFinished()
306
+ self.progressBarFinished()
307
+
308
+ if __name__ == "__main__":
309
+ app = QApplication(sys.argv)
310
+ my_widget = OWDoclingToMarkdown()
311
+ my_widget.show()
312
+ if hasattr(app, "exec"):
313
+ app.exec()
314
+ else:
315
+ app.exec_()
@@ -1,11 +1,11 @@
1
- import os
1
+ import os, sys
2
2
  from pathlib import Path
3
3
  import numpy as np
4
4
 
5
5
  from Orange.widgets import widget
6
6
  from Orange.widgets.utils.signals import Input, Output
7
7
  from Orange.data import Domain, StringVariable, Table
8
- from AnyQt.QtWidgets import QCheckBox
8
+ from AnyQt.QtWidgets import QCheckBox, QApplication
9
9
 
10
10
  try:
11
11
  from Orange.widgets.orangecontrib.AAIT.utils.import_uic import uic
@@ -111,4 +111,13 @@ class OWMarkdownLoader(widget.OWWidget):
111
111
  metas = np.array(md_rows, dtype=object) if md_rows else np.empty((0, 2), dtype=object)
112
112
  md_table = Table.from_numpy(domain, X, metas=metas)
113
113
 
114
- self.Outputs.md_files.send(md_table)
114
+ self.Outputs.md_files.send(md_table)
115
+
116
+ if __name__ == "__main__":
117
+ app = QApplication(sys.argv)
118
+ my_widget = OWMarkdownLoader()
119
+ my_widget.show()
120
+ if hasattr(app, "exec"):
121
+ app.exec()
122
+ else:
123
+ app.exec_()
@@ -0,0 +1,532 @@
1
+ from datetime import datetime
2
+ from typing import List, Dict
3
+ from ddgs import DDGS
4
+ import os
5
+ import sys
6
+ import Orange
7
+ import re
8
+ from Orange.widgets.widget import Input, Output
9
+ from AnyQt.QtWidgets import QApplication, QPushButton, QLineEdit, QSpinBox, QDoubleSpinBox
10
+ import json
11
+ from Orange.widgets.settings import Setting
12
+
13
+ if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
14
+ from Orange.widgets.orangecontrib.AAIT.utils import thread_management, base_widget
15
+ from Orange.widgets.orangecontrib.HLIT_dev.remote_server_smb import convert
16
+ else:
17
+ from orangecontrib.AAIT.utils import thread_management, base_widget
18
+ from orangecontrib.HLIT_dev.remote_server_smb import convert
19
+
20
+ class WebSearch(base_widget.BaseListWidget):
21
+ name = "WebSearch"
22
+ description = "Search url website from a query with DDG."
23
+ icon = "icons/websearch.png"
24
+ if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
25
+ icon = "icons_dev/websearch.png"
26
+ priority = 3000
27
+ gui = ""
28
+ want_control_area = False
29
+ category = "AAIT - TOOLBOX"
30
+ gui = os.path.join(os.path.dirname(os.path.abspath(__file__)), "designer/owwebsearch.ui")
31
+ # Settings
32
+ selected_column_name = Setting("content")
33
+ region = Setting('fr-fr')
34
+ time_range = Setting('y')
35
+ max_results = Setting(20)
36
+ relevance_threshold = Setting(0.3)
37
+
38
+ class Inputs:
39
+ data = Input("Data", Orange.data.Table)
40
+
41
+ @Inputs.data
42
+ def set_data(self, in_data):
43
+ self.data = in_data
44
+ if in_data is None:
45
+ self.Outputs.data.send(None)
46
+ return
47
+ if self.data:
48
+ self.var_selector.add_variables(self.data.domain)
49
+ self.var_selector.select_variable_by_name(self.selected_column_name)
50
+ self.run()
51
+
52
+ class Outputs:
53
+ data = Output("Data", Orange.data.Table)
54
+
55
+
56
+ def __init__(self):
57
+ super().__init__()
58
+ # Qt Management
59
+ self.setFixedWidth(500)
60
+ self.setFixedHeight(600)
61
+
62
+ self.edit_region = self.findChild(QLineEdit, 'boxRegion')
63
+ self.edit_region.setPlaceholderText("Region")
64
+ self.edit_region.setText(self.region)
65
+ self.edit_region.editingFinished.connect(self.update_parameters)
66
+
67
+ self.edit_time_range = self.findChild(QLineEdit, 'boxTimeRange')
68
+ self.edit_time_range.setPlaceholderText("Time Range")
69
+ self.edit_time_range.setText(self.time_range)
70
+ self.edit_time_range.editingFinished.connect(self.update_parameters)
71
+
72
+ self.edit_max_results = self.bind_spinbox("boxMaxResults", self.max_results)
73
+ self.edit_relevance_threshold = self.bind_spinbox("boxRelevanceThreshold", self.relevance_threshold, is_double=True)
74
+
75
+ self.pushButton_run =self.findChild(QPushButton, 'pushButton_send')
76
+ self.pushButton_run.clicked.connect(self.run)
77
+ self.load_config()
78
+
79
+ def bind_spinbox(self, name, value, is_double=False):
80
+ widget_type = QDoubleSpinBox if is_double else QSpinBox
81
+ box = self.findChild(widget_type, name)
82
+ box.setValue(value)
83
+ box.editingFinished.connect(self.update_parameters)
84
+ return box
85
+
86
+ def update_parameters(self):
87
+ self.max_results = self.edit_max_results.value()
88
+ self.relevance_threshold = self.edit_relevance_threshold.value()
89
+ self.time_range = self.edit_time_range.text()
90
+ self.region = self.edit_region.text()
91
+
92
+ def load_config(self):
93
+ config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils/config.json")
94
+ with open(config_path, "r", encoding="utf-8") as f:
95
+ config = json.load(f)
96
+ self.domain_context = config["domain_context"]
97
+ self.stop_words = set(config["stop_words"])
98
+
99
+ def detect_domain(self, query: str):
100
+ """Détecte les domaines dans la requête"""
101
+ query_lower = query.lower()
102
+ detected = []
103
+
104
+ for domain_key in self.domain_context.keys():
105
+ if domain_key in query_lower:
106
+ detected.append(domain_key)
107
+
108
+ return detected
109
+
110
+ def get_contextual_terms(self, query: str):
111
+ """Récupère les termes contextuels basés sur le domaine"""
112
+ domains = self.detect_domain(query)
113
+
114
+ if not domains:
115
+ return []
116
+
117
+ context_terms = []
118
+ for domain in domains:
119
+ terms = self.domain_context.get(domain, [])[:3]
120
+ context_terms.extend(terms)
121
+
122
+ return context_terms
123
+
124
+ def optimize_query(self, query: str):
125
+ """Génère des variations optimisées"""
126
+ query = self.clean_query(query)
127
+ variations = []
128
+
129
+ # Détecter noms scientifiques et dates
130
+ scientific_names = self.detect_scientific_name(query)
131
+ temporal_exprs = self.detect_temporal_expressions(query)
132
+ key_phrases = self.extract_key_phrases(query)
133
+
134
+ words = query.split()
135
+ important_words = [
136
+ w for w in words
137
+ if len(w) > 3 and w.lower() not in self.stop_words
138
+ ]
139
+
140
+ # Noms scientifiques entre guillemets
141
+ if scientific_names:
142
+ for sci_name in scientific_names:
143
+ variations.append(f'"{sci_name}"')
144
+ other_words = [w for w in important_words if w not in sci_name.split()]
145
+ context = other_words[:3] + temporal_exprs
146
+ if context:
147
+ variations.append(f'"{sci_name}" {" ".join(context)}')
148
+
149
+ # Avec expressions temporelles
150
+ if temporal_exprs and not scientific_names:
151
+ if len(important_words) >= 1:
152
+ non_temporal_words = []
153
+ for word in important_words:
154
+ is_part_of_temporal = False
155
+ for temp_expr in temporal_exprs:
156
+ if word.lower() in temp_expr.lower():
157
+ is_part_of_temporal = True
158
+ break
159
+ if not is_part_of_temporal:
160
+ non_temporal_words.append(word)
161
+
162
+ if non_temporal_words and temporal_exprs:
163
+ variations.append(f"{' '.join(non_temporal_words[:3])} {' '.join(temporal_exprs)}")
164
+
165
+ if key_phrases and non_temporal_words:
166
+ main_phrase = key_phrases[0]
167
+ contains_temporal = any(temp in main_phrase for temp in temporal_exprs)
168
+ if not contains_temporal and len(main_phrase.split()) >= 2:
169
+ variations.append(f'"{main_phrase}" {" ".join(temporal_exprs)}')
170
+
171
+ # Phrase clé entre guillemets
172
+ if key_phrases and not scientific_names and not temporal_exprs:
173
+ main_phrase = key_phrases[0]
174
+ if len(main_phrase.split()) >= 2:
175
+ variations.append(f'"{main_phrase}"')
176
+ if len(key_phrases) > 1:
177
+ variations.append(f'"{main_phrase}" {key_phrases[1]}')
178
+
179
+ # Ultra-simplifié
180
+ if len(important_words) >= 2:
181
+ simplified = ' '.join(important_words[:4])
182
+ if temporal_exprs:
183
+ for temp_expr in temporal_exprs:
184
+ if temp_expr.lower() not in simplified.lower():
185
+ simplified = f"{simplified} {temp_expr}"
186
+ variations.append(simplified)
187
+ elif len(important_words) == 1 and temporal_exprs:
188
+ variations.append(f"{important_words[0]} {' '.join(temporal_exprs)}")
189
+
190
+ # Enrichissement contextuel
191
+ context_terms = self.get_contextual_terms(query)
192
+ if context_terms and important_words:
193
+ enriched_parts = important_words[:3] + context_terms[:2]
194
+ if temporal_exprs:
195
+ enriched_parts.extend(temporal_exprs)
196
+ enriched = ' '.join(enriched_parts)
197
+ variations.append(enriched)
198
+
199
+ variations.append(query)
200
+
201
+ # Dédupliquer
202
+ seen = set()
203
+ unique_variations = []
204
+ for v in variations:
205
+ v_clean = v.strip()
206
+ if v_clean and v_clean not in seen and len(v_clean.split()) <= 10:
207
+ seen.add(v_clean)
208
+ unique_variations.append(v_clean)
209
+
210
+ return unique_variations[:6]
211
+
212
+ def calculate_relevance(self, query: str, title: str, snippet: str):
213
+ """Calcule un score de pertinence"""
214
+ query_lower = query.lower()
215
+ title_lower = title.lower()
216
+ snippet_lower = snippet.lower()
217
+
218
+ query_words = [
219
+ w for w in query_lower.split()
220
+ if len(w) > 3 and w not in self.stop_words
221
+ ]
222
+
223
+ if not query_words:
224
+ return 0.5
225
+
226
+ score = 0.0
227
+ max_score = len(query_words)
228
+
229
+ for word in query_words:
230
+ if word in title_lower:
231
+ score += 0.6
232
+ elif word in snippet_lower:
233
+ score += 0.4
234
+ else:
235
+ word_norm = self.normalize_text(word)
236
+ title_norm = self.normalize_text(title_lower)
237
+ snippet_norm = self.normalize_text(snippet_lower)
238
+
239
+ if word_norm in title_norm:
240
+ score += 0.5
241
+ elif word_norm in snippet_norm:
242
+ score += 0.3
243
+
244
+ return min(score / max_score, 1.0)
245
+
246
+ def normalize_text(self, text: str):
247
+ """Normalise le texte"""
248
+ accent_map = {
249
+ 'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e',
250
+ 'à': 'a', 'â': 'a', 'ä': 'a',
251
+ 'î': 'i', 'ï': 'i',
252
+ 'ô': 'o', 'ö': 'o',
253
+ 'ù': 'u', 'û': 'u', 'ü': 'u',
254
+ 'ç': 'c', 'ñ': 'n'
255
+ }
256
+
257
+ result = text.lower()
258
+ for old, new in accent_map.items():
259
+ result = result.replace(old, new)
260
+
261
+ return result
262
+
263
+ def filter_by_relevance(self, results: List[Dict], query: str):
264
+ """Filtre avec vérification de fraîcheur"""
265
+ from datetime import datetime, timedelta
266
+
267
+ scored_results = []
268
+ current_year = datetime.now().year
269
+
270
+ for result in results:
271
+ score = self.calculate_relevance(
272
+ query,
273
+ result.get('title', ''),
274
+ result.get('body', result.get('snippet', ''))
275
+ )
276
+
277
+ # Pénaliser les vieux contenus
278
+ title = result.get('title', '')
279
+ snippet = result.get('body', result.get('snippet', ''))
280
+
281
+ # Chercher des années dans le contenu
282
+ years_found = re.findall(r'\b(20\d{2})\b', title + ' ' + snippet)
283
+ if years_found:
284
+ max_year = max(int(y) for y in years_found)
285
+ year_diff = current_year - max_year
286
+
287
+ # Pénalité selon l'ancienneté
288
+ if year_diff > 2:
289
+ score *= 0.3
290
+ elif year_diff > 1:
291
+ score *= 0.7
292
+
293
+ result['relevance_score'] = score
294
+ scored_results.append(result)
295
+
296
+ filtered = [r for r in scored_results if r['relevance_score'] >= self.relevance_threshold]
297
+ filtered.sort(key=lambda x: x['relevance_score'], reverse=True)
298
+
299
+ return filtered
300
+
301
+ def detect_scientific_name(self, query: str):
302
+ """Détecte les noms scientifiques"""
303
+ excluded_words = {'prix', 'cours', 'marché', 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet',
304
+ 'août', 'septembre', 'octobre', 'novembre', 'décembre', 'année', 'mois', 'jour', 'monde',
305
+ 'france', 'europe', 'production', 'commerce', 'export', 'import', 'aquaculture',
306
+ 'distribution', 'habitat', 'recherche', 'étude', 'analyse', 'rapport'}
307
+ context_words = {'prix', 'cours', 'marché', 'production', 'recherche', 'étude', 'analyse', 'rapport', 'habitat',
308
+ 'aquaculture', 'distribution', 'ecology', 'biology', 'genetic', 'fishery', 'cultivation',
309
+ 'harvest', 'spawning'}
310
+ pattern = r'\b([a-zA-Z]{4,})\s+([a-zA-Z]{4,})\b'
311
+ matches = re.finditer(pattern, query.lower())
312
+
313
+ scientific_names = []
314
+ for match in matches:
315
+ word1, word2 = match.groups()
316
+
317
+ if word1 in excluded_words or word2 in excluded_words:
318
+ continue
319
+
320
+ if word1 in context_words and word2 in context_words:
321
+ continue
322
+
323
+ if word1 in context_words:
324
+ continue
325
+
326
+ latin_suffixes = ('us', 'a', 'is', 'um', 'ae', 'i', 'ica', 'ensis', 'anus', 'ina', 'ella', 'ina')
327
+ has_latin_ending = word2.endswith(latin_suffixes) or word1.endswith(latin_suffixes)
328
+
329
+ if has_latin_ending:
330
+ normalized = f"{word1.capitalize()} {word2.lower()}"
331
+ if normalized not in scientific_names:
332
+ scientific_names.append(normalized)
333
+
334
+ return scientific_names
335
+
336
+ def detect_temporal_expressions(self, query: str):
337
+ """Détecte les expressions temporelles"""
338
+ temporal_patterns = [
339
+ r'\b(janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)\s+\d{4}\b',
340
+ r'\b\d{1,2}[/-]\d{1,2}[/-]\d{4}\b',
341
+ r'\b[QT][1-4]\s+\d{4}\b',
342
+ r'\b(S[12]|premier|second|1er|2ème)\s+(semestre|trimestre)\s+\d{4}\b',
343
+ r'\b(20\d{2})\b',
344
+ r'\b(cette|l\'|cette)\s+(année|semaine|mois)\b',
345
+ r'\b(dernier|dernière|prochain|prochaine)\s+(année|semaine|mois|trimestre)\b',
346
+ ]
347
+
348
+ found = []
349
+ remaining_query = query
350
+
351
+ for pattern in temporal_patterns:
352
+ matches = re.finditer(pattern, remaining_query, re.IGNORECASE)
353
+ for match in matches:
354
+ expr = match.group()
355
+ found.append(expr)
356
+ remaining_query = remaining_query.replace(expr, ' ' * len(expr))
357
+
358
+ return found
359
+
360
+ def extract_key_phrases(self, query: str):
361
+ """Extrait les phrases clés"""
362
+ temporal_exprs = self.detect_temporal_expressions(query)
363
+
364
+ temp_query = query
365
+ temporal_tokens = {}
366
+ for i, expr in enumerate(temporal_exprs):
367
+ token = f"__TEMPORAL_{i}__"
368
+ temporal_tokens[token] = expr
369
+ temp_query = temp_query.replace(expr, token)
370
+
371
+ words = temp_query.split()
372
+
373
+ important_indices = []
374
+ for i, word in enumerate(words):
375
+ if word.startswith("__TEMPORAL_"):
376
+ important_indices.append(i)
377
+ elif len(word) > 3 and word.lower() not in self.stop_words:
378
+ important_indices.append(i)
379
+
380
+ phrases = []
381
+ if not important_indices:
382
+ return []
383
+
384
+ current_phrase = [words[important_indices[0]]]
385
+ last_idx = important_indices[0]
386
+
387
+ for idx in important_indices[1:]:
388
+ if idx - last_idx <= 2:
389
+ for j in range(last_idx + 1, idx + 1):
390
+ if words[j].lower() not in self.stop_words or len(current_phrase) == 1 or words[j].startswith(
391
+ "__TEMPORAL_"):
392
+ current_phrase.append(words[j])
393
+ else:
394
+ if len(current_phrase) >= 1:
395
+ phrase = ' '.join(current_phrase)
396
+ for token, expr in temporal_tokens.items():
397
+ phrase = phrase.replace(token, expr)
398
+ phrases.append(phrase)
399
+ current_phrase = [words[idx]]
400
+ last_idx = idx
401
+
402
+ if len(current_phrase) >= 1:
403
+ phrase = ' '.join(current_phrase)
404
+ for token, expr in temporal_tokens.items():
405
+ phrase = phrase.replace(token, expr)
406
+ phrases.append(phrase)
407
+
408
+ return phrases
409
+
410
+ def clean_query(self, query: str):
411
+ """Nettoie la requête"""
412
+ query = query.strip()
413
+
414
+ generic_prefixes = [
415
+ r'^(les?\s+)?informations?\s+(disponibles?\s+)?(sur|concernant|relatif|au sujet)\s+',
416
+ r'^(je\s+)?(cherche|recherche|veux|voudrais|souhaite)\s+',
417
+ r'^(peux-tu|pouvez-vous|trouve|trouver)\s+',
418
+ r'^(donne-moi|donnez-moi)\s+',
419
+ ]
420
+
421
+ for pattern in generic_prefixes:
422
+ query = re.sub(pattern, '', query, flags=re.IGNORECASE)
423
+
424
+ return query.strip()
425
+
426
+ def search(self, use_optimization: bool = True):
427
+ all_results = []
428
+ seen_urls = set()
429
+
430
+ if use_optimization:
431
+ query_variations = self.optimize_query(self.query)
432
+ queries_to_try = query_variations
433
+ else:
434
+ queries_to_try = [self.query]
435
+
436
+ for idx, q in enumerate(queries_to_try, 1):
437
+ if len(all_results) >= self.max_results:
438
+ break
439
+
440
+ try:
441
+ with DDGS() as ddgs:
442
+ search_results = list(ddgs.text(
443
+ q,
444
+ region=self.region,
445
+ safesearch='off',
446
+ timelimit=self.time_range,
447
+ max_results=min(50, self.max_results * 3)
448
+ ))
449
+ filtered = self.filter_by_relevance(search_results, self.query)
450
+
451
+ new_count = 0
452
+ for r in filtered:
453
+ if r['href'] not in seen_urls:
454
+ seen_urls.add(r['href'])
455
+
456
+ result = {
457
+ 'url': r['href'],
458
+ 'title': r['title'],
459
+ 'snippet': r.get('body', ''),
460
+ 'source': 'DuckDuckGo',
461
+ 'query': self.query,
462
+ 'query_variation': q,
463
+ 'relevance_score': r['relevance_score'],
464
+ 'fetched_at': datetime.now().isoformat(),
465
+ 'rank': len(all_results) + 1
466
+ }
467
+
468
+ all_results.append(result)
469
+ new_count += 1
470
+
471
+ if len(all_results) >= self.max_results:
472
+ break
473
+
474
+ except Exception as e:
475
+ print(e)
476
+ continue
477
+
478
+ return all_results[:self.max_results]
479
+
480
+ def run(self):
481
+ self.error("")
482
+ self.warning("")
483
+ if self.data is None:
484
+ self.Outputs.data.send(None)
485
+ return
486
+
487
+ if not self.selected_column_name in self.data.domain:
488
+ self.warning(f'Previously selected column "{self.selected_column_name}" does not exist in your data.')
489
+ return
490
+
491
+ self.query = self.data.get_column(self.selected_column_name)[0]
492
+
493
+ self.progressBarInit()
494
+ self.thread = thread_management.Thread(self.search)
495
+ self.thread.progress.connect(self.handle_progress)
496
+ self.thread.result.connect(self.handle_result)
497
+ self.thread.finish.connect(self.handle_finish)
498
+ self.thread.start()
499
+
500
+ def handle_progress(self, progress) -> None:
501
+ value = progress[0]
502
+ text = progress[1]
503
+ if value is not None:
504
+ self.progressBarSet(value)
505
+ if text is None:
506
+ self.textBrowser.setText("")
507
+ else:
508
+ self.textBrowser.insertPlainText(text)
509
+
510
+ def handle_result(self, result):
511
+ if result is None or len(result) == 0:
512
+ self.Outputs.data.send(None)
513
+ return
514
+ data = convert.convert_json_implicite_to_data_table(result)
515
+ self.Outputs.data.send(data)
516
+
517
+ def handle_finish(self):
518
+ self.progressBarFinished()
519
+
520
+ def post_initialized(self):
521
+ pass
522
+
523
+
524
+ if __name__ == "__main__":
525
+ app = QApplication(sys.argv)
526
+ my_widget = WebSearch()
527
+ my_widget.show()
528
+
529
+ if hasattr(app, "exec"):
530
+ sys.exit(app.exec())
531
+ else:
532
+ sys.exit(app.exec_())
@@ -30,12 +30,12 @@
30
30
  <rect>
31
31
  <x>10</x>
32
32
  <y>10</y>
33
- <width>411</width>
33
+ <width>391</width>
34
34
  <height>51</height>
35
35
  </rect>
36
36
  </property>
37
37
  <property name="text">
38
- <string>This widget takes a list of url and return the conteent of the web site .</string>
38
+ <string>This widget takes a list of url from a column &quot;url&quot; and return the content of the web site .</string>
39
39
  </property>
40
40
  <property name="textFormat">
41
41
  <enum>Qt::AutoText</enum>