io4it 3.0.2.2__tar.gz → 3.0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {io4it-3.0.2.2 → io4it-3.0.3.1}/PKG-INFO +1 -1
- {io4it-3.0.2.2 → io4it-3.0.3.1}/io4it.egg-info/PKG-INFO +1 -1
- {io4it-3.0.2.2 → io4it-3.0.3.1}/io4it.egg-info/SOURCES.txt +0 -1
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWExportMarkdown.py +1 -1
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWExtractTablesDocxToXlsx.py +1 -1
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWMarkdownLoader.py +1 -1
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWMarkdownizer.py +1 -1
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWParserHTML.py +7 -3
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWS3Uploader.py +1 -1
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWS3downloader.py +1 -1
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWS3list.py +1 -1
- io4it-3.0.3.1/orangecontrib/IO4IT/widgets/OWWebSearch.py +307 -0
- io4it-3.0.3.1/orangecontrib/IO4IT/widgets/designer/owwebsearch.ui +131 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/setup.py +1 -1
- io4it-3.0.2.2/orangecontrib/IO4IT/utils/config.json +0 -45
- io4it-3.0.2.2/orangecontrib/IO4IT/widgets/OWWebSearch.py +0 -532
- io4it-3.0.2.2/orangecontrib/IO4IT/widgets/designer/owwebsearch.ui +0 -296
- {io4it-3.0.2.2 → io4it-3.0.3.1}/io4it.egg-info/dependency_links.txt +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/io4it.egg-info/entry_points.txt +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/io4it.egg-info/namespace_packages.txt +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/io4it.egg-info/requires.txt +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/io4it.egg-info/top_level.txt +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/__init__.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/ocr_function/__init__.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/ocr_function/word_converter.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/utils/__init__.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/utils/keys_manager.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/utils/mail.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/utils/pool_exec_utils.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/utils/secret_manager.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/utils/utils_md.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWChatGpt.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWDeep_Search.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWDoclingASR.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWDoclingToMarkdown.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWInboxMailMonitoring.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWMD2HTML.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWOfficeNormalizer.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWPdfType.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWProcessPoolExecutor.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWSpeechToText.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWmailLoader.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWmailSender.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/OWwordpdf2docx.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/__init__.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/__init__.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/nogui.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/ow_file_ext_selector.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owchatgpt.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owdeepsearch.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owdoclingasr.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owdoclingtomarkdown.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owdocxtoxlsx.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owexportmarkdown.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owinboxmailmonitoring.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owmailloader.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owmailsender.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owmarkdownizer.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owmarkdownloader.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owmd2html.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owofficenormalizer.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owparserhtml.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owpdftype.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owprocesspoolexecutor.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owspeechtotext.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/owvisualizationer.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/designer/wordpdf2docx.ui +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/__init__.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/chatgpt.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/check_pdf.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/deepsearch.svg +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/dep_md_old.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/download.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/export_md.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/extract_table.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/file_extensor.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/html.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/list_aws.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/load_md.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/mail_loader.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/mail_writer.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/md.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/monitor-email.svg +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/office_normalizer.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/owmd2html.svg +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/process_pool_executor.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/speech_to_text.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/upload.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/visualizationer.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/websearch.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons/wordpdf2docx.png +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/IO4IT/widgets/icons_dev/__init__.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/orangecontrib/__init__.py +0 -0
- {io4it-3.0.2.2 → io4it-3.0.3.1}/setup.cfg +0 -0
|
@@ -39,7 +39,6 @@ orangecontrib/IO4IT/widgets/OWmailLoader.py
|
|
|
39
39
|
orangecontrib/IO4IT/widgets/OWmailSender.py
|
|
40
40
|
orangecontrib/IO4IT/widgets/OWwordpdf2docx.py
|
|
41
41
|
orangecontrib/IO4IT/widgets/__init__.py
|
|
42
|
-
orangecontrib/IO4IT/widgets/../utils/config.json
|
|
43
42
|
orangecontrib/IO4IT/widgets/designer/__init__.py
|
|
44
43
|
orangecontrib/IO4IT/widgets/designer/nogui.ui
|
|
45
44
|
orangecontrib/IO4IT/widgets/designer/ow_file_ext_selector.ui
|
|
@@ -27,7 +27,7 @@ else:
|
|
|
27
27
|
|
|
28
28
|
class OWExportMarkdown(widget.OWWidget):
|
|
29
29
|
name = "OWExportMarkdown"
|
|
30
|
-
description = "
|
|
30
|
+
description = "Automatically export content to DOCX, PPTX, and PDF using the same base path."
|
|
31
31
|
icon = "icons/export_md.png"
|
|
32
32
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
33
33
|
icon = "icons_dev/export_md.png"
|
|
@@ -23,7 +23,7 @@ class OWExtractTablesDocxToXlsx(widget.OWWidget):
|
|
|
23
23
|
en fichiers XLSX distincts (une table Word = un fichier XLSX).
|
|
24
24
|
"""
|
|
25
25
|
name = "Docx to XLSX"
|
|
26
|
-
description = "
|
|
26
|
+
description = "Extract tables from Word documents and save them as XLSX, with an optional split feature."
|
|
27
27
|
category = "AAIT - TOOLBOX"
|
|
28
28
|
icon = "icons/extract_table.png"
|
|
29
29
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
@@ -17,7 +17,7 @@ except ImportError:
|
|
|
17
17
|
|
|
18
18
|
class OWMarkdownLoader(widget.OWWidget):
|
|
19
19
|
name = "Markdown Loader"
|
|
20
|
-
description = "
|
|
20
|
+
description = "Load all Markdown files from a folder (recursively)."
|
|
21
21
|
icon = "icons/load_md.png"
|
|
22
22
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
23
23
|
icon = "icons_dev/load_md.png"
|
|
@@ -518,7 +518,7 @@ class MarkdownConversionThread(QThread):
|
|
|
518
518
|
|
|
519
519
|
class FileProcessorApp(widget.OWWidget):
|
|
520
520
|
name = "Markdownizer"
|
|
521
|
-
description = "Convert PDFs, DOCX, PPTX to Markdown (
|
|
521
|
+
description = "[deprecated]Convert PDFs, DOCX, PPTX to Markdown (ignore image)"
|
|
522
522
|
icon = "icons/dep_md_old.png"
|
|
523
523
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
524
524
|
icon = "icons_dev/dep_md_old.png"
|
|
@@ -7,6 +7,7 @@ import asyncio
|
|
|
7
7
|
import aiohttp
|
|
8
8
|
import html2text
|
|
9
9
|
from bs4 import BeautifulSoup
|
|
10
|
+
import urllib.request
|
|
10
11
|
|
|
11
12
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
12
13
|
from Orange.widgets.orangecontrib.AAIT.utils import thread_management
|
|
@@ -112,13 +113,16 @@ class ParseHMTL(OWWidget):
|
|
|
112
113
|
'Connection': 'keep-alive',
|
|
113
114
|
'Upgrade-Insecure-Requests': '1'
|
|
114
115
|
}
|
|
115
|
-
|
|
116
|
+
proxies = urllib.request.getproxies()
|
|
117
|
+
if "http" in proxies:
|
|
118
|
+
proxies = proxies["http"]
|
|
119
|
+
else:
|
|
120
|
+
proxies = None
|
|
121
|
+
async with session.get(url, headers=headers, proxy=proxies) as response:
|
|
116
122
|
if response.status != 200:
|
|
117
123
|
raise Exception(f"HTTP {response.status}")
|
|
118
|
-
|
|
119
124
|
html = await response.text()
|
|
120
125
|
soup = BeautifulSoup(html, 'html.parser')
|
|
121
|
-
|
|
122
126
|
meta_desc = ''
|
|
123
127
|
meta_tag = soup.find('meta', attrs={'name': 'description'})
|
|
124
128
|
if not meta_tag:
|
|
@@ -9,7 +9,7 @@ from Orange.data import Table
|
|
|
9
9
|
|
|
10
10
|
class OWS3FileDownloader(OWWidget):
|
|
11
11
|
name = "S3 File Uploader"
|
|
12
|
-
description = "
|
|
12
|
+
description = "Upload the listed files from a local directory to S3."
|
|
13
13
|
icon = "icons/upload.png"
|
|
14
14
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
15
15
|
icon = "icons_dev/upload.png"
|
|
@@ -8,7 +8,7 @@ from Orange.data import Table
|
|
|
8
8
|
|
|
9
9
|
class OWS3FileDownloader(OWWidget):
|
|
10
10
|
name = "S3 File Downloader"
|
|
11
|
-
description = "
|
|
11
|
+
description = "Download the listed files from S3 to a local directory."
|
|
12
12
|
icon = "icons/download.png"
|
|
13
13
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
14
14
|
icon = "icons_dev/download.png"
|
|
@@ -8,7 +8,7 @@ import os
|
|
|
8
8
|
|
|
9
9
|
class OWS3FileLister(OWWidget):
|
|
10
10
|
name = "S3 File Lister"
|
|
11
|
-
description = "
|
|
11
|
+
description = "List the files in an S3 bucket and display their details."
|
|
12
12
|
icon = "icons/list_aws.png"
|
|
13
13
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
14
14
|
icon = "icons_dev/list_aws.png"
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import Orange
|
|
4
|
+
from Orange.widgets.widget import Input, Output
|
|
5
|
+
from AnyQt.QtWidgets import QApplication, QPushButton
|
|
6
|
+
from Orange.widgets.settings import Setting
|
|
7
|
+
import requests
|
|
8
|
+
from bs4 import BeautifulSoup
|
|
9
|
+
from urllib.parse import quote, urlparse, urljoin, parse_qs
|
|
10
|
+
import unicodedata
|
|
11
|
+
import re
|
|
12
|
+
|
|
13
|
+
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
14
|
+
from Orange.widgets.orangecontrib.AAIT.utils import thread_management, base_widget
|
|
15
|
+
from Orange.widgets.orangecontrib.HLIT_dev.remote_server_smb import convert
|
|
16
|
+
else:
|
|
17
|
+
from orangecontrib.AAIT.utils import thread_management, base_widget
|
|
18
|
+
from orangecontrib.HLIT_dev.remote_server_smb import convert
|
|
19
|
+
|
|
20
|
+
class WebSearch(base_widget.BaseListWidget):
|
|
21
|
+
name = "WebSearch"
|
|
22
|
+
description = "Search url website from a query with DDG."
|
|
23
|
+
icon = "icons/websearch.png"
|
|
24
|
+
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
25
|
+
icon = "icons_dev/websearch.png"
|
|
26
|
+
priority = 3000
|
|
27
|
+
gui = ""
|
|
28
|
+
want_control_area = False
|
|
29
|
+
category = "AAIT - TOOLBOX"
|
|
30
|
+
gui = os.path.join(os.path.dirname(os.path.abspath(__file__)), "designer/owwebsearch.ui")
|
|
31
|
+
# Settings
|
|
32
|
+
selected_column_name = Setting("content")
|
|
33
|
+
|
|
34
|
+
class Inputs:
|
|
35
|
+
data = Input("Data", Orange.data.Table)
|
|
36
|
+
|
|
37
|
+
@Inputs.data
|
|
38
|
+
def set_data(self, in_data):
|
|
39
|
+
self.data = in_data
|
|
40
|
+
if in_data is None:
|
|
41
|
+
self.Outputs.data.send(None)
|
|
42
|
+
return
|
|
43
|
+
if self.data:
|
|
44
|
+
self.var_selector.add_variables(self.data.domain)
|
|
45
|
+
self.var_selector.select_variable_by_name(self.selected_column_name)
|
|
46
|
+
self.run()
|
|
47
|
+
|
|
48
|
+
class Outputs:
|
|
49
|
+
data = Output("Data", Orange.data.Table)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def __init__(self):
|
|
53
|
+
super().__init__()
|
|
54
|
+
# Qt Management
|
|
55
|
+
self.setFixedWidth(480)
|
|
56
|
+
self.setFixedHeight(450)
|
|
57
|
+
|
|
58
|
+
self.pushButton_run =self.findChild(QPushButton, 'pushButton_send')
|
|
59
|
+
self.pushButton_run.clicked.connect(self.run)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def normaliser_texte(self, txt: str) -> str:
|
|
63
|
+
"""
|
|
64
|
+
Met en minuscules, retire les accents et trim.
|
|
65
|
+
"""
|
|
66
|
+
if not txt:
|
|
67
|
+
return ""
|
|
68
|
+
txt = txt.lower()
|
|
69
|
+
txt = unicodedata.normalize("NFD", txt)
|
|
70
|
+
txt = "".join(c for c in txt if not unicodedata.combining(c))
|
|
71
|
+
return txt.strip()
|
|
72
|
+
|
|
73
|
+
def extraire_mots_cles(self, requete: str):
|
|
74
|
+
"""
|
|
75
|
+
Découpe la requête en mots-clés simples, en retirant
|
|
76
|
+
les mots très fréquents (du, de, le, la, etc.).
|
|
77
|
+
"""
|
|
78
|
+
stopwords = {"du", "de", "des", "le", "la", "les", "un", "une", "au", "aux", "et", "en", "pour", "sur", "a"}
|
|
79
|
+
req_norm = self.normaliser_texte(requete)
|
|
80
|
+
mots = re.findall(r"\w+", req_norm)
|
|
81
|
+
mots_cles = [m for m in mots if m not in stopwords and len(m) > 2]
|
|
82
|
+
return mots_cles or mots
|
|
83
|
+
|
|
84
|
+
def recherche_duckduckgo(self, query, max_results=10):
|
|
85
|
+
q = quote(query)
|
|
86
|
+
url = f"https://duckduckgo.com/html/?q={q}"
|
|
87
|
+
headers = {"User-Agent": "Mozilla/5.0"}
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
r = requests.get(url, headers=headers, timeout=10)
|
|
91
|
+
r.raise_for_status()
|
|
92
|
+
except requests.RequestException as e:
|
|
93
|
+
print(f"[ERREUR] Problème lors de la requête DuckDuckGo : {e}")
|
|
94
|
+
return []
|
|
95
|
+
|
|
96
|
+
soup = BeautifulSoup(r.text, "html.parser")
|
|
97
|
+
resultats = []
|
|
98
|
+
|
|
99
|
+
for a in soup.select("a.result__a")[:max_results]:
|
|
100
|
+
titre = a.get_text(strip=True)
|
|
101
|
+
lien = a.get("href", "")
|
|
102
|
+
|
|
103
|
+
# Gestion des liens de redirection DuckDuckGo
|
|
104
|
+
parsed = urlparse(lien)
|
|
105
|
+
if parsed.netloc and "duckduckgo.com" in parsed.netloc and parsed.path.startswith("/l/"):
|
|
106
|
+
qs = parse_qs(parsed.query)
|
|
107
|
+
if "uddg" in qs:
|
|
108
|
+
lien = qs["uddg"][0]
|
|
109
|
+
|
|
110
|
+
# Lien relatif -> absolu
|
|
111
|
+
if lien.startswith("/"):
|
|
112
|
+
lien = urljoin("https://duckduckgo.com", lien)
|
|
113
|
+
|
|
114
|
+
resultats.append((titre, lien))
|
|
115
|
+
|
|
116
|
+
return resultats
|
|
117
|
+
|
|
118
|
+
def extraire_domaines(self, resultats_search):
|
|
119
|
+
domaines = set()
|
|
120
|
+
for _titre, url in resultats_search:
|
|
121
|
+
try:
|
|
122
|
+
parsed = urlparse(url)
|
|
123
|
+
scheme = parsed.scheme or "https"
|
|
124
|
+
if not parsed.netloc:
|
|
125
|
+
continue
|
|
126
|
+
domaine = f"{scheme}://{parsed.netloc}"
|
|
127
|
+
domaines.add(domaine)
|
|
128
|
+
except Exception:
|
|
129
|
+
pass
|
|
130
|
+
return list(domaines)
|
|
131
|
+
|
|
132
|
+
def trouver_flux_rss(self, url_site):
|
|
133
|
+
try:
|
|
134
|
+
r = requests.get(url_site, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
|
|
135
|
+
r.raise_for_status()
|
|
136
|
+
except requests.RequestException as e:
|
|
137
|
+
print(f"[ERREUR] Impossible d'accéder à {url_site} : {e}")
|
|
138
|
+
return []
|
|
139
|
+
|
|
140
|
+
soup = BeautifulSoup(r.text, "html.parser")
|
|
141
|
+
flux = []
|
|
142
|
+
|
|
143
|
+
for link in soup.find_all("link", type="application/rss+xml"):
|
|
144
|
+
href = link.get("href")
|
|
145
|
+
if href:
|
|
146
|
+
href = urljoin(url_site, href)
|
|
147
|
+
flux.append(href)
|
|
148
|
+
|
|
149
|
+
for link in soup.find_all("a"):
|
|
150
|
+
href = link.get("href", "")
|
|
151
|
+
if not href:
|
|
152
|
+
continue
|
|
153
|
+
href_norm = href.lower()
|
|
154
|
+
if "rss" in href_norm or "feed" in href_norm:
|
|
155
|
+
flux.append(urljoin(url_site, href))
|
|
156
|
+
|
|
157
|
+
return list(set(flux))
|
|
158
|
+
|
|
159
|
+
def rechercher_articles_dans_flux(self, requete, flux_list, max_results=20):
|
|
160
|
+
mots_cles = self.extraire_mots_cles(requete)
|
|
161
|
+
articles = []
|
|
162
|
+
|
|
163
|
+
headers = {"User-Agent": "Mozilla/5.0"}
|
|
164
|
+
|
|
165
|
+
for flux in flux_list:
|
|
166
|
+
try:
|
|
167
|
+
r = requests.get(flux, headers=headers, timeout=10)
|
|
168
|
+
r.raise_for_status()
|
|
169
|
+
# Parse du flux en XML
|
|
170
|
+
soup = BeautifulSoup(r.content, "xml")
|
|
171
|
+
except requests.RequestException as e:
|
|
172
|
+
print(f"[ERREUR] Problème lors de la lecture du flux {flux} : {e}")
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
# Gestion RSS (<item>) et Atom (<entry>)
|
|
176
|
+
for entry in soup.find_all(["item", "entry"]):
|
|
177
|
+
# Titre
|
|
178
|
+
titre_tag = entry.find("title")
|
|
179
|
+
titre = titre_tag.get_text(strip=True) if titre_tag else ""
|
|
180
|
+
|
|
181
|
+
# Résumé / description
|
|
182
|
+
resume_tag = entry.find("description") or entry.find("summary")
|
|
183
|
+
resume = resume_tag.get_text(strip=True) if resume_tag else ""
|
|
184
|
+
|
|
185
|
+
# Lien
|
|
186
|
+
lien_tag = entry.find("link")
|
|
187
|
+
lien = ""
|
|
188
|
+
if lien_tag:
|
|
189
|
+
# Atom : <link href="...">
|
|
190
|
+
if lien_tag.has_attr("href"):
|
|
191
|
+
lien = lien_tag["href"]
|
|
192
|
+
else:
|
|
193
|
+
# RSS : <link>https://...</link>
|
|
194
|
+
lien = lien_tag.get_text(strip=True)
|
|
195
|
+
|
|
196
|
+
# Date
|
|
197
|
+
date_tag = (
|
|
198
|
+
entry.find("pubDate")
|
|
199
|
+
or entry.find("published")
|
|
200
|
+
or entry.find("updated")
|
|
201
|
+
)
|
|
202
|
+
date = date_tag.get_text(strip=True) if date_tag else "Date inconnue"
|
|
203
|
+
|
|
204
|
+
texte_complet = self.normaliser_texte(titre + " " + resume)
|
|
205
|
+
|
|
206
|
+
# Condition : au moins un mot-clé présent dans titre+résumé
|
|
207
|
+
if any(mot in texte_complet for mot in mots_cles):
|
|
208
|
+
articles.append(
|
|
209
|
+
{
|
|
210
|
+
"titre": titre,
|
|
211
|
+
"url": lien,
|
|
212
|
+
"date": date,
|
|
213
|
+
"source_flux": flux,
|
|
214
|
+
}
|
|
215
|
+
)
|
|
216
|
+
if len(articles) >= max_results:
|
|
217
|
+
return articles
|
|
218
|
+
|
|
219
|
+
return articles
|
|
220
|
+
|
|
221
|
+
def pipeline_veille_requete(self,requete):
|
|
222
|
+
resultats = self.recherche_duckduckgo(requete)
|
|
223
|
+
if not resultats:
|
|
224
|
+
print("Aucun résultat trouvé sur DuckDuckGo.")
|
|
225
|
+
return []
|
|
226
|
+
print(resultats)
|
|
227
|
+
domaines = self.extraire_domaines(resultats)
|
|
228
|
+
|
|
229
|
+
flux = []
|
|
230
|
+
for d in domaines:
|
|
231
|
+
found = self.trouver_flux_rss(d)
|
|
232
|
+
if found:
|
|
233
|
+
print(f"Flux trouvés sur {d}:")
|
|
234
|
+
for f in found:
|
|
235
|
+
print(" ->", f)
|
|
236
|
+
flux.extend(found)
|
|
237
|
+
|
|
238
|
+
flux = list(set(flux))
|
|
239
|
+
|
|
240
|
+
if not flux:
|
|
241
|
+
return [
|
|
242
|
+
{"titre": t, "url": u, "date": None, "source_flux": None, "source": "web"}
|
|
243
|
+
for t, u in resultats
|
|
244
|
+
]
|
|
245
|
+
articles = self.rechercher_articles_dans_flux(requete, flux)
|
|
246
|
+
|
|
247
|
+
if not articles:
|
|
248
|
+
return [
|
|
249
|
+
{"titre": t, "url": u, "date": None, "source_flux": None, "source": "web"}
|
|
250
|
+
for t, u in resultats
|
|
251
|
+
]
|
|
252
|
+
return articles
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def run(self):
|
|
256
|
+
self.error("")
|
|
257
|
+
self.warning("")
|
|
258
|
+
if self.data is None:
|
|
259
|
+
self.Outputs.data.send(None)
|
|
260
|
+
return
|
|
261
|
+
|
|
262
|
+
if not self.selected_column_name in self.data.domain:
|
|
263
|
+
self.warning(f'Previously selected column "{self.selected_column_name}" does not exist in your data.')
|
|
264
|
+
return
|
|
265
|
+
|
|
266
|
+
self.query = self.data.get_column(self.selected_column_name)[0]
|
|
267
|
+
|
|
268
|
+
self.progressBarInit()
|
|
269
|
+
self.thread = thread_management.Thread(self.pipeline_veille_requete, self.query)
|
|
270
|
+
self.thread.progress.connect(self.handle_progress)
|
|
271
|
+
self.thread.result.connect(self.handle_result)
|
|
272
|
+
self.thread.finish.connect(self.handle_finish)
|
|
273
|
+
self.thread.start()
|
|
274
|
+
|
|
275
|
+
def handle_progress(self, progress) -> None:
|
|
276
|
+
value = progress[0]
|
|
277
|
+
text = progress[1]
|
|
278
|
+
if value is not None:
|
|
279
|
+
self.progressBarSet(value)
|
|
280
|
+
if text is None:
|
|
281
|
+
self.textBrowser.setText("")
|
|
282
|
+
else:
|
|
283
|
+
self.textBrowser.insertPlainText(text)
|
|
284
|
+
|
|
285
|
+
def handle_result(self, result):
|
|
286
|
+
if result is None or len(result) == 0:
|
|
287
|
+
self.Outputs.data.send(None)
|
|
288
|
+
return
|
|
289
|
+
data = convert.convert_json_implicite_to_data_table(result)
|
|
290
|
+
self.Outputs.data.send(data)
|
|
291
|
+
|
|
292
|
+
def handle_finish(self):
|
|
293
|
+
self.progressBarFinished()
|
|
294
|
+
|
|
295
|
+
def post_initialized(self):
|
|
296
|
+
pass
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
if __name__ == "__main__":
|
|
300
|
+
app = QApplication(sys.argv)
|
|
301
|
+
my_widget = WebSearch()
|
|
302
|
+
my_widget.show()
|
|
303
|
+
|
|
304
|
+
if hasattr(app, "exec"):
|
|
305
|
+
sys.exit(app.exec())
|
|
306
|
+
else:
|
|
307
|
+
sys.exit(app.exec_())
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<ui version="4.0">
|
|
3
|
+
<class>Form</class>
|
|
4
|
+
<widget class="QWidget" name="Form">
|
|
5
|
+
<property name="geometry">
|
|
6
|
+
<rect>
|
|
7
|
+
<x>0</x>
|
|
8
|
+
<y>0</y>
|
|
9
|
+
<width>477</width>
|
|
10
|
+
<height>409</height>
|
|
11
|
+
</rect>
|
|
12
|
+
</property>
|
|
13
|
+
<property name="windowTitle">
|
|
14
|
+
<string>Form</string>
|
|
15
|
+
</property>
|
|
16
|
+
<widget class="QGroupBox" name="groupBox">
|
|
17
|
+
<property name="geometry">
|
|
18
|
+
<rect>
|
|
19
|
+
<x>20</x>
|
|
20
|
+
<y>40</y>
|
|
21
|
+
<width>431</width>
|
|
22
|
+
<height>71</height>
|
|
23
|
+
</rect>
|
|
24
|
+
</property>
|
|
25
|
+
<property name="title">
|
|
26
|
+
<string>Explanation</string>
|
|
27
|
+
</property>
|
|
28
|
+
<widget class="QLabel" name="Description">
|
|
29
|
+
<property name="geometry">
|
|
30
|
+
<rect>
|
|
31
|
+
<x>10</x>
|
|
32
|
+
<y>10</y>
|
|
33
|
+
<width>411</width>
|
|
34
|
+
<height>51</height>
|
|
35
|
+
</rect>
|
|
36
|
+
</property>
|
|
37
|
+
<property name="text">
|
|
38
|
+
<string>This widget takes a query input and return a list of web site related to it .</string>
|
|
39
|
+
</property>
|
|
40
|
+
<property name="textFormat">
|
|
41
|
+
<enum>Qt::AutoText</enum>
|
|
42
|
+
</property>
|
|
43
|
+
<property name="alignment">
|
|
44
|
+
<set>Qt::AlignLeading|Qt::AlignLeft|Qt::AlignVCenter</set>
|
|
45
|
+
</property>
|
|
46
|
+
<property name="wordWrap">
|
|
47
|
+
<bool>true</bool>
|
|
48
|
+
</property>
|
|
49
|
+
</widget>
|
|
50
|
+
</widget>
|
|
51
|
+
<widget class="QPushButton" name="pushButton_send">
|
|
52
|
+
<property name="enabled">
|
|
53
|
+
<bool>true</bool>
|
|
54
|
+
</property>
|
|
55
|
+
<property name="geometry">
|
|
56
|
+
<rect>
|
|
57
|
+
<x>149</x>
|
|
58
|
+
<y>344</y>
|
|
59
|
+
<width>301</width>
|
|
60
|
+
<height>31</height>
|
|
61
|
+
</rect>
|
|
62
|
+
</property>
|
|
63
|
+
<property name="text">
|
|
64
|
+
<string>Run</string>
|
|
65
|
+
</property>
|
|
66
|
+
</widget>
|
|
67
|
+
<widget class="QCheckBox" name="checkBox_send">
|
|
68
|
+
<property name="enabled">
|
|
69
|
+
<bool>false</bool>
|
|
70
|
+
</property>
|
|
71
|
+
<property name="geometry">
|
|
72
|
+
<rect>
|
|
73
|
+
<x>20</x>
|
|
74
|
+
<y>350</y>
|
|
75
|
+
<width>131</width>
|
|
76
|
+
<height>16</height>
|
|
77
|
+
</rect>
|
|
78
|
+
</property>
|
|
79
|
+
<property name="text">
|
|
80
|
+
<string>Auto send data</string>
|
|
81
|
+
</property>
|
|
82
|
+
</widget>
|
|
83
|
+
<widget class="QGroupBox" name="groupBox_3">
|
|
84
|
+
<property name="geometry">
|
|
85
|
+
<rect>
|
|
86
|
+
<x>20</x>
|
|
87
|
+
<y>150</y>
|
|
88
|
+
<width>431</width>
|
|
89
|
+
<height>161</height>
|
|
90
|
+
</rect>
|
|
91
|
+
</property>
|
|
92
|
+
<property name="title">
|
|
93
|
+
<string>Column selection</string>
|
|
94
|
+
</property>
|
|
95
|
+
<widget class="QLabel" name="Description_4">
|
|
96
|
+
<property name="geometry">
|
|
97
|
+
<rect>
|
|
98
|
+
<x>10</x>
|
|
99
|
+
<y>30</y>
|
|
100
|
+
<width>411</width>
|
|
101
|
+
<height>61</height>
|
|
102
|
+
</rect>
|
|
103
|
+
</property>
|
|
104
|
+
<property name="text">
|
|
105
|
+
<string/>
|
|
106
|
+
</property>
|
|
107
|
+
<property name="textFormat">
|
|
108
|
+
<enum>Qt::AutoText</enum>
|
|
109
|
+
</property>
|
|
110
|
+
<property name="alignment">
|
|
111
|
+
<set>Qt::AlignLeading|Qt::AlignLeft|Qt::AlignVCenter</set>
|
|
112
|
+
</property>
|
|
113
|
+
<property name="wordWrap">
|
|
114
|
+
<bool>true</bool>
|
|
115
|
+
</property>
|
|
116
|
+
</widget>
|
|
117
|
+
<widget class="QWidget" name="placeholder" native="true">
|
|
118
|
+
<property name="geometry">
|
|
119
|
+
<rect>
|
|
120
|
+
<x>0</x>
|
|
121
|
+
<y>20</y>
|
|
122
|
+
<width>431</width>
|
|
123
|
+
<height>141</height>
|
|
124
|
+
</rect>
|
|
125
|
+
</property>
|
|
126
|
+
</widget>
|
|
127
|
+
</widget>
|
|
128
|
+
</widget>
|
|
129
|
+
<resources/>
|
|
130
|
+
<connections/>
|
|
131
|
+
</ui>
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"domain_context": {
|
|
3
|
-
"prix": ["cours", "cotation", "tarif", "valeur", "coût"],
|
|
4
|
-
"marché": ["négoce", "commerce", "trading", "bourse", "mondial", "international"],
|
|
5
|
-
"économie": ["économique", "fiscal", "finance", "budget"],
|
|
6
|
-
"bourse": ["action", "indice", "trading", "investissement"],
|
|
7
|
-
|
|
8
|
-
"porc": ["porcin", "viande porcine", "cochon", "élevage"],
|
|
9
|
-
"agriculture": ["agricole", "exploitation", "récolte", "production"],
|
|
10
|
-
"élevage": ["bétail", "ferme", "cheptel"],
|
|
11
|
-
|
|
12
|
-
"produits mer": ["coquillage", "crustacé", "mollusque", "pêche", "aquaculture"],
|
|
13
|
-
"saint jacques": ["coquille", "pectinidé", "noix", "scallop"],
|
|
14
|
-
"coquille": ["bivalve", "mollusque", "pêche"],
|
|
15
|
-
|
|
16
|
-
"actualités": ["news", "information", "récent", "nouveau"],
|
|
17
|
-
"tendances": ["évolution", "dynamique", "orientation", "conjoncture"],
|
|
18
|
-
"perspectives": ["prévisions", "anticipations", "outlook", "horizon"],
|
|
19
|
-
"conjoncture": ["situation", "contexte", "analyse", "bilan"],
|
|
20
|
-
|
|
21
|
-
"scientifique": ["espèce", "taxonomie", "biologie", "distribution", "habitat"],
|
|
22
|
-
"pêche": ["fishery", "capture", "exploitation", "stock"],
|
|
23
|
-
|
|
24
|
-
"ia": ["intelligence artificielle", "machine learning", "deep learning"],
|
|
25
|
-
"technologie": ["tech", "digital", "numérique", "innovation"],
|
|
26
|
-
"logiciel": ["software", "application", "programme"],
|
|
27
|
-
|
|
28
|
-
"article": ["publication", "presse", "média", "journal"],
|
|
29
|
-
"recherche": ["étude", "scientifique", "analyse", "rapport"],
|
|
30
|
-
"étude": ["recherche", "analyse", "données", "résultat"],
|
|
31
|
-
|
|
32
|
-
"santé": ["médical", "sanitaire", "clinique", "thérapie"],
|
|
33
|
-
"maladie": ["pathologie", "syndrome", "infection"],
|
|
34
|
-
|
|
35
|
-
"politique": ["gouvernement", "législatif", "élection", "pouvoir"],
|
|
36
|
-
"loi": ["législation", "règlement", "juridique", "légal"]
|
|
37
|
-
},
|
|
38
|
-
|
|
39
|
-
"stop_words": [
|
|
40
|
-
"le", "la", "les", "un", "une", "des", "du", "de", "et", "en",
|
|
41
|
-
"pour", "dans", "avec", "sur", "par", "ce", "cette", "ces",
|
|
42
|
-
"ou", "où", "qui", "que", "quoi", "dont", "quel",
|
|
43
|
-
"informations", "disponibles", "concernant", "relatif"
|
|
44
|
-
]
|
|
45
|
-
}
|