io4it 3.0.5.99__tar.gz → 3.0.5.991__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- io4it-3.0.5.991/PKG-INFO +37 -0
- io4it-3.0.5.991/io4it.egg-info/PKG-INFO +37 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWParserHTML.py +119 -74
- {io4it-3.0.5.99 → io4it-3.0.5.991}/setup.py +1 -1
- io4it-3.0.5.99/PKG-INFO +0 -7
- io4it-3.0.5.99/io4it.egg-info/PKG-INFO +0 -7
- {io4it-3.0.5.99 → io4it-3.0.5.991}/io4it.egg-info/SOURCES.txt +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/io4it.egg-info/dependency_links.txt +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/io4it.egg-info/entry_points.txt +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/io4it.egg-info/namespace_packages.txt +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/io4it.egg-info/requires.txt +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/io4it.egg-info/top_level.txt +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/__init__.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/ocr_function/__init__.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/ocr_function/word_converter.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/toolViews/designer/key_manager_ui.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/toolViews/key_manager_ui.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/utils/__init__.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/utils/audio.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/utils/keys_manager.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/utils/mail.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/utils/pool_exec_utils.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/utils/secret_manager.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/utils/utils_md.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWChatGpt.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWDeep_Search.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWDoclingASR.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWDoclingToMarkdown.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWExportMarkdown.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWExtractTablesDocxToXlsx.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWInboxMailMonitoring.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWJsonToDataTable.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWMD2HTML.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWMarkdownLoader.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWMarkdownizer.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWOfficeNormalizer.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWPdfType.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWProcessPoolExecutor.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWS3Uploader.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWS3downloader.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWS3list.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWSpeechToText.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWWebSearch.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWmailLoader.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWmailSender.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWwordpdf2docx.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/__init__.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/__init__.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/nogui.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/ow_extract_tables_docx_to_xlsx.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/ow_file_ext_selector.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/ow_json_to_data_table.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owchatgpt.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owdeepsearch.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owdoclingasr.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owdoclingtomarkdown.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owexportmarkdown.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owinboxmailmonitoring.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owmailloader.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owmailsender.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owmarkdownizer.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owmarkdownloader.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owmd2html.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owofficenormalizer.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owparserhtml.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owpdftype.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owprocesspoolexecutor.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owspeechtotext.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owvisualizationer.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owwebsearch.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/wordpdf2docx.ui +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/__init__.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/chatgpt.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/check_pdf.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/deepsearch.svg +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/dep_md_old.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/download.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/export_md.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/extract_table.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/file_extensor.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/html.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/json-file.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/list_aws.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/load_md.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/mail_loader.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/mail_writer.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/md.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/monitor-email.svg +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/office_normalizer.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/owmd2html.svg +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/process_pool_executor.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/speech_to_text.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/upload.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/visualizationer.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/websearch.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/wordpdf2docx.png +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons_dev/__init__.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/__init__.py +0 -0
- {io4it-3.0.5.99 → io4it-3.0.5.991}/setup.cfg +0 -0
io4it-3.0.5.991/PKG-INFO
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: io4it
|
|
3
|
+
Version: 3.0.5.991
|
|
4
|
+
Summary: UNKNOWN
|
|
5
|
+
Home-page:
|
|
6
|
+
Author:
|
|
7
|
+
Author-email:
|
|
8
|
+
License: UNKNOWN
|
|
9
|
+
Keywords: orange3 add-on
|
|
10
|
+
Platform: UNKNOWN
|
|
11
|
+
Requires-Dist: torchvision
|
|
12
|
+
Requires-Dist: torchaudio
|
|
13
|
+
Requires-Dist: torch
|
|
14
|
+
Requires-Dist: pylatexenc
|
|
15
|
+
Requires-Dist: docopt
|
|
16
|
+
Requires-Dist: boto3
|
|
17
|
+
Requires-Dist: opencv-python-headless
|
|
18
|
+
Requires-Dist: docling
|
|
19
|
+
Requires-Dist: docling-core
|
|
20
|
+
Requires-Dist: speechbrain
|
|
21
|
+
Requires-Dist: whisper
|
|
22
|
+
Requires-Dist: whisper-openai
|
|
23
|
+
Requires-Dist: pyannote.audio
|
|
24
|
+
Requires-Dist: pyannote-core
|
|
25
|
+
Requires-Dist: pypandoc
|
|
26
|
+
Requires-Dist: pypandoc-binary
|
|
27
|
+
Requires-Dist: scikit-learn
|
|
28
|
+
Requires-Dist: openai
|
|
29
|
+
Requires-Dist: pip-system-certs
|
|
30
|
+
Requires-Dist: docx2pdf
|
|
31
|
+
Requires-Dist: msal
|
|
32
|
+
Requires-Dist: exchangelib
|
|
33
|
+
Requires-Dist: html2text
|
|
34
|
+
Requires-Dist: ddgs
|
|
35
|
+
Requires-Dist: CATEGORIT
|
|
36
|
+
|
|
37
|
+
UNKNOWN
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: io4it
|
|
3
|
+
Version: 3.0.5.991
|
|
4
|
+
Summary: UNKNOWN
|
|
5
|
+
Home-page:
|
|
6
|
+
Author:
|
|
7
|
+
Author-email:
|
|
8
|
+
License: UNKNOWN
|
|
9
|
+
Keywords: orange3 add-on
|
|
10
|
+
Platform: UNKNOWN
|
|
11
|
+
Requires-Dist: torchvision
|
|
12
|
+
Requires-Dist: torchaudio
|
|
13
|
+
Requires-Dist: torch
|
|
14
|
+
Requires-Dist: pylatexenc
|
|
15
|
+
Requires-Dist: docopt
|
|
16
|
+
Requires-Dist: boto3
|
|
17
|
+
Requires-Dist: opencv-python-headless
|
|
18
|
+
Requires-Dist: docling
|
|
19
|
+
Requires-Dist: docling-core
|
|
20
|
+
Requires-Dist: speechbrain
|
|
21
|
+
Requires-Dist: whisper
|
|
22
|
+
Requires-Dist: whisper-openai
|
|
23
|
+
Requires-Dist: pyannote.audio
|
|
24
|
+
Requires-Dist: pyannote-core
|
|
25
|
+
Requires-Dist: pypandoc
|
|
26
|
+
Requires-Dist: pypandoc-binary
|
|
27
|
+
Requires-Dist: scikit-learn
|
|
28
|
+
Requires-Dist: openai
|
|
29
|
+
Requires-Dist: pip-system-certs
|
|
30
|
+
Requires-Dist: docx2pdf
|
|
31
|
+
Requires-Dist: msal
|
|
32
|
+
Requires-Dist: exchangelib
|
|
33
|
+
Requires-Dist: html2text
|
|
34
|
+
Requires-Dist: ddgs
|
|
35
|
+
Requires-Dist: CATEGORIT
|
|
36
|
+
|
|
37
|
+
UNKNOWN
|
|
@@ -4,10 +4,15 @@ import Orange
|
|
|
4
4
|
from Orange.widgets.widget import OWWidget, Input, Output
|
|
5
5
|
from AnyQt.QtWidgets import QApplication
|
|
6
6
|
import asyncio
|
|
7
|
-
import aiohttp
|
|
8
7
|
import html2text
|
|
9
8
|
from bs4 import BeautifulSoup
|
|
10
9
|
import urllib.request
|
|
10
|
+
import urllib3
|
|
11
|
+
import requests
|
|
12
|
+
from requests_ntlm import HttpNtlmAuth
|
|
13
|
+
|
|
14
|
+
# Désactive les avertissements SSL pour certificats d'entreprise auto-signés
|
|
15
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
11
16
|
|
|
12
17
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
13
18
|
from Orange.widgets.orangecontrib.AAIT.utils import thread_management
|
|
@@ -25,7 +30,6 @@ class ParseHMTL(OWWidget):
|
|
|
25
30
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
26
31
|
icon = "icons_dev/html.png"
|
|
27
32
|
priority = 3000
|
|
28
|
-
gui = ""
|
|
29
33
|
want_control_area = False
|
|
30
34
|
category = "AAIT - TOOLBOX"
|
|
31
35
|
gui = os.path.join(os.path.dirname(os.path.abspath(__file__)), "designer/owparserhtml.ui")
|
|
@@ -33,17 +37,6 @@ class ParseHMTL(OWWidget):
|
|
|
33
37
|
class Inputs:
|
|
34
38
|
data = Input("Data", Orange.data.Table)
|
|
35
39
|
|
|
36
|
-
@Inputs.data
|
|
37
|
-
def set_data(self, in_data):
|
|
38
|
-
if in_data is None:
|
|
39
|
-
return
|
|
40
|
-
if "url" not in in_data.domain:
|
|
41
|
-
self.error("input table need a url column")
|
|
42
|
-
return
|
|
43
|
-
self.data = in_data
|
|
44
|
-
self.url_data = in_data.get_column("url")
|
|
45
|
-
self.run()
|
|
46
|
-
|
|
47
40
|
class Outputs:
|
|
48
41
|
data = Output("Data", Orange.data.Table)
|
|
49
42
|
|
|
@@ -55,93 +48,146 @@ class ParseHMTL(OWWidget):
|
|
|
55
48
|
uic.loadUi(self.gui, self)
|
|
56
49
|
|
|
57
50
|
self.data = None
|
|
51
|
+
self.url_data = []
|
|
58
52
|
self.thread = None
|
|
59
53
|
self.markdown = True
|
|
54
|
+
self.proxy_url = self._get_enterprise_proxy_url()
|
|
55
|
+
self.ntlm_auth = HttpNtlmAuth('', '')
|
|
60
56
|
self.run()
|
|
61
57
|
|
|
62
58
|
def update_parameters(self):
|
|
63
59
|
return
|
|
64
60
|
|
|
65
61
|
|
|
62
|
+
|
|
63
|
+
@Inputs.data
|
|
64
|
+
def set_data(self, in_data):
|
|
65
|
+
if in_data is None:
|
|
66
|
+
return
|
|
67
|
+
if "url" not in in_data.domain:
|
|
68
|
+
self.error("input table need a url column")
|
|
69
|
+
return
|
|
70
|
+
self.data = in_data
|
|
71
|
+
self.url_data = list(in_data.get_column("url"))
|
|
72
|
+
self.run()
|
|
73
|
+
|
|
74
|
+
def _get_enterprise_proxy_url(self):
|
|
75
|
+
proxies_dict = urllib.request.getproxies()
|
|
76
|
+
raw_proxy = proxies_dict.get("http") or proxies_dict.get("https")
|
|
77
|
+
if raw_proxy and not raw_proxy.startswith("http"):
|
|
78
|
+
return f"http://{raw_proxy}"
|
|
79
|
+
return raw_proxy
|
|
80
|
+
|
|
81
|
+
def _sync_fetch(self, url: str) -> str:
|
|
82
|
+
"""
|
|
83
|
+
Appel réseau réel (synchrone) compatible NTLM, exécuté dans un thread via run_in_executor.
|
|
84
|
+
"""
|
|
85
|
+
session = requests.Session()
|
|
86
|
+
session.verify = False # ignore certifs d'entreprise
|
|
87
|
+
|
|
88
|
+
headers = {
|
|
89
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
|
90
|
+
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
91
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
92
|
+
'Accept-Language': 'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
93
|
+
'Connection': 'keep-alive',
|
|
94
|
+
'Upgrade-Insecure-Requests': '1'
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
proxies = None
|
|
98
|
+
if self.proxy_url:
|
|
99
|
+
proxies = {"http": self.proxy_url, "https": self.proxy_url}
|
|
100
|
+
|
|
101
|
+
resp = session.get(
|
|
102
|
+
url,
|
|
103
|
+
proxies=proxies,
|
|
104
|
+
auth=self.ntlm_auth,
|
|
105
|
+
headers=headers,
|
|
106
|
+
timeout=30
|
|
107
|
+
)
|
|
108
|
+
resp.raise_for_status()
|
|
109
|
+
return resp.text
|
|
110
|
+
|
|
66
111
|
def parse_html(self):
|
|
67
112
|
"""Execute le parsing"""
|
|
68
113
|
try:
|
|
69
114
|
loop = asyncio.new_event_loop()
|
|
70
115
|
asyncio.set_event_loop(loop)
|
|
71
|
-
results = loop.run_until_complete(self.parse_all_urls())
|
|
116
|
+
results = loop.run_until_complete(self.parse_all_urls(progress_callback=self._progress_cb))
|
|
72
117
|
loop.close()
|
|
73
118
|
return results
|
|
74
119
|
except Exception as e:
|
|
75
120
|
self.error(str(e))
|
|
76
121
|
return
|
|
77
122
|
|
|
123
|
+
def _progress_cb(self, value: int, text: str = None):
|
|
124
|
+
"""
|
|
125
|
+
Callback interne utilisé par parse_all_urls.
|
|
126
|
+
On renvoie un tuple (value, text) au thread Qt, comme ton handle_progress s'y attend.
|
|
127
|
+
"""
|
|
128
|
+
if self.thread is not None:
|
|
129
|
+
self.thread.progress.emit((value, text))
|
|
78
130
|
|
|
79
131
|
async def parse_all_urls(self, progress_callback=None):
|
|
80
|
-
"""
|
|
132
|
+
"""
|
|
133
|
+
Parse toutes les URLs en concurrence (comme dans le 2e code) en gardant une progression fluide.
|
|
134
|
+
"""
|
|
81
135
|
results = []
|
|
82
136
|
total = len(self.url_data)
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
results.append({
|
|
96
|
-
"url": url_data,
|
|
97
|
-
'content': '',
|
|
98
|
-
'meta_description': '',
|
|
99
|
-
'word_count': 0,
|
|
100
|
-
'status': f'error: {str(e)}'
|
|
101
|
-
})
|
|
137
|
+
if total == 0:
|
|
138
|
+
return results
|
|
139
|
+
|
|
140
|
+
tasks = [self.parse_single_url(url) for url in self.url_data]
|
|
141
|
+
|
|
142
|
+
for i, task in enumerate(asyncio.as_completed(tasks)):
|
|
143
|
+
result = await task
|
|
144
|
+
results.append(result)
|
|
145
|
+
|
|
146
|
+
if progress_callback:
|
|
147
|
+
progress_value = int(((i + 1) / total) * 100)
|
|
148
|
+
progress_callback(progress_value, None)
|
|
102
149
|
|
|
103
150
|
return results
|
|
104
151
|
|
|
105
|
-
async def parse_single_url(self,
|
|
106
|
-
"""
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
110
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
111
|
-
'Accept-Language': 'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
112
|
-
'Accept-Encoding': 'gzip, deflate, br',
|
|
113
|
-
'Connection': 'keep-alive',
|
|
114
|
-
'Upgrade-Insecure-Requests': '1'
|
|
115
|
-
}
|
|
116
|
-
proxies = urllib.request.getproxies()
|
|
117
|
-
if "http" in proxies:
|
|
118
|
-
proxies = proxies["http"]
|
|
119
|
-
else:
|
|
120
|
-
proxies = None
|
|
121
|
-
async with session.get(url, headers=headers, proxy=proxies) as response:
|
|
122
|
-
if response.status != 200:
|
|
123
|
-
raise Exception(f"HTTP {response.status}")
|
|
124
|
-
html = await response.text()
|
|
125
|
-
soup = BeautifulSoup(html, 'html.parser')
|
|
126
|
-
meta_desc = ''
|
|
127
|
-
meta_tag = soup.find('meta', attrs={'name': 'description'})
|
|
128
|
-
if not meta_tag:
|
|
129
|
-
meta_tag = soup.find('meta', property='og:description')
|
|
130
|
-
if meta_tag:
|
|
131
|
-
meta_desc = meta_tag.get('content', '')
|
|
132
|
-
content = ''
|
|
152
|
+
async def parse_single_url(self, url: str):
|
|
153
|
+
"""
|
|
154
|
+
Appel sync (requests+ntlm) dans executor, puis parsing BeautifulSoup + extraction contenu.
|
|
155
|
+
"""
|
|
133
156
|
try:
|
|
134
|
-
|
|
157
|
+
loop = asyncio.get_event_loop()
|
|
158
|
+
html = await loop.run_in_executor(None, self._sync_fetch, url)
|
|
159
|
+
|
|
160
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
161
|
+
meta_desc = ''
|
|
162
|
+
meta_tag = soup.find('meta', attrs={'name': 'description'})
|
|
163
|
+
if not meta_tag:
|
|
164
|
+
meta_tag = soup.find('meta', property='og:description')
|
|
165
|
+
if meta_tag:
|
|
166
|
+
meta_desc = meta_tag.get('content', '')
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
content = self._extract_main_content(soup)
|
|
170
|
+
except Exception:
|
|
171
|
+
content = ''
|
|
172
|
+
|
|
173
|
+
word_count = len(content.split()) if content else 0
|
|
174
|
+
|
|
175
|
+
return {
|
|
176
|
+
"url": url,
|
|
177
|
+
"content": content,
|
|
178
|
+
"meta_description": meta_desc,
|
|
179
|
+
"word_count": word_count,
|
|
180
|
+
"status": "success"
|
|
181
|
+
}
|
|
182
|
+
|
|
135
183
|
except Exception as e:
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
'status': 'success'
|
|
144
|
-
}
|
|
184
|
+
return {
|
|
185
|
+
"url": url,
|
|
186
|
+
"content": "",
|
|
187
|
+
"meta_description": "",
|
|
188
|
+
"word_count": 0,
|
|
189
|
+
"status": f"error: {str(e)}"
|
|
190
|
+
}
|
|
145
191
|
|
|
146
192
|
def _extract_main_content(self, soup):
|
|
147
193
|
"""Extrait le contenu principal et le convertit en Markdown"""
|
|
@@ -172,7 +218,7 @@ class ParseHMTL(OWWidget):
|
|
|
172
218
|
if paragraphs:
|
|
173
219
|
text = ' '.join([p.get_text(strip=True) for p in paragraphs])
|
|
174
220
|
if len(text) > 100:
|
|
175
|
-
|
|
221
|
+
return text
|
|
176
222
|
paragraphs = soup.find_all('p')
|
|
177
223
|
if paragraphs:
|
|
178
224
|
return ' '.join([p.get_text(strip=True) for p in paragraphs])
|
|
@@ -223,4 +269,3 @@ if __name__ == "__main__":
|
|
|
223
269
|
sys.exit(app.exec())
|
|
224
270
|
else:
|
|
225
271
|
sys.exit(app.exec_())
|
|
226
|
-
|
io4it-3.0.5.99/PKG-INFO
DELETED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/ow_file_ext_selector.ui
RENAMED
|
File without changes
|
{io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/ow_json_to_data_table.ui
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owdoclingtomarkdown.ui
RENAMED
|
File without changes
|
|
File without changes
|
{io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owinboxmailmonitoring.ui
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owofficenormalizer.ui
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owprocesspoolexecutor.ui
RENAMED
|
File without changes
|
|
File without changes
|
{io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owvisualizationer.ui
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/process_pool_executor.png
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|