io4it 3.0.5.99__tar.gz → 3.0.5.991__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. io4it-3.0.5.991/PKG-INFO +37 -0
  2. io4it-3.0.5.991/io4it.egg-info/PKG-INFO +37 -0
  3. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWParserHTML.py +119 -74
  4. {io4it-3.0.5.99 → io4it-3.0.5.991}/setup.py +1 -1
  5. io4it-3.0.5.99/PKG-INFO +0 -7
  6. io4it-3.0.5.99/io4it.egg-info/PKG-INFO +0 -7
  7. {io4it-3.0.5.99 → io4it-3.0.5.991}/io4it.egg-info/SOURCES.txt +0 -0
  8. {io4it-3.0.5.99 → io4it-3.0.5.991}/io4it.egg-info/dependency_links.txt +0 -0
  9. {io4it-3.0.5.99 → io4it-3.0.5.991}/io4it.egg-info/entry_points.txt +0 -0
  10. {io4it-3.0.5.99 → io4it-3.0.5.991}/io4it.egg-info/namespace_packages.txt +0 -0
  11. {io4it-3.0.5.99 → io4it-3.0.5.991}/io4it.egg-info/requires.txt +0 -0
  12. {io4it-3.0.5.99 → io4it-3.0.5.991}/io4it.egg-info/top_level.txt +0 -0
  13. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/__init__.py +0 -0
  14. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/ocr_function/__init__.py +0 -0
  15. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/ocr_function/word_converter.py +0 -0
  16. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/toolViews/designer/key_manager_ui.ui +0 -0
  17. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/toolViews/key_manager_ui.py +0 -0
  18. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/utils/__init__.py +0 -0
  19. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/utils/audio.py +0 -0
  20. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/utils/keys_manager.py +0 -0
  21. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/utils/mail.py +0 -0
  22. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/utils/pool_exec_utils.py +0 -0
  23. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/utils/secret_manager.py +0 -0
  24. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/utils/utils_md.py +0 -0
  25. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWChatGpt.py +0 -0
  26. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWDeep_Search.py +0 -0
  27. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWDoclingASR.py +0 -0
  28. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWDoclingToMarkdown.py +0 -0
  29. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWExportMarkdown.py +0 -0
  30. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWExtractTablesDocxToXlsx.py +0 -0
  31. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWInboxMailMonitoring.py +0 -0
  32. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWJsonToDataTable.py +0 -0
  33. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWMD2HTML.py +0 -0
  34. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWMarkdownLoader.py +0 -0
  35. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWMarkdownizer.py +0 -0
  36. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWOfficeNormalizer.py +0 -0
  37. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWPdfType.py +0 -0
  38. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWProcessPoolExecutor.py +0 -0
  39. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWS3Uploader.py +0 -0
  40. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWS3downloader.py +0 -0
  41. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWS3list.py +0 -0
  42. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWSpeechToText.py +0 -0
  43. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWWebSearch.py +0 -0
  44. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWmailLoader.py +0 -0
  45. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWmailSender.py +0 -0
  46. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/OWwordpdf2docx.py +0 -0
  47. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/__init__.py +0 -0
  48. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/__init__.py +0 -0
  49. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/nogui.ui +0 -0
  50. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/ow_extract_tables_docx_to_xlsx.ui +0 -0
  51. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/ow_file_ext_selector.ui +0 -0
  52. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/ow_json_to_data_table.ui +0 -0
  53. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owchatgpt.ui +0 -0
  54. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owdeepsearch.ui +0 -0
  55. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owdoclingasr.ui +0 -0
  56. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owdoclingtomarkdown.ui +0 -0
  57. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owexportmarkdown.ui +0 -0
  58. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owinboxmailmonitoring.ui +0 -0
  59. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owmailloader.ui +0 -0
  60. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owmailsender.ui +0 -0
  61. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owmarkdownizer.ui +0 -0
  62. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owmarkdownloader.ui +0 -0
  63. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owmd2html.ui +0 -0
  64. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owofficenormalizer.ui +0 -0
  65. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owparserhtml.ui +0 -0
  66. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owpdftype.ui +0 -0
  67. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owprocesspoolexecutor.ui +0 -0
  68. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owspeechtotext.ui +0 -0
  69. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owvisualizationer.ui +0 -0
  70. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/owwebsearch.ui +0 -0
  71. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/designer/wordpdf2docx.ui +0 -0
  72. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/__init__.py +0 -0
  73. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/chatgpt.png +0 -0
  74. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/check_pdf.png +0 -0
  75. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/deepsearch.svg +0 -0
  76. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/dep_md_old.png +0 -0
  77. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/download.png +0 -0
  78. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/export_md.png +0 -0
  79. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/extract_table.png +0 -0
  80. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/file_extensor.png +0 -0
  81. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/html.png +0 -0
  82. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/json-file.png +0 -0
  83. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/list_aws.png +0 -0
  84. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/load_md.png +0 -0
  85. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/mail_loader.png +0 -0
  86. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/mail_writer.png +0 -0
  87. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/md.png +0 -0
  88. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/monitor-email.svg +0 -0
  89. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/office_normalizer.png +0 -0
  90. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/owmd2html.svg +0 -0
  91. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/process_pool_executor.png +0 -0
  92. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/speech_to_text.png +0 -0
  93. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/upload.png +0 -0
  94. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/visualizationer.png +0 -0
  95. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/websearch.png +0 -0
  96. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons/wordpdf2docx.png +0 -0
  97. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/IO4IT/widgets/icons_dev/__init__.py +0 -0
  98. {io4it-3.0.5.99 → io4it-3.0.5.991}/orangecontrib/__init__.py +0 -0
  99. {io4it-3.0.5.99 → io4it-3.0.5.991}/setup.cfg +0 -0
@@ -0,0 +1,37 @@
1
+ Metadata-Version: 2.1
2
+ Name: io4it
3
+ Version: 3.0.5.991
4
+ Summary: UNKNOWN
5
+ Home-page:
6
+ Author:
7
+ Author-email:
8
+ License: UNKNOWN
9
+ Keywords: orange3 add-on
10
+ Platform: UNKNOWN
11
+ Requires-Dist: torchvision
12
+ Requires-Dist: torchaudio
13
+ Requires-Dist: torch
14
+ Requires-Dist: pylatexenc
15
+ Requires-Dist: docopt
16
+ Requires-Dist: boto3
17
+ Requires-Dist: opencv-python-headless
18
+ Requires-Dist: docling
19
+ Requires-Dist: docling-core
20
+ Requires-Dist: speechbrain
21
+ Requires-Dist: whisper
22
+ Requires-Dist: whisper-openai
23
+ Requires-Dist: pyannote.audio
24
+ Requires-Dist: pyannote-core
25
+ Requires-Dist: pypandoc
26
+ Requires-Dist: pypandoc-binary
27
+ Requires-Dist: scikit-learn
28
+ Requires-Dist: openai
29
+ Requires-Dist: pip-system-certs
30
+ Requires-Dist: docx2pdf
31
+ Requires-Dist: msal
32
+ Requires-Dist: exchangelib
33
+ Requires-Dist: html2text
34
+ Requires-Dist: ddgs
35
+ Requires-Dist: CATEGORIT
36
+
37
+ UNKNOWN
@@ -0,0 +1,37 @@
1
+ Metadata-Version: 2.1
2
+ Name: io4it
3
+ Version: 3.0.5.991
4
+ Summary: UNKNOWN
5
+ Home-page:
6
+ Author:
7
+ Author-email:
8
+ License: UNKNOWN
9
+ Keywords: orange3 add-on
10
+ Platform: UNKNOWN
11
+ Requires-Dist: torchvision
12
+ Requires-Dist: torchaudio
13
+ Requires-Dist: torch
14
+ Requires-Dist: pylatexenc
15
+ Requires-Dist: docopt
16
+ Requires-Dist: boto3
17
+ Requires-Dist: opencv-python-headless
18
+ Requires-Dist: docling
19
+ Requires-Dist: docling-core
20
+ Requires-Dist: speechbrain
21
+ Requires-Dist: whisper
22
+ Requires-Dist: whisper-openai
23
+ Requires-Dist: pyannote.audio
24
+ Requires-Dist: pyannote-core
25
+ Requires-Dist: pypandoc
26
+ Requires-Dist: pypandoc-binary
27
+ Requires-Dist: scikit-learn
28
+ Requires-Dist: openai
29
+ Requires-Dist: pip-system-certs
30
+ Requires-Dist: docx2pdf
31
+ Requires-Dist: msal
32
+ Requires-Dist: exchangelib
33
+ Requires-Dist: html2text
34
+ Requires-Dist: ddgs
35
+ Requires-Dist: CATEGORIT
36
+
37
+ UNKNOWN
@@ -4,10 +4,15 @@ import Orange
4
4
  from Orange.widgets.widget import OWWidget, Input, Output
5
5
  from AnyQt.QtWidgets import QApplication
6
6
  import asyncio
7
- import aiohttp
8
7
  import html2text
9
8
  from bs4 import BeautifulSoup
10
9
  import urllib.request
10
+ import urllib3
11
+ import requests
12
+ from requests_ntlm import HttpNtlmAuth
13
+
14
+ # Désactive les avertissements SSL pour certificats d'entreprise auto-signés
15
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
11
16
 
12
17
  if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
13
18
  from Orange.widgets.orangecontrib.AAIT.utils import thread_management
@@ -25,7 +30,6 @@ class ParseHMTL(OWWidget):
25
30
  if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
26
31
  icon = "icons_dev/html.png"
27
32
  priority = 3000
28
- gui = ""
29
33
  want_control_area = False
30
34
  category = "AAIT - TOOLBOX"
31
35
  gui = os.path.join(os.path.dirname(os.path.abspath(__file__)), "designer/owparserhtml.ui")
@@ -33,17 +37,6 @@ class ParseHMTL(OWWidget):
33
37
  class Inputs:
34
38
  data = Input("Data", Orange.data.Table)
35
39
 
36
- @Inputs.data
37
- def set_data(self, in_data):
38
- if in_data is None:
39
- return
40
- if "url" not in in_data.domain:
41
- self.error("input table need a url column")
42
- return
43
- self.data = in_data
44
- self.url_data = in_data.get_column("url")
45
- self.run()
46
-
47
40
  class Outputs:
48
41
  data = Output("Data", Orange.data.Table)
49
42
 
@@ -55,93 +48,146 @@ class ParseHMTL(OWWidget):
55
48
  uic.loadUi(self.gui, self)
56
49
 
57
50
  self.data = None
51
+ self.url_data = []
58
52
  self.thread = None
59
53
  self.markdown = True
54
+ self.proxy_url = self._get_enterprise_proxy_url()
55
+ self.ntlm_auth = HttpNtlmAuth('', '')
60
56
  self.run()
61
57
 
62
58
  def update_parameters(self):
63
59
  return
64
60
 
65
61
 
62
+
63
+ @Inputs.data
64
+ def set_data(self, in_data):
65
+ if in_data is None:
66
+ return
67
+ if "url" not in in_data.domain:
68
+ self.error("input table need a url column")
69
+ return
70
+ self.data = in_data
71
+ self.url_data = list(in_data.get_column("url"))
72
+ self.run()
73
+
74
+ def _get_enterprise_proxy_url(self):
75
+ proxies_dict = urllib.request.getproxies()
76
+ raw_proxy = proxies_dict.get("http") or proxies_dict.get("https")
77
+ if raw_proxy and not raw_proxy.startswith("http"):
78
+ return f"http://{raw_proxy}"
79
+ return raw_proxy
80
+
81
+ def _sync_fetch(self, url: str) -> str:
82
+ """
83
+ Appel réseau réel (synchrone) compatible NTLM, exécuté dans un thread via run_in_executor.
84
+ """
85
+ session = requests.Session()
86
+ session.verify = False # ignore certifs d'entreprise
87
+
88
+ headers = {
89
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
90
+ '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
91
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
92
+ 'Accept-Language': 'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7',
93
+ 'Connection': 'keep-alive',
94
+ 'Upgrade-Insecure-Requests': '1'
95
+ }
96
+
97
+ proxies = None
98
+ if self.proxy_url:
99
+ proxies = {"http": self.proxy_url, "https": self.proxy_url}
100
+
101
+ resp = session.get(
102
+ url,
103
+ proxies=proxies,
104
+ auth=self.ntlm_auth,
105
+ headers=headers,
106
+ timeout=30
107
+ )
108
+ resp.raise_for_status()
109
+ return resp.text
110
+
66
111
  def parse_html(self):
67
112
  """Execute le parsing"""
68
113
  try:
69
114
  loop = asyncio.new_event_loop()
70
115
  asyncio.set_event_loop(loop)
71
- results = loop.run_until_complete(self.parse_all_urls())
116
+ results = loop.run_until_complete(self.parse_all_urls(progress_callback=self._progress_cb))
72
117
  loop.close()
73
118
  return results
74
119
  except Exception as e:
75
120
  self.error(str(e))
76
121
  return
77
122
 
123
+ def _progress_cb(self, value: int, text: str = None):
124
+ """
125
+ Callback interne utilisé par parse_all_urls.
126
+ On renvoie un tuple (value, text) au thread Qt, comme ton handle_progress s'y attend.
127
+ """
128
+ if self.thread is not None:
129
+ self.thread.progress.emit((value, text))
78
130
 
79
131
  async def parse_all_urls(self, progress_callback=None):
80
- """Parse toutes les URLs de manière asynchrone"""
132
+ """
133
+ Parse toutes les URLs en concurrence (comme dans le 2e code) en gardant une progression fluide.
134
+ """
81
135
  results = []
82
136
  total = len(self.url_data)
83
- timeout = aiohttp.ClientTimeout(total=30)
84
- connector = aiohttp.TCPConnector(limit=5, limit_per_host=2)
85
- async with aiohttp.ClientSession(timeout=timeout, connector=connector,cookie_jar=aiohttp.CookieJar()) as session:
86
- for idx, url_data in enumerate(self.url_data):
87
- if progress_callback is not None:
88
- progress_value = int((idx / total) * 100)
89
- progress_callback(progress_value)
90
- try:
91
- parsed = await self.parse_single_url(session, url_data)
92
- results.append(parsed)
93
-
94
- except Exception as e:
95
- results.append({
96
- "url": url_data,
97
- 'content': '',
98
- 'meta_description': '',
99
- 'word_count': 0,
100
- 'status': f'error: {str(e)}'
101
- })
137
+ if total == 0:
138
+ return results
139
+
140
+ tasks = [self.parse_single_url(url) for url in self.url_data]
141
+
142
+ for i, task in enumerate(asyncio.as_completed(tasks)):
143
+ result = await task
144
+ results.append(result)
145
+
146
+ if progress_callback:
147
+ progress_value = int(((i + 1) / total) * 100)
148
+ progress_callback(progress_value, None)
102
149
 
103
150
  return results
104
151
 
105
- async def parse_single_url(self, session, url):
106
- """Parse une seule URL"""
107
- # Fetch HTML
108
- headers = {
109
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
110
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
111
- 'Accept-Language': 'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7',
112
- 'Accept-Encoding': 'gzip, deflate, br',
113
- 'Connection': 'keep-alive',
114
- 'Upgrade-Insecure-Requests': '1'
115
- }
116
- proxies = urllib.request.getproxies()
117
- if "http" in proxies:
118
- proxies = proxies["http"]
119
- else:
120
- proxies = None
121
- async with session.get(url, headers=headers, proxy=proxies) as response:
122
- if response.status != 200:
123
- raise Exception(f"HTTP {response.status}")
124
- html = await response.text()
125
- soup = BeautifulSoup(html, 'html.parser')
126
- meta_desc = ''
127
- meta_tag = soup.find('meta', attrs={'name': 'description'})
128
- if not meta_tag:
129
- meta_tag = soup.find('meta', property='og:description')
130
- if meta_tag:
131
- meta_desc = meta_tag.get('content', '')
132
- content = ''
152
+ async def parse_single_url(self, url: str):
153
+ """
154
+ Appel sync (requests+ntlm) dans executor, puis parsing BeautifulSoup + extraction contenu.
155
+ """
133
156
  try:
134
- content = self._extract_main_content(soup)
157
+ loop = asyncio.get_event_loop()
158
+ html = await loop.run_in_executor(None, self._sync_fetch, url)
159
+
160
+ soup = BeautifulSoup(html, 'html.parser')
161
+ meta_desc = ''
162
+ meta_tag = soup.find('meta', attrs={'name': 'description'})
163
+ if not meta_tag:
164
+ meta_tag = soup.find('meta', property='og:description')
165
+ if meta_tag:
166
+ meta_desc = meta_tag.get('content', '')
167
+
168
+ try:
169
+ content = self._extract_main_content(soup)
170
+ except Exception:
171
+ content = ''
172
+
173
+ word_count = len(content.split()) if content else 0
174
+
175
+ return {
176
+ "url": url,
177
+ "content": content,
178
+ "meta_description": meta_desc,
179
+ "word_count": word_count,
180
+ "status": "success"
181
+ }
182
+
135
183
  except Exception as e:
136
- print(e)
137
- word_count = len(content.split())
138
- return {
139
- "url": url,
140
- 'content': content,
141
- 'meta_description': meta_desc,
142
- 'word_count': word_count,
143
- 'status': 'success'
144
- }
184
+ return {
185
+ "url": url,
186
+ "content": "",
187
+ "meta_description": "",
188
+ "word_count": 0,
189
+ "status": f"error: {str(e)}"
190
+ }
145
191
 
146
192
  def _extract_main_content(self, soup):
147
193
  """Extrait le contenu principal et le convertit en Markdown"""
@@ -172,7 +218,7 @@ class ParseHMTL(OWWidget):
172
218
  if paragraphs:
173
219
  text = ' '.join([p.get_text(strip=True) for p in paragraphs])
174
220
  if len(text) > 100:
175
- return text
221
+ return text
176
222
  paragraphs = soup.find_all('p')
177
223
  if paragraphs:
178
224
  return ' '.join([p.get_text(strip=True) for p in paragraphs])
@@ -223,4 +269,3 @@ if __name__ == "__main__":
223
269
  sys.exit(app.exec())
224
270
  else:
225
271
  sys.exit(app.exec_())
226
-
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  # Configuration
4
4
  NAME = "io4it"
5
- VERSION = "3.0.5.99"
5
+ VERSION = "3.0.5.991"
6
6
 
7
7
  INSTALL_REQUIRES = [
8
8
  "torchvision",
io4it-3.0.5.99/PKG-INFO DELETED
@@ -1,7 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: io4it
3
- Version: 3.0.5.99
4
- Home-page:
5
- Author:
6
- Author-email:
7
- Keywords: orange3 add-on
@@ -1,7 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: io4it
3
- Version: 3.0.5.99
4
- Home-page:
5
- Author:
6
- Author-email:
7
- Keywords: orange3 add-on
File without changes