pomera-ai-commander 0.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. package/LICENSE +21 -21
  2. package/README.md +105 -680
  3. package/bin/pomera-ai-commander.js +62 -62
  4. package/core/__init__.py +65 -65
  5. package/core/app_context.py +482 -482
  6. package/core/async_text_processor.py +421 -421
  7. package/core/backup_manager.py +655 -655
  8. package/core/backup_recovery_manager.py +1033 -1033
  9. package/core/content_hash_cache.py +508 -508
  10. package/core/context_menu.py +313 -313
  11. package/core/data_validator.py +1066 -1066
  12. package/core/database_connection_manager.py +744 -744
  13. package/core/database_curl_settings_manager.py +608 -608
  14. package/core/database_promera_ai_settings_manager.py +446 -446
  15. package/core/database_schema.py +411 -411
  16. package/core/database_schema_manager.py +395 -395
  17. package/core/database_settings_manager.py +1507 -1507
  18. package/core/database_settings_manager_interface.py +456 -456
  19. package/core/dialog_manager.py +734 -734
  20. package/core/efficient_line_numbers.py +510 -510
  21. package/core/error_handler.py +746 -746
  22. package/core/error_service.py +431 -431
  23. package/core/event_consolidator.py +511 -511
  24. package/core/mcp/__init__.py +43 -43
  25. package/core/mcp/protocol.py +288 -288
  26. package/core/mcp/schema.py +251 -251
  27. package/core/mcp/server_stdio.py +299 -299
  28. package/core/mcp/tool_registry.py +2372 -2345
  29. package/core/memory_efficient_text_widget.py +711 -711
  30. package/core/migration_manager.py +914 -914
  31. package/core/migration_test_suite.py +1085 -1085
  32. package/core/migration_validator.py +1143 -1143
  33. package/core/optimized_find_replace.py +714 -714
  34. package/core/optimized_pattern_engine.py +424 -424
  35. package/core/optimized_search_highlighter.py +552 -552
  36. package/core/performance_monitor.py +674 -674
  37. package/core/persistence_manager.py +712 -712
  38. package/core/progressive_stats_calculator.py +632 -632
  39. package/core/regex_pattern_cache.py +529 -529
  40. package/core/regex_pattern_library.py +350 -350
  41. package/core/search_operation_manager.py +434 -434
  42. package/core/settings_defaults_registry.py +1087 -1087
  43. package/core/settings_integrity_validator.py +1111 -1111
  44. package/core/settings_serializer.py +557 -557
  45. package/core/settings_validator.py +1823 -1823
  46. package/core/smart_stats_calculator.py +709 -709
  47. package/core/statistics_update_manager.py +619 -619
  48. package/core/stats_config_manager.py +858 -858
  49. package/core/streaming_text_handler.py +723 -723
  50. package/core/task_scheduler.py +596 -596
  51. package/core/update_pattern_library.py +168 -168
  52. package/core/visibility_monitor.py +596 -596
  53. package/core/widget_cache.py +498 -498
  54. package/mcp.json +51 -61
  55. package/package.json +61 -57
  56. package/pomera.py +7482 -7482
  57. package/pomera_mcp_server.py +183 -144
  58. package/requirements.txt +32 -0
  59. package/tools/__init__.py +4 -4
  60. package/tools/ai_tools.py +2891 -2891
  61. package/tools/ascii_art_generator.py +352 -352
  62. package/tools/base64_tools.py +183 -183
  63. package/tools/base_tool.py +511 -511
  64. package/tools/case_tool.py +308 -308
  65. package/tools/column_tools.py +395 -395
  66. package/tools/cron_tool.py +884 -884
  67. package/tools/curl_history.py +600 -600
  68. package/tools/curl_processor.py +1207 -1207
  69. package/tools/curl_settings.py +502 -502
  70. package/tools/curl_tool.py +5467 -5467
  71. package/tools/diff_viewer.py +1071 -1071
  72. package/tools/email_extraction_tool.py +248 -248
  73. package/tools/email_header_analyzer.py +425 -425
  74. package/tools/extraction_tools.py +250 -250
  75. package/tools/find_replace.py +1750 -1750
  76. package/tools/folder_file_reporter.py +1463 -1463
  77. package/tools/folder_file_reporter_adapter.py +480 -480
  78. package/tools/generator_tools.py +1216 -1216
  79. package/tools/hash_generator.py +255 -255
  80. package/tools/html_tool.py +656 -656
  81. package/tools/jsonxml_tool.py +729 -729
  82. package/tools/line_tools.py +419 -419
  83. package/tools/markdown_tools.py +561 -561
  84. package/tools/mcp_widget.py +1417 -1417
  85. package/tools/notes_widget.py +973 -973
  86. package/tools/number_base_converter.py +372 -372
  87. package/tools/regex_extractor.py +571 -571
  88. package/tools/slug_generator.py +310 -310
  89. package/tools/sorter_tools.py +458 -458
  90. package/tools/string_escape_tool.py +392 -392
  91. package/tools/text_statistics_tool.py +365 -365
  92. package/tools/text_wrapper.py +430 -430
  93. package/tools/timestamp_converter.py +421 -421
  94. package/tools/tool_loader.py +710 -710
  95. package/tools/translator_tools.py +522 -522
  96. package/tools/url_link_extractor.py +261 -261
  97. package/tools/url_parser.py +204 -204
  98. package/tools/whitespace_tools.py +355 -355
  99. package/tools/word_frequency_counter.py +146 -146
  100. package/core/__pycache__/__init__.cpython-313.pyc +0 -0
  101. package/core/__pycache__/app_context.cpython-313.pyc +0 -0
  102. package/core/__pycache__/async_text_processor.cpython-313.pyc +0 -0
  103. package/core/__pycache__/backup_manager.cpython-313.pyc +0 -0
  104. package/core/__pycache__/backup_recovery_manager.cpython-313.pyc +0 -0
  105. package/core/__pycache__/content_hash_cache.cpython-313.pyc +0 -0
  106. package/core/__pycache__/context_menu.cpython-313.pyc +0 -0
  107. package/core/__pycache__/data_validator.cpython-313.pyc +0 -0
  108. package/core/__pycache__/database_connection_manager.cpython-313.pyc +0 -0
  109. package/core/__pycache__/database_curl_settings_manager.cpython-313.pyc +0 -0
  110. package/core/__pycache__/database_promera_ai_settings_manager.cpython-313.pyc +0 -0
  111. package/core/__pycache__/database_schema.cpython-313.pyc +0 -0
  112. package/core/__pycache__/database_schema_manager.cpython-313.pyc +0 -0
  113. package/core/__pycache__/database_settings_manager.cpython-313.pyc +0 -0
  114. package/core/__pycache__/database_settings_manager_interface.cpython-313.pyc +0 -0
  115. package/core/__pycache__/dialog_manager.cpython-313.pyc +0 -0
  116. package/core/__pycache__/efficient_line_numbers.cpython-313.pyc +0 -0
  117. package/core/__pycache__/error_handler.cpython-313.pyc +0 -0
  118. package/core/__pycache__/error_service.cpython-313.pyc +0 -0
  119. package/core/__pycache__/event_consolidator.cpython-313.pyc +0 -0
  120. package/core/__pycache__/memory_efficient_text_widget.cpython-313.pyc +0 -0
  121. package/core/__pycache__/migration_manager.cpython-313.pyc +0 -0
  122. package/core/__pycache__/migration_test_suite.cpython-313.pyc +0 -0
  123. package/core/__pycache__/migration_validator.cpython-313.pyc +0 -0
  124. package/core/__pycache__/optimized_find_replace.cpython-313.pyc +0 -0
  125. package/core/__pycache__/optimized_pattern_engine.cpython-313.pyc +0 -0
  126. package/core/__pycache__/optimized_search_highlighter.cpython-313.pyc +0 -0
  127. package/core/__pycache__/performance_monitor.cpython-313.pyc +0 -0
  128. package/core/__pycache__/persistence_manager.cpython-313.pyc +0 -0
  129. package/core/__pycache__/progressive_stats_calculator.cpython-313.pyc +0 -0
  130. package/core/__pycache__/regex_pattern_cache.cpython-313.pyc +0 -0
  131. package/core/__pycache__/regex_pattern_library.cpython-313.pyc +0 -0
  132. package/core/__pycache__/search_operation_manager.cpython-313.pyc +0 -0
  133. package/core/__pycache__/settings_defaults_registry.cpython-313.pyc +0 -0
  134. package/core/__pycache__/settings_integrity_validator.cpython-313.pyc +0 -0
  135. package/core/__pycache__/settings_serializer.cpython-313.pyc +0 -0
  136. package/core/__pycache__/settings_validator.cpython-313.pyc +0 -0
  137. package/core/__pycache__/smart_stats_calculator.cpython-313.pyc +0 -0
  138. package/core/__pycache__/statistics_update_manager.cpython-313.pyc +0 -0
  139. package/core/__pycache__/stats_config_manager.cpython-313.pyc +0 -0
  140. package/core/__pycache__/streaming_text_handler.cpython-313.pyc +0 -0
  141. package/core/__pycache__/task_scheduler.cpython-313.pyc +0 -0
  142. package/core/__pycache__/visibility_monitor.cpython-313.pyc +0 -0
  143. package/core/__pycache__/widget_cache.cpython-313.pyc +0 -0
  144. package/core/mcp/__pycache__/__init__.cpython-313.pyc +0 -0
  145. package/core/mcp/__pycache__/protocol.cpython-313.pyc +0 -0
  146. package/core/mcp/__pycache__/schema.cpython-313.pyc +0 -0
  147. package/core/mcp/__pycache__/server_stdio.cpython-313.pyc +0 -0
  148. package/core/mcp/__pycache__/tool_registry.cpython-313.pyc +0 -0
  149. package/tools/__pycache__/__init__.cpython-313.pyc +0 -0
  150. package/tools/__pycache__/ai_tools.cpython-313.pyc +0 -0
  151. package/tools/__pycache__/ascii_art_generator.cpython-313.pyc +0 -0
  152. package/tools/__pycache__/base64_tools.cpython-313.pyc +0 -0
  153. package/tools/__pycache__/base_tool.cpython-313.pyc +0 -0
  154. package/tools/__pycache__/case_tool.cpython-313.pyc +0 -0
  155. package/tools/__pycache__/column_tools.cpython-313.pyc +0 -0
  156. package/tools/__pycache__/cron_tool.cpython-313.pyc +0 -0
  157. package/tools/__pycache__/curl_history.cpython-313.pyc +0 -0
  158. package/tools/__pycache__/curl_processor.cpython-313.pyc +0 -0
  159. package/tools/__pycache__/curl_settings.cpython-313.pyc +0 -0
  160. package/tools/__pycache__/curl_tool.cpython-313.pyc +0 -0
  161. package/tools/__pycache__/diff_viewer.cpython-313.pyc +0 -0
  162. package/tools/__pycache__/email_extraction_tool.cpython-313.pyc +0 -0
  163. package/tools/__pycache__/email_header_analyzer.cpython-313.pyc +0 -0
  164. package/tools/__pycache__/extraction_tools.cpython-313.pyc +0 -0
  165. package/tools/__pycache__/find_replace.cpython-313.pyc +0 -0
  166. package/tools/__pycache__/folder_file_reporter.cpython-313.pyc +0 -0
  167. package/tools/__pycache__/folder_file_reporter_adapter.cpython-313.pyc +0 -0
  168. package/tools/__pycache__/generator_tools.cpython-313.pyc +0 -0
  169. package/tools/__pycache__/hash_generator.cpython-313.pyc +0 -0
  170. package/tools/__pycache__/html_tool.cpython-313.pyc +0 -0
  171. package/tools/__pycache__/huggingface_helper.cpython-313.pyc +0 -0
  172. package/tools/__pycache__/jsonxml_tool.cpython-313.pyc +0 -0
  173. package/tools/__pycache__/line_tools.cpython-313.pyc +0 -0
  174. package/tools/__pycache__/list_comparator.cpython-313.pyc +0 -0
  175. package/tools/__pycache__/markdown_tools.cpython-313.pyc +0 -0
  176. package/tools/__pycache__/mcp_widget.cpython-313.pyc +0 -0
  177. package/tools/__pycache__/notes_widget.cpython-313.pyc +0 -0
  178. package/tools/__pycache__/number_base_converter.cpython-313.pyc +0 -0
  179. package/tools/__pycache__/regex_extractor.cpython-313.pyc +0 -0
  180. package/tools/__pycache__/slug_generator.cpython-313.pyc +0 -0
  181. package/tools/__pycache__/sorter_tools.cpython-313.pyc +0 -0
  182. package/tools/__pycache__/string_escape_tool.cpython-313.pyc +0 -0
  183. package/tools/__pycache__/text_statistics_tool.cpython-313.pyc +0 -0
  184. package/tools/__pycache__/text_wrapper.cpython-313.pyc +0 -0
  185. package/tools/__pycache__/timestamp_converter.cpython-313.pyc +0 -0
  186. package/tools/__pycache__/tool_loader.cpython-313.pyc +0 -0
  187. package/tools/__pycache__/translator_tools.cpython-313.pyc +0 -0
  188. package/tools/__pycache__/url_link_extractor.cpython-313.pyc +0 -0
  189. package/tools/__pycache__/url_parser.cpython-313.pyc +0 -0
  190. package/tools/__pycache__/whitespace_tools.cpython-313.pyc +0 -0
  191. package/tools/__pycache__/word_frequency_counter.cpython-313.pyc +0 -0
@@ -1,657 +1,657 @@
1
- #!/usr/bin/env python3
2
- """
3
- HTML Extraction Tool Module for Pomera AI Commander
4
-
5
- This module provides HTML processing capabilities including:
6
- - Extracting visible text from HTML (as it would appear in a browser)
7
- - Cleaning up HTML by removing unnecessary tags
8
- - Extracting specific HTML elements
9
- - Converting HTML to plain text with proper formatting
10
-
11
- Author: Pomera AI Commander
12
- """
13
-
14
- import re
15
- import html
16
- from typing import Dict, Any, List, Optional
17
- import logging
18
-
19
-
20
- class HTMLExtractionTool:
21
- """
22
- HTML Extraction Tool for processing HTML content and extracting useful information.
23
-
24
- Features:
25
- - Extract visible text from HTML (browser-rendered text)
26
- - Clean HTML by removing unnecessary tags
27
- - Extract specific elements (links, images, headings, etc.)
28
- - Convert HTML to formatted plain text
29
- - Remove scripts, styles, and other non-visible content
30
- """
31
-
32
- def __init__(self, logger=None):
33
- """
34
- Initialize the HTML Extraction Tool.
35
-
36
- Args:
37
- logger: Logger instance for debugging
38
- """
39
- self.logger = logger or logging.getLogger(__name__)
40
-
41
- # Tags that should be completely removed along with their content
42
- self.script_style_tags = ['script', 'style', 'noscript', 'meta', 'head', 'title']
43
-
44
- # Block-level tags that should add line breaks
45
- self.block_tags = [
46
- 'div', 'p', 'br', 'hr', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
47
- 'ul', 'ol', 'li', 'dl', 'dt', 'dd', 'blockquote', 'pre',
48
- 'table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot',
49
- 'section', 'article', 'header', 'footer', 'nav', 'aside',
50
- 'main', 'figure', 'figcaption', 'address'
51
- ]
52
-
53
- # Inline tags that should preserve spacing
54
- self.inline_tags = [
55
- 'span', 'a', 'strong', 'b', 'em', 'i', 'u', 'small', 'mark',
56
- 'del', 'ins', 'sub', 'sup', 'code', 'kbd', 'samp', 'var',
57
- 'abbr', 'acronym', 'cite', 'dfn', 'q', 'time'
58
- ]
59
-
60
- def process_text(self, html_content: str, settings: Dict[str, Any]) -> str:
61
- """
62
- Process HTML content based on the selected extraction method.
63
-
64
- Args:
65
- html_content: HTML content to process
66
- settings: Tool settings dictionary
67
-
68
- Returns:
69
- Processed text based on the selected method
70
- """
71
- try:
72
- if not html_content.strip():
73
- return "No HTML content provided."
74
-
75
- extraction_method = settings.get("extraction_method", "visible_text")
76
-
77
- if extraction_method == "visible_text":
78
- return self.extract_visible_text(html_content, settings)
79
- elif extraction_method == "clean_html":
80
- return self.clean_html(html_content, settings)
81
- elif extraction_method == "extract_links":
82
- return self.extract_links(html_content, settings)
83
- elif extraction_method == "extract_images":
84
- return self.extract_images(html_content, settings)
85
- elif extraction_method == "extract_headings":
86
- return self.extract_headings(html_content, settings)
87
- elif extraction_method == "extract_tables":
88
- return self.extract_tables(html_content, settings)
89
- elif extraction_method == "extract_forms":
90
- return self.extract_forms(html_content, settings)
91
- else:
92
- return self.extract_visible_text(html_content, settings)
93
-
94
- except Exception as e:
95
- self.logger.error(f"Error processing HTML: {e}")
96
- return f"Error processing HTML: {str(e)}"
97
-
98
- def extract_visible_text(self, html_content: str, settings: Dict[str, Any]) -> str:
99
- """
100
- Extract visible text from HTML as it would appear in a browser.
101
-
102
- Args:
103
- html_content: HTML content to process
104
- settings: Tool settings
105
-
106
- Returns:
107
- Visible text with proper formatting
108
- """
109
- try:
110
- # Remove script and style tags with their content
111
- html_content = self._remove_script_style_tags(html_content)
112
-
113
- # Remove HTML comments
114
- html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
115
-
116
- # Handle block-level tags by adding line breaks
117
- for tag in self.block_tags:
118
- # Add line breaks before and after block tags
119
- html_content = re.sub(f'<{tag}[^>]*>', f'\n<{tag}>', html_content, flags=re.IGNORECASE)
120
- html_content = re.sub(f'</{tag}>', f'</{tag}>\n', html_content, flags=re.IGNORECASE)
121
-
122
- # Handle list items specially
123
- html_content = re.sub(r'<li[^>]*>', '\n• ', html_content, flags=re.IGNORECASE)
124
- html_content = re.sub(r'</li>', '', html_content, flags=re.IGNORECASE)
125
-
126
- # Handle table cells
127
- html_content = re.sub(r'<td[^>]*>', '\t', html_content, flags=re.IGNORECASE)
128
- html_content = re.sub(r'</td>', '', html_content, flags=re.IGNORECASE)
129
- html_content = re.sub(r'<th[^>]*>', '\t', html_content, flags=re.IGNORECASE)
130
- html_content = re.sub(r'</th>', '', html_content, flags=re.IGNORECASE)
131
-
132
- # Remove all remaining HTML tags
133
- html_content = re.sub(r'<[^>]+>', '', html_content)
134
-
135
- # Decode HTML entities
136
- html_content = html.unescape(html_content)
137
-
138
- # Clean up whitespace
139
- lines = html_content.split('\n')
140
- cleaned_lines = []
141
-
142
- for line in lines:
143
- line = line.strip()
144
- if line: # Only keep non-empty lines
145
- cleaned_lines.append(line)
146
-
147
- # Join lines and clean up multiple line breaks
148
- result = '\n'.join(cleaned_lines)
149
-
150
- # Remove excessive line breaks
151
- result = re.sub(r'\n{3,}', '\n\n', result)
152
-
153
- # Add formatting options
154
- if settings.get("preserve_links", False):
155
- result = self._add_link_references(html_content, result)
156
-
157
- return result.strip()
158
-
159
- except Exception as e:
160
- self.logger.error(f"Error extracting visible text: {e}")
161
- return f"Error extracting visible text: {str(e)}"
162
-
163
- def clean_html(self, html_content: str, settings: Dict[str, Any]) -> str:
164
- """
165
- Clean HTML by removing unnecessary tags and attributes.
166
-
167
- Args:
168
- html_content: HTML content to clean
169
- settings: Tool settings
170
-
171
- Returns:
172
- Cleaned HTML
173
- """
174
- try:
175
- # Remove script and style tags if requested
176
- if settings.get("remove_scripts", True):
177
- html_content = self._remove_script_style_tags(html_content)
178
-
179
- # Remove HTML comments
180
- if settings.get("remove_comments", True):
181
- html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
182
-
183
- # Remove specific attributes if requested
184
- if settings.get("remove_style_attrs", True):
185
- html_content = re.sub(r'\s+style\s*=\s*["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE)
186
-
187
- if settings.get("remove_class_attrs", False):
188
- html_content = re.sub(r'\s+class\s*=\s*["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE)
189
-
190
- if settings.get("remove_id_attrs", False):
191
- html_content = re.sub(r'\s+id\s*=\s*["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE)
192
-
193
- # Remove empty tags if requested
194
- if settings.get("remove_empty_tags", True):
195
- # Remove tags that are completely empty
196
- html_content = re.sub(r'<(\w+)[^>]*>\s*</\1>', '', html_content, flags=re.IGNORECASE)
197
-
198
- # Clean up whitespace
199
- html_content = re.sub(r'\n\s*\n', '\n', html_content)
200
- html_content = re.sub(r'>\s+<', '><', html_content)
201
-
202
- return html_content.strip()
203
-
204
- except Exception as e:
205
- self.logger.error(f"Error cleaning HTML: {e}")
206
- return f"Error cleaning HTML: {str(e)}"
207
-
208
- def extract_links(self, html_content: str, settings: Dict[str, Any]) -> str:
209
- """
210
- Extract all links from HTML content.
211
-
212
- Args:
213
- html_content: HTML content to process
214
- settings: Tool settings
215
-
216
- Returns:
217
- List of links with their text
218
- """
219
- try:
220
- # Find all anchor tags
221
- link_pattern = r'<a[^>]*href\s*=\s*["\']([^"\']*)["\'][^>]*>(.*?)</a>'
222
- links = re.findall(link_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
223
-
224
- if not links:
225
- return "No links found in the HTML content."
226
-
227
- result_lines = []
228
- include_text = settings.get("include_link_text", True)
229
- absolute_only = settings.get("absolute_links_only", False)
230
-
231
- for href, link_text in links:
232
- # Clean up link text
233
- link_text = re.sub(r'<[^>]+>', '', link_text).strip()
234
- link_text = html.unescape(link_text)
235
-
236
- # Filter absolute links if requested
237
- if absolute_only and not (href.startswith('http://') or href.startswith('https://')):
238
- continue
239
-
240
- if include_text and link_text:
241
- result_lines.append(f"{link_text}: {href}")
242
- else:
243
- result_lines.append(href)
244
-
245
- return '\n'.join(result_lines) if result_lines else "No links match the specified criteria."
246
-
247
- except Exception as e:
248
- self.logger.error(f"Error extracting links: {e}")
249
- return f"Error extracting links: {str(e)}"
250
-
251
- def extract_images(self, html_content: str, settings: Dict[str, Any]) -> str:
252
- """
253
- Extract all images from HTML content.
254
-
255
- Args:
256
- html_content: HTML content to process
257
- settings: Tool settings
258
-
259
- Returns:
260
- List of images with their attributes
261
- """
262
- try:
263
- # Find all img tags
264
- img_pattern = r'<img[^>]*>'
265
- images = re.findall(img_pattern, html_content, flags=re.IGNORECASE)
266
-
267
- if not images:
268
- return "No images found in the HTML content."
269
-
270
- result_lines = []
271
- include_alt = settings.get("include_alt_text", True)
272
- include_title = settings.get("include_title", False)
273
-
274
- for img_tag in images:
275
- # Extract src attribute
276
- src_match = re.search(r'src\s*=\s*["\']([^"\']*)["\']', img_tag, re.IGNORECASE)
277
- src = src_match.group(1) if src_match else "No src"
278
-
279
- # Extract alt attribute
280
- alt_match = re.search(r'alt\s*=\s*["\']([^"\']*)["\']', img_tag, re.IGNORECASE)
281
- alt = alt_match.group(1) if alt_match else ""
282
-
283
- # Extract title attribute
284
- title_match = re.search(r'title\s*=\s*["\']([^"\']*)["\']', img_tag, re.IGNORECASE)
285
- title = title_match.group(1) if title_match else ""
286
-
287
- # Build result line
288
- parts = [src]
289
- if include_alt and alt:
290
- parts.append(f"Alt: {alt}")
291
- if include_title and title:
292
- parts.append(f"Title: {title}")
293
-
294
- result_lines.append(" | ".join(parts))
295
-
296
- return '\n'.join(result_lines)
297
-
298
- except Exception as e:
299
- self.logger.error(f"Error extracting images: {e}")
300
- return f"Error extracting images: {str(e)}"
301
-
302
- def extract_headings(self, html_content: str, settings: Dict[str, Any]) -> str:
303
- """
304
- Extract all headings from HTML content.
305
-
306
- Args:
307
- html_content: HTML content to process
308
- settings: Tool settings
309
-
310
- Returns:
311
- List of headings with their levels
312
- """
313
- try:
314
- # Find all heading tags
315
- heading_pattern = r'<(h[1-6])[^>]*>(.*?)</\1>'
316
- headings = re.findall(heading_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
317
-
318
- if not headings:
319
- return "No headings found in the HTML content."
320
-
321
- result_lines = []
322
- include_level = settings.get("include_heading_level", True)
323
-
324
- for tag, content in headings:
325
- # Clean up heading content
326
- content = re.sub(r'<[^>]+>', '', content).strip()
327
- content = html.unescape(content)
328
-
329
- if include_level:
330
- level = tag.upper()
331
- result_lines.append(f"{level}: {content}")
332
- else:
333
- result_lines.append(content)
334
-
335
- return '\n'.join(result_lines)
336
-
337
- except Exception as e:
338
- self.logger.error(f"Error extracting headings: {e}")
339
- return f"Error extracting headings: {str(e)}"
340
-
341
- def extract_tables(self, html_content: str, settings: Dict[str, Any]) -> str:
342
- """
343
- Extract table data from HTML content.
344
-
345
- Args:
346
- html_content: HTML content to process
347
- settings: Tool settings
348
-
349
- Returns:
350
- Formatted table data
351
- """
352
- try:
353
- # Find all table tags
354
- table_pattern = r'<table[^>]*>(.*?)</table>'
355
- tables = re.findall(table_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
356
-
357
- if not tables:
358
- return "No tables found in the HTML content."
359
-
360
- result_lines = []
361
- separator = settings.get("column_separator", "\t")
362
-
363
- for i, table_content in enumerate(tables):
364
- if len(tables) > 1:
365
- result_lines.append(f"\n--- Table {i + 1} ---")
366
-
367
- # Find all rows
368
- row_pattern = r'<tr[^>]*>(.*?)</tr>'
369
- rows = re.findall(row_pattern, table_content, flags=re.IGNORECASE | re.DOTALL)
370
-
371
- for row_content in rows:
372
- # Find all cells (td or th)
373
- cell_pattern = r'<(?:td|th)[^>]*>(.*?)</(?:td|th)>'
374
- cells = re.findall(cell_pattern, row_content, flags=re.IGNORECASE | re.DOTALL)
375
-
376
- # Clean up cell content
377
- cleaned_cells = []
378
- for cell in cells:
379
- cell = re.sub(r'<[^>]+>', '', cell).strip()
380
- cell = html.unescape(cell)
381
- cleaned_cells.append(cell)
382
-
383
- if cleaned_cells:
384
- result_lines.append(separator.join(cleaned_cells))
385
-
386
- return '\n'.join(result_lines)
387
-
388
- except Exception as e:
389
- self.logger.error(f"Error extracting tables: {e}")
390
- return f"Error extracting tables: {str(e)}"
391
-
392
- def extract_forms(self, html_content: str, settings: Dict[str, Any]) -> str:
393
- """
394
- Extract form information from HTML content.
395
-
396
- Args:
397
- html_content: HTML content to process
398
- settings: Tool settings
399
-
400
- Returns:
401
- Form structure information
402
- """
403
- try:
404
- # Find all form tags
405
- form_pattern = r'<form[^>]*>(.*?)</form>'
406
- forms = re.findall(form_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
407
-
408
- if not forms:
409
- return "No forms found in the HTML content."
410
-
411
- result_lines = []
412
-
413
- for i, form_content in enumerate(forms):
414
- if len(forms) > 1:
415
- result_lines.append(f"\n--- Form {i + 1} ---")
416
-
417
- # Extract form attributes
418
- form_tag_match = re.search(r'<form([^>]*)>', html_content, re.IGNORECASE)
419
- if form_tag_match:
420
- form_attrs = form_tag_match.group(1)
421
-
422
- # Extract action
423
- action_match = re.search(r'action\s*=\s*["\']([^"\']*)["\']', form_attrs, re.IGNORECASE)
424
- if action_match:
425
- result_lines.append(f"Action: {action_match.group(1)}")
426
-
427
- # Extract method
428
- method_match = re.search(r'method\s*=\s*["\']([^"\']*)["\']', form_attrs, re.IGNORECASE)
429
- if method_match:
430
- result_lines.append(f"Method: {method_match.group(1)}")
431
-
432
- # Find all input fields
433
- input_pattern = r'<input[^>]*>'
434
- inputs = re.findall(input_pattern, form_content, flags=re.IGNORECASE)
435
-
436
- if inputs:
437
- result_lines.append("Input Fields:")
438
- for input_tag in inputs:
439
- # Extract input attributes
440
- name_match = re.search(r'name\s*=\s*["\']([^"\']*)["\']', input_tag, re.IGNORECASE)
441
- type_match = re.search(r'type\s*=\s*["\']([^"\']*)["\']', input_tag, re.IGNORECASE)
442
-
443
- name = name_match.group(1) if name_match else "unnamed"
444
- input_type = type_match.group(1) if type_match else "text"
445
-
446
- result_lines.append(f" - {name} ({input_type})")
447
-
448
- # Find all textarea fields
449
- textarea_pattern = r'<textarea[^>]*name\s*=\s*["\']([^"\']*)["\'][^>]*>'
450
- textareas = re.findall(textarea_pattern, form_content, flags=re.IGNORECASE)
451
-
452
- if textareas:
453
- result_lines.append("Textarea Fields:")
454
- for name in textareas:
455
- result_lines.append(f" - {name}")
456
-
457
- # Find all select fields
458
- select_pattern = r'<select[^>]*name\s*=\s*["\']([^"\']*)["\'][^>]*>'
459
- selects = re.findall(select_pattern, form_content, flags=re.IGNORECASE)
460
-
461
- if selects:
462
- result_lines.append("Select Fields:")
463
- for name in selects:
464
- result_lines.append(f" - {name}")
465
-
466
- return '\n'.join(result_lines)
467
-
468
- except Exception as e:
469
- self.logger.error(f"Error extracting forms: {e}")
470
- return f"Error extracting forms: {str(e)}"
471
-
472
- def _remove_script_style_tags(self, html_content: str) -> str:
473
- """Remove script and style tags with their content."""
474
- for tag in self.script_style_tags:
475
- pattern = f'<{tag}[^>]*>.*?</{tag}>'
476
- html_content = re.sub(pattern, '', html_content, flags=re.IGNORECASE | re.DOTALL)
477
- return html_content
478
-
479
- def _add_link_references(self, original_html: str, text_result: str) -> str:
480
- """Add link references to the text result."""
481
- try:
482
- # This is a simplified implementation
483
- # In a full implementation, you might want to add footnote-style references
484
- link_pattern = r'<a[^>]*href\s*=\s*["\']([^"\']*)["\'][^>]*>(.*?)</a>'
485
- links = re.findall(link_pattern, original_html, flags=re.IGNORECASE | re.DOTALL)
486
-
487
- if links:
488
- text_result += "\n\nLinks found in document:\n"
489
- for i, (href, link_text) in enumerate(links, 1):
490
- link_text = re.sub(r'<[^>]+>', '', link_text).strip()
491
- link_text = html.unescape(link_text)
492
- text_result += f"{i}. {link_text}: {href}\n"
493
-
494
- return text_result
495
- except Exception:
496
- return text_result
497
-
498
-
499
- # Tool settings configuration
500
- def get_default_settings():
501
- """Get default settings for the HTML Extraction Tool."""
502
- return {
503
- "extraction_method": "visible_text",
504
- "preserve_links": False,
505
- "remove_scripts": True,
506
- "remove_comments": True,
507
- "remove_style_attrs": True,
508
- "remove_class_attrs": False,
509
- "remove_id_attrs": False,
510
- "remove_empty_tags": True,
511
- "include_link_text": True,
512
- "absolute_links_only": False,
513
- "include_alt_text": True,
514
- "include_title": False,
515
- "include_heading_level": True,
516
- "column_separator": "\t"
517
- }
518
-
519
-
520
- def get_settings_ui_config():
521
- """Get UI configuration for the HTML Extraction Tool settings."""
522
- return {
523
- "extraction_method": {
524
- "type": "dropdown",
525
- "label": "Extraction Method",
526
- "options": [
527
- ("Extract Visible Text", "visible_text"),
528
- ("Clean HTML", "clean_html"),
529
- ("Extract Links", "extract_links"),
530
- ("Extract Images", "extract_images"),
531
- ("Extract Headings", "extract_headings"),
532
- ("Extract Tables", "extract_tables"),
533
- ("Extract Forms", "extract_forms")
534
- ],
535
- "default": "visible_text"
536
- },
537
- "preserve_links": {
538
- "type": "checkbox",
539
- "label": "Add link references to visible text",
540
- "default": False,
541
- "show_when": {"extraction_method": "visible_text"}
542
- },
543
- "remove_scripts": {
544
- "type": "checkbox",
545
- "label": "Remove script and style tags",
546
- "default": True,
547
- "show_when": {"extraction_method": "clean_html"}
548
- },
549
- "remove_comments": {
550
- "type": "checkbox",
551
- "label": "Remove HTML comments",
552
- "default": True,
553
- "show_when": {"extraction_method": "clean_html"}
554
- },
555
- "remove_style_attrs": {
556
- "type": "checkbox",
557
- "label": "Remove style attributes",
558
- "default": True,
559
- "show_when": {"extraction_method": "clean_html"}
560
- },
561
- "remove_class_attrs": {
562
- "type": "checkbox",
563
- "label": "Remove class attributes",
564
- "default": False,
565
- "show_when": {"extraction_method": "clean_html"}
566
- },
567
- "remove_id_attrs": {
568
- "type": "checkbox",
569
- "label": "Remove ID attributes",
570
- "default": False,
571
- "show_when": {"extraction_method": "clean_html"}
572
- },
573
- "remove_empty_tags": {
574
- "type": "checkbox",
575
- "label": "Remove empty tags",
576
- "default": True,
577
- "show_when": {"extraction_method": "clean_html"}
578
- },
579
- "include_link_text": {
580
- "type": "checkbox",
581
- "label": "Include link text",
582
- "default": True,
583
- "show_when": {"extraction_method": "extract_links"}
584
- },
585
- "absolute_links_only": {
586
- "type": "checkbox",
587
- "label": "Only absolute links (http/https)",
588
- "default": False,
589
- "show_when": {"extraction_method": "extract_links"}
590
- },
591
- "include_alt_text": {
592
- "type": "checkbox",
593
- "label": "Include alt text",
594
- "default": True,
595
- "show_when": {"extraction_method": "extract_images"}
596
- },
597
- "include_title": {
598
- "type": "checkbox",
599
- "label": "Include title attribute",
600
- "default": False,
601
- "show_when": {"extraction_method": "extract_images"}
602
- },
603
- "include_heading_level": {
604
- "type": "checkbox",
605
- "label": "Include heading level (H1, H2, etc.)",
606
- "default": True,
607
- "show_when": {"extraction_method": "extract_headings"}
608
- },
609
- "column_separator": {
610
- "type": "entry",
611
- "label": "Column separator",
612
- "default": "\t",
613
- "show_when": {"extraction_method": "extract_tables"}
614
- }
615
- }
616
-
617
-
618
- # BaseTool-compatible wrapper
619
- try:
620
- from tools.base_tool import ToolWithOptions
621
- import tkinter as tk
622
- from tkinter import ttk
623
-
624
- class HTMLToolV2(ToolWithOptions):
625
- """
626
- BaseTool-compatible version of HTMLExtractionTool.
627
- """
628
-
629
- TOOL_NAME = "HTML Tool"
630
- TOOL_DESCRIPTION = "Extract and process HTML content"
631
- TOOL_VERSION = "2.0.0"
632
-
633
- OPTIONS = [
634
- ("Visible Text", "visible_text"),
635
- ("Clean HTML", "clean_html"),
636
- ("Extract Links", "extract_links"),
637
- ("Extract Images", "extract_images"),
638
- ("Extract Headings", "extract_headings"),
639
- ("Extract Tables", "extract_tables"),
640
- ("Extract Forms", "extract_forms"),
641
- ]
642
- OPTIONS_LABEL = "Operation"
643
- USE_DROPDOWN = True
644
- DEFAULT_OPTION = "visible_text"
645
-
646
- def __init__(self):
647
- super().__init__()
648
- self._tool = HTMLExtractionTool()
649
-
650
- def process_text(self, input_text: str, settings: Dict[str, Any]) -> str:
651
- """Process HTML content."""
652
- mode = settings.get("mode", "visible_text")
653
- tool_settings = {"extraction_method": mode}
654
- return self._tool.process_text(input_text, tool_settings)
655
-
656
- except ImportError:
1
+ #!/usr/bin/env python3
2
+ """
3
+ HTML Extraction Tool Module for Pomera AI Commander
4
+
5
+ This module provides HTML processing capabilities including:
6
+ - Extracting visible text from HTML (as it would appear in a browser)
7
+ - Cleaning up HTML by removing unnecessary tags
8
+ - Extracting specific HTML elements
9
+ - Converting HTML to plain text with proper formatting
10
+
11
+ Author: Pomera AI Commander
12
+ """
13
+
14
+ import re
15
+ import html
16
+ from typing import Dict, Any, List, Optional
17
+ import logging
18
+
19
+
20
+ class HTMLExtractionTool:
21
+ """
22
+ HTML Extraction Tool for processing HTML content and extracting useful information.
23
+
24
+ Features:
25
+ - Extract visible text from HTML (browser-rendered text)
26
+ - Clean HTML by removing unnecessary tags
27
+ - Extract specific elements (links, images, headings, etc.)
28
+ - Convert HTML to formatted plain text
29
+ - Remove scripts, styles, and other non-visible content
30
+ """
31
+
32
+ def __init__(self, logger=None):
33
+ """
34
+ Initialize the HTML Extraction Tool.
35
+
36
+ Args:
37
+ logger: Logger instance for debugging
38
+ """
39
+ self.logger = logger or logging.getLogger(__name__)
40
+
41
+ # Tags that should be completely removed along with their content
42
+ self.script_style_tags = ['script', 'style', 'noscript', 'meta', 'head', 'title']
43
+
44
+ # Block-level tags that should add line breaks
45
+ self.block_tags = [
46
+ 'div', 'p', 'br', 'hr', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
47
+ 'ul', 'ol', 'li', 'dl', 'dt', 'dd', 'blockquote', 'pre',
48
+ 'table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot',
49
+ 'section', 'article', 'header', 'footer', 'nav', 'aside',
50
+ 'main', 'figure', 'figcaption', 'address'
51
+ ]
52
+
53
+ # Inline tags that should preserve spacing
54
+ self.inline_tags = [
55
+ 'span', 'a', 'strong', 'b', 'em', 'i', 'u', 'small', 'mark',
56
+ 'del', 'ins', 'sub', 'sup', 'code', 'kbd', 'samp', 'var',
57
+ 'abbr', 'acronym', 'cite', 'dfn', 'q', 'time'
58
+ ]
59
+
60
+ def process_text(self, html_content: str, settings: Dict[str, Any]) -> str:
61
+ """
62
+ Process HTML content based on the selected extraction method.
63
+
64
+ Args:
65
+ html_content: HTML content to process
66
+ settings: Tool settings dictionary
67
+
68
+ Returns:
69
+ Processed text based on the selected method
70
+ """
71
+ try:
72
+ if not html_content.strip():
73
+ return "No HTML content provided."
74
+
75
+ extraction_method = settings.get("extraction_method", "visible_text")
76
+
77
+ if extraction_method == "visible_text":
78
+ return self.extract_visible_text(html_content, settings)
79
+ elif extraction_method == "clean_html":
80
+ return self.clean_html(html_content, settings)
81
+ elif extraction_method == "extract_links":
82
+ return self.extract_links(html_content, settings)
83
+ elif extraction_method == "extract_images":
84
+ return self.extract_images(html_content, settings)
85
+ elif extraction_method == "extract_headings":
86
+ return self.extract_headings(html_content, settings)
87
+ elif extraction_method == "extract_tables":
88
+ return self.extract_tables(html_content, settings)
89
+ elif extraction_method == "extract_forms":
90
+ return self.extract_forms(html_content, settings)
91
+ else:
92
+ return self.extract_visible_text(html_content, settings)
93
+
94
+ except Exception as e:
95
+ self.logger.error(f"Error processing HTML: {e}")
96
+ return f"Error processing HTML: {str(e)}"
97
+
98
+ def extract_visible_text(self, html_content: str, settings: Dict[str, Any]) -> str:
99
+ """
100
+ Extract visible text from HTML as it would appear in a browser.
101
+
102
+ Args:
103
+ html_content: HTML content to process
104
+ settings: Tool settings
105
+
106
+ Returns:
107
+ Visible text with proper formatting
108
+ """
109
+ try:
110
+ # Remove script and style tags with their content
111
+ html_content = self._remove_script_style_tags(html_content)
112
+
113
+ # Remove HTML comments
114
+ html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
115
+
116
+ # Handle block-level tags by adding line breaks
117
+ for tag in self.block_tags:
118
+ # Add line breaks before and after block tags
119
+ html_content = re.sub(f'<{tag}[^>]*>', f'\n<{tag}>', html_content, flags=re.IGNORECASE)
120
+ html_content = re.sub(f'</{tag}>', f'</{tag}>\n', html_content, flags=re.IGNORECASE)
121
+
122
+ # Handle list items specially
123
+ html_content = re.sub(r'<li[^>]*>', '\n• ', html_content, flags=re.IGNORECASE)
124
+ html_content = re.sub(r'</li>', '', html_content, flags=re.IGNORECASE)
125
+
126
+ # Handle table cells
127
+ html_content = re.sub(r'<td[^>]*>', '\t', html_content, flags=re.IGNORECASE)
128
+ html_content = re.sub(r'</td>', '', html_content, flags=re.IGNORECASE)
129
+ html_content = re.sub(r'<th[^>]*>', '\t', html_content, flags=re.IGNORECASE)
130
+ html_content = re.sub(r'</th>', '', html_content, flags=re.IGNORECASE)
131
+
132
+ # Remove all remaining HTML tags
133
+ html_content = re.sub(r'<[^>]+>', '', html_content)
134
+
135
+ # Decode HTML entities
136
+ html_content = html.unescape(html_content)
137
+
138
+ # Clean up whitespace
139
+ lines = html_content.split('\n')
140
+ cleaned_lines = []
141
+
142
+ for line in lines:
143
+ line = line.strip()
144
+ if line: # Only keep non-empty lines
145
+ cleaned_lines.append(line)
146
+
147
+ # Join lines and clean up multiple line breaks
148
+ result = '\n'.join(cleaned_lines)
149
+
150
+ # Remove excessive line breaks
151
+ result = re.sub(r'\n{3,}', '\n\n', result)
152
+
153
+ # Add formatting options
154
+ if settings.get("preserve_links", False):
155
+ result = self._add_link_references(html_content, result)
156
+
157
+ return result.strip()
158
+
159
+ except Exception as e:
160
+ self.logger.error(f"Error extracting visible text: {e}")
161
+ return f"Error extracting visible text: {str(e)}"
162
+
163
+ def clean_html(self, html_content: str, settings: Dict[str, Any]) -> str:
164
+ """
165
+ Clean HTML by removing unnecessary tags and attributes.
166
+
167
+ Args:
168
+ html_content: HTML content to clean
169
+ settings: Tool settings
170
+
171
+ Returns:
172
+ Cleaned HTML
173
+ """
174
+ try:
175
+ # Remove script and style tags if requested
176
+ if settings.get("remove_scripts", True):
177
+ html_content = self._remove_script_style_tags(html_content)
178
+
179
+ # Remove HTML comments
180
+ if settings.get("remove_comments", True):
181
+ html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
182
+
183
+ # Remove specific attributes if requested
184
+ if settings.get("remove_style_attrs", True):
185
+ html_content = re.sub(r'\s+style\s*=\s*["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE)
186
+
187
+ if settings.get("remove_class_attrs", False):
188
+ html_content = re.sub(r'\s+class\s*=\s*["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE)
189
+
190
+ if settings.get("remove_id_attrs", False):
191
+ html_content = re.sub(r'\s+id\s*=\s*["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE)
192
+
193
+ # Remove empty tags if requested
194
+ if settings.get("remove_empty_tags", True):
195
+ # Remove tags that are completely empty
196
+ html_content = re.sub(r'<(\w+)[^>]*>\s*</\1>', '', html_content, flags=re.IGNORECASE)
197
+
198
+ # Clean up whitespace
199
+ html_content = re.sub(r'\n\s*\n', '\n', html_content)
200
+ html_content = re.sub(r'>\s+<', '><', html_content)
201
+
202
+ return html_content.strip()
203
+
204
+ except Exception as e:
205
+ self.logger.error(f"Error cleaning HTML: {e}")
206
+ return f"Error cleaning HTML: {str(e)}"
207
+
208
+ def extract_links(self, html_content: str, settings: Dict[str, Any]) -> str:
209
+ """
210
+ Extract all links from HTML content.
211
+
212
+ Args:
213
+ html_content: HTML content to process
214
+ settings: Tool settings
215
+
216
+ Returns:
217
+ List of links with their text
218
+ """
219
+ try:
220
+ # Find all anchor tags
221
+ link_pattern = r'<a[^>]*href\s*=\s*["\']([^"\']*)["\'][^>]*>(.*?)</a>'
222
+ links = re.findall(link_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
223
+
224
+ if not links:
225
+ return "No links found in the HTML content."
226
+
227
+ result_lines = []
228
+ include_text = settings.get("include_link_text", True)
229
+ absolute_only = settings.get("absolute_links_only", False)
230
+
231
+ for href, link_text in links:
232
+ # Clean up link text
233
+ link_text = re.sub(r'<[^>]+>', '', link_text).strip()
234
+ link_text = html.unescape(link_text)
235
+
236
+ # Filter absolute links if requested
237
+ if absolute_only and not (href.startswith('http://') or href.startswith('https://')):
238
+ continue
239
+
240
+ if include_text and link_text:
241
+ result_lines.append(f"{link_text}: {href}")
242
+ else:
243
+ result_lines.append(href)
244
+
245
+ return '\n'.join(result_lines) if result_lines else "No links match the specified criteria."
246
+
247
+ except Exception as e:
248
+ self.logger.error(f"Error extracting links: {e}")
249
+ return f"Error extracting links: {str(e)}"
250
+
251
+ def extract_images(self, html_content: str, settings: Dict[str, Any]) -> str:
252
+ """
253
+ Extract all images from HTML content.
254
+
255
+ Args:
256
+ html_content: HTML content to process
257
+ settings: Tool settings
258
+
259
+ Returns:
260
+ List of images with their attributes
261
+ """
262
+ try:
263
+ # Find all img tags
264
+ img_pattern = r'<img[^>]*>'
265
+ images = re.findall(img_pattern, html_content, flags=re.IGNORECASE)
266
+
267
+ if not images:
268
+ return "No images found in the HTML content."
269
+
270
+ result_lines = []
271
+ include_alt = settings.get("include_alt_text", True)
272
+ include_title = settings.get("include_title", False)
273
+
274
+ for img_tag in images:
275
+ # Extract src attribute
276
+ src_match = re.search(r'src\s*=\s*["\']([^"\']*)["\']', img_tag, re.IGNORECASE)
277
+ src = src_match.group(1) if src_match else "No src"
278
+
279
+ # Extract alt attribute
280
+ alt_match = re.search(r'alt\s*=\s*["\']([^"\']*)["\']', img_tag, re.IGNORECASE)
281
+ alt = alt_match.group(1) if alt_match else ""
282
+
283
+ # Extract title attribute
284
+ title_match = re.search(r'title\s*=\s*["\']([^"\']*)["\']', img_tag, re.IGNORECASE)
285
+ title = title_match.group(1) if title_match else ""
286
+
287
+ # Build result line
288
+ parts = [src]
289
+ if include_alt and alt:
290
+ parts.append(f"Alt: {alt}")
291
+ if include_title and title:
292
+ parts.append(f"Title: {title}")
293
+
294
+ result_lines.append(" | ".join(parts))
295
+
296
+ return '\n'.join(result_lines)
297
+
298
+ except Exception as e:
299
+ self.logger.error(f"Error extracting images: {e}")
300
+ return f"Error extracting images: {str(e)}"
301
+
302
+ def extract_headings(self, html_content: str, settings: Dict[str, Any]) -> str:
303
+ """
304
+ Extract all headings from HTML content.
305
+
306
+ Args:
307
+ html_content: HTML content to process
308
+ settings: Tool settings
309
+
310
+ Returns:
311
+ List of headings with their levels
312
+ """
313
+ try:
314
+ # Find all heading tags
315
+ heading_pattern = r'<(h[1-6])[^>]*>(.*?)</\1>'
316
+ headings = re.findall(heading_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
317
+
318
+ if not headings:
319
+ return "No headings found in the HTML content."
320
+
321
+ result_lines = []
322
+ include_level = settings.get("include_heading_level", True)
323
+
324
+ for tag, content in headings:
325
+ # Clean up heading content
326
+ content = re.sub(r'<[^>]+>', '', content).strip()
327
+ content = html.unescape(content)
328
+
329
+ if include_level:
330
+ level = tag.upper()
331
+ result_lines.append(f"{level}: {content}")
332
+ else:
333
+ result_lines.append(content)
334
+
335
+ return '\n'.join(result_lines)
336
+
337
+ except Exception as e:
338
+ self.logger.error(f"Error extracting headings: {e}")
339
+ return f"Error extracting headings: {str(e)}"
340
+
341
+ def extract_tables(self, html_content: str, settings: Dict[str, Any]) -> str:
342
+ """
343
+ Extract table data from HTML content.
344
+
345
+ Args:
346
+ html_content: HTML content to process
347
+ settings: Tool settings
348
+
349
+ Returns:
350
+ Formatted table data
351
+ """
352
+ try:
353
+ # Find all table tags
354
+ table_pattern = r'<table[^>]*>(.*?)</table>'
355
+ tables = re.findall(table_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
356
+
357
+ if not tables:
358
+ return "No tables found in the HTML content."
359
+
360
+ result_lines = []
361
+ separator = settings.get("column_separator", "\t")
362
+
363
+ for i, table_content in enumerate(tables):
364
+ if len(tables) > 1:
365
+ result_lines.append(f"\n--- Table {i + 1} ---")
366
+
367
+ # Find all rows
368
+ row_pattern = r'<tr[^>]*>(.*?)</tr>'
369
+ rows = re.findall(row_pattern, table_content, flags=re.IGNORECASE | re.DOTALL)
370
+
371
+ for row_content in rows:
372
+ # Find all cells (td or th)
373
+ cell_pattern = r'<(?:td|th)[^>]*>(.*?)</(?:td|th)>'
374
+ cells = re.findall(cell_pattern, row_content, flags=re.IGNORECASE | re.DOTALL)
375
+
376
+ # Clean up cell content
377
+ cleaned_cells = []
378
+ for cell in cells:
379
+ cell = re.sub(r'<[^>]+>', '', cell).strip()
380
+ cell = html.unescape(cell)
381
+ cleaned_cells.append(cell)
382
+
383
+ if cleaned_cells:
384
+ result_lines.append(separator.join(cleaned_cells))
385
+
386
+ return '\n'.join(result_lines)
387
+
388
+ except Exception as e:
389
+ self.logger.error(f"Error extracting tables: {e}")
390
+ return f"Error extracting tables: {str(e)}"
391
+
392
+ def extract_forms(self, html_content: str, settings: Dict[str, Any]) -> str:
393
+ """
394
+ Extract form information from HTML content.
395
+
396
+ Args:
397
+ html_content: HTML content to process
398
+ settings: Tool settings
399
+
400
+ Returns:
401
+ Form structure information
402
+ """
403
+ try:
404
+ # Find all form tags
405
+ form_pattern = r'<form[^>]*>(.*?)</form>'
406
+ forms = re.findall(form_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
407
+
408
+ if not forms:
409
+ return "No forms found in the HTML content."
410
+
411
+ result_lines = []
412
+
413
+ for i, form_content in enumerate(forms):
414
+ if len(forms) > 1:
415
+ result_lines.append(f"\n--- Form {i + 1} ---")
416
+
417
+ # Extract form attributes
418
+ form_tag_match = re.search(r'<form([^>]*)>', html_content, re.IGNORECASE)
419
+ if form_tag_match:
420
+ form_attrs = form_tag_match.group(1)
421
+
422
+ # Extract action
423
+ action_match = re.search(r'action\s*=\s*["\']([^"\']*)["\']', form_attrs, re.IGNORECASE)
424
+ if action_match:
425
+ result_lines.append(f"Action: {action_match.group(1)}")
426
+
427
+ # Extract method
428
+ method_match = re.search(r'method\s*=\s*["\']([^"\']*)["\']', form_attrs, re.IGNORECASE)
429
+ if method_match:
430
+ result_lines.append(f"Method: {method_match.group(1)}")
431
+
432
+ # Find all input fields
433
+ input_pattern = r'<input[^>]*>'
434
+ inputs = re.findall(input_pattern, form_content, flags=re.IGNORECASE)
435
+
436
+ if inputs:
437
+ result_lines.append("Input Fields:")
438
+ for input_tag in inputs:
439
+ # Extract input attributes
440
+ name_match = re.search(r'name\s*=\s*["\']([^"\']*)["\']', input_tag, re.IGNORECASE)
441
+ type_match = re.search(r'type\s*=\s*["\']([^"\']*)["\']', input_tag, re.IGNORECASE)
442
+
443
+ name = name_match.group(1) if name_match else "unnamed"
444
+ input_type = type_match.group(1) if type_match else "text"
445
+
446
+ result_lines.append(f" - {name} ({input_type})")
447
+
448
+ # Find all textarea fields
449
+ textarea_pattern = r'<textarea[^>]*name\s*=\s*["\']([^"\']*)["\'][^>]*>'
450
+ textareas = re.findall(textarea_pattern, form_content, flags=re.IGNORECASE)
451
+
452
+ if textareas:
453
+ result_lines.append("Textarea Fields:")
454
+ for name in textareas:
455
+ result_lines.append(f" - {name}")
456
+
457
+ # Find all select fields
458
+ select_pattern = r'<select[^>]*name\s*=\s*["\']([^"\']*)["\'][^>]*>'
459
+ selects = re.findall(select_pattern, form_content, flags=re.IGNORECASE)
460
+
461
+ if selects:
462
+ result_lines.append("Select Fields:")
463
+ for name in selects:
464
+ result_lines.append(f" - {name}")
465
+
466
+ return '\n'.join(result_lines)
467
+
468
+ except Exception as e:
469
+ self.logger.error(f"Error extracting forms: {e}")
470
+ return f"Error extracting forms: {str(e)}"
471
+
472
+ def _remove_script_style_tags(self, html_content: str) -> str:
473
+ """Remove script and style tags with their content."""
474
+ for tag in self.script_style_tags:
475
+ pattern = f'<{tag}[^>]*>.*?</{tag}>'
476
+ html_content = re.sub(pattern, '', html_content, flags=re.IGNORECASE | re.DOTALL)
477
+ return html_content
478
+
479
+ def _add_link_references(self, original_html: str, text_result: str) -> str:
480
+ """Add link references to the text result."""
481
+ try:
482
+ # This is a simplified implementation
483
+ # In a full implementation, you might want to add footnote-style references
484
+ link_pattern = r'<a[^>]*href\s*=\s*["\']([^"\']*)["\'][^>]*>(.*?)</a>'
485
+ links = re.findall(link_pattern, original_html, flags=re.IGNORECASE | re.DOTALL)
486
+
487
+ if links:
488
+ text_result += "\n\nLinks found in document:\n"
489
+ for i, (href, link_text) in enumerate(links, 1):
490
+ link_text = re.sub(r'<[^>]+>', '', link_text).strip()
491
+ link_text = html.unescape(link_text)
492
+ text_result += f"{i}. {link_text}: {href}\n"
493
+
494
+ return text_result
495
+ except Exception:
496
+ return text_result
497
+
498
+
499
+ # Tool settings configuration
500
+ def get_default_settings():
501
+ """Get default settings for the HTML Extraction Tool."""
502
+ return {
503
+ "extraction_method": "visible_text",
504
+ "preserve_links": False,
505
+ "remove_scripts": True,
506
+ "remove_comments": True,
507
+ "remove_style_attrs": True,
508
+ "remove_class_attrs": False,
509
+ "remove_id_attrs": False,
510
+ "remove_empty_tags": True,
511
+ "include_link_text": True,
512
+ "absolute_links_only": False,
513
+ "include_alt_text": True,
514
+ "include_title": False,
515
+ "include_heading_level": True,
516
+ "column_separator": "\t"
517
+ }
518
+
519
+
520
+ def get_settings_ui_config():
521
+ """Get UI configuration for the HTML Extraction Tool settings."""
522
+ return {
523
+ "extraction_method": {
524
+ "type": "dropdown",
525
+ "label": "Extraction Method",
526
+ "options": [
527
+ ("Extract Visible Text", "visible_text"),
528
+ ("Clean HTML", "clean_html"),
529
+ ("Extract Links", "extract_links"),
530
+ ("Extract Images", "extract_images"),
531
+ ("Extract Headings", "extract_headings"),
532
+ ("Extract Tables", "extract_tables"),
533
+ ("Extract Forms", "extract_forms")
534
+ ],
535
+ "default": "visible_text"
536
+ },
537
+ "preserve_links": {
538
+ "type": "checkbox",
539
+ "label": "Add link references to visible text",
540
+ "default": False,
541
+ "show_when": {"extraction_method": "visible_text"}
542
+ },
543
+ "remove_scripts": {
544
+ "type": "checkbox",
545
+ "label": "Remove script and style tags",
546
+ "default": True,
547
+ "show_when": {"extraction_method": "clean_html"}
548
+ },
549
+ "remove_comments": {
550
+ "type": "checkbox",
551
+ "label": "Remove HTML comments",
552
+ "default": True,
553
+ "show_when": {"extraction_method": "clean_html"}
554
+ },
555
+ "remove_style_attrs": {
556
+ "type": "checkbox",
557
+ "label": "Remove style attributes",
558
+ "default": True,
559
+ "show_when": {"extraction_method": "clean_html"}
560
+ },
561
+ "remove_class_attrs": {
562
+ "type": "checkbox",
563
+ "label": "Remove class attributes",
564
+ "default": False,
565
+ "show_when": {"extraction_method": "clean_html"}
566
+ },
567
+ "remove_id_attrs": {
568
+ "type": "checkbox",
569
+ "label": "Remove ID attributes",
570
+ "default": False,
571
+ "show_when": {"extraction_method": "clean_html"}
572
+ },
573
+ "remove_empty_tags": {
574
+ "type": "checkbox",
575
+ "label": "Remove empty tags",
576
+ "default": True,
577
+ "show_when": {"extraction_method": "clean_html"}
578
+ },
579
+ "include_link_text": {
580
+ "type": "checkbox",
581
+ "label": "Include link text",
582
+ "default": True,
583
+ "show_when": {"extraction_method": "extract_links"}
584
+ },
585
+ "absolute_links_only": {
586
+ "type": "checkbox",
587
+ "label": "Only absolute links (http/https)",
588
+ "default": False,
589
+ "show_when": {"extraction_method": "extract_links"}
590
+ },
591
+ "include_alt_text": {
592
+ "type": "checkbox",
593
+ "label": "Include alt text",
594
+ "default": True,
595
+ "show_when": {"extraction_method": "extract_images"}
596
+ },
597
+ "include_title": {
598
+ "type": "checkbox",
599
+ "label": "Include title attribute",
600
+ "default": False,
601
+ "show_when": {"extraction_method": "extract_images"}
602
+ },
603
+ "include_heading_level": {
604
+ "type": "checkbox",
605
+ "label": "Include heading level (H1, H2, etc.)",
606
+ "default": True,
607
+ "show_when": {"extraction_method": "extract_headings"}
608
+ },
609
+ "column_separator": {
610
+ "type": "entry",
611
+ "label": "Column separator",
612
+ "default": "\t",
613
+ "show_when": {"extraction_method": "extract_tables"}
614
+ }
615
+ }
616
+
617
+
618
+ # BaseTool-compatible wrapper
619
+ try:
620
+ from tools.base_tool import ToolWithOptions
621
+ import tkinter as tk
622
+ from tkinter import ttk
623
+
624
+ class HTMLToolV2(ToolWithOptions):
625
+ """
626
+ BaseTool-compatible version of HTMLExtractionTool.
627
+ """
628
+
629
+ TOOL_NAME = "HTML Tool"
630
+ TOOL_DESCRIPTION = "Extract and process HTML content"
631
+ TOOL_VERSION = "2.0.0"
632
+
633
+ OPTIONS = [
634
+ ("Visible Text", "visible_text"),
635
+ ("Clean HTML", "clean_html"),
636
+ ("Extract Links", "extract_links"),
637
+ ("Extract Images", "extract_images"),
638
+ ("Extract Headings", "extract_headings"),
639
+ ("Extract Tables", "extract_tables"),
640
+ ("Extract Forms", "extract_forms"),
641
+ ]
642
+ OPTIONS_LABEL = "Operation"
643
+ USE_DROPDOWN = True
644
+ DEFAULT_OPTION = "visible_text"
645
+
646
+ def __init__(self):
647
+ super().__init__()
648
+ self._tool = HTMLExtractionTool()
649
+
650
+ def process_text(self, input_text: str, settings: Dict[str, Any]) -> str:
651
+ """Process HTML content."""
652
+ mode = settings.get("mode", "visible_text")
653
+ tool_settings = {"extraction_method": mode}
654
+ return self._tool.process_text(input_text, tool_settings)
655
+
656
+ except ImportError:
657
657
  pass