pomera-ai-commander 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +680 -0
  3. package/bin/pomera-ai-commander.js +62 -0
  4. package/core/__init__.py +66 -0
  5. package/core/__pycache__/__init__.cpython-313.pyc +0 -0
  6. package/core/__pycache__/app_context.cpython-313.pyc +0 -0
  7. package/core/__pycache__/async_text_processor.cpython-313.pyc +0 -0
  8. package/core/__pycache__/backup_manager.cpython-313.pyc +0 -0
  9. package/core/__pycache__/backup_recovery_manager.cpython-313.pyc +0 -0
  10. package/core/__pycache__/content_hash_cache.cpython-313.pyc +0 -0
  11. package/core/__pycache__/context_menu.cpython-313.pyc +0 -0
  12. package/core/__pycache__/data_validator.cpython-313.pyc +0 -0
  13. package/core/__pycache__/database_connection_manager.cpython-313.pyc +0 -0
  14. package/core/__pycache__/database_curl_settings_manager.cpython-313.pyc +0 -0
  15. package/core/__pycache__/database_promera_ai_settings_manager.cpython-313.pyc +0 -0
  16. package/core/__pycache__/database_schema.cpython-313.pyc +0 -0
  17. package/core/__pycache__/database_schema_manager.cpython-313.pyc +0 -0
  18. package/core/__pycache__/database_settings_manager.cpython-313.pyc +0 -0
  19. package/core/__pycache__/database_settings_manager_interface.cpython-313.pyc +0 -0
  20. package/core/__pycache__/dialog_manager.cpython-313.pyc +0 -0
  21. package/core/__pycache__/efficient_line_numbers.cpython-313.pyc +0 -0
  22. package/core/__pycache__/error_handler.cpython-313.pyc +0 -0
  23. package/core/__pycache__/error_service.cpython-313.pyc +0 -0
  24. package/core/__pycache__/event_consolidator.cpython-313.pyc +0 -0
  25. package/core/__pycache__/memory_efficient_text_widget.cpython-313.pyc +0 -0
  26. package/core/__pycache__/migration_manager.cpython-313.pyc +0 -0
  27. package/core/__pycache__/migration_test_suite.cpython-313.pyc +0 -0
  28. package/core/__pycache__/migration_validator.cpython-313.pyc +0 -0
  29. package/core/__pycache__/optimized_find_replace.cpython-313.pyc +0 -0
  30. package/core/__pycache__/optimized_pattern_engine.cpython-313.pyc +0 -0
  31. package/core/__pycache__/optimized_search_highlighter.cpython-313.pyc +0 -0
  32. package/core/__pycache__/performance_monitor.cpython-313.pyc +0 -0
  33. package/core/__pycache__/persistence_manager.cpython-313.pyc +0 -0
  34. package/core/__pycache__/progressive_stats_calculator.cpython-313.pyc +0 -0
  35. package/core/__pycache__/regex_pattern_cache.cpython-313.pyc +0 -0
  36. package/core/__pycache__/regex_pattern_library.cpython-313.pyc +0 -0
  37. package/core/__pycache__/search_operation_manager.cpython-313.pyc +0 -0
  38. package/core/__pycache__/settings_defaults_registry.cpython-313.pyc +0 -0
  39. package/core/__pycache__/settings_integrity_validator.cpython-313.pyc +0 -0
  40. package/core/__pycache__/settings_serializer.cpython-313.pyc +0 -0
  41. package/core/__pycache__/settings_validator.cpython-313.pyc +0 -0
  42. package/core/__pycache__/smart_stats_calculator.cpython-313.pyc +0 -0
  43. package/core/__pycache__/statistics_update_manager.cpython-313.pyc +0 -0
  44. package/core/__pycache__/stats_config_manager.cpython-313.pyc +0 -0
  45. package/core/__pycache__/streaming_text_handler.cpython-313.pyc +0 -0
  46. package/core/__pycache__/task_scheduler.cpython-313.pyc +0 -0
  47. package/core/__pycache__/visibility_monitor.cpython-313.pyc +0 -0
  48. package/core/__pycache__/widget_cache.cpython-313.pyc +0 -0
  49. package/core/app_context.py +482 -0
  50. package/core/async_text_processor.py +422 -0
  51. package/core/backup_manager.py +656 -0
  52. package/core/backup_recovery_manager.py +1034 -0
  53. package/core/content_hash_cache.py +509 -0
  54. package/core/context_menu.py +313 -0
  55. package/core/data_validator.py +1067 -0
  56. package/core/database_connection_manager.py +745 -0
  57. package/core/database_curl_settings_manager.py +609 -0
  58. package/core/database_promera_ai_settings_manager.py +447 -0
  59. package/core/database_schema.py +412 -0
  60. package/core/database_schema_manager.py +396 -0
  61. package/core/database_settings_manager.py +1508 -0
  62. package/core/database_settings_manager_interface.py +457 -0
  63. package/core/dialog_manager.py +735 -0
  64. package/core/efficient_line_numbers.py +511 -0
  65. package/core/error_handler.py +747 -0
  66. package/core/error_service.py +431 -0
  67. package/core/event_consolidator.py +512 -0
  68. package/core/mcp/__init__.py +43 -0
  69. package/core/mcp/__pycache__/__init__.cpython-313.pyc +0 -0
  70. package/core/mcp/__pycache__/protocol.cpython-313.pyc +0 -0
  71. package/core/mcp/__pycache__/schema.cpython-313.pyc +0 -0
  72. package/core/mcp/__pycache__/server_stdio.cpython-313.pyc +0 -0
  73. package/core/mcp/__pycache__/tool_registry.cpython-313.pyc +0 -0
  74. package/core/mcp/protocol.py +288 -0
  75. package/core/mcp/schema.py +251 -0
  76. package/core/mcp/server_stdio.py +299 -0
  77. package/core/mcp/tool_registry.py +2345 -0
  78. package/core/memory_efficient_text_widget.py +712 -0
  79. package/core/migration_manager.py +915 -0
  80. package/core/migration_test_suite.py +1086 -0
  81. package/core/migration_validator.py +1144 -0
  82. package/core/optimized_find_replace.py +715 -0
  83. package/core/optimized_pattern_engine.py +424 -0
  84. package/core/optimized_search_highlighter.py +553 -0
  85. package/core/performance_monitor.py +675 -0
  86. package/core/persistence_manager.py +713 -0
  87. package/core/progressive_stats_calculator.py +632 -0
  88. package/core/regex_pattern_cache.py +530 -0
  89. package/core/regex_pattern_library.py +351 -0
  90. package/core/search_operation_manager.py +435 -0
  91. package/core/settings_defaults_registry.py +1087 -0
  92. package/core/settings_integrity_validator.py +1112 -0
  93. package/core/settings_serializer.py +558 -0
  94. package/core/settings_validator.py +1824 -0
  95. package/core/smart_stats_calculator.py +710 -0
  96. package/core/statistics_update_manager.py +619 -0
  97. package/core/stats_config_manager.py +858 -0
  98. package/core/streaming_text_handler.py +723 -0
  99. package/core/task_scheduler.py +596 -0
  100. package/core/update_pattern_library.py +169 -0
  101. package/core/visibility_monitor.py +596 -0
  102. package/core/widget_cache.py +498 -0
  103. package/mcp.json +61 -0
  104. package/package.json +57 -0
  105. package/pomera.py +7483 -0
  106. package/pomera_mcp_server.py +144 -0
  107. package/tools/__init__.py +5 -0
  108. package/tools/__pycache__/__init__.cpython-313.pyc +0 -0
  109. package/tools/__pycache__/ai_tools.cpython-313.pyc +0 -0
  110. package/tools/__pycache__/ascii_art_generator.cpython-313.pyc +0 -0
  111. package/tools/__pycache__/base64_tools.cpython-313.pyc +0 -0
  112. package/tools/__pycache__/base_tool.cpython-313.pyc +0 -0
  113. package/tools/__pycache__/case_tool.cpython-313.pyc +0 -0
  114. package/tools/__pycache__/column_tools.cpython-313.pyc +0 -0
  115. package/tools/__pycache__/cron_tool.cpython-313.pyc +0 -0
  116. package/tools/__pycache__/curl_history.cpython-313.pyc +0 -0
  117. package/tools/__pycache__/curl_processor.cpython-313.pyc +0 -0
  118. package/tools/__pycache__/curl_settings.cpython-313.pyc +0 -0
  119. package/tools/__pycache__/curl_tool.cpython-313.pyc +0 -0
  120. package/tools/__pycache__/diff_viewer.cpython-313.pyc +0 -0
  121. package/tools/__pycache__/email_extraction_tool.cpython-313.pyc +0 -0
  122. package/tools/__pycache__/email_header_analyzer.cpython-313.pyc +0 -0
  123. package/tools/__pycache__/extraction_tools.cpython-313.pyc +0 -0
  124. package/tools/__pycache__/find_replace.cpython-313.pyc +0 -0
  125. package/tools/__pycache__/folder_file_reporter.cpython-313.pyc +0 -0
  126. package/tools/__pycache__/folder_file_reporter_adapter.cpython-313.pyc +0 -0
  127. package/tools/__pycache__/generator_tools.cpython-313.pyc +0 -0
  128. package/tools/__pycache__/hash_generator.cpython-313.pyc +0 -0
  129. package/tools/__pycache__/html_tool.cpython-313.pyc +0 -0
  130. package/tools/__pycache__/huggingface_helper.cpython-313.pyc +0 -0
  131. package/tools/__pycache__/jsonxml_tool.cpython-313.pyc +0 -0
  132. package/tools/__pycache__/line_tools.cpython-313.pyc +0 -0
  133. package/tools/__pycache__/list_comparator.cpython-313.pyc +0 -0
  134. package/tools/__pycache__/markdown_tools.cpython-313.pyc +0 -0
  135. package/tools/__pycache__/mcp_widget.cpython-313.pyc +0 -0
  136. package/tools/__pycache__/notes_widget.cpython-313.pyc +0 -0
  137. package/tools/__pycache__/number_base_converter.cpython-313.pyc +0 -0
  138. package/tools/__pycache__/regex_extractor.cpython-313.pyc +0 -0
  139. package/tools/__pycache__/slug_generator.cpython-313.pyc +0 -0
  140. package/tools/__pycache__/sorter_tools.cpython-313.pyc +0 -0
  141. package/tools/__pycache__/string_escape_tool.cpython-313.pyc +0 -0
  142. package/tools/__pycache__/text_statistics_tool.cpython-313.pyc +0 -0
  143. package/tools/__pycache__/text_wrapper.cpython-313.pyc +0 -0
  144. package/tools/__pycache__/timestamp_converter.cpython-313.pyc +0 -0
  145. package/tools/__pycache__/tool_loader.cpython-313.pyc +0 -0
  146. package/tools/__pycache__/translator_tools.cpython-313.pyc +0 -0
  147. package/tools/__pycache__/url_link_extractor.cpython-313.pyc +0 -0
  148. package/tools/__pycache__/url_parser.cpython-313.pyc +0 -0
  149. package/tools/__pycache__/whitespace_tools.cpython-313.pyc +0 -0
  150. package/tools/__pycache__/word_frequency_counter.cpython-313.pyc +0 -0
  151. package/tools/ai_tools.py +2892 -0
  152. package/tools/ascii_art_generator.py +353 -0
  153. package/tools/base64_tools.py +184 -0
  154. package/tools/base_tool.py +511 -0
  155. package/tools/case_tool.py +309 -0
  156. package/tools/column_tools.py +396 -0
  157. package/tools/cron_tool.py +885 -0
  158. package/tools/curl_history.py +601 -0
  159. package/tools/curl_processor.py +1208 -0
  160. package/tools/curl_settings.py +503 -0
  161. package/tools/curl_tool.py +5467 -0
  162. package/tools/diff_viewer.py +1072 -0
  163. package/tools/email_extraction_tool.py +249 -0
  164. package/tools/email_header_analyzer.py +426 -0
  165. package/tools/extraction_tools.py +250 -0
  166. package/tools/find_replace.py +1751 -0
  167. package/tools/folder_file_reporter.py +1463 -0
  168. package/tools/folder_file_reporter_adapter.py +480 -0
  169. package/tools/generator_tools.py +1217 -0
  170. package/tools/hash_generator.py +256 -0
  171. package/tools/html_tool.py +657 -0
  172. package/tools/huggingface_helper.py +449 -0
  173. package/tools/jsonxml_tool.py +730 -0
  174. package/tools/line_tools.py +419 -0
  175. package/tools/list_comparator.py +720 -0
  176. package/tools/markdown_tools.py +562 -0
  177. package/tools/mcp_widget.py +1417 -0
  178. package/tools/notes_widget.py +973 -0
  179. package/tools/number_base_converter.py +373 -0
  180. package/tools/regex_extractor.py +572 -0
  181. package/tools/slug_generator.py +311 -0
  182. package/tools/sorter_tools.py +459 -0
  183. package/tools/string_escape_tool.py +393 -0
  184. package/tools/text_statistics_tool.py +366 -0
  185. package/tools/text_wrapper.py +431 -0
  186. package/tools/timestamp_converter.py +422 -0
  187. package/tools/tool_loader.py +710 -0
  188. package/tools/translator_tools.py +523 -0
  189. package/tools/url_link_extractor.py +262 -0
  190. package/tools/url_parser.py +205 -0
  191. package/tools/whitespace_tools.py +356 -0
  192. package/tools/word_frequency_counter.py +147 -0
@@ -0,0 +1,657 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ HTML Extraction Tool Module for Pomera AI Commander
4
+
5
+ This module provides HTML processing capabilities including:
6
+ - Extracting visible text from HTML (as it would appear in a browser)
7
+ - Cleaning up HTML by removing unnecessary tags
8
+ - Extracting specific HTML elements
9
+ - Converting HTML to plain text with proper formatting
10
+
11
+ Author: Pomera AI Commander
12
+ """
13
+
14
+ import re
15
+ import html
16
+ from typing import Dict, Any, List, Optional
17
+ import logging
18
+
19
+
20
+ class HTMLExtractionTool:
21
+ """
22
+ HTML Extraction Tool for processing HTML content and extracting useful information.
23
+
24
+ Features:
25
+ - Extract visible text from HTML (browser-rendered text)
26
+ - Clean HTML by removing unnecessary tags
27
+ - Extract specific elements (links, images, headings, etc.)
28
+ - Convert HTML to formatted plain text
29
+ - Remove scripts, styles, and other non-visible content
30
+ """
31
+
32
+ def __init__(self, logger=None):
33
+ """
34
+ Initialize the HTML Extraction Tool.
35
+
36
+ Args:
37
+ logger: Logger instance for debugging
38
+ """
39
+ self.logger = logger or logging.getLogger(__name__)
40
+
41
+ # Tags that should be completely removed along with their content
42
+ self.script_style_tags = ['script', 'style', 'noscript', 'meta', 'head', 'title']
43
+
44
+ # Block-level tags that should add line breaks
45
+ self.block_tags = [
46
+ 'div', 'p', 'br', 'hr', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
47
+ 'ul', 'ol', 'li', 'dl', 'dt', 'dd', 'blockquote', 'pre',
48
+ 'table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot',
49
+ 'section', 'article', 'header', 'footer', 'nav', 'aside',
50
+ 'main', 'figure', 'figcaption', 'address'
51
+ ]
52
+
53
+ # Inline tags that should preserve spacing
54
+ self.inline_tags = [
55
+ 'span', 'a', 'strong', 'b', 'em', 'i', 'u', 'small', 'mark',
56
+ 'del', 'ins', 'sub', 'sup', 'code', 'kbd', 'samp', 'var',
57
+ 'abbr', 'acronym', 'cite', 'dfn', 'q', 'time'
58
+ ]
59
+
60
+ def process_text(self, html_content: str, settings: Dict[str, Any]) -> str:
61
+ """
62
+ Process HTML content based on the selected extraction method.
63
+
64
+ Args:
65
+ html_content: HTML content to process
66
+ settings: Tool settings dictionary
67
+
68
+ Returns:
69
+ Processed text based on the selected method
70
+ """
71
+ try:
72
+ if not html_content.strip():
73
+ return "No HTML content provided."
74
+
75
+ extraction_method = settings.get("extraction_method", "visible_text")
76
+
77
+ if extraction_method == "visible_text":
78
+ return self.extract_visible_text(html_content, settings)
79
+ elif extraction_method == "clean_html":
80
+ return self.clean_html(html_content, settings)
81
+ elif extraction_method == "extract_links":
82
+ return self.extract_links(html_content, settings)
83
+ elif extraction_method == "extract_images":
84
+ return self.extract_images(html_content, settings)
85
+ elif extraction_method == "extract_headings":
86
+ return self.extract_headings(html_content, settings)
87
+ elif extraction_method == "extract_tables":
88
+ return self.extract_tables(html_content, settings)
89
+ elif extraction_method == "extract_forms":
90
+ return self.extract_forms(html_content, settings)
91
+ else:
92
+ return self.extract_visible_text(html_content, settings)
93
+
94
+ except Exception as e:
95
+ self.logger.error(f"Error processing HTML: {e}")
96
+ return f"Error processing HTML: {str(e)}"
97
+
98
+ def extract_visible_text(self, html_content: str, settings: Dict[str, Any]) -> str:
99
+ """
100
+ Extract visible text from HTML as it would appear in a browser.
101
+
102
+ Args:
103
+ html_content: HTML content to process
104
+ settings: Tool settings
105
+
106
+ Returns:
107
+ Visible text with proper formatting
108
+ """
109
+ try:
110
+ # Remove script and style tags with their content
111
+ html_content = self._remove_script_style_tags(html_content)
112
+
113
+ # Remove HTML comments
114
+ html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
115
+
116
+ # Handle block-level tags by adding line breaks
117
+ for tag in self.block_tags:
118
+ # Add line breaks before and after block tags
119
+ html_content = re.sub(f'<{tag}[^>]*>', f'\n<{tag}>', html_content, flags=re.IGNORECASE)
120
+ html_content = re.sub(f'</{tag}>', f'</{tag}>\n', html_content, flags=re.IGNORECASE)
121
+
122
+ # Handle list items specially
123
+ html_content = re.sub(r'<li[^>]*>', '\n• ', html_content, flags=re.IGNORECASE)
124
+ html_content = re.sub(r'</li>', '', html_content, flags=re.IGNORECASE)
125
+
126
+ # Handle table cells
127
+ html_content = re.sub(r'<td[^>]*>', '\t', html_content, flags=re.IGNORECASE)
128
+ html_content = re.sub(r'</td>', '', html_content, flags=re.IGNORECASE)
129
+ html_content = re.sub(r'<th[^>]*>', '\t', html_content, flags=re.IGNORECASE)
130
+ html_content = re.sub(r'</th>', '', html_content, flags=re.IGNORECASE)
131
+
132
+ # Remove all remaining HTML tags
133
+ html_content = re.sub(r'<[^>]+>', '', html_content)
134
+
135
+ # Decode HTML entities
136
+ html_content = html.unescape(html_content)
137
+
138
+ # Clean up whitespace
139
+ lines = html_content.split('\n')
140
+ cleaned_lines = []
141
+
142
+ for line in lines:
143
+ line = line.strip()
144
+ if line: # Only keep non-empty lines
145
+ cleaned_lines.append(line)
146
+
147
+ # Join lines and clean up multiple line breaks
148
+ result = '\n'.join(cleaned_lines)
149
+
150
+ # Remove excessive line breaks
151
+ result = re.sub(r'\n{3,}', '\n\n', result)
152
+
153
+ # Add formatting options
154
+ if settings.get("preserve_links", False):
155
+ result = self._add_link_references(html_content, result)
156
+
157
+ return result.strip()
158
+
159
+ except Exception as e:
160
+ self.logger.error(f"Error extracting visible text: {e}")
161
+ return f"Error extracting visible text: {str(e)}"
162
+
163
+ def clean_html(self, html_content: str, settings: Dict[str, Any]) -> str:
164
+ """
165
+ Clean HTML by removing unnecessary tags and attributes.
166
+
167
+ Args:
168
+ html_content: HTML content to clean
169
+ settings: Tool settings
170
+
171
+ Returns:
172
+ Cleaned HTML
173
+ """
174
+ try:
175
+ # Remove script and style tags if requested
176
+ if settings.get("remove_scripts", True):
177
+ html_content = self._remove_script_style_tags(html_content)
178
+
179
+ # Remove HTML comments
180
+ if settings.get("remove_comments", True):
181
+ html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
182
+
183
+ # Remove specific attributes if requested
184
+ if settings.get("remove_style_attrs", True):
185
+ html_content = re.sub(r'\s+style\s*=\s*["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE)
186
+
187
+ if settings.get("remove_class_attrs", False):
188
+ html_content = re.sub(r'\s+class\s*=\s*["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE)
189
+
190
+ if settings.get("remove_id_attrs", False):
191
+ html_content = re.sub(r'\s+id\s*=\s*["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE)
192
+
193
+ # Remove empty tags if requested
194
+ if settings.get("remove_empty_tags", True):
195
+ # Remove tags that are completely empty
196
+ html_content = re.sub(r'<(\w+)[^>]*>\s*</\1>', '', html_content, flags=re.IGNORECASE)
197
+
198
+ # Clean up whitespace
199
+ html_content = re.sub(r'\n\s*\n', '\n', html_content)
200
+ html_content = re.sub(r'>\s+<', '><', html_content)
201
+
202
+ return html_content.strip()
203
+
204
+ except Exception as e:
205
+ self.logger.error(f"Error cleaning HTML: {e}")
206
+ return f"Error cleaning HTML: {str(e)}"
207
+
208
+ def extract_links(self, html_content: str, settings: Dict[str, Any]) -> str:
209
+ """
210
+ Extract all links from HTML content.
211
+
212
+ Args:
213
+ html_content: HTML content to process
214
+ settings: Tool settings
215
+
216
+ Returns:
217
+ List of links with their text
218
+ """
219
+ try:
220
+ # Find all anchor tags
221
+ link_pattern = r'<a[^>]*href\s*=\s*["\']([^"\']*)["\'][^>]*>(.*?)</a>'
222
+ links = re.findall(link_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
223
+
224
+ if not links:
225
+ return "No links found in the HTML content."
226
+
227
+ result_lines = []
228
+ include_text = settings.get("include_link_text", True)
229
+ absolute_only = settings.get("absolute_links_only", False)
230
+
231
+ for href, link_text in links:
232
+ # Clean up link text
233
+ link_text = re.sub(r'<[^>]+>', '', link_text).strip()
234
+ link_text = html.unescape(link_text)
235
+
236
+ # Filter absolute links if requested
237
+ if absolute_only and not (href.startswith('http://') or href.startswith('https://')):
238
+ continue
239
+
240
+ if include_text and link_text:
241
+ result_lines.append(f"{link_text}: {href}")
242
+ else:
243
+ result_lines.append(href)
244
+
245
+ return '\n'.join(result_lines) if result_lines else "No links match the specified criteria."
246
+
247
+ except Exception as e:
248
+ self.logger.error(f"Error extracting links: {e}")
249
+ return f"Error extracting links: {str(e)}"
250
+
251
+ def extract_images(self, html_content: str, settings: Dict[str, Any]) -> str:
252
+ """
253
+ Extract all images from HTML content.
254
+
255
+ Args:
256
+ html_content: HTML content to process
257
+ settings: Tool settings
258
+
259
+ Returns:
260
+ List of images with their attributes
261
+ """
262
+ try:
263
+ # Find all img tags
264
+ img_pattern = r'<img[^>]*>'
265
+ images = re.findall(img_pattern, html_content, flags=re.IGNORECASE)
266
+
267
+ if not images:
268
+ return "No images found in the HTML content."
269
+
270
+ result_lines = []
271
+ include_alt = settings.get("include_alt_text", True)
272
+ include_title = settings.get("include_title", False)
273
+
274
+ for img_tag in images:
275
+ # Extract src attribute
276
+ src_match = re.search(r'src\s*=\s*["\']([^"\']*)["\']', img_tag, re.IGNORECASE)
277
+ src = src_match.group(1) if src_match else "No src"
278
+
279
+ # Extract alt attribute
280
+ alt_match = re.search(r'alt\s*=\s*["\']([^"\']*)["\']', img_tag, re.IGNORECASE)
281
+ alt = alt_match.group(1) if alt_match else ""
282
+
283
+ # Extract title attribute
284
+ title_match = re.search(r'title\s*=\s*["\']([^"\']*)["\']', img_tag, re.IGNORECASE)
285
+ title = title_match.group(1) if title_match else ""
286
+
287
+ # Build result line
288
+ parts = [src]
289
+ if include_alt and alt:
290
+ parts.append(f"Alt: {alt}")
291
+ if include_title and title:
292
+ parts.append(f"Title: {title}")
293
+
294
+ result_lines.append(" | ".join(parts))
295
+
296
+ return '\n'.join(result_lines)
297
+
298
+ except Exception as e:
299
+ self.logger.error(f"Error extracting images: {e}")
300
+ return f"Error extracting images: {str(e)}"
301
+
302
+ def extract_headings(self, html_content: str, settings: Dict[str, Any]) -> str:
303
+ """
304
+ Extract all headings from HTML content.
305
+
306
+ Args:
307
+ html_content: HTML content to process
308
+ settings: Tool settings
309
+
310
+ Returns:
311
+ List of headings with their levels
312
+ """
313
+ try:
314
+ # Find all heading tags
315
+ heading_pattern = r'<(h[1-6])[^>]*>(.*?)</\1>'
316
+ headings = re.findall(heading_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
317
+
318
+ if not headings:
319
+ return "No headings found in the HTML content."
320
+
321
+ result_lines = []
322
+ include_level = settings.get("include_heading_level", True)
323
+
324
+ for tag, content in headings:
325
+ # Clean up heading content
326
+ content = re.sub(r'<[^>]+>', '', content).strip()
327
+ content = html.unescape(content)
328
+
329
+ if include_level:
330
+ level = tag.upper()
331
+ result_lines.append(f"{level}: {content}")
332
+ else:
333
+ result_lines.append(content)
334
+
335
+ return '\n'.join(result_lines)
336
+
337
+ except Exception as e:
338
+ self.logger.error(f"Error extracting headings: {e}")
339
+ return f"Error extracting headings: {str(e)}"
340
+
341
+ def extract_tables(self, html_content: str, settings: Dict[str, Any]) -> str:
342
+ """
343
+ Extract table data from HTML content.
344
+
345
+ Args:
346
+ html_content: HTML content to process
347
+ settings: Tool settings
348
+
349
+ Returns:
350
+ Formatted table data
351
+ """
352
+ try:
353
+ # Find all table tags
354
+ table_pattern = r'<table[^>]*>(.*?)</table>'
355
+ tables = re.findall(table_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
356
+
357
+ if not tables:
358
+ return "No tables found in the HTML content."
359
+
360
+ result_lines = []
361
+ separator = settings.get("column_separator", "\t")
362
+
363
+ for i, table_content in enumerate(tables):
364
+ if len(tables) > 1:
365
+ result_lines.append(f"\n--- Table {i + 1} ---")
366
+
367
+ # Find all rows
368
+ row_pattern = r'<tr[^>]*>(.*?)</tr>'
369
+ rows = re.findall(row_pattern, table_content, flags=re.IGNORECASE | re.DOTALL)
370
+
371
+ for row_content in rows:
372
+ # Find all cells (td or th)
373
+ cell_pattern = r'<(?:td|th)[^>]*>(.*?)</(?:td|th)>'
374
+ cells = re.findall(cell_pattern, row_content, flags=re.IGNORECASE | re.DOTALL)
375
+
376
+ # Clean up cell content
377
+ cleaned_cells = []
378
+ for cell in cells:
379
+ cell = re.sub(r'<[^>]+>', '', cell).strip()
380
+ cell = html.unescape(cell)
381
+ cleaned_cells.append(cell)
382
+
383
+ if cleaned_cells:
384
+ result_lines.append(separator.join(cleaned_cells))
385
+
386
+ return '\n'.join(result_lines)
387
+
388
+ except Exception as e:
389
+ self.logger.error(f"Error extracting tables: {e}")
390
+ return f"Error extracting tables: {str(e)}"
391
+
392
+ def extract_forms(self, html_content: str, settings: Dict[str, Any]) -> str:
393
+ """
394
+ Extract form information from HTML content.
395
+
396
+ Args:
397
+ html_content: HTML content to process
398
+ settings: Tool settings
399
+
400
+ Returns:
401
+ Form structure information
402
+ """
403
+ try:
404
+ # Find all form tags
405
+ form_pattern = r'<form[^>]*>(.*?)</form>'
406
+ forms = re.findall(form_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
407
+
408
+ if not forms:
409
+ return "No forms found in the HTML content."
410
+
411
+ result_lines = []
412
+
413
+ for i, form_content in enumerate(forms):
414
+ if len(forms) > 1:
415
+ result_lines.append(f"\n--- Form {i + 1} ---")
416
+
417
+ # Extract form attributes
418
+ form_tag_match = re.search(r'<form([^>]*)>', html_content, re.IGNORECASE)
419
+ if form_tag_match:
420
+ form_attrs = form_tag_match.group(1)
421
+
422
+ # Extract action
423
+ action_match = re.search(r'action\s*=\s*["\']([^"\']*)["\']', form_attrs, re.IGNORECASE)
424
+ if action_match:
425
+ result_lines.append(f"Action: {action_match.group(1)}")
426
+
427
+ # Extract method
428
+ method_match = re.search(r'method\s*=\s*["\']([^"\']*)["\']', form_attrs, re.IGNORECASE)
429
+ if method_match:
430
+ result_lines.append(f"Method: {method_match.group(1)}")
431
+
432
+ # Find all input fields
433
+ input_pattern = r'<input[^>]*>'
434
+ inputs = re.findall(input_pattern, form_content, flags=re.IGNORECASE)
435
+
436
+ if inputs:
437
+ result_lines.append("Input Fields:")
438
+ for input_tag in inputs:
439
+ # Extract input attributes
440
+ name_match = re.search(r'name\s*=\s*["\']([^"\']*)["\']', input_tag, re.IGNORECASE)
441
+ type_match = re.search(r'type\s*=\s*["\']([^"\']*)["\']', input_tag, re.IGNORECASE)
442
+
443
+ name = name_match.group(1) if name_match else "unnamed"
444
+ input_type = type_match.group(1) if type_match else "text"
445
+
446
+ result_lines.append(f" - {name} ({input_type})")
447
+
448
+ # Find all textarea fields
449
+ textarea_pattern = r'<textarea[^>]*name\s*=\s*["\']([^"\']*)["\'][^>]*>'
450
+ textareas = re.findall(textarea_pattern, form_content, flags=re.IGNORECASE)
451
+
452
+ if textareas:
453
+ result_lines.append("Textarea Fields:")
454
+ for name in textareas:
455
+ result_lines.append(f" - {name}")
456
+
457
+ # Find all select fields
458
+ select_pattern = r'<select[^>]*name\s*=\s*["\']([^"\']*)["\'][^>]*>'
459
+ selects = re.findall(select_pattern, form_content, flags=re.IGNORECASE)
460
+
461
+ if selects:
462
+ result_lines.append("Select Fields:")
463
+ for name in selects:
464
+ result_lines.append(f" - {name}")
465
+
466
+ return '\n'.join(result_lines)
467
+
468
+ except Exception as e:
469
+ self.logger.error(f"Error extracting forms: {e}")
470
+ return f"Error extracting forms: {str(e)}"
471
+
472
+ def _remove_script_style_tags(self, html_content: str) -> str:
473
+ """Remove script and style tags with their content."""
474
+ for tag in self.script_style_tags:
475
+ pattern = f'<{tag}[^>]*>.*?</{tag}>'
476
+ html_content = re.sub(pattern, '', html_content, flags=re.IGNORECASE | re.DOTALL)
477
+ return html_content
478
+
479
+ def _add_link_references(self, original_html: str, text_result: str) -> str:
480
+ """Add link references to the text result."""
481
+ try:
482
+ # This is a simplified implementation
483
+ # In a full implementation, you might want to add footnote-style references
484
+ link_pattern = r'<a[^>]*href\s*=\s*["\']([^"\']*)["\'][^>]*>(.*?)</a>'
485
+ links = re.findall(link_pattern, original_html, flags=re.IGNORECASE | re.DOTALL)
486
+
487
+ if links:
488
+ text_result += "\n\nLinks found in document:\n"
489
+ for i, (href, link_text) in enumerate(links, 1):
490
+ link_text = re.sub(r'<[^>]+>', '', link_text).strip()
491
+ link_text = html.unescape(link_text)
492
+ text_result += f"{i}. {link_text}: {href}\n"
493
+
494
+ return text_result
495
+ except Exception:
496
+ return text_result
497
+
498
+
499
+ # Tool settings configuration
500
+ def get_default_settings():
501
+ """Get default settings for the HTML Extraction Tool."""
502
+ return {
503
+ "extraction_method": "visible_text",
504
+ "preserve_links": False,
505
+ "remove_scripts": True,
506
+ "remove_comments": True,
507
+ "remove_style_attrs": True,
508
+ "remove_class_attrs": False,
509
+ "remove_id_attrs": False,
510
+ "remove_empty_tags": True,
511
+ "include_link_text": True,
512
+ "absolute_links_only": False,
513
+ "include_alt_text": True,
514
+ "include_title": False,
515
+ "include_heading_level": True,
516
+ "column_separator": "\t"
517
+ }
518
+
519
+
520
+ def get_settings_ui_config():
521
+ """Get UI configuration for the HTML Extraction Tool settings."""
522
+ return {
523
+ "extraction_method": {
524
+ "type": "dropdown",
525
+ "label": "Extraction Method",
526
+ "options": [
527
+ ("Extract Visible Text", "visible_text"),
528
+ ("Clean HTML", "clean_html"),
529
+ ("Extract Links", "extract_links"),
530
+ ("Extract Images", "extract_images"),
531
+ ("Extract Headings", "extract_headings"),
532
+ ("Extract Tables", "extract_tables"),
533
+ ("Extract Forms", "extract_forms")
534
+ ],
535
+ "default": "visible_text"
536
+ },
537
+ "preserve_links": {
538
+ "type": "checkbox",
539
+ "label": "Add link references to visible text",
540
+ "default": False,
541
+ "show_when": {"extraction_method": "visible_text"}
542
+ },
543
+ "remove_scripts": {
544
+ "type": "checkbox",
545
+ "label": "Remove script and style tags",
546
+ "default": True,
547
+ "show_when": {"extraction_method": "clean_html"}
548
+ },
549
+ "remove_comments": {
550
+ "type": "checkbox",
551
+ "label": "Remove HTML comments",
552
+ "default": True,
553
+ "show_when": {"extraction_method": "clean_html"}
554
+ },
555
+ "remove_style_attrs": {
556
+ "type": "checkbox",
557
+ "label": "Remove style attributes",
558
+ "default": True,
559
+ "show_when": {"extraction_method": "clean_html"}
560
+ },
561
+ "remove_class_attrs": {
562
+ "type": "checkbox",
563
+ "label": "Remove class attributes",
564
+ "default": False,
565
+ "show_when": {"extraction_method": "clean_html"}
566
+ },
567
+ "remove_id_attrs": {
568
+ "type": "checkbox",
569
+ "label": "Remove ID attributes",
570
+ "default": False,
571
+ "show_when": {"extraction_method": "clean_html"}
572
+ },
573
+ "remove_empty_tags": {
574
+ "type": "checkbox",
575
+ "label": "Remove empty tags",
576
+ "default": True,
577
+ "show_when": {"extraction_method": "clean_html"}
578
+ },
579
+ "include_link_text": {
580
+ "type": "checkbox",
581
+ "label": "Include link text",
582
+ "default": True,
583
+ "show_when": {"extraction_method": "extract_links"}
584
+ },
585
+ "absolute_links_only": {
586
+ "type": "checkbox",
587
+ "label": "Only absolute links (http/https)",
588
+ "default": False,
589
+ "show_when": {"extraction_method": "extract_links"}
590
+ },
591
+ "include_alt_text": {
592
+ "type": "checkbox",
593
+ "label": "Include alt text",
594
+ "default": True,
595
+ "show_when": {"extraction_method": "extract_images"}
596
+ },
597
+ "include_title": {
598
+ "type": "checkbox",
599
+ "label": "Include title attribute",
600
+ "default": False,
601
+ "show_when": {"extraction_method": "extract_images"}
602
+ },
603
+ "include_heading_level": {
604
+ "type": "checkbox",
605
+ "label": "Include heading level (H1, H2, etc.)",
606
+ "default": True,
607
+ "show_when": {"extraction_method": "extract_headings"}
608
+ },
609
+ "column_separator": {
610
+ "type": "entry",
611
+ "label": "Column separator",
612
+ "default": "\t",
613
+ "show_when": {"extraction_method": "extract_tables"}
614
+ }
615
+ }
616
+
617
+
618
+ # BaseTool-compatible wrapper
619
+ try:
620
+ from tools.base_tool import ToolWithOptions
621
+ import tkinter as tk
622
+ from tkinter import ttk
623
+
624
+ class HTMLToolV2(ToolWithOptions):
625
+ """
626
+ BaseTool-compatible version of HTMLExtractionTool.
627
+ """
628
+
629
+ TOOL_NAME = "HTML Tool"
630
+ TOOL_DESCRIPTION = "Extract and process HTML content"
631
+ TOOL_VERSION = "2.0.0"
632
+
633
+ OPTIONS = [
634
+ ("Visible Text", "visible_text"),
635
+ ("Clean HTML", "clean_html"),
636
+ ("Extract Links", "extract_links"),
637
+ ("Extract Images", "extract_images"),
638
+ ("Extract Headings", "extract_headings"),
639
+ ("Extract Tables", "extract_tables"),
640
+ ("Extract Forms", "extract_forms"),
641
+ ]
642
+ OPTIONS_LABEL = "Operation"
643
+ USE_DROPDOWN = True
644
+ DEFAULT_OPTION = "visible_text"
645
+
646
+ def __init__(self):
647
+ super().__init__()
648
+ self._tool = HTMLExtractionTool()
649
+
650
+ def process_text(self, input_text: str, settings: Dict[str, Any]) -> str:
651
+ """Process HTML content."""
652
+ mode = settings.get("mode", "visible_text")
653
+ tool_settings = {"extraction_method": mode}
654
+ return self._tool.process_text(input_text, tool_settings)
655
+
656
+ except ImportError:
657
+ pass