pomera-ai-commander 1.1.1 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +105 -680
- package/bin/pomera-ai-commander.js +62 -62
- package/core/__init__.py +65 -65
- package/core/app_context.py +482 -482
- package/core/async_text_processor.py +421 -421
- package/core/backup_manager.py +655 -655
- package/core/backup_recovery_manager.py +1199 -1033
- package/core/content_hash_cache.py +508 -508
- package/core/context_menu.py +313 -313
- package/core/data_directory.py +549 -0
- package/core/data_validator.py +1066 -1066
- package/core/database_connection_manager.py +744 -744
- package/core/database_curl_settings_manager.py +608 -608
- package/core/database_promera_ai_settings_manager.py +446 -446
- package/core/database_schema.py +411 -411
- package/core/database_schema_manager.py +395 -395
- package/core/database_settings_manager.py +1507 -1507
- package/core/database_settings_manager_interface.py +456 -456
- package/core/dialog_manager.py +734 -734
- package/core/diff_utils.py +239 -0
- package/core/efficient_line_numbers.py +540 -510
- package/core/error_handler.py +746 -746
- package/core/error_service.py +431 -431
- package/core/event_consolidator.py +511 -511
- package/core/mcp/__init__.py +43 -43
- package/core/mcp/find_replace_diff.py +334 -0
- package/core/mcp/protocol.py +288 -288
- package/core/mcp/schema.py +251 -251
- package/core/mcp/server_stdio.py +299 -299
- package/core/mcp/tool_registry.py +2699 -2345
- package/core/memento.py +275 -0
- package/core/memory_efficient_text_widget.py +711 -711
- package/core/migration_manager.py +914 -914
- package/core/migration_test_suite.py +1085 -1085
- package/core/migration_validator.py +1143 -1143
- package/core/optimized_find_replace.py +714 -714
- package/core/optimized_pattern_engine.py +424 -424
- package/core/optimized_search_highlighter.py +552 -552
- package/core/performance_monitor.py +674 -674
- package/core/persistence_manager.py +712 -712
- package/core/progressive_stats_calculator.py +632 -632
- package/core/regex_pattern_cache.py +529 -529
- package/core/regex_pattern_library.py +350 -350
- package/core/search_operation_manager.py +434 -434
- package/core/settings_defaults_registry.py +1087 -1087
- package/core/settings_integrity_validator.py +1111 -1111
- package/core/settings_serializer.py +557 -557
- package/core/settings_validator.py +1823 -1823
- package/core/smart_stats_calculator.py +709 -709
- package/core/statistics_update_manager.py +619 -619
- package/core/stats_config_manager.py +858 -858
- package/core/streaming_text_handler.py +723 -723
- package/core/task_scheduler.py +596 -596
- package/core/update_pattern_library.py +168 -168
- package/core/visibility_monitor.py +596 -596
- package/core/widget_cache.py +498 -498
- package/mcp.json +51 -61
- package/migrate_data.py +127 -0
- package/package.json +64 -57
- package/pomera.py +7883 -7482
- package/pomera_mcp_server.py +183 -144
- package/requirements.txt +33 -0
- package/scripts/Dockerfile.alpine +43 -0
- package/scripts/Dockerfile.gui-test +54 -0
- package/scripts/Dockerfile.linux +43 -0
- package/scripts/Dockerfile.test-linux +80 -0
- package/scripts/Dockerfile.ubuntu +39 -0
- package/scripts/README.md +53 -0
- package/scripts/build-all.bat +113 -0
- package/scripts/build-docker.bat +53 -0
- package/scripts/build-docker.sh +55 -0
- package/scripts/build-optimized.bat +101 -0
- package/scripts/build.sh +78 -0
- package/scripts/docker-compose.test.yml +27 -0
- package/scripts/docker-compose.yml +32 -0
- package/scripts/postinstall.js +62 -0
- package/scripts/requirements-minimal.txt +33 -0
- package/scripts/test-linux-simple.bat +28 -0
- package/scripts/validate-release-workflow.py +450 -0
- package/tools/__init__.py +4 -4
- package/tools/ai_tools.py +2891 -2891
- package/tools/ascii_art_generator.py +352 -352
- package/tools/base64_tools.py +183 -183
- package/tools/base_tool.py +511 -511
- package/tools/case_tool.py +308 -308
- package/tools/column_tools.py +395 -395
- package/tools/cron_tool.py +884 -884
- package/tools/curl_history.py +600 -600
- package/tools/curl_processor.py +1207 -1207
- package/tools/curl_settings.py +502 -502
- package/tools/curl_tool.py +5467 -5467
- package/tools/diff_viewer.py +1817 -1072
- package/tools/email_extraction_tool.py +248 -248
- package/tools/email_header_analyzer.py +425 -425
- package/tools/extraction_tools.py +250 -250
- package/tools/find_replace.py +2289 -1750
- package/tools/folder_file_reporter.py +1463 -1463
- package/tools/folder_file_reporter_adapter.py +480 -480
- package/tools/generator_tools.py +1216 -1216
- package/tools/hash_generator.py +255 -255
- package/tools/html_tool.py +656 -656
- package/tools/jsonxml_tool.py +729 -729
- package/tools/line_tools.py +419 -419
- package/tools/markdown_tools.py +561 -561
- package/tools/mcp_widget.py +1417 -1417
- package/tools/notes_widget.py +978 -973
- package/tools/number_base_converter.py +372 -372
- package/tools/regex_extractor.py +571 -571
- package/tools/slug_generator.py +310 -310
- package/tools/sorter_tools.py +458 -458
- package/tools/string_escape_tool.py +392 -392
- package/tools/text_statistics_tool.py +365 -365
- package/tools/text_wrapper.py +430 -430
- package/tools/timestamp_converter.py +421 -421
- package/tools/tool_loader.py +710 -710
- package/tools/translator_tools.py +522 -522
- package/tools/url_link_extractor.py +261 -261
- package/tools/url_parser.py +204 -204
- package/tools/whitespace_tools.py +355 -355
- package/tools/word_frequency_counter.py +146 -146
- package/core/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/__pycache__/app_context.cpython-313.pyc +0 -0
- package/core/__pycache__/async_text_processor.cpython-313.pyc +0 -0
- package/core/__pycache__/backup_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/backup_recovery_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/content_hash_cache.cpython-313.pyc +0 -0
- package/core/__pycache__/context_menu.cpython-313.pyc +0 -0
- package/core/__pycache__/data_validator.cpython-313.pyc +0 -0
- package/core/__pycache__/database_connection_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/database_curl_settings_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/database_promera_ai_settings_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/database_schema.cpython-313.pyc +0 -0
- package/core/__pycache__/database_schema_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/database_settings_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/database_settings_manager_interface.cpython-313.pyc +0 -0
- package/core/__pycache__/dialog_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/efficient_line_numbers.cpython-313.pyc +0 -0
- package/core/__pycache__/error_handler.cpython-313.pyc +0 -0
- package/core/__pycache__/error_service.cpython-313.pyc +0 -0
- package/core/__pycache__/event_consolidator.cpython-313.pyc +0 -0
- package/core/__pycache__/memory_efficient_text_widget.cpython-313.pyc +0 -0
- package/core/__pycache__/migration_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/migration_test_suite.cpython-313.pyc +0 -0
- package/core/__pycache__/migration_validator.cpython-313.pyc +0 -0
- package/core/__pycache__/optimized_find_replace.cpython-313.pyc +0 -0
- package/core/__pycache__/optimized_pattern_engine.cpython-313.pyc +0 -0
- package/core/__pycache__/optimized_search_highlighter.cpython-313.pyc +0 -0
- package/core/__pycache__/performance_monitor.cpython-313.pyc +0 -0
- package/core/__pycache__/persistence_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/progressive_stats_calculator.cpython-313.pyc +0 -0
- package/core/__pycache__/regex_pattern_cache.cpython-313.pyc +0 -0
- package/core/__pycache__/regex_pattern_library.cpython-313.pyc +0 -0
- package/core/__pycache__/search_operation_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/settings_defaults_registry.cpython-313.pyc +0 -0
- package/core/__pycache__/settings_integrity_validator.cpython-313.pyc +0 -0
- package/core/__pycache__/settings_serializer.cpython-313.pyc +0 -0
- package/core/__pycache__/settings_validator.cpython-313.pyc +0 -0
- package/core/__pycache__/smart_stats_calculator.cpython-313.pyc +0 -0
- package/core/__pycache__/statistics_update_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/stats_config_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/streaming_text_handler.cpython-313.pyc +0 -0
- package/core/__pycache__/task_scheduler.cpython-313.pyc +0 -0
- package/core/__pycache__/visibility_monitor.cpython-313.pyc +0 -0
- package/core/__pycache__/widget_cache.cpython-313.pyc +0 -0
- package/core/mcp/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/mcp/__pycache__/protocol.cpython-313.pyc +0 -0
- package/core/mcp/__pycache__/schema.cpython-313.pyc +0 -0
- package/core/mcp/__pycache__/server_stdio.cpython-313.pyc +0 -0
- package/core/mcp/__pycache__/tool_registry.cpython-313.pyc +0 -0
- package/tools/__pycache__/__init__.cpython-313.pyc +0 -0
- package/tools/__pycache__/ai_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/ascii_art_generator.cpython-313.pyc +0 -0
- package/tools/__pycache__/base64_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/base_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/case_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/column_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/cron_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/curl_history.cpython-313.pyc +0 -0
- package/tools/__pycache__/curl_processor.cpython-313.pyc +0 -0
- package/tools/__pycache__/curl_settings.cpython-313.pyc +0 -0
- package/tools/__pycache__/curl_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/diff_viewer.cpython-313.pyc +0 -0
- package/tools/__pycache__/email_extraction_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/email_header_analyzer.cpython-313.pyc +0 -0
- package/tools/__pycache__/extraction_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/find_replace.cpython-313.pyc +0 -0
- package/tools/__pycache__/folder_file_reporter.cpython-313.pyc +0 -0
- package/tools/__pycache__/folder_file_reporter_adapter.cpython-313.pyc +0 -0
- package/tools/__pycache__/generator_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/hash_generator.cpython-313.pyc +0 -0
- package/tools/__pycache__/html_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/huggingface_helper.cpython-313.pyc +0 -0
- package/tools/__pycache__/jsonxml_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/line_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/list_comparator.cpython-313.pyc +0 -0
- package/tools/__pycache__/markdown_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/mcp_widget.cpython-313.pyc +0 -0
- package/tools/__pycache__/notes_widget.cpython-313.pyc +0 -0
- package/tools/__pycache__/number_base_converter.cpython-313.pyc +0 -0
- package/tools/__pycache__/regex_extractor.cpython-313.pyc +0 -0
- package/tools/__pycache__/slug_generator.cpython-313.pyc +0 -0
- package/tools/__pycache__/sorter_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/string_escape_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/text_statistics_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/text_wrapper.cpython-313.pyc +0 -0
- package/tools/__pycache__/timestamp_converter.cpython-313.pyc +0 -0
- package/tools/__pycache__/tool_loader.cpython-313.pyc +0 -0
- package/tools/__pycache__/translator_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/url_link_extractor.cpython-313.pyc +0 -0
- package/tools/__pycache__/url_parser.cpython-313.pyc +0 -0
- package/tools/__pycache__/whitespace_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/word_frequency_counter.cpython-313.pyc +0 -0
package/tools/html_tool.py
CHANGED
|
@@ -1,657 +1,657 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
HTML Extraction Tool Module for Pomera AI Commander
|
|
4
|
-
|
|
5
|
-
This module provides HTML processing capabilities including:
|
|
6
|
-
- Extracting visible text from HTML (as it would appear in a browser)
|
|
7
|
-
- Cleaning up HTML by removing unnecessary tags
|
|
8
|
-
- Extracting specific HTML elements
|
|
9
|
-
- Converting HTML to plain text with proper formatting
|
|
10
|
-
|
|
11
|
-
Author: Pomera AI Commander
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
import re
|
|
15
|
-
import html
|
|
16
|
-
from typing import Dict, Any, List, Optional
|
|
17
|
-
import logging
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class HTMLExtractionTool:
|
|
21
|
-
"""
|
|
22
|
-
HTML Extraction Tool for processing HTML content and extracting useful information.
|
|
23
|
-
|
|
24
|
-
Features:
|
|
25
|
-
- Extract visible text from HTML (browser-rendered text)
|
|
26
|
-
- Clean HTML by removing unnecessary tags
|
|
27
|
-
- Extract specific elements (links, images, headings, etc.)
|
|
28
|
-
- Convert HTML to formatted plain text
|
|
29
|
-
- Remove scripts, styles, and other non-visible content
|
|
30
|
-
"""
|
|
31
|
-
|
|
32
|
-
def __init__(self, logger=None):
|
|
33
|
-
"""
|
|
34
|
-
Initialize the HTML Extraction Tool.
|
|
35
|
-
|
|
36
|
-
Args:
|
|
37
|
-
logger: Logger instance for debugging
|
|
38
|
-
"""
|
|
39
|
-
self.logger = logger or logging.getLogger(__name__)
|
|
40
|
-
|
|
41
|
-
# Tags that should be completely removed along with their content
|
|
42
|
-
self.script_style_tags = ['script', 'style', 'noscript', 'meta', 'head', 'title']
|
|
43
|
-
|
|
44
|
-
# Block-level tags that should add line breaks
|
|
45
|
-
self.block_tags = [
|
|
46
|
-
'div', 'p', 'br', 'hr', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
|
47
|
-
'ul', 'ol', 'li', 'dl', 'dt', 'dd', 'blockquote', 'pre',
|
|
48
|
-
'table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot',
|
|
49
|
-
'section', 'article', 'header', 'footer', 'nav', 'aside',
|
|
50
|
-
'main', 'figure', 'figcaption', 'address'
|
|
51
|
-
]
|
|
52
|
-
|
|
53
|
-
# Inline tags that should preserve spacing
|
|
54
|
-
self.inline_tags = [
|
|
55
|
-
'span', 'a', 'strong', 'b', 'em', 'i', 'u', 'small', 'mark',
|
|
56
|
-
'del', 'ins', 'sub', 'sup', 'code', 'kbd', 'samp', 'var',
|
|
57
|
-
'abbr', 'acronym', 'cite', 'dfn', 'q', 'time'
|
|
58
|
-
]
|
|
59
|
-
|
|
60
|
-
def process_text(self, html_content: str, settings: Dict[str, Any]) -> str:
|
|
61
|
-
"""
|
|
62
|
-
Process HTML content based on the selected extraction method.
|
|
63
|
-
|
|
64
|
-
Args:
|
|
65
|
-
html_content: HTML content to process
|
|
66
|
-
settings: Tool settings dictionary
|
|
67
|
-
|
|
68
|
-
Returns:
|
|
69
|
-
Processed text based on the selected method
|
|
70
|
-
"""
|
|
71
|
-
try:
|
|
72
|
-
if not html_content.strip():
|
|
73
|
-
return "No HTML content provided."
|
|
74
|
-
|
|
75
|
-
extraction_method = settings.get("extraction_method", "visible_text")
|
|
76
|
-
|
|
77
|
-
if extraction_method == "visible_text":
|
|
78
|
-
return self.extract_visible_text(html_content, settings)
|
|
79
|
-
elif extraction_method == "clean_html":
|
|
80
|
-
return self.clean_html(html_content, settings)
|
|
81
|
-
elif extraction_method == "extract_links":
|
|
82
|
-
return self.extract_links(html_content, settings)
|
|
83
|
-
elif extraction_method == "extract_images":
|
|
84
|
-
return self.extract_images(html_content, settings)
|
|
85
|
-
elif extraction_method == "extract_headings":
|
|
86
|
-
return self.extract_headings(html_content, settings)
|
|
87
|
-
elif extraction_method == "extract_tables":
|
|
88
|
-
return self.extract_tables(html_content, settings)
|
|
89
|
-
elif extraction_method == "extract_forms":
|
|
90
|
-
return self.extract_forms(html_content, settings)
|
|
91
|
-
else:
|
|
92
|
-
return self.extract_visible_text(html_content, settings)
|
|
93
|
-
|
|
94
|
-
except Exception as e:
|
|
95
|
-
self.logger.error(f"Error processing HTML: {e}")
|
|
96
|
-
return f"Error processing HTML: {str(e)}"
|
|
97
|
-
|
|
98
|
-
def extract_visible_text(self, html_content: str, settings: Dict[str, Any]) -> str:
|
|
99
|
-
"""
|
|
100
|
-
Extract visible text from HTML as it would appear in a browser.
|
|
101
|
-
|
|
102
|
-
Args:
|
|
103
|
-
html_content: HTML content to process
|
|
104
|
-
settings: Tool settings
|
|
105
|
-
|
|
106
|
-
Returns:
|
|
107
|
-
Visible text with proper formatting
|
|
108
|
-
"""
|
|
109
|
-
try:
|
|
110
|
-
# Remove script and style tags with their content
|
|
111
|
-
html_content = self._remove_script_style_tags(html_content)
|
|
112
|
-
|
|
113
|
-
# Remove HTML comments
|
|
114
|
-
html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
|
|
115
|
-
|
|
116
|
-
# Handle block-level tags by adding line breaks
|
|
117
|
-
for tag in self.block_tags:
|
|
118
|
-
# Add line breaks before and after block tags
|
|
119
|
-
html_content = re.sub(f'<{tag}[^>]*>', f'\n<{tag}>', html_content, flags=re.IGNORECASE)
|
|
120
|
-
html_content = re.sub(f'</{tag}>', f'</{tag}>\n', html_content, flags=re.IGNORECASE)
|
|
121
|
-
|
|
122
|
-
# Handle list items specially
|
|
123
|
-
html_content = re.sub(r'<li[^>]*>', '\n• ', html_content, flags=re.IGNORECASE)
|
|
124
|
-
html_content = re.sub(r'</li>', '', html_content, flags=re.IGNORECASE)
|
|
125
|
-
|
|
126
|
-
# Handle table cells
|
|
127
|
-
html_content = re.sub(r'<td[^>]*>', '\t', html_content, flags=re.IGNORECASE)
|
|
128
|
-
html_content = re.sub(r'</td>', '', html_content, flags=re.IGNORECASE)
|
|
129
|
-
html_content = re.sub(r'<th[^>]*>', '\t', html_content, flags=re.IGNORECASE)
|
|
130
|
-
html_content = re.sub(r'</th>', '', html_content, flags=re.IGNORECASE)
|
|
131
|
-
|
|
132
|
-
# Remove all remaining HTML tags
|
|
133
|
-
html_content = re.sub(r'<[^>]+>', '', html_content)
|
|
134
|
-
|
|
135
|
-
# Decode HTML entities
|
|
136
|
-
html_content = html.unescape(html_content)
|
|
137
|
-
|
|
138
|
-
# Clean up whitespace
|
|
139
|
-
lines = html_content.split('\n')
|
|
140
|
-
cleaned_lines = []
|
|
141
|
-
|
|
142
|
-
for line in lines:
|
|
143
|
-
line = line.strip()
|
|
144
|
-
if line: # Only keep non-empty lines
|
|
145
|
-
cleaned_lines.append(line)
|
|
146
|
-
|
|
147
|
-
# Join lines and clean up multiple line breaks
|
|
148
|
-
result = '\n'.join(cleaned_lines)
|
|
149
|
-
|
|
150
|
-
# Remove excessive line breaks
|
|
151
|
-
result = re.sub(r'\n{3,}', '\n\n', result)
|
|
152
|
-
|
|
153
|
-
# Add formatting options
|
|
154
|
-
if settings.get("preserve_links", False):
|
|
155
|
-
result = self._add_link_references(html_content, result)
|
|
156
|
-
|
|
157
|
-
return result.strip()
|
|
158
|
-
|
|
159
|
-
except Exception as e:
|
|
160
|
-
self.logger.error(f"Error extracting visible text: {e}")
|
|
161
|
-
return f"Error extracting visible text: {str(e)}"
|
|
162
|
-
|
|
163
|
-
def clean_html(self, html_content: str, settings: Dict[str, Any]) -> str:
|
|
164
|
-
"""
|
|
165
|
-
Clean HTML by removing unnecessary tags and attributes.
|
|
166
|
-
|
|
167
|
-
Args:
|
|
168
|
-
html_content: HTML content to clean
|
|
169
|
-
settings: Tool settings
|
|
170
|
-
|
|
171
|
-
Returns:
|
|
172
|
-
Cleaned HTML
|
|
173
|
-
"""
|
|
174
|
-
try:
|
|
175
|
-
# Remove script and style tags if requested
|
|
176
|
-
if settings.get("remove_scripts", True):
|
|
177
|
-
html_content = self._remove_script_style_tags(html_content)
|
|
178
|
-
|
|
179
|
-
# Remove HTML comments
|
|
180
|
-
if settings.get("remove_comments", True):
|
|
181
|
-
html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
|
|
182
|
-
|
|
183
|
-
# Remove specific attributes if requested
|
|
184
|
-
if settings.get("remove_style_attrs", True):
|
|
185
|
-
html_content = re.sub(r'\s+style\s*=\s*["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE)
|
|
186
|
-
|
|
187
|
-
if settings.get("remove_class_attrs", False):
|
|
188
|
-
html_content = re.sub(r'\s+class\s*=\s*["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE)
|
|
189
|
-
|
|
190
|
-
if settings.get("remove_id_attrs", False):
|
|
191
|
-
html_content = re.sub(r'\s+id\s*=\s*["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE)
|
|
192
|
-
|
|
193
|
-
# Remove empty tags if requested
|
|
194
|
-
if settings.get("remove_empty_tags", True):
|
|
195
|
-
# Remove tags that are completely empty
|
|
196
|
-
html_content = re.sub(r'<(\w+)[^>]*>\s*</\1>', '', html_content, flags=re.IGNORECASE)
|
|
197
|
-
|
|
198
|
-
# Clean up whitespace
|
|
199
|
-
html_content = re.sub(r'\n\s*\n', '\n', html_content)
|
|
200
|
-
html_content = re.sub(r'>\s+<', '><', html_content)
|
|
201
|
-
|
|
202
|
-
return html_content.strip()
|
|
203
|
-
|
|
204
|
-
except Exception as e:
|
|
205
|
-
self.logger.error(f"Error cleaning HTML: {e}")
|
|
206
|
-
return f"Error cleaning HTML: {str(e)}"
|
|
207
|
-
|
|
208
|
-
def extract_links(self, html_content: str, settings: Dict[str, Any]) -> str:
|
|
209
|
-
"""
|
|
210
|
-
Extract all links from HTML content.
|
|
211
|
-
|
|
212
|
-
Args:
|
|
213
|
-
html_content: HTML content to process
|
|
214
|
-
settings: Tool settings
|
|
215
|
-
|
|
216
|
-
Returns:
|
|
217
|
-
List of links with their text
|
|
218
|
-
"""
|
|
219
|
-
try:
|
|
220
|
-
# Find all anchor tags
|
|
221
|
-
link_pattern = r'<a[^>]*href\s*=\s*["\']([^"\']*)["\'][^>]*>(.*?)</a>'
|
|
222
|
-
links = re.findall(link_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
|
|
223
|
-
|
|
224
|
-
if not links:
|
|
225
|
-
return "No links found in the HTML content."
|
|
226
|
-
|
|
227
|
-
result_lines = []
|
|
228
|
-
include_text = settings.get("include_link_text", True)
|
|
229
|
-
absolute_only = settings.get("absolute_links_only", False)
|
|
230
|
-
|
|
231
|
-
for href, link_text in links:
|
|
232
|
-
# Clean up link text
|
|
233
|
-
link_text = re.sub(r'<[^>]+>', '', link_text).strip()
|
|
234
|
-
link_text = html.unescape(link_text)
|
|
235
|
-
|
|
236
|
-
# Filter absolute links if requested
|
|
237
|
-
if absolute_only and not (href.startswith('http://') or href.startswith('https://')):
|
|
238
|
-
continue
|
|
239
|
-
|
|
240
|
-
if include_text and link_text:
|
|
241
|
-
result_lines.append(f"{link_text}: {href}")
|
|
242
|
-
else:
|
|
243
|
-
result_lines.append(href)
|
|
244
|
-
|
|
245
|
-
return '\n'.join(result_lines) if result_lines else "No links match the specified criteria."
|
|
246
|
-
|
|
247
|
-
except Exception as e:
|
|
248
|
-
self.logger.error(f"Error extracting links: {e}")
|
|
249
|
-
return f"Error extracting links: {str(e)}"
|
|
250
|
-
|
|
251
|
-
def extract_images(self, html_content: str, settings: Dict[str, Any]) -> str:
|
|
252
|
-
"""
|
|
253
|
-
Extract all images from HTML content.
|
|
254
|
-
|
|
255
|
-
Args:
|
|
256
|
-
html_content: HTML content to process
|
|
257
|
-
settings: Tool settings
|
|
258
|
-
|
|
259
|
-
Returns:
|
|
260
|
-
List of images with their attributes
|
|
261
|
-
"""
|
|
262
|
-
try:
|
|
263
|
-
# Find all img tags
|
|
264
|
-
img_pattern = r'<img[^>]*>'
|
|
265
|
-
images = re.findall(img_pattern, html_content, flags=re.IGNORECASE)
|
|
266
|
-
|
|
267
|
-
if not images:
|
|
268
|
-
return "No images found in the HTML content."
|
|
269
|
-
|
|
270
|
-
result_lines = []
|
|
271
|
-
include_alt = settings.get("include_alt_text", True)
|
|
272
|
-
include_title = settings.get("include_title", False)
|
|
273
|
-
|
|
274
|
-
for img_tag in images:
|
|
275
|
-
# Extract src attribute
|
|
276
|
-
src_match = re.search(r'src\s*=\s*["\']([^"\']*)["\']', img_tag, re.IGNORECASE)
|
|
277
|
-
src = src_match.group(1) if src_match else "No src"
|
|
278
|
-
|
|
279
|
-
# Extract alt attribute
|
|
280
|
-
alt_match = re.search(r'alt\s*=\s*["\']([^"\']*)["\']', img_tag, re.IGNORECASE)
|
|
281
|
-
alt = alt_match.group(1) if alt_match else ""
|
|
282
|
-
|
|
283
|
-
# Extract title attribute
|
|
284
|
-
title_match = re.search(r'title\s*=\s*["\']([^"\']*)["\']', img_tag, re.IGNORECASE)
|
|
285
|
-
title = title_match.group(1) if title_match else ""
|
|
286
|
-
|
|
287
|
-
# Build result line
|
|
288
|
-
parts = [src]
|
|
289
|
-
if include_alt and alt:
|
|
290
|
-
parts.append(f"Alt: {alt}")
|
|
291
|
-
if include_title and title:
|
|
292
|
-
parts.append(f"Title: {title}")
|
|
293
|
-
|
|
294
|
-
result_lines.append(" | ".join(parts))
|
|
295
|
-
|
|
296
|
-
return '\n'.join(result_lines)
|
|
297
|
-
|
|
298
|
-
except Exception as e:
|
|
299
|
-
self.logger.error(f"Error extracting images: {e}")
|
|
300
|
-
return f"Error extracting images: {str(e)}"
|
|
301
|
-
|
|
302
|
-
def extract_headings(self, html_content: str, settings: Dict[str, Any]) -> str:
|
|
303
|
-
"""
|
|
304
|
-
Extract all headings from HTML content.
|
|
305
|
-
|
|
306
|
-
Args:
|
|
307
|
-
html_content: HTML content to process
|
|
308
|
-
settings: Tool settings
|
|
309
|
-
|
|
310
|
-
Returns:
|
|
311
|
-
List of headings with their levels
|
|
312
|
-
"""
|
|
313
|
-
try:
|
|
314
|
-
# Find all heading tags
|
|
315
|
-
heading_pattern = r'<(h[1-6])[^>]*>(.*?)</\1>'
|
|
316
|
-
headings = re.findall(heading_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
|
|
317
|
-
|
|
318
|
-
if not headings:
|
|
319
|
-
return "No headings found in the HTML content."
|
|
320
|
-
|
|
321
|
-
result_lines = []
|
|
322
|
-
include_level = settings.get("include_heading_level", True)
|
|
323
|
-
|
|
324
|
-
for tag, content in headings:
|
|
325
|
-
# Clean up heading content
|
|
326
|
-
content = re.sub(r'<[^>]+>', '', content).strip()
|
|
327
|
-
content = html.unescape(content)
|
|
328
|
-
|
|
329
|
-
if include_level:
|
|
330
|
-
level = tag.upper()
|
|
331
|
-
result_lines.append(f"{level}: {content}")
|
|
332
|
-
else:
|
|
333
|
-
result_lines.append(content)
|
|
334
|
-
|
|
335
|
-
return '\n'.join(result_lines)
|
|
336
|
-
|
|
337
|
-
except Exception as e:
|
|
338
|
-
self.logger.error(f"Error extracting headings: {e}")
|
|
339
|
-
return f"Error extracting headings: {str(e)}"
|
|
340
|
-
|
|
341
|
-
def extract_tables(self, html_content: str, settings: Dict[str, Any]) -> str:
|
|
342
|
-
"""
|
|
343
|
-
Extract table data from HTML content.
|
|
344
|
-
|
|
345
|
-
Args:
|
|
346
|
-
html_content: HTML content to process
|
|
347
|
-
settings: Tool settings
|
|
348
|
-
|
|
349
|
-
Returns:
|
|
350
|
-
Formatted table data
|
|
351
|
-
"""
|
|
352
|
-
try:
|
|
353
|
-
# Find all table tags
|
|
354
|
-
table_pattern = r'<table[^>]*>(.*?)</table>'
|
|
355
|
-
tables = re.findall(table_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
|
|
356
|
-
|
|
357
|
-
if not tables:
|
|
358
|
-
return "No tables found in the HTML content."
|
|
359
|
-
|
|
360
|
-
result_lines = []
|
|
361
|
-
separator = settings.get("column_separator", "\t")
|
|
362
|
-
|
|
363
|
-
for i, table_content in enumerate(tables):
|
|
364
|
-
if len(tables) > 1:
|
|
365
|
-
result_lines.append(f"\n--- Table {i + 1} ---")
|
|
366
|
-
|
|
367
|
-
# Find all rows
|
|
368
|
-
row_pattern = r'<tr[^>]*>(.*?)</tr>'
|
|
369
|
-
rows = re.findall(row_pattern, table_content, flags=re.IGNORECASE | re.DOTALL)
|
|
370
|
-
|
|
371
|
-
for row_content in rows:
|
|
372
|
-
# Find all cells (td or th)
|
|
373
|
-
cell_pattern = r'<(?:td|th)[^>]*>(.*?)</(?:td|th)>'
|
|
374
|
-
cells = re.findall(cell_pattern, row_content, flags=re.IGNORECASE | re.DOTALL)
|
|
375
|
-
|
|
376
|
-
# Clean up cell content
|
|
377
|
-
cleaned_cells = []
|
|
378
|
-
for cell in cells:
|
|
379
|
-
cell = re.sub(r'<[^>]+>', '', cell).strip()
|
|
380
|
-
cell = html.unescape(cell)
|
|
381
|
-
cleaned_cells.append(cell)
|
|
382
|
-
|
|
383
|
-
if cleaned_cells:
|
|
384
|
-
result_lines.append(separator.join(cleaned_cells))
|
|
385
|
-
|
|
386
|
-
return '\n'.join(result_lines)
|
|
387
|
-
|
|
388
|
-
except Exception as e:
|
|
389
|
-
self.logger.error(f"Error extracting tables: {e}")
|
|
390
|
-
return f"Error extracting tables: {str(e)}"
|
|
391
|
-
|
|
392
|
-
def extract_forms(self, html_content: str, settings: Dict[str, Any]) -> str:
|
|
393
|
-
"""
|
|
394
|
-
Extract form information from HTML content.
|
|
395
|
-
|
|
396
|
-
Args:
|
|
397
|
-
html_content: HTML content to process
|
|
398
|
-
settings: Tool settings
|
|
399
|
-
|
|
400
|
-
Returns:
|
|
401
|
-
Form structure information
|
|
402
|
-
"""
|
|
403
|
-
try:
|
|
404
|
-
# Find all form tags
|
|
405
|
-
form_pattern = r'<form[^>]*>(.*?)</form>'
|
|
406
|
-
forms = re.findall(form_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
|
|
407
|
-
|
|
408
|
-
if not forms:
|
|
409
|
-
return "No forms found in the HTML content."
|
|
410
|
-
|
|
411
|
-
result_lines = []
|
|
412
|
-
|
|
413
|
-
for i, form_content in enumerate(forms):
|
|
414
|
-
if len(forms) > 1:
|
|
415
|
-
result_lines.append(f"\n--- Form {i + 1} ---")
|
|
416
|
-
|
|
417
|
-
# Extract form attributes
|
|
418
|
-
form_tag_match = re.search(r'<form([^>]*)>', html_content, re.IGNORECASE)
|
|
419
|
-
if form_tag_match:
|
|
420
|
-
form_attrs = form_tag_match.group(1)
|
|
421
|
-
|
|
422
|
-
# Extract action
|
|
423
|
-
action_match = re.search(r'action\s*=\s*["\']([^"\']*)["\']', form_attrs, re.IGNORECASE)
|
|
424
|
-
if action_match:
|
|
425
|
-
result_lines.append(f"Action: {action_match.group(1)}")
|
|
426
|
-
|
|
427
|
-
# Extract method
|
|
428
|
-
method_match = re.search(r'method\s*=\s*["\']([^"\']*)["\']', form_attrs, re.IGNORECASE)
|
|
429
|
-
if method_match:
|
|
430
|
-
result_lines.append(f"Method: {method_match.group(1)}")
|
|
431
|
-
|
|
432
|
-
# Find all input fields
|
|
433
|
-
input_pattern = r'<input[^>]*>'
|
|
434
|
-
inputs = re.findall(input_pattern, form_content, flags=re.IGNORECASE)
|
|
435
|
-
|
|
436
|
-
if inputs:
|
|
437
|
-
result_lines.append("Input Fields:")
|
|
438
|
-
for input_tag in inputs:
|
|
439
|
-
# Extract input attributes
|
|
440
|
-
name_match = re.search(r'name\s*=\s*["\']([^"\']*)["\']', input_tag, re.IGNORECASE)
|
|
441
|
-
type_match = re.search(r'type\s*=\s*["\']([^"\']*)["\']', input_tag, re.IGNORECASE)
|
|
442
|
-
|
|
443
|
-
name = name_match.group(1) if name_match else "unnamed"
|
|
444
|
-
input_type = type_match.group(1) if type_match else "text"
|
|
445
|
-
|
|
446
|
-
result_lines.append(f" - {name} ({input_type})")
|
|
447
|
-
|
|
448
|
-
# Find all textarea fields
|
|
449
|
-
textarea_pattern = r'<textarea[^>]*name\s*=\s*["\']([^"\']*)["\'][^>]*>'
|
|
450
|
-
textareas = re.findall(textarea_pattern, form_content, flags=re.IGNORECASE)
|
|
451
|
-
|
|
452
|
-
if textareas:
|
|
453
|
-
result_lines.append("Textarea Fields:")
|
|
454
|
-
for name in textareas:
|
|
455
|
-
result_lines.append(f" - {name}")
|
|
456
|
-
|
|
457
|
-
# Find all select fields
|
|
458
|
-
select_pattern = r'<select[^>]*name\s*=\s*["\']([^"\']*)["\'][^>]*>'
|
|
459
|
-
selects = re.findall(select_pattern, form_content, flags=re.IGNORECASE)
|
|
460
|
-
|
|
461
|
-
if selects:
|
|
462
|
-
result_lines.append("Select Fields:")
|
|
463
|
-
for name in selects:
|
|
464
|
-
result_lines.append(f" - {name}")
|
|
465
|
-
|
|
466
|
-
return '\n'.join(result_lines)
|
|
467
|
-
|
|
468
|
-
except Exception as e:
|
|
469
|
-
self.logger.error(f"Error extracting forms: {e}")
|
|
470
|
-
return f"Error extracting forms: {str(e)}"
|
|
471
|
-
|
|
472
|
-
def _remove_script_style_tags(self, html_content: str) -> str:
|
|
473
|
-
"""Remove script and style tags with their content."""
|
|
474
|
-
for tag in self.script_style_tags:
|
|
475
|
-
pattern = f'<{tag}[^>]*>.*?</{tag}>'
|
|
476
|
-
html_content = re.sub(pattern, '', html_content, flags=re.IGNORECASE | re.DOTALL)
|
|
477
|
-
return html_content
|
|
478
|
-
|
|
479
|
-
def _add_link_references(self, original_html: str, text_result: str) -> str:
|
|
480
|
-
"""Add link references to the text result."""
|
|
481
|
-
try:
|
|
482
|
-
# This is a simplified implementation
|
|
483
|
-
# In a full implementation, you might want to add footnote-style references
|
|
484
|
-
link_pattern = r'<a[^>]*href\s*=\s*["\']([^"\']*)["\'][^>]*>(.*?)</a>'
|
|
485
|
-
links = re.findall(link_pattern, original_html, flags=re.IGNORECASE | re.DOTALL)
|
|
486
|
-
|
|
487
|
-
if links:
|
|
488
|
-
text_result += "\n\nLinks found in document:\n"
|
|
489
|
-
for i, (href, link_text) in enumerate(links, 1):
|
|
490
|
-
link_text = re.sub(r'<[^>]+>', '', link_text).strip()
|
|
491
|
-
link_text = html.unescape(link_text)
|
|
492
|
-
text_result += f"{i}. {link_text}: {href}\n"
|
|
493
|
-
|
|
494
|
-
return text_result
|
|
495
|
-
except Exception:
|
|
496
|
-
return text_result
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
# Tool settings configuration
|
|
500
|
-
def get_default_settings():
|
|
501
|
-
"""Get default settings for the HTML Extraction Tool."""
|
|
502
|
-
return {
|
|
503
|
-
"extraction_method": "visible_text",
|
|
504
|
-
"preserve_links": False,
|
|
505
|
-
"remove_scripts": True,
|
|
506
|
-
"remove_comments": True,
|
|
507
|
-
"remove_style_attrs": True,
|
|
508
|
-
"remove_class_attrs": False,
|
|
509
|
-
"remove_id_attrs": False,
|
|
510
|
-
"remove_empty_tags": True,
|
|
511
|
-
"include_link_text": True,
|
|
512
|
-
"absolute_links_only": False,
|
|
513
|
-
"include_alt_text": True,
|
|
514
|
-
"include_title": False,
|
|
515
|
-
"include_heading_level": True,
|
|
516
|
-
"column_separator": "\t"
|
|
517
|
-
}
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
def get_settings_ui_config():
|
|
521
|
-
"""Get UI configuration for the HTML Extraction Tool settings."""
|
|
522
|
-
return {
|
|
523
|
-
"extraction_method": {
|
|
524
|
-
"type": "dropdown",
|
|
525
|
-
"label": "Extraction Method",
|
|
526
|
-
"options": [
|
|
527
|
-
("Extract Visible Text", "visible_text"),
|
|
528
|
-
("Clean HTML", "clean_html"),
|
|
529
|
-
("Extract Links", "extract_links"),
|
|
530
|
-
("Extract Images", "extract_images"),
|
|
531
|
-
("Extract Headings", "extract_headings"),
|
|
532
|
-
("Extract Tables", "extract_tables"),
|
|
533
|
-
("Extract Forms", "extract_forms")
|
|
534
|
-
],
|
|
535
|
-
"default": "visible_text"
|
|
536
|
-
},
|
|
537
|
-
"preserve_links": {
|
|
538
|
-
"type": "checkbox",
|
|
539
|
-
"label": "Add link references to visible text",
|
|
540
|
-
"default": False,
|
|
541
|
-
"show_when": {"extraction_method": "visible_text"}
|
|
542
|
-
},
|
|
543
|
-
"remove_scripts": {
|
|
544
|
-
"type": "checkbox",
|
|
545
|
-
"label": "Remove script and style tags",
|
|
546
|
-
"default": True,
|
|
547
|
-
"show_when": {"extraction_method": "clean_html"}
|
|
548
|
-
},
|
|
549
|
-
"remove_comments": {
|
|
550
|
-
"type": "checkbox",
|
|
551
|
-
"label": "Remove HTML comments",
|
|
552
|
-
"default": True,
|
|
553
|
-
"show_when": {"extraction_method": "clean_html"}
|
|
554
|
-
},
|
|
555
|
-
"remove_style_attrs": {
|
|
556
|
-
"type": "checkbox",
|
|
557
|
-
"label": "Remove style attributes",
|
|
558
|
-
"default": True,
|
|
559
|
-
"show_when": {"extraction_method": "clean_html"}
|
|
560
|
-
},
|
|
561
|
-
"remove_class_attrs": {
|
|
562
|
-
"type": "checkbox",
|
|
563
|
-
"label": "Remove class attributes",
|
|
564
|
-
"default": False,
|
|
565
|
-
"show_when": {"extraction_method": "clean_html"}
|
|
566
|
-
},
|
|
567
|
-
"remove_id_attrs": {
|
|
568
|
-
"type": "checkbox",
|
|
569
|
-
"label": "Remove ID attributes",
|
|
570
|
-
"default": False,
|
|
571
|
-
"show_when": {"extraction_method": "clean_html"}
|
|
572
|
-
},
|
|
573
|
-
"remove_empty_tags": {
|
|
574
|
-
"type": "checkbox",
|
|
575
|
-
"label": "Remove empty tags",
|
|
576
|
-
"default": True,
|
|
577
|
-
"show_when": {"extraction_method": "clean_html"}
|
|
578
|
-
},
|
|
579
|
-
"include_link_text": {
|
|
580
|
-
"type": "checkbox",
|
|
581
|
-
"label": "Include link text",
|
|
582
|
-
"default": True,
|
|
583
|
-
"show_when": {"extraction_method": "extract_links"}
|
|
584
|
-
},
|
|
585
|
-
"absolute_links_only": {
|
|
586
|
-
"type": "checkbox",
|
|
587
|
-
"label": "Only absolute links (http/https)",
|
|
588
|
-
"default": False,
|
|
589
|
-
"show_when": {"extraction_method": "extract_links"}
|
|
590
|
-
},
|
|
591
|
-
"include_alt_text": {
|
|
592
|
-
"type": "checkbox",
|
|
593
|
-
"label": "Include alt text",
|
|
594
|
-
"default": True,
|
|
595
|
-
"show_when": {"extraction_method": "extract_images"}
|
|
596
|
-
},
|
|
597
|
-
"include_title": {
|
|
598
|
-
"type": "checkbox",
|
|
599
|
-
"label": "Include title attribute",
|
|
600
|
-
"default": False,
|
|
601
|
-
"show_when": {"extraction_method": "extract_images"}
|
|
602
|
-
},
|
|
603
|
-
"include_heading_level": {
|
|
604
|
-
"type": "checkbox",
|
|
605
|
-
"label": "Include heading level (H1, H2, etc.)",
|
|
606
|
-
"default": True,
|
|
607
|
-
"show_when": {"extraction_method": "extract_headings"}
|
|
608
|
-
},
|
|
609
|
-
"column_separator": {
|
|
610
|
-
"type": "entry",
|
|
611
|
-
"label": "Column separator",
|
|
612
|
-
"default": "\t",
|
|
613
|
-
"show_when": {"extraction_method": "extract_tables"}
|
|
614
|
-
}
|
|
615
|
-
}
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
# BaseTool-compatible wrapper
|
|
619
|
-
try:
|
|
620
|
-
from tools.base_tool import ToolWithOptions
|
|
621
|
-
import tkinter as tk
|
|
622
|
-
from tkinter import ttk
|
|
623
|
-
|
|
624
|
-
class HTMLToolV2(ToolWithOptions):
|
|
625
|
-
"""
|
|
626
|
-
BaseTool-compatible version of HTMLExtractionTool.
|
|
627
|
-
"""
|
|
628
|
-
|
|
629
|
-
TOOL_NAME = "HTML Tool"
|
|
630
|
-
TOOL_DESCRIPTION = "Extract and process HTML content"
|
|
631
|
-
TOOL_VERSION = "2.0.0"
|
|
632
|
-
|
|
633
|
-
OPTIONS = [
|
|
634
|
-
("Visible Text", "visible_text"),
|
|
635
|
-
("Clean HTML", "clean_html"),
|
|
636
|
-
("Extract Links", "extract_links"),
|
|
637
|
-
("Extract Images", "extract_images"),
|
|
638
|
-
("Extract Headings", "extract_headings"),
|
|
639
|
-
("Extract Tables", "extract_tables"),
|
|
640
|
-
("Extract Forms", "extract_forms"),
|
|
641
|
-
]
|
|
642
|
-
OPTIONS_LABEL = "Operation"
|
|
643
|
-
USE_DROPDOWN = True
|
|
644
|
-
DEFAULT_OPTION = "visible_text"
|
|
645
|
-
|
|
646
|
-
def __init__(self):
|
|
647
|
-
super().__init__()
|
|
648
|
-
self._tool = HTMLExtractionTool()
|
|
649
|
-
|
|
650
|
-
def process_text(self, input_text: str, settings: Dict[str, Any]) -> str:
|
|
651
|
-
"""Process HTML content."""
|
|
652
|
-
mode = settings.get("mode", "visible_text")
|
|
653
|
-
tool_settings = {"extraction_method": mode}
|
|
654
|
-
return self._tool.process_text(input_text, tool_settings)
|
|
655
|
-
|
|
656
|
-
except ImportError:
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
HTML Extraction Tool Module for Pomera AI Commander
|
|
4
|
+
|
|
5
|
+
This module provides HTML processing capabilities including:
|
|
6
|
+
- Extracting visible text from HTML (as it would appear in a browser)
|
|
7
|
+
- Cleaning up HTML by removing unnecessary tags
|
|
8
|
+
- Extracting specific HTML elements
|
|
9
|
+
- Converting HTML to plain text with proper formatting
|
|
10
|
+
|
|
11
|
+
Author: Pomera AI Commander
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
import html
|
|
16
|
+
from typing import Dict, Any, List, Optional
|
|
17
|
+
import logging
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class HTMLExtractionTool:
|
|
21
|
+
"""
|
|
22
|
+
HTML Extraction Tool for processing HTML content and extracting useful information.
|
|
23
|
+
|
|
24
|
+
Features:
|
|
25
|
+
- Extract visible text from HTML (browser-rendered text)
|
|
26
|
+
- Clean HTML by removing unnecessary tags
|
|
27
|
+
- Extract specific elements (links, images, headings, etc.)
|
|
28
|
+
- Convert HTML to formatted plain text
|
|
29
|
+
- Remove scripts, styles, and other non-visible content
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, logger=None):
|
|
33
|
+
"""
|
|
34
|
+
Initialize the HTML Extraction Tool.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
logger: Logger instance for debugging
|
|
38
|
+
"""
|
|
39
|
+
self.logger = logger or logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
# Tags that should be completely removed along with their content
|
|
42
|
+
self.script_style_tags = ['script', 'style', 'noscript', 'meta', 'head', 'title']
|
|
43
|
+
|
|
44
|
+
# Block-level tags that should add line breaks
|
|
45
|
+
self.block_tags = [
|
|
46
|
+
'div', 'p', 'br', 'hr', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
|
47
|
+
'ul', 'ol', 'li', 'dl', 'dt', 'dd', 'blockquote', 'pre',
|
|
48
|
+
'table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot',
|
|
49
|
+
'section', 'article', 'header', 'footer', 'nav', 'aside',
|
|
50
|
+
'main', 'figure', 'figcaption', 'address'
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
# Inline tags that should preserve spacing
|
|
54
|
+
self.inline_tags = [
|
|
55
|
+
'span', 'a', 'strong', 'b', 'em', 'i', 'u', 'small', 'mark',
|
|
56
|
+
'del', 'ins', 'sub', 'sup', 'code', 'kbd', 'samp', 'var',
|
|
57
|
+
'abbr', 'acronym', 'cite', 'dfn', 'q', 'time'
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
def process_text(self, html_content: str, settings: Dict[str, Any]) -> str:
|
|
61
|
+
"""
|
|
62
|
+
Process HTML content based on the selected extraction method.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
html_content: HTML content to process
|
|
66
|
+
settings: Tool settings dictionary
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Processed text based on the selected method
|
|
70
|
+
"""
|
|
71
|
+
try:
|
|
72
|
+
if not html_content.strip():
|
|
73
|
+
return "No HTML content provided."
|
|
74
|
+
|
|
75
|
+
extraction_method = settings.get("extraction_method", "visible_text")
|
|
76
|
+
|
|
77
|
+
if extraction_method == "visible_text":
|
|
78
|
+
return self.extract_visible_text(html_content, settings)
|
|
79
|
+
elif extraction_method == "clean_html":
|
|
80
|
+
return self.clean_html(html_content, settings)
|
|
81
|
+
elif extraction_method == "extract_links":
|
|
82
|
+
return self.extract_links(html_content, settings)
|
|
83
|
+
elif extraction_method == "extract_images":
|
|
84
|
+
return self.extract_images(html_content, settings)
|
|
85
|
+
elif extraction_method == "extract_headings":
|
|
86
|
+
return self.extract_headings(html_content, settings)
|
|
87
|
+
elif extraction_method == "extract_tables":
|
|
88
|
+
return self.extract_tables(html_content, settings)
|
|
89
|
+
elif extraction_method == "extract_forms":
|
|
90
|
+
return self.extract_forms(html_content, settings)
|
|
91
|
+
else:
|
|
92
|
+
return self.extract_visible_text(html_content, settings)
|
|
93
|
+
|
|
94
|
+
except Exception as e:
|
|
95
|
+
self.logger.error(f"Error processing HTML: {e}")
|
|
96
|
+
return f"Error processing HTML: {str(e)}"
|
|
97
|
+
|
|
98
|
+
def extract_visible_text(self, html_content: str, settings: Dict[str, Any]) -> str:
|
|
99
|
+
"""
|
|
100
|
+
Extract visible text from HTML as it would appear in a browser.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
html_content: HTML content to process
|
|
104
|
+
settings: Tool settings
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Visible text with proper formatting
|
|
108
|
+
"""
|
|
109
|
+
try:
|
|
110
|
+
# Remove script and style tags with their content
|
|
111
|
+
html_content = self._remove_script_style_tags(html_content)
|
|
112
|
+
|
|
113
|
+
# Remove HTML comments
|
|
114
|
+
html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
|
|
115
|
+
|
|
116
|
+
# Handle block-level tags by adding line breaks
|
|
117
|
+
for tag in self.block_tags:
|
|
118
|
+
# Add line breaks before and after block tags
|
|
119
|
+
html_content = re.sub(f'<{tag}[^>]*>', f'\n<{tag}>', html_content, flags=re.IGNORECASE)
|
|
120
|
+
html_content = re.sub(f'</{tag}>', f'</{tag}>\n', html_content, flags=re.IGNORECASE)
|
|
121
|
+
|
|
122
|
+
# Handle list items specially
|
|
123
|
+
html_content = re.sub(r'<li[^>]*>', '\n• ', html_content, flags=re.IGNORECASE)
|
|
124
|
+
html_content = re.sub(r'</li>', '', html_content, flags=re.IGNORECASE)
|
|
125
|
+
|
|
126
|
+
# Handle table cells
|
|
127
|
+
html_content = re.sub(r'<td[^>]*>', '\t', html_content, flags=re.IGNORECASE)
|
|
128
|
+
html_content = re.sub(r'</td>', '', html_content, flags=re.IGNORECASE)
|
|
129
|
+
html_content = re.sub(r'<th[^>]*>', '\t', html_content, flags=re.IGNORECASE)
|
|
130
|
+
html_content = re.sub(r'</th>', '', html_content, flags=re.IGNORECASE)
|
|
131
|
+
|
|
132
|
+
# Remove all remaining HTML tags
|
|
133
|
+
html_content = re.sub(r'<[^>]+>', '', html_content)
|
|
134
|
+
|
|
135
|
+
# Decode HTML entities
|
|
136
|
+
html_content = html.unescape(html_content)
|
|
137
|
+
|
|
138
|
+
# Clean up whitespace
|
|
139
|
+
lines = html_content.split('\n')
|
|
140
|
+
cleaned_lines = []
|
|
141
|
+
|
|
142
|
+
for line in lines:
|
|
143
|
+
line = line.strip()
|
|
144
|
+
if line: # Only keep non-empty lines
|
|
145
|
+
cleaned_lines.append(line)
|
|
146
|
+
|
|
147
|
+
# Join lines and clean up multiple line breaks
|
|
148
|
+
result = '\n'.join(cleaned_lines)
|
|
149
|
+
|
|
150
|
+
# Remove excessive line breaks
|
|
151
|
+
result = re.sub(r'\n{3,}', '\n\n', result)
|
|
152
|
+
|
|
153
|
+
# Add formatting options
|
|
154
|
+
if settings.get("preserve_links", False):
|
|
155
|
+
result = self._add_link_references(html_content, result)
|
|
156
|
+
|
|
157
|
+
return result.strip()
|
|
158
|
+
|
|
159
|
+
except Exception as e:
|
|
160
|
+
self.logger.error(f"Error extracting visible text: {e}")
|
|
161
|
+
return f"Error extracting visible text: {str(e)}"
|
|
162
|
+
|
|
163
|
+
def clean_html(self, html_content: str, settings: Dict[str, Any]) -> str:
|
|
164
|
+
"""
|
|
165
|
+
Clean HTML by removing unnecessary tags and attributes.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
html_content: HTML content to clean
|
|
169
|
+
settings: Tool settings
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Cleaned HTML
|
|
173
|
+
"""
|
|
174
|
+
try:
|
|
175
|
+
# Remove script and style tags if requested
|
|
176
|
+
if settings.get("remove_scripts", True):
|
|
177
|
+
html_content = self._remove_script_style_tags(html_content)
|
|
178
|
+
|
|
179
|
+
# Remove HTML comments
|
|
180
|
+
if settings.get("remove_comments", True):
|
|
181
|
+
html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
|
|
182
|
+
|
|
183
|
+
# Remove specific attributes if requested
|
|
184
|
+
if settings.get("remove_style_attrs", True):
|
|
185
|
+
html_content = re.sub(r'\s+style\s*=\s*["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE)
|
|
186
|
+
|
|
187
|
+
if settings.get("remove_class_attrs", False):
|
|
188
|
+
html_content = re.sub(r'\s+class\s*=\s*["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE)
|
|
189
|
+
|
|
190
|
+
if settings.get("remove_id_attrs", False):
|
|
191
|
+
html_content = re.sub(r'\s+id\s*=\s*["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE)
|
|
192
|
+
|
|
193
|
+
# Remove empty tags if requested
|
|
194
|
+
if settings.get("remove_empty_tags", True):
|
|
195
|
+
# Remove tags that are completely empty
|
|
196
|
+
html_content = re.sub(r'<(\w+)[^>]*>\s*</\1>', '', html_content, flags=re.IGNORECASE)
|
|
197
|
+
|
|
198
|
+
# Clean up whitespace
|
|
199
|
+
html_content = re.sub(r'\n\s*\n', '\n', html_content)
|
|
200
|
+
html_content = re.sub(r'>\s+<', '><', html_content)
|
|
201
|
+
|
|
202
|
+
return html_content.strip()
|
|
203
|
+
|
|
204
|
+
except Exception as e:
|
|
205
|
+
self.logger.error(f"Error cleaning HTML: {e}")
|
|
206
|
+
return f"Error cleaning HTML: {str(e)}"
|
|
207
|
+
|
|
208
|
+
def extract_links(self, html_content: str, settings: Dict[str, Any]) -> str:
|
|
209
|
+
"""
|
|
210
|
+
Extract all links from HTML content.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
html_content: HTML content to process
|
|
214
|
+
settings: Tool settings
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
List of links with their text
|
|
218
|
+
"""
|
|
219
|
+
try:
|
|
220
|
+
# Find all anchor tags
|
|
221
|
+
link_pattern = r'<a[^>]*href\s*=\s*["\']([^"\']*)["\'][^>]*>(.*?)</a>'
|
|
222
|
+
links = re.findall(link_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
|
|
223
|
+
|
|
224
|
+
if not links:
|
|
225
|
+
return "No links found in the HTML content."
|
|
226
|
+
|
|
227
|
+
result_lines = []
|
|
228
|
+
include_text = settings.get("include_link_text", True)
|
|
229
|
+
absolute_only = settings.get("absolute_links_only", False)
|
|
230
|
+
|
|
231
|
+
for href, link_text in links:
|
|
232
|
+
# Clean up link text
|
|
233
|
+
link_text = re.sub(r'<[^>]+>', '', link_text).strip()
|
|
234
|
+
link_text = html.unescape(link_text)
|
|
235
|
+
|
|
236
|
+
# Filter absolute links if requested
|
|
237
|
+
if absolute_only and not (href.startswith('http://') or href.startswith('https://')):
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
if include_text and link_text:
|
|
241
|
+
result_lines.append(f"{link_text}: {href}")
|
|
242
|
+
else:
|
|
243
|
+
result_lines.append(href)
|
|
244
|
+
|
|
245
|
+
return '\n'.join(result_lines) if result_lines else "No links match the specified criteria."
|
|
246
|
+
|
|
247
|
+
except Exception as e:
|
|
248
|
+
self.logger.error(f"Error extracting links: {e}")
|
|
249
|
+
return f"Error extracting links: {str(e)}"
|
|
250
|
+
|
|
251
|
+
def extract_images(self, html_content: str, settings: Dict[str, Any]) -> str:
|
|
252
|
+
"""
|
|
253
|
+
Extract all images from HTML content.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
html_content: HTML content to process
|
|
257
|
+
settings: Tool settings
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
List of images with their attributes
|
|
261
|
+
"""
|
|
262
|
+
try:
|
|
263
|
+
# Find all img tags
|
|
264
|
+
img_pattern = r'<img[^>]*>'
|
|
265
|
+
images = re.findall(img_pattern, html_content, flags=re.IGNORECASE)
|
|
266
|
+
|
|
267
|
+
if not images:
|
|
268
|
+
return "No images found in the HTML content."
|
|
269
|
+
|
|
270
|
+
result_lines = []
|
|
271
|
+
include_alt = settings.get("include_alt_text", True)
|
|
272
|
+
include_title = settings.get("include_title", False)
|
|
273
|
+
|
|
274
|
+
for img_tag in images:
|
|
275
|
+
# Extract src attribute
|
|
276
|
+
src_match = re.search(r'src\s*=\s*["\']([^"\']*)["\']', img_tag, re.IGNORECASE)
|
|
277
|
+
src = src_match.group(1) if src_match else "No src"
|
|
278
|
+
|
|
279
|
+
# Extract alt attribute
|
|
280
|
+
alt_match = re.search(r'alt\s*=\s*["\']([^"\']*)["\']', img_tag, re.IGNORECASE)
|
|
281
|
+
alt = alt_match.group(1) if alt_match else ""
|
|
282
|
+
|
|
283
|
+
# Extract title attribute
|
|
284
|
+
title_match = re.search(r'title\s*=\s*["\']([^"\']*)["\']', img_tag, re.IGNORECASE)
|
|
285
|
+
title = title_match.group(1) if title_match else ""
|
|
286
|
+
|
|
287
|
+
# Build result line
|
|
288
|
+
parts = [src]
|
|
289
|
+
if include_alt and alt:
|
|
290
|
+
parts.append(f"Alt: {alt}")
|
|
291
|
+
if include_title and title:
|
|
292
|
+
parts.append(f"Title: {title}")
|
|
293
|
+
|
|
294
|
+
result_lines.append(" | ".join(parts))
|
|
295
|
+
|
|
296
|
+
return '\n'.join(result_lines)
|
|
297
|
+
|
|
298
|
+
except Exception as e:
|
|
299
|
+
self.logger.error(f"Error extracting images: {e}")
|
|
300
|
+
return f"Error extracting images: {str(e)}"
|
|
301
|
+
|
|
302
|
+
def extract_headings(self, html_content: str, settings: Dict[str, Any]) -> str:
|
|
303
|
+
"""
|
|
304
|
+
Extract all headings from HTML content.
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
html_content: HTML content to process
|
|
308
|
+
settings: Tool settings
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
List of headings with their levels
|
|
312
|
+
"""
|
|
313
|
+
try:
|
|
314
|
+
# Find all heading tags
|
|
315
|
+
heading_pattern = r'<(h[1-6])[^>]*>(.*?)</\1>'
|
|
316
|
+
headings = re.findall(heading_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
|
|
317
|
+
|
|
318
|
+
if not headings:
|
|
319
|
+
return "No headings found in the HTML content."
|
|
320
|
+
|
|
321
|
+
result_lines = []
|
|
322
|
+
include_level = settings.get("include_heading_level", True)
|
|
323
|
+
|
|
324
|
+
for tag, content in headings:
|
|
325
|
+
# Clean up heading content
|
|
326
|
+
content = re.sub(r'<[^>]+>', '', content).strip()
|
|
327
|
+
content = html.unescape(content)
|
|
328
|
+
|
|
329
|
+
if include_level:
|
|
330
|
+
level = tag.upper()
|
|
331
|
+
result_lines.append(f"{level}: {content}")
|
|
332
|
+
else:
|
|
333
|
+
result_lines.append(content)
|
|
334
|
+
|
|
335
|
+
return '\n'.join(result_lines)
|
|
336
|
+
|
|
337
|
+
except Exception as e:
|
|
338
|
+
self.logger.error(f"Error extracting headings: {e}")
|
|
339
|
+
return f"Error extracting headings: {str(e)}"
|
|
340
|
+
|
|
341
|
+
def extract_tables(self, html_content: str, settings: Dict[str, Any]) -> str:
|
|
342
|
+
"""
|
|
343
|
+
Extract table data from HTML content.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
html_content: HTML content to process
|
|
347
|
+
settings: Tool settings
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
Formatted table data
|
|
351
|
+
"""
|
|
352
|
+
try:
|
|
353
|
+
# Find all table tags
|
|
354
|
+
table_pattern = r'<table[^>]*>(.*?)</table>'
|
|
355
|
+
tables = re.findall(table_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
|
|
356
|
+
|
|
357
|
+
if not tables:
|
|
358
|
+
return "No tables found in the HTML content."
|
|
359
|
+
|
|
360
|
+
result_lines = []
|
|
361
|
+
separator = settings.get("column_separator", "\t")
|
|
362
|
+
|
|
363
|
+
for i, table_content in enumerate(tables):
|
|
364
|
+
if len(tables) > 1:
|
|
365
|
+
result_lines.append(f"\n--- Table {i + 1} ---")
|
|
366
|
+
|
|
367
|
+
# Find all rows
|
|
368
|
+
row_pattern = r'<tr[^>]*>(.*?)</tr>'
|
|
369
|
+
rows = re.findall(row_pattern, table_content, flags=re.IGNORECASE | re.DOTALL)
|
|
370
|
+
|
|
371
|
+
for row_content in rows:
|
|
372
|
+
# Find all cells (td or th)
|
|
373
|
+
cell_pattern = r'<(?:td|th)[^>]*>(.*?)</(?:td|th)>'
|
|
374
|
+
cells = re.findall(cell_pattern, row_content, flags=re.IGNORECASE | re.DOTALL)
|
|
375
|
+
|
|
376
|
+
# Clean up cell content
|
|
377
|
+
cleaned_cells = []
|
|
378
|
+
for cell in cells:
|
|
379
|
+
cell = re.sub(r'<[^>]+>', '', cell).strip()
|
|
380
|
+
cell = html.unescape(cell)
|
|
381
|
+
cleaned_cells.append(cell)
|
|
382
|
+
|
|
383
|
+
if cleaned_cells:
|
|
384
|
+
result_lines.append(separator.join(cleaned_cells))
|
|
385
|
+
|
|
386
|
+
return '\n'.join(result_lines)
|
|
387
|
+
|
|
388
|
+
except Exception as e:
|
|
389
|
+
self.logger.error(f"Error extracting tables: {e}")
|
|
390
|
+
return f"Error extracting tables: {str(e)}"
|
|
391
|
+
|
|
392
|
+
def extract_forms(self, html_content: str, settings: Dict[str, Any]) -> str:
|
|
393
|
+
"""
|
|
394
|
+
Extract form information from HTML content.
|
|
395
|
+
|
|
396
|
+
Args:
|
|
397
|
+
html_content: HTML content to process
|
|
398
|
+
settings: Tool settings
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
Form structure information
|
|
402
|
+
"""
|
|
403
|
+
try:
|
|
404
|
+
# Find all form tags
|
|
405
|
+
form_pattern = r'<form[^>]*>(.*?)</form>'
|
|
406
|
+
forms = re.findall(form_pattern, html_content, flags=re.IGNORECASE | re.DOTALL)
|
|
407
|
+
|
|
408
|
+
if not forms:
|
|
409
|
+
return "No forms found in the HTML content."
|
|
410
|
+
|
|
411
|
+
result_lines = []
|
|
412
|
+
|
|
413
|
+
for i, form_content in enumerate(forms):
|
|
414
|
+
if len(forms) > 1:
|
|
415
|
+
result_lines.append(f"\n--- Form {i + 1} ---")
|
|
416
|
+
|
|
417
|
+
# Extract form attributes
|
|
418
|
+
form_tag_match = re.search(r'<form([^>]*)>', html_content, re.IGNORECASE)
|
|
419
|
+
if form_tag_match:
|
|
420
|
+
form_attrs = form_tag_match.group(1)
|
|
421
|
+
|
|
422
|
+
# Extract action
|
|
423
|
+
action_match = re.search(r'action\s*=\s*["\']([^"\']*)["\']', form_attrs, re.IGNORECASE)
|
|
424
|
+
if action_match:
|
|
425
|
+
result_lines.append(f"Action: {action_match.group(1)}")
|
|
426
|
+
|
|
427
|
+
# Extract method
|
|
428
|
+
method_match = re.search(r'method\s*=\s*["\']([^"\']*)["\']', form_attrs, re.IGNORECASE)
|
|
429
|
+
if method_match:
|
|
430
|
+
result_lines.append(f"Method: {method_match.group(1)}")
|
|
431
|
+
|
|
432
|
+
# Find all input fields
|
|
433
|
+
input_pattern = r'<input[^>]*>'
|
|
434
|
+
inputs = re.findall(input_pattern, form_content, flags=re.IGNORECASE)
|
|
435
|
+
|
|
436
|
+
if inputs:
|
|
437
|
+
result_lines.append("Input Fields:")
|
|
438
|
+
for input_tag in inputs:
|
|
439
|
+
# Extract input attributes
|
|
440
|
+
name_match = re.search(r'name\s*=\s*["\']([^"\']*)["\']', input_tag, re.IGNORECASE)
|
|
441
|
+
type_match = re.search(r'type\s*=\s*["\']([^"\']*)["\']', input_tag, re.IGNORECASE)
|
|
442
|
+
|
|
443
|
+
name = name_match.group(1) if name_match else "unnamed"
|
|
444
|
+
input_type = type_match.group(1) if type_match else "text"
|
|
445
|
+
|
|
446
|
+
result_lines.append(f" - {name} ({input_type})")
|
|
447
|
+
|
|
448
|
+
# Find all textarea fields
|
|
449
|
+
textarea_pattern = r'<textarea[^>]*name\s*=\s*["\']([^"\']*)["\'][^>]*>'
|
|
450
|
+
textareas = re.findall(textarea_pattern, form_content, flags=re.IGNORECASE)
|
|
451
|
+
|
|
452
|
+
if textareas:
|
|
453
|
+
result_lines.append("Textarea Fields:")
|
|
454
|
+
for name in textareas:
|
|
455
|
+
result_lines.append(f" - {name}")
|
|
456
|
+
|
|
457
|
+
# Find all select fields
|
|
458
|
+
select_pattern = r'<select[^>]*name\s*=\s*["\']([^"\']*)["\'][^>]*>'
|
|
459
|
+
selects = re.findall(select_pattern, form_content, flags=re.IGNORECASE)
|
|
460
|
+
|
|
461
|
+
if selects:
|
|
462
|
+
result_lines.append("Select Fields:")
|
|
463
|
+
for name in selects:
|
|
464
|
+
result_lines.append(f" - {name}")
|
|
465
|
+
|
|
466
|
+
return '\n'.join(result_lines)
|
|
467
|
+
|
|
468
|
+
except Exception as e:
|
|
469
|
+
self.logger.error(f"Error extracting forms: {e}")
|
|
470
|
+
return f"Error extracting forms: {str(e)}"
|
|
471
|
+
|
|
472
|
+
def _remove_script_style_tags(self, html_content: str) -> str:
|
|
473
|
+
"""Remove script and style tags with their content."""
|
|
474
|
+
for tag in self.script_style_tags:
|
|
475
|
+
pattern = f'<{tag}[^>]*>.*?</{tag}>'
|
|
476
|
+
html_content = re.sub(pattern, '', html_content, flags=re.IGNORECASE | re.DOTALL)
|
|
477
|
+
return html_content
|
|
478
|
+
|
|
479
|
+
def _add_link_references(self, original_html: str, text_result: str) -> str:
|
|
480
|
+
"""Add link references to the text result."""
|
|
481
|
+
try:
|
|
482
|
+
# This is a simplified implementation
|
|
483
|
+
# In a full implementation, you might want to add footnote-style references
|
|
484
|
+
link_pattern = r'<a[^>]*href\s*=\s*["\']([^"\']*)["\'][^>]*>(.*?)</a>'
|
|
485
|
+
links = re.findall(link_pattern, original_html, flags=re.IGNORECASE | re.DOTALL)
|
|
486
|
+
|
|
487
|
+
if links:
|
|
488
|
+
text_result += "\n\nLinks found in document:\n"
|
|
489
|
+
for i, (href, link_text) in enumerate(links, 1):
|
|
490
|
+
link_text = re.sub(r'<[^>]+>', '', link_text).strip()
|
|
491
|
+
link_text = html.unescape(link_text)
|
|
492
|
+
text_result += f"{i}. {link_text}: {href}\n"
|
|
493
|
+
|
|
494
|
+
return text_result
|
|
495
|
+
except Exception:
|
|
496
|
+
return text_result
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
# Tool settings configuration
|
|
500
|
+
def get_default_settings():
|
|
501
|
+
"""Get default settings for the HTML Extraction Tool."""
|
|
502
|
+
return {
|
|
503
|
+
"extraction_method": "visible_text",
|
|
504
|
+
"preserve_links": False,
|
|
505
|
+
"remove_scripts": True,
|
|
506
|
+
"remove_comments": True,
|
|
507
|
+
"remove_style_attrs": True,
|
|
508
|
+
"remove_class_attrs": False,
|
|
509
|
+
"remove_id_attrs": False,
|
|
510
|
+
"remove_empty_tags": True,
|
|
511
|
+
"include_link_text": True,
|
|
512
|
+
"absolute_links_only": False,
|
|
513
|
+
"include_alt_text": True,
|
|
514
|
+
"include_title": False,
|
|
515
|
+
"include_heading_level": True,
|
|
516
|
+
"column_separator": "\t"
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def get_settings_ui_config():
|
|
521
|
+
"""Get UI configuration for the HTML Extraction Tool settings."""
|
|
522
|
+
return {
|
|
523
|
+
"extraction_method": {
|
|
524
|
+
"type": "dropdown",
|
|
525
|
+
"label": "Extraction Method",
|
|
526
|
+
"options": [
|
|
527
|
+
("Extract Visible Text", "visible_text"),
|
|
528
|
+
("Clean HTML", "clean_html"),
|
|
529
|
+
("Extract Links", "extract_links"),
|
|
530
|
+
("Extract Images", "extract_images"),
|
|
531
|
+
("Extract Headings", "extract_headings"),
|
|
532
|
+
("Extract Tables", "extract_tables"),
|
|
533
|
+
("Extract Forms", "extract_forms")
|
|
534
|
+
],
|
|
535
|
+
"default": "visible_text"
|
|
536
|
+
},
|
|
537
|
+
"preserve_links": {
|
|
538
|
+
"type": "checkbox",
|
|
539
|
+
"label": "Add link references to visible text",
|
|
540
|
+
"default": False,
|
|
541
|
+
"show_when": {"extraction_method": "visible_text"}
|
|
542
|
+
},
|
|
543
|
+
"remove_scripts": {
|
|
544
|
+
"type": "checkbox",
|
|
545
|
+
"label": "Remove script and style tags",
|
|
546
|
+
"default": True,
|
|
547
|
+
"show_when": {"extraction_method": "clean_html"}
|
|
548
|
+
},
|
|
549
|
+
"remove_comments": {
|
|
550
|
+
"type": "checkbox",
|
|
551
|
+
"label": "Remove HTML comments",
|
|
552
|
+
"default": True,
|
|
553
|
+
"show_when": {"extraction_method": "clean_html"}
|
|
554
|
+
},
|
|
555
|
+
"remove_style_attrs": {
|
|
556
|
+
"type": "checkbox",
|
|
557
|
+
"label": "Remove style attributes",
|
|
558
|
+
"default": True,
|
|
559
|
+
"show_when": {"extraction_method": "clean_html"}
|
|
560
|
+
},
|
|
561
|
+
"remove_class_attrs": {
|
|
562
|
+
"type": "checkbox",
|
|
563
|
+
"label": "Remove class attributes",
|
|
564
|
+
"default": False,
|
|
565
|
+
"show_when": {"extraction_method": "clean_html"}
|
|
566
|
+
},
|
|
567
|
+
"remove_id_attrs": {
|
|
568
|
+
"type": "checkbox",
|
|
569
|
+
"label": "Remove ID attributes",
|
|
570
|
+
"default": False,
|
|
571
|
+
"show_when": {"extraction_method": "clean_html"}
|
|
572
|
+
},
|
|
573
|
+
"remove_empty_tags": {
|
|
574
|
+
"type": "checkbox",
|
|
575
|
+
"label": "Remove empty tags",
|
|
576
|
+
"default": True,
|
|
577
|
+
"show_when": {"extraction_method": "clean_html"}
|
|
578
|
+
},
|
|
579
|
+
"include_link_text": {
|
|
580
|
+
"type": "checkbox",
|
|
581
|
+
"label": "Include link text",
|
|
582
|
+
"default": True,
|
|
583
|
+
"show_when": {"extraction_method": "extract_links"}
|
|
584
|
+
},
|
|
585
|
+
"absolute_links_only": {
|
|
586
|
+
"type": "checkbox",
|
|
587
|
+
"label": "Only absolute links (http/https)",
|
|
588
|
+
"default": False,
|
|
589
|
+
"show_when": {"extraction_method": "extract_links"}
|
|
590
|
+
},
|
|
591
|
+
"include_alt_text": {
|
|
592
|
+
"type": "checkbox",
|
|
593
|
+
"label": "Include alt text",
|
|
594
|
+
"default": True,
|
|
595
|
+
"show_when": {"extraction_method": "extract_images"}
|
|
596
|
+
},
|
|
597
|
+
"include_title": {
|
|
598
|
+
"type": "checkbox",
|
|
599
|
+
"label": "Include title attribute",
|
|
600
|
+
"default": False,
|
|
601
|
+
"show_when": {"extraction_method": "extract_images"}
|
|
602
|
+
},
|
|
603
|
+
"include_heading_level": {
|
|
604
|
+
"type": "checkbox",
|
|
605
|
+
"label": "Include heading level (H1, H2, etc.)",
|
|
606
|
+
"default": True,
|
|
607
|
+
"show_when": {"extraction_method": "extract_headings"}
|
|
608
|
+
},
|
|
609
|
+
"column_separator": {
|
|
610
|
+
"type": "entry",
|
|
611
|
+
"label": "Column separator",
|
|
612
|
+
"default": "\t",
|
|
613
|
+
"show_when": {"extraction_method": "extract_tables"}
|
|
614
|
+
}
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
# BaseTool-compatible wrapper
|
|
619
|
+
try:
|
|
620
|
+
from tools.base_tool import ToolWithOptions
|
|
621
|
+
import tkinter as tk
|
|
622
|
+
from tkinter import ttk
|
|
623
|
+
|
|
624
|
+
class HTMLToolV2(ToolWithOptions):
|
|
625
|
+
"""
|
|
626
|
+
BaseTool-compatible version of HTMLExtractionTool.
|
|
627
|
+
"""
|
|
628
|
+
|
|
629
|
+
TOOL_NAME = "HTML Tool"
|
|
630
|
+
TOOL_DESCRIPTION = "Extract and process HTML content"
|
|
631
|
+
TOOL_VERSION = "2.0.0"
|
|
632
|
+
|
|
633
|
+
OPTIONS = [
|
|
634
|
+
("Visible Text", "visible_text"),
|
|
635
|
+
("Clean HTML", "clean_html"),
|
|
636
|
+
("Extract Links", "extract_links"),
|
|
637
|
+
("Extract Images", "extract_images"),
|
|
638
|
+
("Extract Headings", "extract_headings"),
|
|
639
|
+
("Extract Tables", "extract_tables"),
|
|
640
|
+
("Extract Forms", "extract_forms"),
|
|
641
|
+
]
|
|
642
|
+
OPTIONS_LABEL = "Operation"
|
|
643
|
+
USE_DROPDOWN = True
|
|
644
|
+
DEFAULT_OPTION = "visible_text"
|
|
645
|
+
|
|
646
|
+
def __init__(self):
|
|
647
|
+
super().__init__()
|
|
648
|
+
self._tool = HTMLExtractionTool()
|
|
649
|
+
|
|
650
|
+
def process_text(self, input_text: str, settings: Dict[str, Any]) -> str:
|
|
651
|
+
"""Process HTML content."""
|
|
652
|
+
mode = settings.get("mode", "visible_text")
|
|
653
|
+
tool_settings = {"extraction_method": mode}
|
|
654
|
+
return self._tool.process_text(input_text, tool_settings)
|
|
655
|
+
|
|
656
|
+
except ImportError:
|
|
657
657
|
pass
|