pomera-ai-commander 1.2.7 → 1.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/pomera-create-shortcut.js +51 -0
- package/bin/pomera.js +68 -0
- package/core/database_schema.py +24 -1
- package/core/database_schema_manager.py +4 -2
- package/core/database_settings_manager.py +25 -2
- package/core/dialog_manager.py +4 -4
- package/core/efficient_line_numbers.py +5 -4
- package/core/load_presets_dialog.py +460 -0
- package/core/mcp/tool_registry.py +327 -0
- package/core/settings_defaults_registry.py +159 -15
- package/core/tool_search_widget.py +85 -5
- package/create_shortcut.py +12 -4
- package/mcp.json +1 -1
- package/package.json +4 -2
- package/pomera.py +760 -25
- package/tools/base64_tools.py +4 -4
- package/tools/case_tool.py +4 -4
- package/tools/curl_settings.py +12 -1
- package/tools/curl_tool.py +176 -11
- package/tools/notes_widget.py +8 -1
- package/tools/tool_loader.py +20 -9
- package/tools/url_content_reader.py +402 -0
- package/tools/web_search.py +522 -0
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
"""
|
|
2
|
+
URL Content Reader Module for Pomera AI Commander
|
|
3
|
+
|
|
4
|
+
Fetches web content and converts HTML to Markdown.
|
|
5
|
+
Features:
|
|
6
|
+
- HTTP/HTTPS URL fetching
|
|
7
|
+
- Main content extraction (skips nav, header, footer)
|
|
8
|
+
- HTML to Markdown conversion
|
|
9
|
+
- Proper error handling and timeout support
|
|
10
|
+
|
|
11
|
+
Author: Pomera AI Commander
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
import urllib.request
|
|
16
|
+
import urllib.error
|
|
17
|
+
from typing import Optional, List, Tuple
|
|
18
|
+
from html.parser import HTMLParser
|
|
19
|
+
from html import unescape
|
|
20
|
+
import logging
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class HTMLToMarkdownConverter(HTMLParser):
|
|
24
|
+
"""Convert HTML to Markdown format."""
|
|
25
|
+
|
|
26
|
+
# Tags to completely skip (including content)
|
|
27
|
+
SKIP_TAGS = {'script', 'style', 'noscript', 'iframe', 'svg', 'canvas',
|
|
28
|
+
'nav', 'header', 'footer', 'aside', 'form', 'button'}
|
|
29
|
+
|
|
30
|
+
# Block-level tags that need newlines
|
|
31
|
+
BLOCK_TAGS = {'p', 'div', 'section', 'article', 'main', 'h1', 'h2', 'h3',
|
|
32
|
+
'h4', 'h5', 'h6', 'blockquote', 'pre', 'li', 'tr', 'td', 'th'}
|
|
33
|
+
|
|
34
|
+
def __init__(self):
|
|
35
|
+
super().__init__()
|
|
36
|
+
self.output: List[str] = []
|
|
37
|
+
self.tag_stack: List[str] = []
|
|
38
|
+
self.skip_depth = 0
|
|
39
|
+
self.list_depth = 0
|
|
40
|
+
self.in_pre = False
|
|
41
|
+
self.in_code = False
|
|
42
|
+
self.current_link_url = ""
|
|
43
|
+
self.current_link_text = ""
|
|
44
|
+
self.in_link = False
|
|
45
|
+
|
|
46
|
+
def handle_starttag(self, tag, attrs):
|
|
47
|
+
tag = tag.lower()
|
|
48
|
+
|
|
49
|
+
# Track skip depth for nested skip tags
|
|
50
|
+
if tag in self.SKIP_TAGS:
|
|
51
|
+
self.skip_depth += 1
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
if self.skip_depth > 0:
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
self.tag_stack.append(tag)
|
|
58
|
+
attrs_dict = dict(attrs)
|
|
59
|
+
|
|
60
|
+
# Headings
|
|
61
|
+
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
|
62
|
+
level = int(tag[1])
|
|
63
|
+
self.output.append('\n\n' + '#' * level + ' ')
|
|
64
|
+
|
|
65
|
+
# Paragraphs and divs
|
|
66
|
+
elif tag in ('p', 'div', 'section', 'article', 'main'):
|
|
67
|
+
self.output.append('\n\n')
|
|
68
|
+
|
|
69
|
+
# Line break
|
|
70
|
+
elif tag == 'br':
|
|
71
|
+
self.output.append('\n')
|
|
72
|
+
|
|
73
|
+
# Horizontal rule
|
|
74
|
+
elif tag == 'hr':
|
|
75
|
+
self.output.append('\n\n---\n\n')
|
|
76
|
+
|
|
77
|
+
# Bold
|
|
78
|
+
elif tag in ('strong', 'b'):
|
|
79
|
+
self.output.append('**')
|
|
80
|
+
|
|
81
|
+
# Italic
|
|
82
|
+
elif tag in ('em', 'i'):
|
|
83
|
+
self.output.append('*')
|
|
84
|
+
|
|
85
|
+
# Code
|
|
86
|
+
elif tag == 'code':
|
|
87
|
+
if not self.in_pre:
|
|
88
|
+
self.output.append('`')
|
|
89
|
+
self.in_code = True
|
|
90
|
+
|
|
91
|
+
# Preformatted
|
|
92
|
+
elif tag == 'pre':
|
|
93
|
+
self.output.append('\n\n```\n')
|
|
94
|
+
self.in_pre = True
|
|
95
|
+
|
|
96
|
+
# Links
|
|
97
|
+
elif tag == 'a':
|
|
98
|
+
href = attrs_dict.get('href', '')
|
|
99
|
+
if href and not href.startswith('#') and not href.startswith('javascript:'):
|
|
100
|
+
self.current_link_url = href
|
|
101
|
+
self.current_link_text = ""
|
|
102
|
+
self.in_link = True
|
|
103
|
+
self.output.append('[')
|
|
104
|
+
|
|
105
|
+
# Images
|
|
106
|
+
elif tag == 'img':
|
|
107
|
+
src = attrs_dict.get('src', '')
|
|
108
|
+
alt = attrs_dict.get('alt', 'image')
|
|
109
|
+
if src:
|
|
110
|
+
self.output.append(f'\n\n')
|
|
111
|
+
|
|
112
|
+
# Unordered list
|
|
113
|
+
elif tag == 'ul':
|
|
114
|
+
self.list_depth += 1
|
|
115
|
+
self.output.append('\n')
|
|
116
|
+
|
|
117
|
+
# Ordered list
|
|
118
|
+
elif tag == 'ol':
|
|
119
|
+
self.list_depth += 1
|
|
120
|
+
self.output.append('\n')
|
|
121
|
+
|
|
122
|
+
# List item
|
|
123
|
+
elif tag == 'li':
|
|
124
|
+
indent = ' ' * (self.list_depth - 1)
|
|
125
|
+
parent = self.tag_stack[-2] if len(self.tag_stack) > 1 else 'ul'
|
|
126
|
+
if parent == 'ol':
|
|
127
|
+
self.output.append(f'\n{indent}1. ')
|
|
128
|
+
else:
|
|
129
|
+
self.output.append(f'\n{indent}- ')
|
|
130
|
+
|
|
131
|
+
# Blockquote
|
|
132
|
+
elif tag == 'blockquote':
|
|
133
|
+
self.output.append('\n\n> ')
|
|
134
|
+
|
|
135
|
+
# Table elements
|
|
136
|
+
elif tag == 'table':
|
|
137
|
+
self.output.append('\n\n')
|
|
138
|
+
elif tag == 'tr':
|
|
139
|
+
self.output.append('\n')
|
|
140
|
+
elif tag in ('td', 'th'):
|
|
141
|
+
self.output.append(' | ')
|
|
142
|
+
|
|
143
|
+
def handle_endtag(self, tag):
|
|
144
|
+
tag = tag.lower()
|
|
145
|
+
|
|
146
|
+
if tag in self.SKIP_TAGS:
|
|
147
|
+
self.skip_depth = max(0, self.skip_depth - 1)
|
|
148
|
+
return
|
|
149
|
+
|
|
150
|
+
if self.skip_depth > 0:
|
|
151
|
+
return
|
|
152
|
+
|
|
153
|
+
if self.tag_stack and self.tag_stack[-1] == tag:
|
|
154
|
+
self.tag_stack.pop()
|
|
155
|
+
|
|
156
|
+
# Bold
|
|
157
|
+
if tag in ('strong', 'b'):
|
|
158
|
+
self.output.append('**')
|
|
159
|
+
|
|
160
|
+
# Italic
|
|
161
|
+
elif tag in ('em', 'i'):
|
|
162
|
+
self.output.append('*')
|
|
163
|
+
|
|
164
|
+
# Code
|
|
165
|
+
elif tag == 'code':
|
|
166
|
+
if not self.in_pre:
|
|
167
|
+
self.output.append('`')
|
|
168
|
+
self.in_code = False
|
|
169
|
+
|
|
170
|
+
# Preformatted
|
|
171
|
+
elif tag == 'pre':
|
|
172
|
+
self.output.append('\n```\n\n')
|
|
173
|
+
self.in_pre = False
|
|
174
|
+
|
|
175
|
+
# Links
|
|
176
|
+
elif tag == 'a' and self.in_link:
|
|
177
|
+
self.output.append(f']({self.current_link_url})')
|
|
178
|
+
self.in_link = False
|
|
179
|
+
self.current_link_url = ""
|
|
180
|
+
|
|
181
|
+
# Lists
|
|
182
|
+
elif tag in ('ul', 'ol'):
|
|
183
|
+
self.list_depth = max(0, self.list_depth - 1)
|
|
184
|
+
if self.list_depth == 0:
|
|
185
|
+
self.output.append('\n')
|
|
186
|
+
|
|
187
|
+
# Block elements
|
|
188
|
+
elif tag in self.BLOCK_TAGS:
|
|
189
|
+
self.output.append('\n')
|
|
190
|
+
|
|
191
|
+
def handle_data(self, data):
|
|
192
|
+
if self.skip_depth > 0:
|
|
193
|
+
return
|
|
194
|
+
|
|
195
|
+
# Preserve whitespace in pre/code blocks
|
|
196
|
+
if self.in_pre:
|
|
197
|
+
self.output.append(data)
|
|
198
|
+
else:
|
|
199
|
+
# Normalize whitespace
|
|
200
|
+
text = re.sub(r'\s+', ' ', data)
|
|
201
|
+
if text.strip():
|
|
202
|
+
self.output.append(text)
|
|
203
|
+
|
|
204
|
+
def handle_entityref(self, name):
|
|
205
|
+
if self.skip_depth > 0:
|
|
206
|
+
return
|
|
207
|
+
char = unescape(f'&{name};')
|
|
208
|
+
self.output.append(char)
|
|
209
|
+
|
|
210
|
+
def handle_charref(self, name):
|
|
211
|
+
if self.skip_depth > 0:
|
|
212
|
+
return
|
|
213
|
+
char = unescape(f'&#{name};')
|
|
214
|
+
self.output.append(char)
|
|
215
|
+
|
|
216
|
+
def get_markdown(self) -> str:
|
|
217
|
+
"""Get the converted markdown text."""
|
|
218
|
+
text = ''.join(self.output)
|
|
219
|
+
|
|
220
|
+
# Clean up excessive newlines
|
|
221
|
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
222
|
+
|
|
223
|
+
# Clean up spaces around markdown elements
|
|
224
|
+
text = re.sub(r'\*\* +', '**', text)
|
|
225
|
+
text = re.sub(r' +\*\*', '**', text)
|
|
226
|
+
text = re.sub(r'\* +', '*', text)
|
|
227
|
+
text = re.sub(r' +\*', '*', text)
|
|
228
|
+
|
|
229
|
+
# Clean up empty list items
|
|
230
|
+
text = re.sub(r'\n- \n', '\n', text)
|
|
231
|
+
text = re.sub(r'\n1\. \n', '\n', text)
|
|
232
|
+
|
|
233
|
+
return text.strip()
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
class URLContentReader:
|
|
237
|
+
"""Fetch URLs and convert content to Markdown."""
|
|
238
|
+
|
|
239
|
+
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
240
|
+
|
|
241
|
+
def __init__(self, logger=None):
|
|
242
|
+
self.logger = logger or logging.getLogger(__name__)
|
|
243
|
+
|
|
244
|
+
def fetch_url(self, url: str, timeout: int = 30) -> str:
|
|
245
|
+
"""
|
|
246
|
+
Fetch content from a URL.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
url: URL to fetch
|
|
250
|
+
timeout: Request timeout in seconds
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
HTML content as string
|
|
254
|
+
"""
|
|
255
|
+
# Validate URL
|
|
256
|
+
if not url.startswith(('http://', 'https://')):
|
|
257
|
+
url = 'https://' + url
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
req = urllib.request.Request(url)
|
|
261
|
+
req.add_header('User-Agent', self.USER_AGENT)
|
|
262
|
+
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
|
|
263
|
+
req.add_header('Accept-Language', 'en-US,en;q=0.5')
|
|
264
|
+
|
|
265
|
+
with urllib.request.urlopen(req, timeout=timeout) as response:
|
|
266
|
+
# Detect encoding
|
|
267
|
+
charset = response.headers.get_content_charset()
|
|
268
|
+
if not charset:
|
|
269
|
+
charset = 'utf-8'
|
|
270
|
+
|
|
271
|
+
content = response.read()
|
|
272
|
+
try:
|
|
273
|
+
return content.decode(charset, errors='replace')
|
|
274
|
+
except (UnicodeDecodeError, LookupError):
|
|
275
|
+
return content.decode('utf-8', errors='replace')
|
|
276
|
+
|
|
277
|
+
except urllib.error.HTTPError as e:
|
|
278
|
+
raise Exception(f"HTTP Error {e.code}: {e.reason}")
|
|
279
|
+
except urllib.error.URLError as e:
|
|
280
|
+
raise Exception(f"URL Error: {e.reason}")
|
|
281
|
+
except Exception as e:
|
|
282
|
+
raise Exception(f"Fetch error: {str(e)}")
|
|
283
|
+
|
|
284
|
+
def html_to_markdown(self, html: str, extract_main_content: bool = True) -> str:
|
|
285
|
+
"""
|
|
286
|
+
Convert HTML to Markdown.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
html: HTML content
|
|
290
|
+
extract_main_content: If True, try to extract main content area
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Markdown formatted text
|
|
294
|
+
"""
|
|
295
|
+
if extract_main_content:
|
|
296
|
+
html = self._extract_main_content(html)
|
|
297
|
+
|
|
298
|
+
converter = HTMLToMarkdownConverter()
|
|
299
|
+
try:
|
|
300
|
+
converter.feed(html)
|
|
301
|
+
return converter.get_markdown()
|
|
302
|
+
except Exception as e:
|
|
303
|
+
self.logger.error(f"HTML parsing error: {e}")
|
|
304
|
+
# Fallback: simple text extraction
|
|
305
|
+
return self._simple_text_extraction(html)
|
|
306
|
+
|
|
307
|
+
def _extract_main_content(self, html: str) -> str:
|
|
308
|
+
"""Try to extract main content area from HTML."""
|
|
309
|
+
# Try to find main content containers
|
|
310
|
+
patterns = [
|
|
311
|
+
r'<main[^>]*>(.*?)</main>',
|
|
312
|
+
r'<article[^>]*>(.*?)</article>',
|
|
313
|
+
r'<div[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</div>',
|
|
314
|
+
r'<div[^>]*id="[^"]*content[^"]*"[^>]*>(.*?)</div>',
|
|
315
|
+
r'<div[^>]*class="[^"]*main[^"]*"[^>]*>(.*?)</div>',
|
|
316
|
+
r'<body[^>]*>(.*?)</body>',
|
|
317
|
+
]
|
|
318
|
+
|
|
319
|
+
for pattern in patterns:
|
|
320
|
+
match = re.search(pattern, html, re.IGNORECASE | re.DOTALL)
|
|
321
|
+
if match:
|
|
322
|
+
return match.group(1)
|
|
323
|
+
|
|
324
|
+
return html
|
|
325
|
+
|
|
326
|
+
def _simple_text_extraction(self, html: str) -> str:
|
|
327
|
+
"""Simple fallback text extraction."""
|
|
328
|
+
# Remove script and style
|
|
329
|
+
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.IGNORECASE | re.DOTALL)
|
|
330
|
+
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.IGNORECASE | re.DOTALL)
|
|
331
|
+
|
|
332
|
+
# Remove HTML tags
|
|
333
|
+
text = re.sub(r'<[^>]+>', ' ', text)
|
|
334
|
+
|
|
335
|
+
# Decode entities
|
|
336
|
+
text = unescape(text)
|
|
337
|
+
|
|
338
|
+
# Normalize whitespace
|
|
339
|
+
text = re.sub(r'\s+', ' ', text)
|
|
340
|
+
|
|
341
|
+
return text.strip()
|
|
342
|
+
|
|
343
|
+
def fetch_and_convert(self, url: str, timeout: int = 30,
|
|
344
|
+
extract_main_content: bool = True) -> str:
|
|
345
|
+
"""
|
|
346
|
+
Fetch URL and convert to Markdown in one step.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
url: URL to fetch
|
|
350
|
+
timeout: Request timeout in seconds
|
|
351
|
+
extract_main_content: If True, extract main content only
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
Markdown formatted content
|
|
355
|
+
"""
|
|
356
|
+
html = self.fetch_url(url, timeout)
|
|
357
|
+
return self.html_to_markdown(html, extract_main_content)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
# CLI support
|
|
361
|
+
def main():
|
|
362
|
+
"""CLI entry point for URL content reading."""
|
|
363
|
+
import argparse
|
|
364
|
+
|
|
365
|
+
parser = argparse.ArgumentParser(description="Fetch URLs and convert to Markdown")
|
|
366
|
+
parser.add_argument("url", nargs="?", help="URL to fetch")
|
|
367
|
+
parser.add_argument("--timeout", "-t", type=int, default=30, help="Timeout in seconds")
|
|
368
|
+
parser.add_argument("--no-extract", action="store_true",
|
|
369
|
+
help="Don't try to extract main content")
|
|
370
|
+
parser.add_argument("--output", "-o", type=str, help="Output file path")
|
|
371
|
+
|
|
372
|
+
args = parser.parse_args()
|
|
373
|
+
|
|
374
|
+
if not args.url:
|
|
375
|
+
parser.print_help()
|
|
376
|
+
return
|
|
377
|
+
|
|
378
|
+
reader = URLContentReader()
|
|
379
|
+
|
|
380
|
+
try:
|
|
381
|
+
markdown = reader.fetch_and_convert(
|
|
382
|
+
args.url,
|
|
383
|
+
timeout=args.timeout,
|
|
384
|
+
extract_main_content=not args.no_extract
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
if args.output:
|
|
388
|
+
with open(args.output, 'w', encoding='utf-8') as f:
|
|
389
|
+
f.write(markdown)
|
|
390
|
+
print(f"Saved to: {args.output}")
|
|
391
|
+
else:
|
|
392
|
+
print(markdown)
|
|
393
|
+
|
|
394
|
+
except Exception as e:
|
|
395
|
+
print(f"Error: {e}")
|
|
396
|
+
return 1
|
|
397
|
+
|
|
398
|
+
return 0
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
if __name__ == "__main__":
|
|
402
|
+
exit(main() or 0)
|