pomera-ai-commander 1.2.8 → 1.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +113 -89
- package/core/database_schema.py +24 -1
- package/core/database_schema_manager.py +4 -2
- package/core/database_settings_manager.py +25 -2
- package/core/dialog_manager.py +4 -4
- package/core/efficient_line_numbers.py +5 -4
- package/core/load_presets_dialog.py +460 -0
- package/core/mcp/tool_registry.py +327 -0
- package/core/settings_defaults_registry.py +159 -15
- package/mcp.json +1 -1
- package/package.json +2 -1
- package/pomera.py +755 -22
- package/tools/case_tool.py +4 -4
- package/tools/curl_settings.py +12 -1
- package/tools/curl_tool.py +176 -11
- package/tools/tool_loader.py +18 -0
- package/tools/url_content_reader.py +402 -0
- package/tools/web_search.py +522 -0
package/tools/case_tool.py
CHANGED
|
@@ -188,7 +188,8 @@ class CaseTool:
|
|
|
188
188
|
"""Get default settings for the Case Tool.
|
|
189
189
|
|
|
190
190
|
Uses the centralized Settings Defaults Registry if available,
|
|
191
|
-
otherwise falls back to
|
|
191
|
+
otherwise falls back to minimal defaults. Full exclusions list
|
|
192
|
+
is maintained only in the registry.
|
|
192
193
|
"""
|
|
193
194
|
try:
|
|
194
195
|
from core.settings_defaults_registry import get_registry
|
|
@@ -199,11 +200,10 @@ class CaseTool:
|
|
|
199
200
|
except Exception:
|
|
200
201
|
pass
|
|
201
202
|
|
|
202
|
-
#
|
|
203
|
-
# Exclusions: a, an, the, and, but, or, for, nor, on, at, to, from, by, with, in, of
|
|
203
|
+
# Minimal fallback - registry has the full exclusions list
|
|
204
204
|
return {
|
|
205
205
|
"mode": "Sentence",
|
|
206
|
-
"exclusions": "
|
|
206
|
+
"exclusions": ""
|
|
207
207
|
}
|
|
208
208
|
|
|
209
209
|
|
package/tools/curl_settings.py
CHANGED
|
@@ -91,7 +91,18 @@ class CurlSettingsManager:
|
|
|
91
91
|
# Version and metadata
|
|
92
92
|
"settings_version": "1.0",
|
|
93
93
|
"last_updated": None,
|
|
94
|
-
"created_date": None
|
|
94
|
+
"created_date": None,
|
|
95
|
+
|
|
96
|
+
# UI State Persistence (NEW - persist between restarts)
|
|
97
|
+
"last_url": "",
|
|
98
|
+
"last_method": "GET",
|
|
99
|
+
"last_headers": "",
|
|
100
|
+
"last_body": "",
|
|
101
|
+
"last_body_type": "None",
|
|
102
|
+
"last_auth_type": "None",
|
|
103
|
+
"last_auth_data": {}, # Encrypted auth tokens stored here
|
|
104
|
+
"last_complex_options": "",
|
|
105
|
+
"persist_ui_state": True # User preference to persist UI state
|
|
95
106
|
}
|
|
96
107
|
|
|
97
108
|
# Current settings (loaded from file or defaults)
|
package/tools/curl_tool.py
CHANGED
|
@@ -284,6 +284,9 @@ class CurlToolWidget:
|
|
|
284
284
|
# Create the UI
|
|
285
285
|
self.create_widgets()
|
|
286
286
|
|
|
287
|
+
# Restore saved UI state (URL, method, headers, body, auth)
|
|
288
|
+
self._restore_ui_state()
|
|
289
|
+
|
|
287
290
|
# Save settings when the window is closed
|
|
288
291
|
if hasattr(self.parent, 'protocol'):
|
|
289
292
|
self.parent.protocol("WM_DELETE_WINDOW", self._on_closing)
|
|
@@ -4232,17 +4235,8 @@ curl -X POST https://api.example.com/users \\
|
|
|
4232
4235
|
if self.logger:
|
|
4233
4236
|
self.logger.error(f"Error saving settings: {e}")
|
|
4234
4237
|
|
|
4235
|
-
|
|
4236
|
-
|
|
4237
|
-
try:
|
|
4238
|
-
self._save_current_settings()
|
|
4239
|
-
except Exception as e:
|
|
4240
|
-
if self.logger:
|
|
4241
|
-
self.logger.error(f"Error saving settings on close: {e}")
|
|
4242
|
-
|
|
4243
|
-
# Continue with normal closing
|
|
4244
|
-
if hasattr(self.parent, 'destroy'):
|
|
4245
|
-
self.parent.destroy()
|
|
4238
|
+
# NOTE: _on_closing method has been moved to end of class (line ~5490)
|
|
4239
|
+
# to consolidate with UI state persistence logic
|
|
4246
4240
|
|
|
4247
4241
|
def _export_curl(self):
|
|
4248
4242
|
"""Export current request as cURL command."""
|
|
@@ -5483,6 +5477,177 @@ Timestamp: {timestamp}{additional_details}"""
|
|
|
5483
5477
|
if item:
|
|
5484
5478
|
self.history_tree.selection_set(item)
|
|
5485
5479
|
self.history_context_menu.post(event.x_root, event.y_root)
|
|
5480
|
+
|
|
5481
|
+
def _on_closing(self):
|
|
5482
|
+
"""Handle widget/window closing - save all settings and UI state."""
|
|
5483
|
+
# Save original settings (from the original method at line 4238)
|
|
5484
|
+
try:
|
|
5485
|
+
self._save_current_settings()
|
|
5486
|
+
except Exception as e:
|
|
5487
|
+
if self.logger:
|
|
5488
|
+
self.logger.error(f"Error saving settings on close: {e}")
|
|
5489
|
+
|
|
5490
|
+
# Also save UI state for full persistence (new functionality)
|
|
5491
|
+
try:
|
|
5492
|
+
self._save_ui_state()
|
|
5493
|
+
self.logger.info("cURL Tool UI state saved on close")
|
|
5494
|
+
except Exception as e:
|
|
5495
|
+
self.logger.error(f"Error saving UI state on close: {e}")
|
|
5496
|
+
|
|
5497
|
+
# If parent has destroy, call it
|
|
5498
|
+
if hasattr(self.parent, 'destroy'):
|
|
5499
|
+
self.parent.destroy()
|
|
5500
|
+
|
|
5501
|
+
def _save_ui_state(self):
|
|
5502
|
+
"""Save current UI state to settings for persistence."""
|
|
5503
|
+
if not self.settings_manager:
|
|
5504
|
+
return
|
|
5505
|
+
|
|
5506
|
+
# Check if UI state persistence is enabled
|
|
5507
|
+
if not self.settings.get("persist_ui_state", True):
|
|
5508
|
+
return
|
|
5509
|
+
|
|
5510
|
+
try:
|
|
5511
|
+
# Get current URL from text widget
|
|
5512
|
+
url = ""
|
|
5513
|
+
if self.url_text and self.url_text.winfo_exists():
|
|
5514
|
+
url = self.url_text.get("1.0", tk.END).strip()
|
|
5515
|
+
|
|
5516
|
+
# Get current method
|
|
5517
|
+
method = self.method_var.get() if self.method_var else "GET"
|
|
5518
|
+
|
|
5519
|
+
# Get headers from text widget
|
|
5520
|
+
headers = ""
|
|
5521
|
+
if hasattr(self, 'headers_text') and self.headers_text and self.headers_text.winfo_exists():
|
|
5522
|
+
headers = self.headers_text.get("1.0", tk.END).strip()
|
|
5523
|
+
|
|
5524
|
+
# Get body from text widget
|
|
5525
|
+
body = ""
|
|
5526
|
+
if hasattr(self, 'body_text') and self.body_text and self.body_text.winfo_exists():
|
|
5527
|
+
body = self.body_text.get("1.0", tk.END).strip()
|
|
5528
|
+
|
|
5529
|
+
# Get body type
|
|
5530
|
+
body_type = self.body_type_var.get() if self.body_type_var else "None"
|
|
5531
|
+
|
|
5532
|
+
# Get auth type
|
|
5533
|
+
auth_type = self.auth_type_var.get() if self.auth_type_var else "None"
|
|
5534
|
+
|
|
5535
|
+
# Get auth data (encrypted)
|
|
5536
|
+
auth_data = {}
|
|
5537
|
+
if auth_type == "Bearer":
|
|
5538
|
+
token = self.bearer_token_var.get() if hasattr(self, 'bearer_token_var') else ""
|
|
5539
|
+
if token:
|
|
5540
|
+
auth_data["bearer_token"] = encrypt_auth_value(token)
|
|
5541
|
+
elif auth_type == "Basic":
|
|
5542
|
+
username = self.basic_username_var.get() if hasattr(self, 'basic_username_var') else ""
|
|
5543
|
+
password = self.basic_password_var.get() if hasattr(self, 'basic_password_var') else ""
|
|
5544
|
+
if username or password:
|
|
5545
|
+
auth_data["basic_username"] = username
|
|
5546
|
+
auth_data["basic_password"] = encrypt_auth_value(password)
|
|
5547
|
+
elif auth_type == "API Key":
|
|
5548
|
+
key_name = self.api_key_name_var.get() if hasattr(self, 'api_key_name_var') else ""
|
|
5549
|
+
key_value = self.api_key_value_var.get() if hasattr(self, 'api_key_value_var') else ""
|
|
5550
|
+
key_location = self.api_key_location_var.get() if hasattr(self, 'api_key_location_var') else "header"
|
|
5551
|
+
if key_value:
|
|
5552
|
+
auth_data["api_key_name"] = key_name
|
|
5553
|
+
auth_data["api_key_value"] = encrypt_auth_value(key_value)
|
|
5554
|
+
auth_data["api_key_location"] = key_location
|
|
5555
|
+
|
|
5556
|
+
# Get complex options
|
|
5557
|
+
complex_options = self.complex_options_var.get() if hasattr(self, 'complex_options_var') else ""
|
|
5558
|
+
|
|
5559
|
+
# Update settings
|
|
5560
|
+
self.settings_manager.set_setting("last_url", url)
|
|
5561
|
+
self.settings_manager.set_setting("last_method", method)
|
|
5562
|
+
self.settings_manager.set_setting("last_headers", headers)
|
|
5563
|
+
self.settings_manager.set_setting("last_body", body)
|
|
5564
|
+
self.settings_manager.set_setting("last_body_type", body_type)
|
|
5565
|
+
self.settings_manager.set_setting("last_auth_type", auth_type)
|
|
5566
|
+
self.settings_manager.set_setting("last_auth_data", auth_data)
|
|
5567
|
+
self.settings_manager.set_setting("last_complex_options", complex_options)
|
|
5568
|
+
|
|
5569
|
+
# Save to persistent storage
|
|
5570
|
+
self.settings_manager.save_settings()
|
|
5571
|
+
self.logger.debug("UI state saved successfully")
|
|
5572
|
+
|
|
5573
|
+
except Exception as e:
|
|
5574
|
+
self.logger.error(f"Error saving UI state: {e}")
|
|
5575
|
+
|
|
5576
|
+
def _restore_ui_state(self):
|
|
5577
|
+
"""Restore UI state from saved settings."""
|
|
5578
|
+
if not self.settings_manager:
|
|
5579
|
+
return
|
|
5580
|
+
|
|
5581
|
+
# Check if UI state persistence is enabled
|
|
5582
|
+
if not self.settings.get("persist_ui_state", True):
|
|
5583
|
+
return
|
|
5584
|
+
|
|
5585
|
+
try:
|
|
5586
|
+
# Restore URL
|
|
5587
|
+
last_url = self.settings.get("last_url", "")
|
|
5588
|
+
if last_url and self.url_text and self.url_text.winfo_exists():
|
|
5589
|
+
self.url_text.delete("1.0", tk.END)
|
|
5590
|
+
self.url_text.insert("1.0", last_url)
|
|
5591
|
+
|
|
5592
|
+
# Restore method
|
|
5593
|
+
last_method = self.settings.get("last_method", "GET")
|
|
5594
|
+
if self.method_var:
|
|
5595
|
+
self.method_var.set(last_method)
|
|
5596
|
+
|
|
5597
|
+
# Restore headers
|
|
5598
|
+
last_headers = self.settings.get("last_headers", "")
|
|
5599
|
+
if last_headers and hasattr(self, 'headers_text') and self.headers_text:
|
|
5600
|
+
if self.headers_text.winfo_exists():
|
|
5601
|
+
self.headers_text.delete("1.0", tk.END)
|
|
5602
|
+
self.headers_text.insert("1.0", last_headers)
|
|
5603
|
+
|
|
5604
|
+
# Restore body
|
|
5605
|
+
last_body = self.settings.get("last_body", "")
|
|
5606
|
+
if last_body and hasattr(self, 'body_text') and self.body_text:
|
|
5607
|
+
if self.body_text.winfo_exists():
|
|
5608
|
+
self.body_text.delete("1.0", tk.END)
|
|
5609
|
+
self.body_text.insert("1.0", last_body)
|
|
5610
|
+
|
|
5611
|
+
# Restore body type
|
|
5612
|
+
last_body_type = self.settings.get("last_body_type", "None")
|
|
5613
|
+
if self.body_type_var:
|
|
5614
|
+
self.body_type_var.set(last_body_type)
|
|
5615
|
+
|
|
5616
|
+
# Restore auth type
|
|
5617
|
+
last_auth_type = self.settings.get("last_auth_type", "None")
|
|
5618
|
+
if self.auth_type_var:
|
|
5619
|
+
self.auth_type_var.set(last_auth_type)
|
|
5620
|
+
|
|
5621
|
+
# Restore auth data (decrypted)
|
|
5622
|
+
last_auth_data = self.settings.get("last_auth_data", {})
|
|
5623
|
+
if last_auth_data:
|
|
5624
|
+
if last_auth_type == "Bearer" and hasattr(self, 'bearer_token_var'):
|
|
5625
|
+
token = last_auth_data.get("bearer_token", "")
|
|
5626
|
+
self.bearer_token_var.set(decrypt_auth_value(token))
|
|
5627
|
+
elif last_auth_type == "Basic":
|
|
5628
|
+
if hasattr(self, 'basic_username_var'):
|
|
5629
|
+
self.basic_username_var.set(last_auth_data.get("basic_username", ""))
|
|
5630
|
+
if hasattr(self, 'basic_password_var'):
|
|
5631
|
+
password = last_auth_data.get("basic_password", "")
|
|
5632
|
+
self.basic_password_var.set(decrypt_auth_value(password))
|
|
5633
|
+
elif last_auth_type == "API Key":
|
|
5634
|
+
if hasattr(self, 'api_key_name_var'):
|
|
5635
|
+
self.api_key_name_var.set(last_auth_data.get("api_key_name", ""))
|
|
5636
|
+
if hasattr(self, 'api_key_value_var'):
|
|
5637
|
+
key_value = last_auth_data.get("api_key_value", "")
|
|
5638
|
+
self.api_key_value_var.set(decrypt_auth_value(key_value))
|
|
5639
|
+
if hasattr(self, 'api_key_location_var'):
|
|
5640
|
+
self.api_key_location_var.set(last_auth_data.get("api_key_location", "header"))
|
|
5641
|
+
|
|
5642
|
+
# Restore complex options
|
|
5643
|
+
last_complex_options = self.settings.get("last_complex_options", "")
|
|
5644
|
+
if last_complex_options and hasattr(self, 'complex_options_var'):
|
|
5645
|
+
self.complex_options_var.set(last_complex_options)
|
|
5646
|
+
|
|
5647
|
+
self.logger.debug("UI state restored successfully")
|
|
5648
|
+
|
|
5649
|
+
except Exception as e:
|
|
5650
|
+
self.logger.error(f"Error restoring UI state: {e}")
|
|
5486
5651
|
|
|
5487
5652
|
|
|
5488
5653
|
# For standalone testing
|
package/tools/tool_loader.py
CHANGED
|
@@ -366,6 +366,24 @@ TOOL_SPECS: Dict[str, ToolSpec] = {
|
|
|
366
366
|
description="Model Context Protocol server management",
|
|
367
367
|
available_flag="MCP_WIDGET_MODULE_AVAILABLE"
|
|
368
368
|
),
|
|
369
|
+
|
|
370
|
+
# Web Tools - handled inline in pomera.py (tabbed interface like AI Tools)
|
|
371
|
+
"Web Search": ToolSpec(
|
|
372
|
+
name="Web Search",
|
|
373
|
+
module_path="tools.web_search", # Core module, UI created inline
|
|
374
|
+
class_name="search", # Function, not class
|
|
375
|
+
category=ToolCategory.UTILITY,
|
|
376
|
+
description="Search the web using DuckDuckGo, Tavily, Google, Brave, SerpApi, Serper",
|
|
377
|
+
available_flag="" # Always available
|
|
378
|
+
),
|
|
379
|
+
"URL Reader": ToolSpec(
|
|
380
|
+
name="URL Reader",
|
|
381
|
+
module_path="tools.url_content_reader",
|
|
382
|
+
class_name="URLContentReader",
|
|
383
|
+
category=ToolCategory.UTILITY,
|
|
384
|
+
description="Fetch URL content and convert to HTML, JSON, or Markdown",
|
|
385
|
+
available_flag="" # Always available
|
|
386
|
+
),
|
|
369
387
|
}
|
|
370
388
|
|
|
371
389
|
# These sub-tools appear as tabs within their parent tool
|
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
"""
|
|
2
|
+
URL Content Reader Module for Pomera AI Commander
|
|
3
|
+
|
|
4
|
+
Fetches web content and converts HTML to Markdown.
|
|
5
|
+
Features:
|
|
6
|
+
- HTTP/HTTPS URL fetching
|
|
7
|
+
- Main content extraction (skips nav, header, footer)
|
|
8
|
+
- HTML to Markdown conversion
|
|
9
|
+
- Proper error handling and timeout support
|
|
10
|
+
|
|
11
|
+
Author: Pomera AI Commander
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
import urllib.request
|
|
16
|
+
import urllib.error
|
|
17
|
+
from typing import Optional, List, Tuple
|
|
18
|
+
from html.parser import HTMLParser
|
|
19
|
+
from html import unescape
|
|
20
|
+
import logging
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class HTMLToMarkdownConverter(HTMLParser):
|
|
24
|
+
"""Convert HTML to Markdown format."""
|
|
25
|
+
|
|
26
|
+
# Tags to completely skip (including content)
|
|
27
|
+
SKIP_TAGS = {'script', 'style', 'noscript', 'iframe', 'svg', 'canvas',
|
|
28
|
+
'nav', 'header', 'footer', 'aside', 'form', 'button'}
|
|
29
|
+
|
|
30
|
+
# Block-level tags that need newlines
|
|
31
|
+
BLOCK_TAGS = {'p', 'div', 'section', 'article', 'main', 'h1', 'h2', 'h3',
|
|
32
|
+
'h4', 'h5', 'h6', 'blockquote', 'pre', 'li', 'tr', 'td', 'th'}
|
|
33
|
+
|
|
34
|
+
def __init__(self):
|
|
35
|
+
super().__init__()
|
|
36
|
+
self.output: List[str] = []
|
|
37
|
+
self.tag_stack: List[str] = []
|
|
38
|
+
self.skip_depth = 0
|
|
39
|
+
self.list_depth = 0
|
|
40
|
+
self.in_pre = False
|
|
41
|
+
self.in_code = False
|
|
42
|
+
self.current_link_url = ""
|
|
43
|
+
self.current_link_text = ""
|
|
44
|
+
self.in_link = False
|
|
45
|
+
|
|
46
|
+
def handle_starttag(self, tag, attrs):
|
|
47
|
+
tag = tag.lower()
|
|
48
|
+
|
|
49
|
+
# Track skip depth for nested skip tags
|
|
50
|
+
if tag in self.SKIP_TAGS:
|
|
51
|
+
self.skip_depth += 1
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
if self.skip_depth > 0:
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
self.tag_stack.append(tag)
|
|
58
|
+
attrs_dict = dict(attrs)
|
|
59
|
+
|
|
60
|
+
# Headings
|
|
61
|
+
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
|
62
|
+
level = int(tag[1])
|
|
63
|
+
self.output.append('\n\n' + '#' * level + ' ')
|
|
64
|
+
|
|
65
|
+
# Paragraphs and divs
|
|
66
|
+
elif tag in ('p', 'div', 'section', 'article', 'main'):
|
|
67
|
+
self.output.append('\n\n')
|
|
68
|
+
|
|
69
|
+
# Line break
|
|
70
|
+
elif tag == 'br':
|
|
71
|
+
self.output.append('\n')
|
|
72
|
+
|
|
73
|
+
# Horizontal rule
|
|
74
|
+
elif tag == 'hr':
|
|
75
|
+
self.output.append('\n\n---\n\n')
|
|
76
|
+
|
|
77
|
+
# Bold
|
|
78
|
+
elif tag in ('strong', 'b'):
|
|
79
|
+
self.output.append('**')
|
|
80
|
+
|
|
81
|
+
# Italic
|
|
82
|
+
elif tag in ('em', 'i'):
|
|
83
|
+
self.output.append('*')
|
|
84
|
+
|
|
85
|
+
# Code
|
|
86
|
+
elif tag == 'code':
|
|
87
|
+
if not self.in_pre:
|
|
88
|
+
self.output.append('`')
|
|
89
|
+
self.in_code = True
|
|
90
|
+
|
|
91
|
+
# Preformatted
|
|
92
|
+
elif tag == 'pre':
|
|
93
|
+
self.output.append('\n\n```\n')
|
|
94
|
+
self.in_pre = True
|
|
95
|
+
|
|
96
|
+
# Links
|
|
97
|
+
elif tag == 'a':
|
|
98
|
+
href = attrs_dict.get('href', '')
|
|
99
|
+
if href and not href.startswith('#') and not href.startswith('javascript:'):
|
|
100
|
+
self.current_link_url = href
|
|
101
|
+
self.current_link_text = ""
|
|
102
|
+
self.in_link = True
|
|
103
|
+
self.output.append('[')
|
|
104
|
+
|
|
105
|
+
# Images
|
|
106
|
+
elif tag == 'img':
|
|
107
|
+
src = attrs_dict.get('src', '')
|
|
108
|
+
alt = attrs_dict.get('alt', 'image')
|
|
109
|
+
if src:
|
|
110
|
+
self.output.append(f'\n\n')
|
|
111
|
+
|
|
112
|
+
# Unordered list
|
|
113
|
+
elif tag == 'ul':
|
|
114
|
+
self.list_depth += 1
|
|
115
|
+
self.output.append('\n')
|
|
116
|
+
|
|
117
|
+
# Ordered list
|
|
118
|
+
elif tag == 'ol':
|
|
119
|
+
self.list_depth += 1
|
|
120
|
+
self.output.append('\n')
|
|
121
|
+
|
|
122
|
+
# List item
|
|
123
|
+
elif tag == 'li':
|
|
124
|
+
indent = ' ' * (self.list_depth - 1)
|
|
125
|
+
parent = self.tag_stack[-2] if len(self.tag_stack) > 1 else 'ul'
|
|
126
|
+
if parent == 'ol':
|
|
127
|
+
self.output.append(f'\n{indent}1. ')
|
|
128
|
+
else:
|
|
129
|
+
self.output.append(f'\n{indent}- ')
|
|
130
|
+
|
|
131
|
+
# Blockquote
|
|
132
|
+
elif tag == 'blockquote':
|
|
133
|
+
self.output.append('\n\n> ')
|
|
134
|
+
|
|
135
|
+
# Table elements
|
|
136
|
+
elif tag == 'table':
|
|
137
|
+
self.output.append('\n\n')
|
|
138
|
+
elif tag == 'tr':
|
|
139
|
+
self.output.append('\n')
|
|
140
|
+
elif tag in ('td', 'th'):
|
|
141
|
+
self.output.append(' | ')
|
|
142
|
+
|
|
143
|
+
def handle_endtag(self, tag):
|
|
144
|
+
tag = tag.lower()
|
|
145
|
+
|
|
146
|
+
if tag in self.SKIP_TAGS:
|
|
147
|
+
self.skip_depth = max(0, self.skip_depth - 1)
|
|
148
|
+
return
|
|
149
|
+
|
|
150
|
+
if self.skip_depth > 0:
|
|
151
|
+
return
|
|
152
|
+
|
|
153
|
+
if self.tag_stack and self.tag_stack[-1] == tag:
|
|
154
|
+
self.tag_stack.pop()
|
|
155
|
+
|
|
156
|
+
# Bold
|
|
157
|
+
if tag in ('strong', 'b'):
|
|
158
|
+
self.output.append('**')
|
|
159
|
+
|
|
160
|
+
# Italic
|
|
161
|
+
elif tag in ('em', 'i'):
|
|
162
|
+
self.output.append('*')
|
|
163
|
+
|
|
164
|
+
# Code
|
|
165
|
+
elif tag == 'code':
|
|
166
|
+
if not self.in_pre:
|
|
167
|
+
self.output.append('`')
|
|
168
|
+
self.in_code = False
|
|
169
|
+
|
|
170
|
+
# Preformatted
|
|
171
|
+
elif tag == 'pre':
|
|
172
|
+
self.output.append('\n```\n\n')
|
|
173
|
+
self.in_pre = False
|
|
174
|
+
|
|
175
|
+
# Links
|
|
176
|
+
elif tag == 'a' and self.in_link:
|
|
177
|
+
self.output.append(f']({self.current_link_url})')
|
|
178
|
+
self.in_link = False
|
|
179
|
+
self.current_link_url = ""
|
|
180
|
+
|
|
181
|
+
# Lists
|
|
182
|
+
elif tag in ('ul', 'ol'):
|
|
183
|
+
self.list_depth = max(0, self.list_depth - 1)
|
|
184
|
+
if self.list_depth == 0:
|
|
185
|
+
self.output.append('\n')
|
|
186
|
+
|
|
187
|
+
# Block elements
|
|
188
|
+
elif tag in self.BLOCK_TAGS:
|
|
189
|
+
self.output.append('\n')
|
|
190
|
+
|
|
191
|
+
def handle_data(self, data):
|
|
192
|
+
if self.skip_depth > 0:
|
|
193
|
+
return
|
|
194
|
+
|
|
195
|
+
# Preserve whitespace in pre/code blocks
|
|
196
|
+
if self.in_pre:
|
|
197
|
+
self.output.append(data)
|
|
198
|
+
else:
|
|
199
|
+
# Normalize whitespace
|
|
200
|
+
text = re.sub(r'\s+', ' ', data)
|
|
201
|
+
if text.strip():
|
|
202
|
+
self.output.append(text)
|
|
203
|
+
|
|
204
|
+
def handle_entityref(self, name):
|
|
205
|
+
if self.skip_depth > 0:
|
|
206
|
+
return
|
|
207
|
+
char = unescape(f'&{name};')
|
|
208
|
+
self.output.append(char)
|
|
209
|
+
|
|
210
|
+
def handle_charref(self, name):
|
|
211
|
+
if self.skip_depth > 0:
|
|
212
|
+
return
|
|
213
|
+
char = unescape(f'&#{name};')
|
|
214
|
+
self.output.append(char)
|
|
215
|
+
|
|
216
|
+
def get_markdown(self) -> str:
|
|
217
|
+
"""Get the converted markdown text."""
|
|
218
|
+
text = ''.join(self.output)
|
|
219
|
+
|
|
220
|
+
# Clean up excessive newlines
|
|
221
|
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
222
|
+
|
|
223
|
+
# Clean up spaces around markdown elements
|
|
224
|
+
text = re.sub(r'\*\* +', '**', text)
|
|
225
|
+
text = re.sub(r' +\*\*', '**', text)
|
|
226
|
+
text = re.sub(r'\* +', '*', text)
|
|
227
|
+
text = re.sub(r' +\*', '*', text)
|
|
228
|
+
|
|
229
|
+
# Clean up empty list items
|
|
230
|
+
text = re.sub(r'\n- \n', '\n', text)
|
|
231
|
+
text = re.sub(r'\n1\. \n', '\n', text)
|
|
232
|
+
|
|
233
|
+
return text.strip()
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
class URLContentReader:
|
|
237
|
+
"""Fetch URLs and convert content to Markdown."""
|
|
238
|
+
|
|
239
|
+
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
240
|
+
|
|
241
|
+
def __init__(self, logger=None):
|
|
242
|
+
self.logger = logger or logging.getLogger(__name__)
|
|
243
|
+
|
|
244
|
+
def fetch_url(self, url: str, timeout: int = 30) -> str:
|
|
245
|
+
"""
|
|
246
|
+
Fetch content from a URL.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
url: URL to fetch
|
|
250
|
+
timeout: Request timeout in seconds
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
HTML content as string
|
|
254
|
+
"""
|
|
255
|
+
# Validate URL
|
|
256
|
+
if not url.startswith(('http://', 'https://')):
|
|
257
|
+
url = 'https://' + url
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
req = urllib.request.Request(url)
|
|
261
|
+
req.add_header('User-Agent', self.USER_AGENT)
|
|
262
|
+
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
|
|
263
|
+
req.add_header('Accept-Language', 'en-US,en;q=0.5')
|
|
264
|
+
|
|
265
|
+
with urllib.request.urlopen(req, timeout=timeout) as response:
|
|
266
|
+
# Detect encoding
|
|
267
|
+
charset = response.headers.get_content_charset()
|
|
268
|
+
if not charset:
|
|
269
|
+
charset = 'utf-8'
|
|
270
|
+
|
|
271
|
+
content = response.read()
|
|
272
|
+
try:
|
|
273
|
+
return content.decode(charset, errors='replace')
|
|
274
|
+
except (UnicodeDecodeError, LookupError):
|
|
275
|
+
return content.decode('utf-8', errors='replace')
|
|
276
|
+
|
|
277
|
+
except urllib.error.HTTPError as e:
|
|
278
|
+
raise Exception(f"HTTP Error {e.code}: {e.reason}")
|
|
279
|
+
except urllib.error.URLError as e:
|
|
280
|
+
raise Exception(f"URL Error: {e.reason}")
|
|
281
|
+
except Exception as e:
|
|
282
|
+
raise Exception(f"Fetch error: {str(e)}")
|
|
283
|
+
|
|
284
|
+
def html_to_markdown(self, html: str, extract_main_content: bool = True) -> str:
|
|
285
|
+
"""
|
|
286
|
+
Convert HTML to Markdown.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
html: HTML content
|
|
290
|
+
extract_main_content: If True, try to extract main content area
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Markdown formatted text
|
|
294
|
+
"""
|
|
295
|
+
if extract_main_content:
|
|
296
|
+
html = self._extract_main_content(html)
|
|
297
|
+
|
|
298
|
+
converter = HTMLToMarkdownConverter()
|
|
299
|
+
try:
|
|
300
|
+
converter.feed(html)
|
|
301
|
+
return converter.get_markdown()
|
|
302
|
+
except Exception as e:
|
|
303
|
+
self.logger.error(f"HTML parsing error: {e}")
|
|
304
|
+
# Fallback: simple text extraction
|
|
305
|
+
return self._simple_text_extraction(html)
|
|
306
|
+
|
|
307
|
+
def _extract_main_content(self, html: str) -> str:
|
|
308
|
+
"""Try to extract main content area from HTML."""
|
|
309
|
+
# Try to find main content containers
|
|
310
|
+
patterns = [
|
|
311
|
+
r'<main[^>]*>(.*?)</main>',
|
|
312
|
+
r'<article[^>]*>(.*?)</article>',
|
|
313
|
+
r'<div[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</div>',
|
|
314
|
+
r'<div[^>]*id="[^"]*content[^"]*"[^>]*>(.*?)</div>',
|
|
315
|
+
r'<div[^>]*class="[^"]*main[^"]*"[^>]*>(.*?)</div>',
|
|
316
|
+
r'<body[^>]*>(.*?)</body>',
|
|
317
|
+
]
|
|
318
|
+
|
|
319
|
+
for pattern in patterns:
|
|
320
|
+
match = re.search(pattern, html, re.IGNORECASE | re.DOTALL)
|
|
321
|
+
if match:
|
|
322
|
+
return match.group(1)
|
|
323
|
+
|
|
324
|
+
return html
|
|
325
|
+
|
|
326
|
+
def _simple_text_extraction(self, html: str) -> str:
|
|
327
|
+
"""Simple fallback text extraction."""
|
|
328
|
+
# Remove script and style
|
|
329
|
+
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.IGNORECASE | re.DOTALL)
|
|
330
|
+
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.IGNORECASE | re.DOTALL)
|
|
331
|
+
|
|
332
|
+
# Remove HTML tags
|
|
333
|
+
text = re.sub(r'<[^>]+>', ' ', text)
|
|
334
|
+
|
|
335
|
+
# Decode entities
|
|
336
|
+
text = unescape(text)
|
|
337
|
+
|
|
338
|
+
# Normalize whitespace
|
|
339
|
+
text = re.sub(r'\s+', ' ', text)
|
|
340
|
+
|
|
341
|
+
return text.strip()
|
|
342
|
+
|
|
343
|
+
def fetch_and_convert(self, url: str, timeout: int = 30,
|
|
344
|
+
extract_main_content: bool = True) -> str:
|
|
345
|
+
"""
|
|
346
|
+
Fetch URL and convert to Markdown in one step.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
url: URL to fetch
|
|
350
|
+
timeout: Request timeout in seconds
|
|
351
|
+
extract_main_content: If True, extract main content only
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
Markdown formatted content
|
|
355
|
+
"""
|
|
356
|
+
html = self.fetch_url(url, timeout)
|
|
357
|
+
return self.html_to_markdown(html, extract_main_content)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
# CLI support
|
|
361
|
+
def main():
|
|
362
|
+
"""CLI entry point for URL content reading."""
|
|
363
|
+
import argparse
|
|
364
|
+
|
|
365
|
+
parser = argparse.ArgumentParser(description="Fetch URLs and convert to Markdown")
|
|
366
|
+
parser.add_argument("url", nargs="?", help="URL to fetch")
|
|
367
|
+
parser.add_argument("--timeout", "-t", type=int, default=30, help="Timeout in seconds")
|
|
368
|
+
parser.add_argument("--no-extract", action="store_true",
|
|
369
|
+
help="Don't try to extract main content")
|
|
370
|
+
parser.add_argument("--output", "-o", type=str, help="Output file path")
|
|
371
|
+
|
|
372
|
+
args = parser.parse_args()
|
|
373
|
+
|
|
374
|
+
if not args.url:
|
|
375
|
+
parser.print_help()
|
|
376
|
+
return
|
|
377
|
+
|
|
378
|
+
reader = URLContentReader()
|
|
379
|
+
|
|
380
|
+
try:
|
|
381
|
+
markdown = reader.fetch_and_convert(
|
|
382
|
+
args.url,
|
|
383
|
+
timeout=args.timeout,
|
|
384
|
+
extract_main_content=not args.no_extract
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
if args.output:
|
|
388
|
+
with open(args.output, 'w', encoding='utf-8') as f:
|
|
389
|
+
f.write(markdown)
|
|
390
|
+
print(f"Saved to: {args.output}")
|
|
391
|
+
else:
|
|
392
|
+
print(markdown)
|
|
393
|
+
|
|
394
|
+
except Exception as e:
|
|
395
|
+
print(f"Error: {e}")
|
|
396
|
+
return 1
|
|
397
|
+
|
|
398
|
+
return 0
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
if __name__ == "__main__":
|
|
402
|
+
exit(main() or 0)
|