pomera-ai-commander 1.2.7 → 1.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,402 @@
1
+ """
2
+ URL Content Reader Module for Pomera AI Commander
3
+
4
+ Fetches web content and converts HTML to Markdown.
5
+ Features:
6
+ - HTTP/HTTPS URL fetching
7
+ - Main content extraction (skips nav, header, footer)
8
+ - HTML to Markdown conversion
9
+ - Proper error handling and timeout support
10
+
11
+ Author: Pomera AI Commander
12
+ """
13
+
14
+ import re
15
+ import urllib.request
16
+ import urllib.error
17
+ from typing import Optional, List, Tuple
18
+ from html.parser import HTMLParser
19
+ from html import unescape
20
+ import logging
21
+
22
+
23
+ class HTMLToMarkdownConverter(HTMLParser):
24
+ """Convert HTML to Markdown format."""
25
+
26
+ # Tags to completely skip (including content)
27
+ SKIP_TAGS = {'script', 'style', 'noscript', 'iframe', 'svg', 'canvas',
28
+ 'nav', 'header', 'footer', 'aside', 'form', 'button'}
29
+
30
+ # Block-level tags that need newlines
31
+ BLOCK_TAGS = {'p', 'div', 'section', 'article', 'main', 'h1', 'h2', 'h3',
32
+ 'h4', 'h5', 'h6', 'blockquote', 'pre', 'li', 'tr', 'td', 'th'}
33
+
34
+ def __init__(self):
35
+ super().__init__()
36
+ self.output: List[str] = []
37
+ self.tag_stack: List[str] = []
38
+ self.skip_depth = 0
39
+ self.list_depth = 0
40
+ self.in_pre = False
41
+ self.in_code = False
42
+ self.current_link_url = ""
43
+ self.current_link_text = ""
44
+ self.in_link = False
45
+
46
+ def handle_starttag(self, tag, attrs):
47
+ tag = tag.lower()
48
+
49
+ # Track skip depth for nested skip tags
50
+ if tag in self.SKIP_TAGS:
51
+ self.skip_depth += 1
52
+ return
53
+
54
+ if self.skip_depth > 0:
55
+ return
56
+
57
+ self.tag_stack.append(tag)
58
+ attrs_dict = dict(attrs)
59
+
60
+ # Headings
61
+ if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
62
+ level = int(tag[1])
63
+ self.output.append('\n\n' + '#' * level + ' ')
64
+
65
+ # Paragraphs and divs
66
+ elif tag in ('p', 'div', 'section', 'article', 'main'):
67
+ self.output.append('\n\n')
68
+
69
+ # Line break
70
+ elif tag == 'br':
71
+ self.output.append('\n')
72
+
73
+ # Horizontal rule
74
+ elif tag == 'hr':
75
+ self.output.append('\n\n---\n\n')
76
+
77
+ # Bold
78
+ elif tag in ('strong', 'b'):
79
+ self.output.append('**')
80
+
81
+ # Italic
82
+ elif tag in ('em', 'i'):
83
+ self.output.append('*')
84
+
85
+ # Code
86
+ elif tag == 'code':
87
+ if not self.in_pre:
88
+ self.output.append('`')
89
+ self.in_code = True
90
+
91
+ # Preformatted
92
+ elif tag == 'pre':
93
+ self.output.append('\n\n```\n')
94
+ self.in_pre = True
95
+
96
+ # Links
97
+ elif tag == 'a':
98
+ href = attrs_dict.get('href', '')
99
+ if href and not href.startswith('#') and not href.startswith('javascript:'):
100
+ self.current_link_url = href
101
+ self.current_link_text = ""
102
+ self.in_link = True
103
+ self.output.append('[')
104
+
105
+ # Images
106
+ elif tag == 'img':
107
+ src = attrs_dict.get('src', '')
108
+ alt = attrs_dict.get('alt', 'image')
109
+ if src:
110
+ self.output.append(f'\n![{alt}]({src})\n')
111
+
112
+ # Unordered list
113
+ elif tag == 'ul':
114
+ self.list_depth += 1
115
+ self.output.append('\n')
116
+
117
+ # Ordered list
118
+ elif tag == 'ol':
119
+ self.list_depth += 1
120
+ self.output.append('\n')
121
+
122
+ # List item
123
+ elif tag == 'li':
124
+ indent = ' ' * (self.list_depth - 1)
125
+ parent = self.tag_stack[-2] if len(self.tag_stack) > 1 else 'ul'
126
+ if parent == 'ol':
127
+ self.output.append(f'\n{indent}1. ')
128
+ else:
129
+ self.output.append(f'\n{indent}- ')
130
+
131
+ # Blockquote
132
+ elif tag == 'blockquote':
133
+ self.output.append('\n\n> ')
134
+
135
+ # Table elements
136
+ elif tag == 'table':
137
+ self.output.append('\n\n')
138
+ elif tag == 'tr':
139
+ self.output.append('\n')
140
+ elif tag in ('td', 'th'):
141
+ self.output.append(' | ')
142
+
143
+ def handle_endtag(self, tag):
144
+ tag = tag.lower()
145
+
146
+ if tag in self.SKIP_TAGS:
147
+ self.skip_depth = max(0, self.skip_depth - 1)
148
+ return
149
+
150
+ if self.skip_depth > 0:
151
+ return
152
+
153
+ if self.tag_stack and self.tag_stack[-1] == tag:
154
+ self.tag_stack.pop()
155
+
156
+ # Bold
157
+ if tag in ('strong', 'b'):
158
+ self.output.append('**')
159
+
160
+ # Italic
161
+ elif tag in ('em', 'i'):
162
+ self.output.append('*')
163
+
164
+ # Code
165
+ elif tag == 'code':
166
+ if not self.in_pre:
167
+ self.output.append('`')
168
+ self.in_code = False
169
+
170
+ # Preformatted
171
+ elif tag == 'pre':
172
+ self.output.append('\n```\n\n')
173
+ self.in_pre = False
174
+
175
+ # Links
176
+ elif tag == 'a' and self.in_link:
177
+ self.output.append(f']({self.current_link_url})')
178
+ self.in_link = False
179
+ self.current_link_url = ""
180
+
181
+ # Lists
182
+ elif tag in ('ul', 'ol'):
183
+ self.list_depth = max(0, self.list_depth - 1)
184
+ if self.list_depth == 0:
185
+ self.output.append('\n')
186
+
187
+ # Block elements
188
+ elif tag in self.BLOCK_TAGS:
189
+ self.output.append('\n')
190
+
191
+ def handle_data(self, data):
192
+ if self.skip_depth > 0:
193
+ return
194
+
195
+ # Preserve whitespace in pre/code blocks
196
+ if self.in_pre:
197
+ self.output.append(data)
198
+ else:
199
+ # Normalize whitespace
200
+ text = re.sub(r'\s+', ' ', data)
201
+ if text.strip():
202
+ self.output.append(text)
203
+
204
+ def handle_entityref(self, name):
205
+ if self.skip_depth > 0:
206
+ return
207
+ char = unescape(f'&{name};')
208
+ self.output.append(char)
209
+
210
+ def handle_charref(self, name):
211
+ if self.skip_depth > 0:
212
+ return
213
+ char = unescape(f'&#{name};')
214
+ self.output.append(char)
215
+
216
+ def get_markdown(self) -> str:
217
+ """Get the converted markdown text."""
218
+ text = ''.join(self.output)
219
+
220
+ # Clean up excessive newlines
221
+ text = re.sub(r'\n{3,}', '\n\n', text)
222
+
223
+ # Clean up spaces around markdown elements
224
+ text = re.sub(r'\*\* +', '**', text)
225
+ text = re.sub(r' +\*\*', '**', text)
226
+ text = re.sub(r'\* +', '*', text)
227
+ text = re.sub(r' +\*', '*', text)
228
+
229
+ # Clean up empty list items
230
+ text = re.sub(r'\n- \n', '\n', text)
231
+ text = re.sub(r'\n1\. \n', '\n', text)
232
+
233
+ return text.strip()
234
+
235
+
236
+ class URLContentReader:
237
+ """Fetch URLs and convert content to Markdown."""
238
+
239
+ USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
240
+
241
+ def __init__(self, logger=None):
242
+ self.logger = logger or logging.getLogger(__name__)
243
+
244
+ def fetch_url(self, url: str, timeout: int = 30) -> str:
245
+ """
246
+ Fetch content from a URL.
247
+
248
+ Args:
249
+ url: URL to fetch
250
+ timeout: Request timeout in seconds
251
+
252
+ Returns:
253
+ HTML content as string
254
+ """
255
+ # Validate URL
256
+ if not url.startswith(('http://', 'https://')):
257
+ url = 'https://' + url
258
+
259
+ try:
260
+ req = urllib.request.Request(url)
261
+ req.add_header('User-Agent', self.USER_AGENT)
262
+ req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
263
+ req.add_header('Accept-Language', 'en-US,en;q=0.5')
264
+
265
+ with urllib.request.urlopen(req, timeout=timeout) as response:
266
+ # Detect encoding
267
+ charset = response.headers.get_content_charset()
268
+ if not charset:
269
+ charset = 'utf-8'
270
+
271
+ content = response.read()
272
+ try:
273
+ return content.decode(charset, errors='replace')
274
+ except (UnicodeDecodeError, LookupError):
275
+ return content.decode('utf-8', errors='replace')
276
+
277
+ except urllib.error.HTTPError as e:
278
+ raise Exception(f"HTTP Error {e.code}: {e.reason}")
279
+ except urllib.error.URLError as e:
280
+ raise Exception(f"URL Error: {e.reason}")
281
+ except Exception as e:
282
+ raise Exception(f"Fetch error: {str(e)}")
283
+
284
+ def html_to_markdown(self, html: str, extract_main_content: bool = True) -> str:
285
+ """
286
+ Convert HTML to Markdown.
287
+
288
+ Args:
289
+ html: HTML content
290
+ extract_main_content: If True, try to extract main content area
291
+
292
+ Returns:
293
+ Markdown formatted text
294
+ """
295
+ if extract_main_content:
296
+ html = self._extract_main_content(html)
297
+
298
+ converter = HTMLToMarkdownConverter()
299
+ try:
300
+ converter.feed(html)
301
+ return converter.get_markdown()
302
+ except Exception as e:
303
+ self.logger.error(f"HTML parsing error: {e}")
304
+ # Fallback: simple text extraction
305
+ return self._simple_text_extraction(html)
306
+
307
+ def _extract_main_content(self, html: str) -> str:
308
+ """Try to extract main content area from HTML."""
309
+ # Try to find main content containers
310
+ patterns = [
311
+ r'<main[^>]*>(.*?)</main>',
312
+ r'<article[^>]*>(.*?)</article>',
313
+ r'<div[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</div>',
314
+ r'<div[^>]*id="[^"]*content[^"]*"[^>]*>(.*?)</div>',
315
+ r'<div[^>]*class="[^"]*main[^"]*"[^>]*>(.*?)</div>',
316
+ r'<body[^>]*>(.*?)</body>',
317
+ ]
318
+
319
+ for pattern in patterns:
320
+ match = re.search(pattern, html, re.IGNORECASE | re.DOTALL)
321
+ if match:
322
+ return match.group(1)
323
+
324
+ return html
325
+
326
+ def _simple_text_extraction(self, html: str) -> str:
327
+ """Simple fallback text extraction."""
328
+ # Remove script and style
329
+ text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.IGNORECASE | re.DOTALL)
330
+ text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.IGNORECASE | re.DOTALL)
331
+
332
+ # Remove HTML tags
333
+ text = re.sub(r'<[^>]+>', ' ', text)
334
+
335
+ # Decode entities
336
+ text = unescape(text)
337
+
338
+ # Normalize whitespace
339
+ text = re.sub(r'\s+', ' ', text)
340
+
341
+ return text.strip()
342
+
343
+ def fetch_and_convert(self, url: str, timeout: int = 30,
344
+ extract_main_content: bool = True) -> str:
345
+ """
346
+ Fetch URL and convert to Markdown in one step.
347
+
348
+ Args:
349
+ url: URL to fetch
350
+ timeout: Request timeout in seconds
351
+ extract_main_content: If True, extract main content only
352
+
353
+ Returns:
354
+ Markdown formatted content
355
+ """
356
+ html = self.fetch_url(url, timeout)
357
+ return self.html_to_markdown(html, extract_main_content)
358
+
359
+
360
+ # CLI support
361
+ def main():
362
+ """CLI entry point for URL content reading."""
363
+ import argparse
364
+
365
+ parser = argparse.ArgumentParser(description="Fetch URLs and convert to Markdown")
366
+ parser.add_argument("url", nargs="?", help="URL to fetch")
367
+ parser.add_argument("--timeout", "-t", type=int, default=30, help="Timeout in seconds")
368
+ parser.add_argument("--no-extract", action="store_true",
369
+ help="Don't try to extract main content")
370
+ parser.add_argument("--output", "-o", type=str, help="Output file path")
371
+
372
+ args = parser.parse_args()
373
+
374
+ if not args.url:
375
+ parser.print_help()
376
+ return
377
+
378
+ reader = URLContentReader()
379
+
380
+ try:
381
+ markdown = reader.fetch_and_convert(
382
+ args.url,
383
+ timeout=args.timeout,
384
+ extract_main_content=not args.no_extract
385
+ )
386
+
387
+ if args.output:
388
+ with open(args.output, 'w', encoding='utf-8') as f:
389
+ f.write(markdown)
390
+ print(f"Saved to: {args.output}")
391
+ else:
392
+ print(markdown)
393
+
394
+ except Exception as e:
395
+ print(f"Error: {e}")
396
+ return 1
397
+
398
+ return 0
399
+
400
+
401
+ if __name__ == "__main__":
402
+ exit(main() or 0)