webscout 6.8__py3-none-any.whl → 7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (45) hide show
  1. webscout/AIbase.py +12 -2
  2. webscout/DWEBS.py +38 -22
  3. webscout/Extra/YTToolkit/YTdownloader.py +7 -2
  4. webscout/Extra/YTToolkit/ytapi/channel.py +1 -1
  5. webscout/Extra/YTToolkit/ytapi/query.py +3 -0
  6. webscout/Extra/YTToolkit/ytapi/stream.py +3 -0
  7. webscout/Extra/YTToolkit/ytapi/video.py +3 -1
  8. webscout/Extra/autocoder/autocoder_utiles.py +68 -7
  9. webscout/Extra/autollama.py +0 -16
  10. webscout/Extra/gguf.py +0 -13
  11. webscout/Provider/AISEARCH/DeepFind.py +251 -0
  12. webscout/Provider/AISEARCH/__init__.py +2 -2
  13. webscout/Provider/AISEARCH/felo_search.py +167 -118
  14. webscout/Provider/Blackboxai.py +1 -1
  15. webscout/Provider/Glider.py +207 -0
  16. webscout/Provider/HF_space/__init__.py +0 -0
  17. webscout/Provider/HF_space/qwen_qwen2.py +206 -0
  18. webscout/Provider/TextPollinationsAI.py +201 -0
  19. webscout/Provider/Youchat.py +28 -22
  20. webscout/Provider/__init__.py +11 -3
  21. webscout/Provider/askmyai.py +2 -2
  22. webscout/Provider/cerebras.py +3 -3
  23. webscout/Provider/chatglm.py +205 -0
  24. webscout/Provider/dgaf.py +186 -0
  25. webscout/Provider/hermes.py +219 -0
  26. webscout/Provider/llmchat.py +1 -0
  27. webscout/__init__.py +0 -1
  28. webscout/litagent/__init__.py +3 -146
  29. webscout/litagent/agent.py +120 -0
  30. webscout/litagent/constants.py +31 -0
  31. webscout/swiftcli/__init__.py +1 -0
  32. webscout/tempid.py +0 -4
  33. webscout/version.py +1 -1
  34. webscout/webscout_search.py +1140 -1104
  35. webscout/webscout_search_async.py +635 -361
  36. {webscout-6.8.dist-info → webscout-7.0.dist-info}/METADATA +23 -39
  37. {webscout-6.8.dist-info → webscout-7.0.dist-info}/RECORD +41 -35
  38. {webscout-6.8.dist-info → webscout-7.0.dist-info}/WHEEL +1 -1
  39. webscout/Extra/markdownlite/__init__.py +0 -862
  40. webscout/Provider/AISEARCH/ooai.py +0 -155
  41. webscout/Provider/Deepseek.py +0 -227
  42. webscout/zerodir/__init__.py +0 -225
  43. {webscout-6.8.dist-info → webscout-7.0.dist-info}/LICENSE.md +0 -0
  44. {webscout-6.8.dist-info → webscout-7.0.dist-info}/entry_points.txt +0 -0
  45. {webscout-6.8.dist-info → webscout-7.0.dist-info}/top_level.txt +0 -0
@@ -1,862 +0,0 @@
1
- import sys
2
- import os
3
- from webscout.Litlogger import LitLogger, LogFormat, ColorScheme
4
- from webscout.scout import Scout, Tag
5
- from textwrap import fill
6
- import re
7
- import six
8
- import html
9
- import json
10
- from typing import Union, Dict, Any, Optional, List
11
- import functools
12
-
13
- # Initialize Litlogger
14
- logger = LitLogger(
15
- name="MarkdownLite",
16
- format=LogFormat.DETAILED,
17
- color_scheme=ColorScheme.OCEAN
18
- )
19
-
20
- # Decorator for error handling and logging
21
- def markdown_conversion_error_handler(func):
22
- @functools.wraps(func)
23
- def wrapper(*args, **kwargs):
24
- try:
25
- return func(*args, **kwargs)
26
- except Exception as e:
27
- logger.error(f"Markdown conversion error: {e}", exc_info=True)
28
- raise
29
- return wrapper
30
-
31
- # Constants and configuration
32
- MARKDOWN_CONVERSION_OPTIONS = {
33
- 'SEMANTIC_CONVERSION': True,
34
- 'PRESERVE_METADATA': True,
35
- 'SMART_LISTS': True,
36
- 'LINK_REWRITING': None,
37
- 'CUSTOM_ANALYZERS': [],
38
- 'STRUCTURED_OUTPUT': True,
39
- 'DEBUG_MODE': False
40
- }
41
-
42
- # Existing utility functions
43
- def chomp(text):
44
- """
45
- Strip leading/trailing spaces while preserving prefix/suffix spaces.
46
-
47
- Args:
48
- text (str): Input text to process
49
-
50
- Returns:
51
- tuple: (prefix, suffix, stripped_text)
52
- """
53
- prefix = ' ' if text and text[0] == ' ' else ''
54
- suffix = ' ' if text and text[-1] == ' ' else ''
55
- text = text.strip()
56
- return (prefix, suffix, text)
57
-
58
- def abstract_inline_conversion(markup_fn):
59
- """
60
- Abstract inline tag conversion with enhanced flexibility.
61
-
62
- Args:
63
- markup_fn (callable): Function to generate markup
64
-
65
- Returns:
66
- callable: Conversion implementation
67
- """
68
- def implementation(self, el, text, convert_as_inline):
69
- markup_prefix = markup_fn(self)
70
- if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
71
- markup_suffix = '</' + markup_prefix[1:]
72
- else:
73
- markup_suffix = markup_prefix
74
-
75
- if el.find_parent(['pre', 'code', 'kbd', 'samp']):
76
- return text
77
-
78
- prefix, suffix, text = chomp(text)
79
- if not text:
80
- return ''
81
-
82
- return f'{prefix}{markup_prefix}{text}{markup_suffix}{suffix}'
83
- return implementation
84
-
85
- class MarkdownConverter(object):
86
- class DefaultOptions:
87
- autolinks = True
88
- bullets = '*+-' # An iterable of bullet types.
89
- code_language = ''
90
- code_language_callback = None
91
- convert = None
92
- default_title = False
93
- escape_asterisks = True
94
- escape_underscores = True
95
- escape_misc = False
96
- heading_style = 'underlined'
97
- keep_inline_images_in = []
98
- newline_style = 'spaces'
99
- strip = None
100
- strong_em_symbol = '*'
101
- sub_symbol = ''
102
- sup_symbol = ''
103
- wrap = False
104
- wrap_width = 80
105
-
106
- # New options for Scout integration
107
- semantic_conversion = False # Enable semantic-aware conversion
108
- preserve_metadata = False # Keep HTML metadata in markdown
109
- smart_lists = True # Smart list handling
110
- link_rewriting = None # Function for rewriting URLs
111
- custom_analyzers = [] # List of custom text analyzers
112
- structured_output = False # Return structured output with metadata
113
-
114
- # Existing options
115
- debug_mode = False
116
- handle_unknown_tags = 'ignore' # 'ignore', 'warn', 'error'
117
- preserve_html_comments = False
118
- max_depth = 100 # Prevent potential infinite recursion
119
- custom_tag_handlers = {} # Allow custom tag conversion functions
120
-
121
- class Options(DefaultOptions):
122
- pass
123
-
124
- # Inline conversion methods
125
- convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])
126
- convert_del = abstract_inline_conversion(lambda self: '~~')
127
- convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])
128
- convert_i = convert_em
129
- convert_s = convert_del
130
- convert_strong = convert_b
131
- convert_sub = abstract_inline_conversion(lambda self: self.options['sub_symbol'])
132
- convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])
133
-
134
- def __init__(self, **options):
135
- # Merge default and user-provided options
136
- default_options = {
137
- 'SEMANTIC_CONVERSION': True,
138
- 'PRESERVE_METADATA': True,
139
- 'SMART_LISTS': True,
140
- 'LINK_REWRITING': None,
141
- 'CUSTOM_ANALYZERS': [],
142
- 'STRUCTURED_OUTPUT': True,
143
- 'DEBUG_MODE': False,
144
- # Add max_depth for conversion
145
- 'max_depth': 10,
146
- # Inherit existing default options
147
- 'strip': None,
148
- 'convert': None,
149
- 'heading_style': 'underlined',
150
- 'newline_style': 'spaces',
151
- 'strong_em_symbol': '*',
152
- 'escape_asterisks': True,
153
- 'escape_underscores': True,
154
- 'escape_misc': False,
155
- 'keep_inline_images_in': [],
156
- 'sub_symbol': '',
157
- 'sup_symbol': '',
158
- 'wrap': False
159
- }
160
-
161
- # Update with user options
162
- default_options.update(options)
163
- self.options = default_options
164
-
165
- # Setup logging based on debug mode
166
- if self.options['DEBUG_MODE']:
167
- logger.setLevel(logging.DEBUG)
168
-
169
- # Initialize metadata and structure
170
- self._metadata = {}
171
- self._structure = {}
172
- self._semantic_info = {}
173
-
174
- @markdown_conversion_error_handler
175
- def convert(self, html):
176
- """
177
- Enhanced conversion with metadata and structure analysis.
178
-
179
- Args:
180
- html (str): HTML content to convert
181
-
182
- Returns:
183
- Union[str, Dict[str, Any]]: Markdown text or structured output
184
- """
185
- # Handle different Scout result types
186
- scout = html if hasattr(html, '_soup') or hasattr(html, 'name') else Scout(html, features='html.parser')
187
-
188
- # If scout is a search result, get the first result or the original scout
189
- if hasattr(scout, '_results') and scout._results:
190
- scout = scout._results[0]
191
-
192
- # Ensure we have a valid Scout object or Tag
193
- if not hasattr(scout, '_soup') and not hasattr(scout, 'name'):
194
- raise ValueError("Unable to convert input to a valid Scout object")
195
-
196
- logger.debug(f"Parsing HTML: {str(scout)[:100]}...")
197
-
198
- # Extract additional information if needed
199
- if self.options['PRESERVE_METADATA']:
200
- self._metadata = self._extract_metadata(scout)
201
-
202
- if self.options['SEMANTIC_CONVERSION']:
203
- self._structure = self._analyze_structure(scout)
204
- self._semantic_info = self._extract_semantic_info(scout)
205
-
206
- # Convert to markdown
207
- markdown = self.convert_soup(scout._soup if hasattr(scout, '_soup') else scout)
208
-
209
- # Return structured output if requested
210
- if self.options['STRUCTURED_OUTPUT']:
211
- return {
212
- 'markdown': markdown,
213
- 'metadata': self._metadata,
214
- 'structure': self._structure,
215
- 'semantic_info': self._semantic_info
216
- }
217
-
218
- return markdown
219
-
220
- def convert_soup(self, soup):
221
- """Convert Scout's internal soup object."""
222
- return self.process_tag(soup, convert_as_inline=False, children_only=True)
223
-
224
- def process_tag(self, node, convert_as_inline, children_only=False, depth=0):
225
- """Enhanced tag processing with semantic awareness."""
226
- if depth > self.options['max_depth']:
227
- logger.warning(f"Max recursion depth reached at tag: {node.name}")
228
- return ''
229
-
230
- # Check for custom tag handlers
231
- if hasattr(node, 'name') and node.name in self.options['custom_tag_handlers']:
232
- custom_handler = self.options['custom_tag_handlers'][node.name]
233
- return custom_handler(node, convert_as_inline)
234
-
235
- text = ''
236
-
237
- # markdown headings or cells can't include
238
- # block elements (elements w/newlines)
239
- isHeading = re.match(r'h[1-6]', node.name) if hasattr(node, 'name') else False
240
- isCell = hasattr(node, 'name') and node.name in ['td', 'th']
241
- convert_children_as_inline = convert_as_inline
242
-
243
- if not children_only and (isHeading or isCell):
244
- convert_children_as_inline = True
245
-
246
- # Remove whitespace-only textnodes
247
- should_remove_inside = should_remove_whitespace_inside(node)
248
-
249
- # Iterate through children
250
- for el in node.children:
251
- # Skip script, style, and comment-like elements
252
- if hasattr(el, 'name') and el.name in ['script', 'style', 'comment']:
253
- continue
254
-
255
- # Check if element is a text node that can be stripped
256
- if (isinstance(el, str) or
257
- (hasattr(el, 'string') and el.string and str(el.string).strip() == '')):
258
- if should_remove_inside and (not el.previous_sibling or not el.next_sibling):
259
- continue
260
-
261
- # Process child elements
262
- if isinstance(el, str):
263
- text += el
264
- elif hasattr(el, 'name'):
265
- text_strip = text.rstrip('\n')
266
- newlines_left = len(text) - len(text_strip)
267
- next_text = self.process_tag(el, convert_children_as_inline)
268
- next_text_strip = next_text.lstrip('\n')
269
- newlines_right = len(next_text) - len(next_text_strip)
270
- newlines = '\n' * max(newlines_left, newlines_right)
271
- text = text_strip + newlines + next_text_strip
272
-
273
- if not children_only and hasattr(node, 'name'):
274
- convert_fn = getattr(self, 'convert_%s' % node.name, None)
275
- if convert_fn and self.should_convert_tag(node.name):
276
- text = convert_fn(node, text, convert_as_inline)
277
-
278
- # Apply custom analyzers
279
- for analyzer in self.options['custom_analyzers']:
280
- text = analyzer(text, node)
281
-
282
- return text
283
-
284
- def _validate_options(self):
285
- """Validate and sanitize converter options."""
286
- if self.options['max_depth'] < 1:
287
- raise ValueError("max_depth must be a positive integer")
288
-
289
- if self.options['handle_unknown_tags'] not in ['ignore', 'warn', 'error']:
290
- raise ValueError("handle_unknown_tags must be 'ignore', 'warn', or 'error'")
291
-
292
- def process_text(self, el):
293
- text = six.text_type(el) or ''
294
-
295
- # normalize whitespace if we're not inside a preformatted element
296
- if not el.find_parent('pre'):
297
- if self.options['wrap']:
298
- text = re.sub(r'[\t ]+', ' ', text)
299
- else:
300
- text = re.sub(r'[\t \r\n]*[\r\n][\t \r\n]*', '\n', text)
301
- text = re.sub(r'[\t ]+', ' ', text)
302
-
303
- # escape special characters if we're not inside a preformatted or code element
304
- if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
305
- text = self.escape(text)
306
-
307
- # remove leading whitespace at the start or just after a
308
- # block-level element; remove traliing whitespace at the end
309
- # or just before a block-level element.
310
- if (should_remove_whitespace_outside(el.previous_sibling)
311
- or (should_remove_whitespace_inside(el.parent)
312
- and not el.previous_sibling)):
313
- text = text.lstrip()
314
- if (should_remove_whitespace_outside(el.next_sibling)
315
- or (should_remove_whitespace_inside(el.parent)
316
- and not el.next_sibling)):
317
- text = text.rstrip()
318
-
319
- return text
320
-
321
- def __getattr__(self, attr):
322
- # Handle headings
323
- m = re.match(r'convert_h(\d+)', attr)
324
- if m:
325
- n = int(m.group(1))
326
-
327
- def convert_tag(el, text, convert_as_inline):
328
- return self._convert_hn(n, el, text, convert_as_inline)
329
-
330
- convert_tag.__name__ = 'convert_h%s' % n
331
- setattr(self, convert_tag.__name__, convert_tag)
332
- return convert_tag
333
-
334
- raise AttributeError(attr)
335
-
336
- def should_convert_tag(self, tag):
337
- tag = tag.lower()
338
- strip = self.options['strip']
339
- convert = self.options['convert']
340
- if strip is not None:
341
- return tag not in strip
342
- elif convert is not None:
343
- return tag in convert
344
- else:
345
- return True
346
-
347
- def escape(self, text):
348
- if not text:
349
- return ''
350
- if self.options['escape_misc']:
351
- text = re.sub(r'([\\&<`[>~=+|])', r'\\\1', text)
352
- # A sequence of one or more consecutive '-', preceded and
353
- # followed by whitespace or start/end of fragment, might
354
- # be confused with an underline of a header, or with a
355
- # list marker.
356
- text = re.sub(r'(\s|^)(-+(?:\s|$))', r'\1\\\2', text)
357
- # A sequence of up to six consecutive '#', preceded and
358
- # followed by whitespace or start/end of fragment, might
359
- # be confused with an ATX heading.
360
- text = re.sub(r'(\s|^)(#{1,6}(?:\s|$))', r'\1\\\2', text)
361
- # '.' or ')' preceded by up to nine digits might be
362
- # confused with a list item.
363
- text = re.sub(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))', r'\1\\\2',
364
- text)
365
- if self.options['escape_asterisks']:
366
- text = text.replace('*', r'\*')
367
- if self.options['escape_underscores']:
368
- text = text.replace('_', r'\_')
369
- return text
370
-
371
- def indent(self, text, columns):
372
- return re.sub(r'^', ' ' * columns, text, flags=re.MULTILINE) if text else ''
373
-
374
- def underline(self, text, pad_char):
375
- text = (text or '').rstrip()
376
- return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
377
-
378
- def convert_a(self, el, text, convert_as_inline):
379
- """Enhanced link conversion with URL rewriting."""
380
- if self.options['link_rewriting'] and callable(self.options['link_rewriting']):
381
- href = el.get('href')
382
- if href:
383
- href = self.options['link_rewriting'](href)
384
- el['href'] = href
385
-
386
- prefix, suffix, text = chomp(text)
387
- if not text:
388
- return ''
389
- href = el.get('href')
390
- title = el.get('title')
391
- # For the replacement see #29: text nodes underscores are escaped
392
- if (self.options['autolinks']
393
- and text.replace(r'\_', '_') == href
394
- and not title
395
- and not self.options['default_title']):
396
- # Shortcut syntax
397
- return '<%s>' % href
398
- if self.options['default_title'] and not title:
399
- title = href
400
- title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
401
- return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text
402
-
403
- def convert_blockquote(self, el, text, convert_as_inline):
404
-
405
- if convert_as_inline:
406
- return ' ' + text.strip() + ' '
407
-
408
- return '\n' + (re.sub(r'^', '> ', text.strip(), flags=re.MULTILINE) + '\n\n') if text else ''
409
-
410
- def convert_br(self, el, text, convert_as_inline):
411
- if convert_as_inline:
412
- return ""
413
-
414
- if self.options['newline_style'].lower() == 'backslash':
415
- return '\\\n'
416
- else:
417
- return ' \n'
418
-
419
- def convert_code(self, el, text, convert_as_inline):
420
- if el.parent.name == 'pre':
421
- return text
422
- converter = abstract_inline_conversion(lambda self: '`')
423
- return converter(self, el, text, convert_as_inline)
424
-
425
- def convert_kbd(self, el, text, convert_as_inline):
426
- return self.convert_code(el, text, convert_as_inline)
427
-
428
- def _convert_hn(self, n, el, text, convert_as_inline):
429
- """ Method name prefixed with _ to prevent <hn> to call this """
430
- if convert_as_inline:
431
- return text
432
-
433
- # prevent MemoryErrors in case of very large n
434
- n = max(1, min(6, n))
435
-
436
- style = self.options['heading_style'].lower()
437
- text = text.strip()
438
- if style == 'underlined' and n <= 2:
439
- line = '=' if n == 1 else '-'
440
- return self.underline(text, line)
441
- text = re.sub(r'[\t ]+', ' ', text)
442
- hashes = '#' * n
443
- if style == 'atx_closed':
444
- return '\n%s %s %s\n\n' % (hashes, text, hashes)
445
- return '\n%s %s\n\n' % (hashes, text)
446
-
447
- def convert_hr(self, el, text, convert_as_inline):
448
- return '\n\n---\n\n'
449
-
450
- def convert_img(self, el, text, convert_as_inline):
451
- alt = el.attrs.get('alt', None) or ''
452
- src = el.attrs.get('src', None) or ''
453
- title = el.attrs.get('title', None) or ''
454
- title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
455
- if (convert_as_inline
456
- and el.parent.name not in self.options['keep_inline_images_in']):
457
- return alt
458
-
459
- return '![%s](%s%s)' % (alt, src, title_part)
460
-
461
- def convert_list(self, el, text, convert_as_inline):
462
- """Enhanced list conversion with smart handling."""
463
- if not self.options['smart_lists']:
464
- return super().convert_list(el, text, convert_as_inline)
465
-
466
- nested = False
467
- before_paragraph = False
468
-
469
- # Smart list processing
470
- list_type = el.name
471
- is_ordered = list_type == 'ol'
472
- start = el.get('start', 1) if is_ordered else None
473
-
474
- # Process list items
475
- items = el.find_all('li', recursive=False)
476
- processed_items = []
477
-
478
- for i, item in enumerate(items):
479
- item_text = self.process_tag(item, convert_as_inline)
480
- if is_ordered:
481
- number = start + i if start else i + 1
482
- processed_items.append(f"{number}. {item_text}")
483
- else:
484
- processed_items.append(f"* {item_text}")
485
-
486
- return '\n'.join(processed_items)
487
-
488
- def convert_ul(self, el, text, convert_as_inline):
489
- return self.convert_list(el, text, convert_as_inline)
490
-
491
- def convert_ol(self, el, text, convert_as_inline):
492
- return self.convert_list(el, text, convert_as_inline)
493
-
494
- def convert_li(self, el, text, convert_as_inline):
495
- parent = el.parent
496
- if parent is not None and parent.name == 'ol':
497
- if parent.get("start") and str(parent.get("start")).isnumeric():
498
- start = int(parent.get("start"))
499
- else:
500
- start = 1
501
- bullet = '%s.' % (start + parent.index(el))
502
- else:
503
- depth = -1
504
- while el:
505
- if el.name == 'ul':
506
- depth += 1
507
- el = el.parent
508
- bullets = self.options['bullets']
509
- bullet = bullets[depth % len(bullets)]
510
- bullet = bullet + ' '
511
- text = (text or '').strip()
512
- text = self.indent(text, len(bullet))
513
- if text:
514
- text = bullet + text[len(bullet):]
515
- return '%s\n' % text
516
-
517
- def convert_p(self, el, text, convert_as_inline):
518
- if convert_as_inline:
519
- return ' ' + text.strip() + ' '
520
- if self.options['wrap']:
521
- # Preserve newlines (and preceding whitespace) resulting
522
- # from <br> tags. Newlines in the input have already been
523
- # replaced by spaces.
524
- lines = text.split('\n')
525
- new_lines = []
526
- for line in lines:
527
- line = line.lstrip()
528
- line_no_trailing = line.rstrip()
529
- trailing = line[len(line_no_trailing):]
530
- line = fill(line,
531
- width=self.options['wrap_width'],
532
- break_long_words=False,
533
- break_on_hyphens=False)
534
- new_lines.append(line + trailing)
535
- text = '\n'.join(new_lines)
536
- return '\n\n%s\n\n' % text if text else ''
537
-
538
- def convert_pre(self, el, text, convert_as_inline):
539
- if not text:
540
- return ''
541
- code_language = self.options['code_language']
542
-
543
- if self.options['code_language_callback']:
544
- code_language = self.options['code_language_callback'](el) or code_language
545
-
546
- return '\n```%s\n%s\n```\n' % (code_language, text)
547
-
548
- def convert_script(self, el, text, convert_as_inline):
549
- return ''
550
-
551
- def convert_style(self, el, text, convert_as_inline):
552
- return ''
553
-
554
- def convert_comment(self, el, text, convert_as_inline):
555
- """Handle comment-like elements based on configuration."""
556
- if self.options['preserve_html_comments']:
557
- return f'<!-- {text} -->'
558
- return ''
559
-
560
- def convert_details(self, el, text, convert_as_inline):
561
- """Convert HTML5 details and summary tags."""
562
- summary = el.find('summary')
563
- summary_text = summary.text if summary else 'Details'
564
- return f'\n<details>\n<summary>{summary_text}</summary>\n\n{text}\n</details>\n'
565
-
566
- def convert_mark(self, el, text, convert_as_inline):
567
- """Convert mark tag with highlighting."""
568
- return f'`{text}`'
569
-
570
- def convert_table(self, el, text, convert_as_inline):
571
- return '\n\n' + text + '\n'
572
-
573
- def convert_caption(self, el, text, convert_as_inline):
574
- return text + '\n'
575
-
576
- def convert_figcaption(self, el, text, convert_as_inline):
577
- return '\n\n' + text + '\n\n'
578
-
579
- def convert_td(self, el, text, convert_as_inline):
580
- colspan = 1
581
- if 'colspan' in el.attrs and el['colspan'].isdigit():
582
- colspan = int(el['colspan'])
583
- return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
584
-
585
- def convert_th(self, el, text, convert_as_inline):
586
- colspan = 1
587
- if 'colspan' in el.attrs and el['colspan'].isdigit():
588
- colspan = int(el['colspan'])
589
- return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
590
-
591
- def convert_tr(self, el, text, convert_as_inline):
592
- cells = el.find_all(['td', 'th'])
593
- is_headrow = (
594
- all([cell.name == 'th' for cell in cells])
595
- or (not el.previous_sibling and not el.parent.name == 'tbody')
596
- or (not el.previous_sibling and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1)
597
- )
598
- overline = ''
599
- underline = ''
600
- if is_headrow and not el.previous_sibling:
601
- # first row and is headline: print headline underline
602
- full_colspan = 0
603
- for cell in cells:
604
- if 'colspan' in cell.attrs and cell['colspan'].isdigit():
605
- full_colspan += int(cell["colspan"])
606
- else:
607
- full_colspan += 1
608
- underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
609
- elif (not el.previous_sibling
610
- and (el.parent.name == 'table'
611
- or (el.parent.name == 'tbody'
612
- and not el.parent.previous_sibling))):
613
- # first row, not headline, and:
614
- # - the parent is table or
615
- # - the parent is tbody at the beginning of a table.
616
- # print empty headline above this row
617
- overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n'
618
- overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
619
- return overline + '|' + text + '\n' + underline
620
-
621
- def _extract_metadata(self, scout):
622
- """
623
- Extract metadata from the parsed document.
624
-
625
- Args:
626
- scout (Union[Scout, Tag, ScoutSearchResult]): Parsed object
627
-
628
- Returns:
629
- Dict[str, Any]: Extracted metadata
630
- """
631
- metadata = {}
632
- try:
633
- # Handle ScoutSearchResult
634
- if hasattr(scout, '_results'):
635
- scout = scout._results[0] if scout._results else None
636
-
637
- if scout is None:
638
- return metadata
639
-
640
- # Find head tag
641
- head = scout.find('head')
642
- if not head and hasattr(scout, 'find_all'):
643
- head_list = scout.find_all('head')
644
- head = head_list[0] if head_list else None
645
-
646
- if head:
647
- # Extract title
648
- title_tag = head.find('title') or (head.find_all('title')[0] if head.find_all('title') else None)
649
- metadata['title'] = title_tag.get_text() if title_tag else None
650
-
651
- # Extract meta tags
652
- metadata['meta'] = {}
653
- meta_tags = head.find_all('meta') if hasattr(head, 'find_all') else []
654
- for meta in meta_tags:
655
- name = meta.get('name') or meta.get('property')
656
- content = meta.get('content')
657
- if name and content:
658
- metadata['meta'][name] = content
659
-
660
- except Exception as e:
661
- logger.warning(f"Metadata extraction failed: {e}")
662
-
663
- return metadata
664
-
665
- def _extract_semantic_info(self, scout):
666
- """
667
- Extract semantic information from the document.
668
-
669
- Args:
670
- scout (Union[Scout, Tag, ScoutSearchResult]): Parsed object
671
-
672
- Returns:
673
- Dict[str, Any]: Semantic information
674
- """
675
- # Handle ScoutSearchResult
676
- if hasattr(scout, '_results'):
677
- scout = scout._results[0] if scout._results else None
678
-
679
- if scout is None:
680
- return {
681
- 'language': 'unknown',
682
- 'text_density': 0,
683
- 'content_types': {}
684
- }
685
-
686
- semantic_info = {
687
- 'language': 'unknown',
688
- 'text_density': 0,
689
- 'content_types': {}
690
- }
691
-
692
- try:
693
- # Try to find language
694
- html_tag = scout.find('html')
695
- if not html_tag and hasattr(scout, 'find_all'):
696
- html_tags = scout.find_all('html')
697
- html_tag = html_tags[0] if html_tags else None
698
-
699
- semantic_info['language'] = html_tag.get('lang', 'unknown') if html_tag else 'unknown'
700
-
701
- # Calculate text density
702
- total_text = scout.get_text() if hasattr(scout, 'get_text') else ''
703
- total_html = str(scout)
704
- semantic_info['text_density'] = len(total_text) / len(total_html) * 100 if total_html else 0
705
-
706
- # Analyze content types
707
- content_types = {}
708
- for tag in scout.find_all() if hasattr(scout, 'find_all') else [scout]:
709
- tag_type = tag.name
710
- content_types[tag_type] = content_types.get(tag_type, 0) + 1
711
-
712
- semantic_info['content_types'] = content_types
713
-
714
- except Exception as e:
715
- logger.warning(f"Semantic info extraction failed: {e}")
716
-
717
- return semantic_info
718
-
719
- def _analyze_structure(self, scout):
720
- """
721
- Analyze document structure.
722
-
723
- Args:
724
- scout (Scout): Parsed Scout object
725
-
726
- Returns:
727
- Dict[str, Any]: Document structure information
728
- """
729
- structure = {
730
- 'headings': [
731
- {'level': h.name, 'text': h.get_text(strip=True)}
732
- for h in scout.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
733
- ],
734
- 'sections': [],
735
- 'links': [
736
- {'href': a.get('href'), 'text': a.get_text(strip=True)}
737
- for a in scout.find_all('a')
738
- ]
739
- }
740
- return structure
741
-
742
- def _calculate_text_density(self, scout):
743
- """
744
- Calculate text density of the document.
745
-
746
- Args:
747
- scout (Scout): Parsed Scout object
748
-
749
- Returns:
750
- float: Text density percentage
751
- """
752
- try:
753
- total_text = scout.get_text()
754
- total_html = str(scout)
755
- return len(total_text) / len(total_html) * 100 if total_html else 0
756
- except Exception as e:
757
- logger.warning(f"Text density calculation failed: {e}")
758
- return 0
759
-
760
- def _analyze_content_types(self, scout):
761
- """
762
- Analyze content types in the document.
763
-
764
- Args:
765
- scout (Scout): Parsed Scout object
766
-
767
- Returns:
768
- Dict[str, int]: Content type counts
769
- """
770
- content_types = {}
771
- try:
772
- for tag in scout.find_all():
773
- tag_type = tag.name
774
- content_types[tag_type] = content_types.get(tag_type, 0) + 1
775
- except Exception as e:
776
- logger.warning(f"Content type analysis failed: {e}")
777
- return content_types
778
-
779
- def markdownify(html: str, **options) -> Union[str, Dict[str, Any]]:
780
- """
781
- Convert HTML to Markdown with advanced options.
782
-
783
- Args:
784
- html (str): HTML content to convert
785
- **options: Conversion options
786
-
787
- Returns:
788
- Union[str, Dict[str, Any]]: Markdown text or structured output
789
- """
790
- try:
791
- # Use Scout's native markdown conversion
792
- scout = Scout(html, features='html.parser')
793
-
794
- # Handle ScoutSearchResult
795
- if hasattr(scout, '_results'):
796
- scout = scout._results[0] if scout._results else scout
797
-
798
- # Determine conversion style based on options
799
- heading_style = options.get('heading_style', 'ATX')
800
-
801
- # Custom markdown conversion to preserve formatting
802
- def convert_tag(tag):
803
- # Handle specific tag types
804
- if tag.name == 'h1':
805
- return f"# {tag.get_text(strip=True)}\n\n"
806
- elif tag.name == 'h2':
807
- return f"## {tag.get_text(strip=True)}\n\n"
808
- elif tag.name == 'h3':
809
- return f"### {tag.get_text(strip=True)}\n\n"
810
- elif tag.name == 'p':
811
- return f"{tag.get_text(strip=True)}\n\n"
812
- elif tag.name == 'strong':
813
- return f"**{tag.get_text(strip=True)}**"
814
- elif tag.name == 'em':
815
- return f"*{tag.get_text(strip=True)}*"
816
- elif tag.name == 'ul':
817
- return ''.join(f"* {li.get_text(strip=True)}\n" for li in tag.find_all('li'))
818
- elif tag.name == 'ol':
819
- return ''.join(f"{i+1}. {li.get_text(strip=True)}\n" for i, li in enumerate(tag.find_all('li')))
820
- elif tag.name == 'a':
821
- return f"[{tag.get_text(strip=True)}]({tag.get('href', '')})"
822
- return tag.get_text(strip=True)
823
-
824
- # Traverse and convert tags
825
- markdown_parts = []
826
- for tag in scout.find_all():
827
- if tag.name in ['h1', 'h2', 'h3', 'p', 'strong', 'em', 'ul', 'ol', 'a']:
828
- markdown_parts.append(convert_tag(tag))
829
-
830
- markdown = '\n'.join(markdown_parts)
831
-
832
- # If structured output is requested, include additional metadata
833
- if options.get('STRUCTURED_OUTPUT', False):
834
- # Custom metadata extraction
835
- metadata = {}
836
- try:
837
- head = scout.find('head') or scout.find_all('head')[0] if scout.find_all('head') else None
838
- if head:
839
- # Extract title
840
- title_tag = head.find('title') or head.find_all('title')[0] if head.find_all('title') else None
841
- metadata['title'] = title_tag.get_text() if title_tag else None
842
-
843
- # Extract meta tags
844
- metadata['meta'] = {
845
- meta.get('name', meta.get('property')): meta.get('content')
846
- for meta in head.find_all('meta')
847
- if meta.get('name') or meta.get('property')
848
- }
849
- except Exception as e:
850
- logger.warning(f"Metadata extraction failed: {e}")
851
-
852
- return {
853
- 'markdown': markdown,
854
- 'metadata': metadata,
855
- 'structure': scout.analyze_page_structure(),
856
- 'semantic_info': scout.extract_semantic_info()
857
- }
858
-
859
- return markdown
860
- except Exception as e:
861
- logger.error(f"Markdownify failed: {e}", exc_info=True)
862
- return str(e)