webscout 6.4__py3-none-any.whl → 6.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/AIutel.py +7 -54
- webscout/DWEBS.py +48 -26
- webscout/{YTdownloader.py → Extra/YTToolkit/YTdownloader.py} +990 -1103
- webscout/Extra/YTToolkit/__init__.py +3 -0
- webscout/{transcriber.py → Extra/YTToolkit/transcriber.py} +1 -1
- webscout/Extra/YTToolkit/ytapi/__init__.py +6 -0
- webscout/Extra/YTToolkit/ytapi/channel.py +307 -0
- webscout/Extra/YTToolkit/ytapi/errors.py +13 -0
- webscout/Extra/YTToolkit/ytapi/extras.py +45 -0
- webscout/Extra/YTToolkit/ytapi/https.py +88 -0
- webscout/Extra/YTToolkit/ytapi/patterns.py +61 -0
- webscout/Extra/YTToolkit/ytapi/playlist.py +59 -0
- webscout/Extra/YTToolkit/ytapi/pool.py +8 -0
- webscout/Extra/YTToolkit/ytapi/query.py +37 -0
- webscout/Extra/YTToolkit/ytapi/stream.py +60 -0
- webscout/Extra/YTToolkit/ytapi/utils.py +62 -0
- webscout/Extra/YTToolkit/ytapi/video.py +102 -0
- webscout/Extra/__init__.py +2 -1
- webscout/Extra/autocoder/autocoder_utiles.py +119 -101
- webscout/Extra/autocoder/rawdog.py +679 -680
- webscout/Extra/gguf.py +441 -441
- webscout/Extra/markdownlite/__init__.py +862 -0
- webscout/Extra/weather_ascii.py +2 -2
- webscout/Provider/AISEARCH/__init__.py +2 -0
- webscout/Provider/AISEARCH/ooai.py +155 -0
- webscout/Provider/Amigo.py +70 -85
- webscout/Provider/{prefind.py → Jadve.py} +72 -70
- webscout/Provider/Netwrck.py +235 -0
- webscout/Provider/Openai.py +4 -3
- webscout/Provider/PI.py +292 -221
- webscout/Provider/PizzaGPT.py +3 -3
- webscout/Provider/Reka.py +0 -1
- webscout/Provider/TTS/__init__.py +5 -1
- webscout/Provider/TTS/deepgram.py +183 -0
- webscout/Provider/TTS/elevenlabs.py +137 -0
- webscout/Provider/TTS/gesserit.py +151 -0
- webscout/Provider/TTS/murfai.py +139 -0
- webscout/Provider/TTS/parler.py +134 -107
- webscout/Provider/TTS/streamElements.py +360 -275
- webscout/Provider/TTS/utils.py +280 -0
- webscout/Provider/TTS/voicepod.py +116 -116
- webscout/Provider/TeachAnything.py +15 -2
- webscout/Provider/Youchat.py +42 -8
- webscout/Provider/__init__.py +8 -21
- webscout/Provider/meta.py +794 -779
- webscout/Provider/multichat.py +230 -0
- webscout/Provider/promptrefine.py +2 -2
- webscout/Provider/talkai.py +10 -13
- webscout/Provider/turboseek.py +5 -4
- webscout/Provider/tutorai.py +8 -112
- webscout/Provider/typegpt.py +5 -7
- webscout/Provider/x0gpt.py +81 -9
- webscout/Provider/yep.py +123 -361
- webscout/__init__.py +33 -28
- webscout/conversation.py +24 -9
- webscout/exceptions.py +188 -20
- webscout/litprinter/__init__.py +719 -831
- webscout/litprinter/colors.py +54 -0
- webscout/optimizers.py +420 -270
- webscout/prompt_manager.py +279 -279
- webscout/scout/__init__.py +8 -0
- webscout/scout/core/__init__.py +7 -0
- webscout/scout/core/crawler.py +140 -0
- webscout/scout/core/scout.py +571 -0
- webscout/scout/core/search_result.py +96 -0
- webscout/scout/core/text_analyzer.py +63 -0
- webscout/scout/core/text_utils.py +277 -0
- webscout/scout/core/web_analyzer.py +52 -0
- webscout/scout/core.py +884 -0
- webscout/scout/element.py +460 -0
- webscout/scout/parsers/__init__.py +69 -0
- webscout/scout/parsers/html5lib_parser.py +172 -0
- webscout/scout/parsers/html_parser.py +236 -0
- webscout/scout/parsers/lxml_parser.py +178 -0
- webscout/scout/utils.py +38 -0
- webscout/update_checker.py +184 -125
- webscout/version.py +1 -1
- webscout/zeroart/__init__.py +55 -0
- webscout/zeroart/base.py +60 -0
- webscout/zeroart/effects.py +99 -0
- webscout/zeroart/fonts.py +816 -0
- webscout/zerodir/__init__.py +225 -0
- {webscout-6.4.dist-info → webscout-6.6.dist-info}/METADATA +18 -231
- webscout-6.6.dist-info/RECORD +197 -0
- webscout-6.6.dist-info/top_level.txt +2 -0
- webstoken/__init__.py +30 -0
- webstoken/classifier.py +189 -0
- webstoken/keywords.py +216 -0
- webstoken/language.py +128 -0
- webstoken/ner.py +164 -0
- webstoken/normalizer.py +35 -0
- webstoken/processor.py +77 -0
- webstoken/sentiment.py +206 -0
- webstoken/stemmer.py +73 -0
- webstoken/t.py +75 -0
- webstoken/tagger.py +60 -0
- webstoken/tokenizer.py +158 -0
- webscout/Agents/Onlinesearcher.py +0 -182
- webscout/Agents/__init__.py +0 -2
- webscout/Agents/functioncall.py +0 -248
- webscout/Bing_search.py +0 -251
- webscout/Provider/Perplexity.py +0 -599
- webscout/Provider/RoboCoders.py +0 -206
- webscout/Provider/genspark.py +0 -225
- webscout/Provider/perplexitylabs.py +0 -265
- webscout/Provider/twitterclone.py +0 -251
- webscout/Provider/upstage.py +0 -230
- webscout/gpt4free.py +0 -666
- webscout/requestsHTMLfix.py +0 -775
- webscout/webai.py +0 -2590
- webscout-6.4.dist-info/RECORD +0 -154
- webscout-6.4.dist-info/top_level.txt +0 -1
- /webscout/Provider/{felo_search.py → AISEARCH/felo_search.py} +0 -0
- {webscout-6.4.dist-info → webscout-6.6.dist-info}/LICENSE.md +0 -0
- {webscout-6.4.dist-info → webscout-6.6.dist-info}/WHEEL +0 -0
- {webscout-6.4.dist-info → webscout-6.6.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,862 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
from webscout.Litlogger import LitLogger, LogFormat, ColorScheme
|
|
4
|
+
from webscout.scout import Scout, Tag
|
|
5
|
+
from textwrap import fill
|
|
6
|
+
import re
|
|
7
|
+
import six
|
|
8
|
+
import html
|
|
9
|
+
import json
|
|
10
|
+
from typing import Union, Dict, Any, Optional, List
|
|
11
|
+
import functools
|
|
12
|
+
|
|
13
|
+
# Initialize Litlogger
|
|
14
|
+
logger = LitLogger(
|
|
15
|
+
name="MarkdownLite",
|
|
16
|
+
format=LogFormat.DETAILED,
|
|
17
|
+
color_scheme=ColorScheme.OCEAN
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Decorator for error handling and logging
|
|
21
|
+
def markdown_conversion_error_handler(func):
|
|
22
|
+
@functools.wraps(func)
|
|
23
|
+
def wrapper(*args, **kwargs):
|
|
24
|
+
try:
|
|
25
|
+
return func(*args, **kwargs)
|
|
26
|
+
except Exception as e:
|
|
27
|
+
logger.error(f"Markdown conversion error: {e}", exc_info=True)
|
|
28
|
+
raise
|
|
29
|
+
return wrapper
|
|
30
|
+
|
|
31
|
+
# Constants and configuration
|
|
32
|
+
MARKDOWN_CONVERSION_OPTIONS = {
|
|
33
|
+
'SEMANTIC_CONVERSION': True,
|
|
34
|
+
'PRESERVE_METADATA': True,
|
|
35
|
+
'SMART_LISTS': True,
|
|
36
|
+
'LINK_REWRITING': None,
|
|
37
|
+
'CUSTOM_ANALYZERS': [],
|
|
38
|
+
'STRUCTURED_OUTPUT': True,
|
|
39
|
+
'DEBUG_MODE': False
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# Existing utility functions
|
|
43
|
+
def chomp(text):
|
|
44
|
+
"""
|
|
45
|
+
Strip leading/trailing spaces while preserving prefix/suffix spaces.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
text (str): Input text to process
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
tuple: (prefix, suffix, stripped_text)
|
|
52
|
+
"""
|
|
53
|
+
prefix = ' ' if text and text[0] == ' ' else ''
|
|
54
|
+
suffix = ' ' if text and text[-1] == ' ' else ''
|
|
55
|
+
text = text.strip()
|
|
56
|
+
return (prefix, suffix, text)
|
|
57
|
+
|
|
58
|
+
def abstract_inline_conversion(markup_fn):
|
|
59
|
+
"""
|
|
60
|
+
Abstract inline tag conversion with enhanced flexibility.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
markup_fn (callable): Function to generate markup
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
callable: Conversion implementation
|
|
67
|
+
"""
|
|
68
|
+
def implementation(self, el, text, convert_as_inline):
|
|
69
|
+
markup_prefix = markup_fn(self)
|
|
70
|
+
if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
|
|
71
|
+
markup_suffix = '</' + markup_prefix[1:]
|
|
72
|
+
else:
|
|
73
|
+
markup_suffix = markup_prefix
|
|
74
|
+
|
|
75
|
+
if el.find_parent(['pre', 'code', 'kbd', 'samp']):
|
|
76
|
+
return text
|
|
77
|
+
|
|
78
|
+
prefix, suffix, text = chomp(text)
|
|
79
|
+
if not text:
|
|
80
|
+
return ''
|
|
81
|
+
|
|
82
|
+
return f'{prefix}{markup_prefix}{text}{markup_suffix}{suffix}'
|
|
83
|
+
return implementation
|
|
84
|
+
|
|
85
|
+
class MarkdownConverter(object):
|
|
86
|
+
class DefaultOptions:
|
|
87
|
+
autolinks = True
|
|
88
|
+
bullets = '*+-' # An iterable of bullet types.
|
|
89
|
+
code_language = ''
|
|
90
|
+
code_language_callback = None
|
|
91
|
+
convert = None
|
|
92
|
+
default_title = False
|
|
93
|
+
escape_asterisks = True
|
|
94
|
+
escape_underscores = True
|
|
95
|
+
escape_misc = False
|
|
96
|
+
heading_style = 'underlined'
|
|
97
|
+
keep_inline_images_in = []
|
|
98
|
+
newline_style = 'spaces'
|
|
99
|
+
strip = None
|
|
100
|
+
strong_em_symbol = '*'
|
|
101
|
+
sub_symbol = ''
|
|
102
|
+
sup_symbol = ''
|
|
103
|
+
wrap = False
|
|
104
|
+
wrap_width = 80
|
|
105
|
+
|
|
106
|
+
# New options for Scout integration
|
|
107
|
+
semantic_conversion = False # Enable semantic-aware conversion
|
|
108
|
+
preserve_metadata = False # Keep HTML metadata in markdown
|
|
109
|
+
smart_lists = True # Smart list handling
|
|
110
|
+
link_rewriting = None # Function for rewriting URLs
|
|
111
|
+
custom_analyzers = [] # List of custom text analyzers
|
|
112
|
+
structured_output = False # Return structured output with metadata
|
|
113
|
+
|
|
114
|
+
# Existing options
|
|
115
|
+
debug_mode = False
|
|
116
|
+
handle_unknown_tags = 'ignore' # 'ignore', 'warn', 'error'
|
|
117
|
+
preserve_html_comments = False
|
|
118
|
+
max_depth = 100 # Prevent potential infinite recursion
|
|
119
|
+
custom_tag_handlers = {} # Allow custom tag conversion functions
|
|
120
|
+
|
|
121
|
+
class Options(DefaultOptions):
|
|
122
|
+
pass
|
|
123
|
+
|
|
124
|
+
# Inline conversion methods
|
|
125
|
+
convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])
|
|
126
|
+
convert_del = abstract_inline_conversion(lambda self: '~~')
|
|
127
|
+
convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])
|
|
128
|
+
convert_i = convert_em
|
|
129
|
+
convert_s = convert_del
|
|
130
|
+
convert_strong = convert_b
|
|
131
|
+
convert_sub = abstract_inline_conversion(lambda self: self.options['sub_symbol'])
|
|
132
|
+
convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])
|
|
133
|
+
|
|
134
|
+
def __init__(self, **options):
|
|
135
|
+
# Merge default and user-provided options
|
|
136
|
+
default_options = {
|
|
137
|
+
'SEMANTIC_CONVERSION': True,
|
|
138
|
+
'PRESERVE_METADATA': True,
|
|
139
|
+
'SMART_LISTS': True,
|
|
140
|
+
'LINK_REWRITING': None,
|
|
141
|
+
'CUSTOM_ANALYZERS': [],
|
|
142
|
+
'STRUCTURED_OUTPUT': True,
|
|
143
|
+
'DEBUG_MODE': False,
|
|
144
|
+
# Add max_depth for conversion
|
|
145
|
+
'max_depth': 10,
|
|
146
|
+
# Inherit existing default options
|
|
147
|
+
'strip': None,
|
|
148
|
+
'convert': None,
|
|
149
|
+
'heading_style': 'underlined',
|
|
150
|
+
'newline_style': 'spaces',
|
|
151
|
+
'strong_em_symbol': '*',
|
|
152
|
+
'escape_asterisks': True,
|
|
153
|
+
'escape_underscores': True,
|
|
154
|
+
'escape_misc': False,
|
|
155
|
+
'keep_inline_images_in': [],
|
|
156
|
+
'sub_symbol': '',
|
|
157
|
+
'sup_symbol': '',
|
|
158
|
+
'wrap': False
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
# Update with user options
|
|
162
|
+
default_options.update(options)
|
|
163
|
+
self.options = default_options
|
|
164
|
+
|
|
165
|
+
# Setup logging based on debug mode
|
|
166
|
+
if self.options['DEBUG_MODE']:
|
|
167
|
+
logger.setLevel(logging.DEBUG)
|
|
168
|
+
|
|
169
|
+
# Initialize metadata and structure
|
|
170
|
+
self._metadata = {}
|
|
171
|
+
self._structure = {}
|
|
172
|
+
self._semantic_info = {}
|
|
173
|
+
|
|
174
|
+
@markdown_conversion_error_handler
|
|
175
|
+
def convert(self, html):
|
|
176
|
+
"""
|
|
177
|
+
Enhanced conversion with metadata and structure analysis.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
html (str): HTML content to convert
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Union[str, Dict[str, Any]]: Markdown text or structured output
|
|
184
|
+
"""
|
|
185
|
+
# Handle different Scout result types
|
|
186
|
+
scout = html if hasattr(html, '_soup') or hasattr(html, 'name') else Scout(html, features='html.parser')
|
|
187
|
+
|
|
188
|
+
# If scout is a search result, get the first result or the original scout
|
|
189
|
+
if hasattr(scout, '_results') and scout._results:
|
|
190
|
+
scout = scout._results[0]
|
|
191
|
+
|
|
192
|
+
# Ensure we have a valid Scout object or Tag
|
|
193
|
+
if not hasattr(scout, '_soup') and not hasattr(scout, 'name'):
|
|
194
|
+
raise ValueError("Unable to convert input to a valid Scout object")
|
|
195
|
+
|
|
196
|
+
logger.debug(f"Parsing HTML: {str(scout)[:100]}...")
|
|
197
|
+
|
|
198
|
+
# Extract additional information if needed
|
|
199
|
+
if self.options['PRESERVE_METADATA']:
|
|
200
|
+
self._metadata = self._extract_metadata(scout)
|
|
201
|
+
|
|
202
|
+
if self.options['SEMANTIC_CONVERSION']:
|
|
203
|
+
self._structure = self._analyze_structure(scout)
|
|
204
|
+
self._semantic_info = self._extract_semantic_info(scout)
|
|
205
|
+
|
|
206
|
+
# Convert to markdown
|
|
207
|
+
markdown = self.convert_soup(scout._soup if hasattr(scout, '_soup') else scout)
|
|
208
|
+
|
|
209
|
+
# Return structured output if requested
|
|
210
|
+
if self.options['STRUCTURED_OUTPUT']:
|
|
211
|
+
return {
|
|
212
|
+
'markdown': markdown,
|
|
213
|
+
'metadata': self._metadata,
|
|
214
|
+
'structure': self._structure,
|
|
215
|
+
'semantic_info': self._semantic_info
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
return markdown
|
|
219
|
+
|
|
220
|
+
def convert_soup(self, soup):
|
|
221
|
+
"""Convert Scout's internal soup object."""
|
|
222
|
+
return self.process_tag(soup, convert_as_inline=False, children_only=True)
|
|
223
|
+
|
|
224
|
+
def process_tag(self, node, convert_as_inline, children_only=False, depth=0):
|
|
225
|
+
"""Enhanced tag processing with semantic awareness."""
|
|
226
|
+
if depth > self.options['max_depth']:
|
|
227
|
+
logger.warning(f"Max recursion depth reached at tag: {node.name}")
|
|
228
|
+
return ''
|
|
229
|
+
|
|
230
|
+
# Check for custom tag handlers
|
|
231
|
+
if hasattr(node, 'name') and node.name in self.options['custom_tag_handlers']:
|
|
232
|
+
custom_handler = self.options['custom_tag_handlers'][node.name]
|
|
233
|
+
return custom_handler(node, convert_as_inline)
|
|
234
|
+
|
|
235
|
+
text = ''
|
|
236
|
+
|
|
237
|
+
# markdown headings or cells can't include
|
|
238
|
+
# block elements (elements w/newlines)
|
|
239
|
+
isHeading = re.match(r'h[1-6]', node.name) if hasattr(node, 'name') else False
|
|
240
|
+
isCell = hasattr(node, 'name') and node.name in ['td', 'th']
|
|
241
|
+
convert_children_as_inline = convert_as_inline
|
|
242
|
+
|
|
243
|
+
if not children_only and (isHeading or isCell):
|
|
244
|
+
convert_children_as_inline = True
|
|
245
|
+
|
|
246
|
+
# Remove whitespace-only textnodes
|
|
247
|
+
should_remove_inside = should_remove_whitespace_inside(node)
|
|
248
|
+
|
|
249
|
+
# Iterate through children
|
|
250
|
+
for el in node.children:
|
|
251
|
+
# Skip script, style, and comment-like elements
|
|
252
|
+
if hasattr(el, 'name') and el.name in ['script', 'style', 'comment']:
|
|
253
|
+
continue
|
|
254
|
+
|
|
255
|
+
# Check if element is a text node that can be stripped
|
|
256
|
+
if (isinstance(el, str) or
|
|
257
|
+
(hasattr(el, 'string') and el.string and str(el.string).strip() == '')):
|
|
258
|
+
if should_remove_inside and (not el.previous_sibling or not el.next_sibling):
|
|
259
|
+
continue
|
|
260
|
+
|
|
261
|
+
# Process child elements
|
|
262
|
+
if isinstance(el, str):
|
|
263
|
+
text += el
|
|
264
|
+
elif hasattr(el, 'name'):
|
|
265
|
+
text_strip = text.rstrip('\n')
|
|
266
|
+
newlines_left = len(text) - len(text_strip)
|
|
267
|
+
next_text = self.process_tag(el, convert_children_as_inline)
|
|
268
|
+
next_text_strip = next_text.lstrip('\n')
|
|
269
|
+
newlines_right = len(next_text) - len(next_text_strip)
|
|
270
|
+
newlines = '\n' * max(newlines_left, newlines_right)
|
|
271
|
+
text = text_strip + newlines + next_text_strip
|
|
272
|
+
|
|
273
|
+
if not children_only and hasattr(node, 'name'):
|
|
274
|
+
convert_fn = getattr(self, 'convert_%s' % node.name, None)
|
|
275
|
+
if convert_fn and self.should_convert_tag(node.name):
|
|
276
|
+
text = convert_fn(node, text, convert_as_inline)
|
|
277
|
+
|
|
278
|
+
# Apply custom analyzers
|
|
279
|
+
for analyzer in self.options['custom_analyzers']:
|
|
280
|
+
text = analyzer(text, node)
|
|
281
|
+
|
|
282
|
+
return text
|
|
283
|
+
|
|
284
|
+
def _validate_options(self):
|
|
285
|
+
"""Validate and sanitize converter options."""
|
|
286
|
+
if self.options['max_depth'] < 1:
|
|
287
|
+
raise ValueError("max_depth must be a positive integer")
|
|
288
|
+
|
|
289
|
+
if self.options['handle_unknown_tags'] not in ['ignore', 'warn', 'error']:
|
|
290
|
+
raise ValueError("handle_unknown_tags must be 'ignore', 'warn', or 'error'")
|
|
291
|
+
|
|
292
|
+
def process_text(self, el):
|
|
293
|
+
text = six.text_type(el) or ''
|
|
294
|
+
|
|
295
|
+
# normalize whitespace if we're not inside a preformatted element
|
|
296
|
+
if not el.find_parent('pre'):
|
|
297
|
+
if self.options['wrap']:
|
|
298
|
+
text = re.sub(r'[\t ]+', ' ', text)
|
|
299
|
+
else:
|
|
300
|
+
text = re.sub(r'[\t \r\n]*[\r\n][\t \r\n]*', '\n', text)
|
|
301
|
+
text = re.sub(r'[\t ]+', ' ', text)
|
|
302
|
+
|
|
303
|
+
# escape special characters if we're not inside a preformatted or code element
|
|
304
|
+
if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
|
|
305
|
+
text = self.escape(text)
|
|
306
|
+
|
|
307
|
+
# remove leading whitespace at the start or just after a
|
|
308
|
+
# block-level element; remove traliing whitespace at the end
|
|
309
|
+
# or just before a block-level element.
|
|
310
|
+
if (should_remove_whitespace_outside(el.previous_sibling)
|
|
311
|
+
or (should_remove_whitespace_inside(el.parent)
|
|
312
|
+
and not el.previous_sibling)):
|
|
313
|
+
text = text.lstrip()
|
|
314
|
+
if (should_remove_whitespace_outside(el.next_sibling)
|
|
315
|
+
or (should_remove_whitespace_inside(el.parent)
|
|
316
|
+
and not el.next_sibling)):
|
|
317
|
+
text = text.rstrip()
|
|
318
|
+
|
|
319
|
+
return text
|
|
320
|
+
|
|
321
|
+
def __getattr__(self, attr):
|
|
322
|
+
# Handle headings
|
|
323
|
+
m = re.match(r'convert_h(\d+)', attr)
|
|
324
|
+
if m:
|
|
325
|
+
n = int(m.group(1))
|
|
326
|
+
|
|
327
|
+
def convert_tag(el, text, convert_as_inline):
|
|
328
|
+
return self._convert_hn(n, el, text, convert_as_inline)
|
|
329
|
+
|
|
330
|
+
convert_tag.__name__ = 'convert_h%s' % n
|
|
331
|
+
setattr(self, convert_tag.__name__, convert_tag)
|
|
332
|
+
return convert_tag
|
|
333
|
+
|
|
334
|
+
raise AttributeError(attr)
|
|
335
|
+
|
|
336
|
+
def should_convert_tag(self, tag):
|
|
337
|
+
tag = tag.lower()
|
|
338
|
+
strip = self.options['strip']
|
|
339
|
+
convert = self.options['convert']
|
|
340
|
+
if strip is not None:
|
|
341
|
+
return tag not in strip
|
|
342
|
+
elif convert is not None:
|
|
343
|
+
return tag in convert
|
|
344
|
+
else:
|
|
345
|
+
return True
|
|
346
|
+
|
|
347
|
+
def escape(self, text):
|
|
348
|
+
if not text:
|
|
349
|
+
return ''
|
|
350
|
+
if self.options['escape_misc']:
|
|
351
|
+
text = re.sub(r'([\\&<`[>~=+|])', r'\\\1', text)
|
|
352
|
+
# A sequence of one or more consecutive '-', preceded and
|
|
353
|
+
# followed by whitespace or start/end of fragment, might
|
|
354
|
+
# be confused with an underline of a header, or with a
|
|
355
|
+
# list marker.
|
|
356
|
+
text = re.sub(r'(\s|^)(-+(?:\s|$))', r'\1\\\2', text)
|
|
357
|
+
# A sequence of up to six consecutive '#', preceded and
|
|
358
|
+
# followed by whitespace or start/end of fragment, might
|
|
359
|
+
# be confused with an ATX heading.
|
|
360
|
+
text = re.sub(r'(\s|^)(#{1,6}(?:\s|$))', r'\1\\\2', text)
|
|
361
|
+
# '.' or ')' preceded by up to nine digits might be
|
|
362
|
+
# confused with a list item.
|
|
363
|
+
text = re.sub(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))', r'\1\\\2',
|
|
364
|
+
text)
|
|
365
|
+
if self.options['escape_asterisks']:
|
|
366
|
+
text = text.replace('*', r'\*')
|
|
367
|
+
if self.options['escape_underscores']:
|
|
368
|
+
text = text.replace('_', r'\_')
|
|
369
|
+
return text
|
|
370
|
+
|
|
371
|
+
def indent(self, text, columns):
|
|
372
|
+
return re.sub(r'^', ' ' * columns, text, flags=re.MULTILINE) if text else ''
|
|
373
|
+
|
|
374
|
+
def underline(self, text, pad_char):
|
|
375
|
+
text = (text or '').rstrip()
|
|
376
|
+
return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
|
|
377
|
+
|
|
378
|
+
def convert_a(self, el, text, convert_as_inline):
|
|
379
|
+
"""Enhanced link conversion with URL rewriting."""
|
|
380
|
+
if self.options['link_rewriting'] and callable(self.options['link_rewriting']):
|
|
381
|
+
href = el.get('href')
|
|
382
|
+
if href:
|
|
383
|
+
href = self.options['link_rewriting'](href)
|
|
384
|
+
el['href'] = href
|
|
385
|
+
|
|
386
|
+
prefix, suffix, text = chomp(text)
|
|
387
|
+
if not text:
|
|
388
|
+
return ''
|
|
389
|
+
href = el.get('href')
|
|
390
|
+
title = el.get('title')
|
|
391
|
+
# For the replacement see #29: text nodes underscores are escaped
|
|
392
|
+
if (self.options['autolinks']
|
|
393
|
+
and text.replace(r'\_', '_') == href
|
|
394
|
+
and not title
|
|
395
|
+
and not self.options['default_title']):
|
|
396
|
+
# Shortcut syntax
|
|
397
|
+
return '<%s>' % href
|
|
398
|
+
if self.options['default_title'] and not title:
|
|
399
|
+
title = href
|
|
400
|
+
title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
|
|
401
|
+
return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text
|
|
402
|
+
|
|
403
|
+
def convert_blockquote(self, el, text, convert_as_inline):
|
|
404
|
+
|
|
405
|
+
if convert_as_inline:
|
|
406
|
+
return ' ' + text.strip() + ' '
|
|
407
|
+
|
|
408
|
+
return '\n' + (re.sub(r'^', '> ', text.strip(), flags=re.MULTILINE) + '\n\n') if text else ''
|
|
409
|
+
|
|
410
|
+
def convert_br(self, el, text, convert_as_inline):
|
|
411
|
+
if convert_as_inline:
|
|
412
|
+
return ""
|
|
413
|
+
|
|
414
|
+
if self.options['newline_style'].lower() == 'backslash':
|
|
415
|
+
return '\\\n'
|
|
416
|
+
else:
|
|
417
|
+
return ' \n'
|
|
418
|
+
|
|
419
|
+
def convert_code(self, el, text, convert_as_inline):
|
|
420
|
+
if el.parent.name == 'pre':
|
|
421
|
+
return text
|
|
422
|
+
converter = abstract_inline_conversion(lambda self: '`')
|
|
423
|
+
return converter(self, el, text, convert_as_inline)
|
|
424
|
+
|
|
425
|
+
def convert_kbd(self, el, text, convert_as_inline):
|
|
426
|
+
return self.convert_code(el, text, convert_as_inline)
|
|
427
|
+
|
|
428
|
+
def _convert_hn(self, n, el, text, convert_as_inline):
|
|
429
|
+
""" Method name prefixed with _ to prevent <hn> to call this """
|
|
430
|
+
if convert_as_inline:
|
|
431
|
+
return text
|
|
432
|
+
|
|
433
|
+
# prevent MemoryErrors in case of very large n
|
|
434
|
+
n = max(1, min(6, n))
|
|
435
|
+
|
|
436
|
+
style = self.options['heading_style'].lower()
|
|
437
|
+
text = text.strip()
|
|
438
|
+
if style == 'underlined' and n <= 2:
|
|
439
|
+
line = '=' if n == 1 else '-'
|
|
440
|
+
return self.underline(text, line)
|
|
441
|
+
text = re.sub(r'[\t ]+', ' ', text)
|
|
442
|
+
hashes = '#' * n
|
|
443
|
+
if style == 'atx_closed':
|
|
444
|
+
return '\n%s %s %s\n\n' % (hashes, text, hashes)
|
|
445
|
+
return '\n%s %s\n\n' % (hashes, text)
|
|
446
|
+
|
|
447
|
+
def convert_hr(self, el, text, convert_as_inline):
|
|
448
|
+
return '\n\n---\n\n'
|
|
449
|
+
|
|
450
|
+
def convert_img(self, el, text, convert_as_inline):
|
|
451
|
+
alt = el.attrs.get('alt', None) or ''
|
|
452
|
+
src = el.attrs.get('src', None) or ''
|
|
453
|
+
title = el.attrs.get('title', None) or ''
|
|
454
|
+
title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
|
|
455
|
+
if (convert_as_inline
|
|
456
|
+
and el.parent.name not in self.options['keep_inline_images_in']):
|
|
457
|
+
return alt
|
|
458
|
+
|
|
459
|
+
return '' % (alt, src, title_part)
|
|
460
|
+
|
|
461
|
+
def convert_list(self, el, text, convert_as_inline):
|
|
462
|
+
"""Enhanced list conversion with smart handling."""
|
|
463
|
+
if not self.options['smart_lists']:
|
|
464
|
+
return super().convert_list(el, text, convert_as_inline)
|
|
465
|
+
|
|
466
|
+
nested = False
|
|
467
|
+
before_paragraph = False
|
|
468
|
+
|
|
469
|
+
# Smart list processing
|
|
470
|
+
list_type = el.name
|
|
471
|
+
is_ordered = list_type == 'ol'
|
|
472
|
+
start = el.get('start', 1) if is_ordered else None
|
|
473
|
+
|
|
474
|
+
# Process list items
|
|
475
|
+
items = el.find_all('li', recursive=False)
|
|
476
|
+
processed_items = []
|
|
477
|
+
|
|
478
|
+
for i, item in enumerate(items):
|
|
479
|
+
item_text = self.process_tag(item, convert_as_inline)
|
|
480
|
+
if is_ordered:
|
|
481
|
+
number = start + i if start else i + 1
|
|
482
|
+
processed_items.append(f"{number}. {item_text}")
|
|
483
|
+
else:
|
|
484
|
+
processed_items.append(f"* {item_text}")
|
|
485
|
+
|
|
486
|
+
return '\n'.join(processed_items)
|
|
487
|
+
|
|
488
|
+
def convert_ul(self, el, text, convert_as_inline):
|
|
489
|
+
return self.convert_list(el, text, convert_as_inline)
|
|
490
|
+
|
|
491
|
+
def convert_ol(self, el, text, convert_as_inline):
|
|
492
|
+
return self.convert_list(el, text, convert_as_inline)
|
|
493
|
+
|
|
494
|
+
def convert_li(self, el, text, convert_as_inline):
|
|
495
|
+
parent = el.parent
|
|
496
|
+
if parent is not None and parent.name == 'ol':
|
|
497
|
+
if parent.get("start") and str(parent.get("start")).isnumeric():
|
|
498
|
+
start = int(parent.get("start"))
|
|
499
|
+
else:
|
|
500
|
+
start = 1
|
|
501
|
+
bullet = '%s.' % (start + parent.index(el))
|
|
502
|
+
else:
|
|
503
|
+
depth = -1
|
|
504
|
+
while el:
|
|
505
|
+
if el.name == 'ul':
|
|
506
|
+
depth += 1
|
|
507
|
+
el = el.parent
|
|
508
|
+
bullets = self.options['bullets']
|
|
509
|
+
bullet = bullets[depth % len(bullets)]
|
|
510
|
+
bullet = bullet + ' '
|
|
511
|
+
text = (text or '').strip()
|
|
512
|
+
text = self.indent(text, len(bullet))
|
|
513
|
+
if text:
|
|
514
|
+
text = bullet + text[len(bullet):]
|
|
515
|
+
return '%s\n' % text
|
|
516
|
+
|
|
517
|
+
def convert_p(self, el, text, convert_as_inline):
|
|
518
|
+
if convert_as_inline:
|
|
519
|
+
return ' ' + text.strip() + ' '
|
|
520
|
+
if self.options['wrap']:
|
|
521
|
+
# Preserve newlines (and preceding whitespace) resulting
|
|
522
|
+
# from <br> tags. Newlines in the input have already been
|
|
523
|
+
# replaced by spaces.
|
|
524
|
+
lines = text.split('\n')
|
|
525
|
+
new_lines = []
|
|
526
|
+
for line in lines:
|
|
527
|
+
line = line.lstrip()
|
|
528
|
+
line_no_trailing = line.rstrip()
|
|
529
|
+
trailing = line[len(line_no_trailing):]
|
|
530
|
+
line = fill(line,
|
|
531
|
+
width=self.options['wrap_width'],
|
|
532
|
+
break_long_words=False,
|
|
533
|
+
break_on_hyphens=False)
|
|
534
|
+
new_lines.append(line + trailing)
|
|
535
|
+
text = '\n'.join(new_lines)
|
|
536
|
+
return '\n\n%s\n\n' % text if text else ''
|
|
537
|
+
|
|
538
|
+
def convert_pre(self, el, text, convert_as_inline):
|
|
539
|
+
if not text:
|
|
540
|
+
return ''
|
|
541
|
+
code_language = self.options['code_language']
|
|
542
|
+
|
|
543
|
+
if self.options['code_language_callback']:
|
|
544
|
+
code_language = self.options['code_language_callback'](el) or code_language
|
|
545
|
+
|
|
546
|
+
return '\n```%s\n%s\n```\n' % (code_language, text)
|
|
547
|
+
|
|
548
|
+
def convert_script(self, el, text, convert_as_inline):
|
|
549
|
+
return ''
|
|
550
|
+
|
|
551
|
+
def convert_style(self, el, text, convert_as_inline):
|
|
552
|
+
return ''
|
|
553
|
+
|
|
554
|
+
def convert_comment(self, el, text, convert_as_inline):
|
|
555
|
+
"""Handle comment-like elements based on configuration."""
|
|
556
|
+
if self.options['preserve_html_comments']:
|
|
557
|
+
return f'<!-- {text} -->'
|
|
558
|
+
return ''
|
|
559
|
+
|
|
560
|
+
def convert_details(self, el, text, convert_as_inline):
|
|
561
|
+
"""Convert HTML5 details and summary tags."""
|
|
562
|
+
summary = el.find('summary')
|
|
563
|
+
summary_text = summary.text if summary else 'Details'
|
|
564
|
+
return f'\n<details>\n<summary>{summary_text}</summary>\n\n{text}\n</details>\n'
|
|
565
|
+
|
|
566
|
+
def convert_mark(self, el, text, convert_as_inline):
|
|
567
|
+
"""Convert mark tag with highlighting."""
|
|
568
|
+
return f'`{text}`'
|
|
569
|
+
|
|
570
|
+
def convert_table(self, el, text, convert_as_inline):
|
|
571
|
+
return '\n\n' + text + '\n'
|
|
572
|
+
|
|
573
|
+
def convert_caption(self, el, text, convert_as_inline):
|
|
574
|
+
return text + '\n'
|
|
575
|
+
|
|
576
|
+
def convert_figcaption(self, el, text, convert_as_inline):
|
|
577
|
+
return '\n\n' + text + '\n\n'
|
|
578
|
+
|
|
579
|
+
def convert_td(self, el, text, convert_as_inline):
|
|
580
|
+
colspan = 1
|
|
581
|
+
if 'colspan' in el.attrs and el['colspan'].isdigit():
|
|
582
|
+
colspan = int(el['colspan'])
|
|
583
|
+
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
|
|
584
|
+
|
|
585
|
+
def convert_th(self, el, text, convert_as_inline):
|
|
586
|
+
colspan = 1
|
|
587
|
+
if 'colspan' in el.attrs and el['colspan'].isdigit():
|
|
588
|
+
colspan = int(el['colspan'])
|
|
589
|
+
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
|
|
590
|
+
|
|
591
|
+
def convert_tr(self, el, text, convert_as_inline):
|
|
592
|
+
cells = el.find_all(['td', 'th'])
|
|
593
|
+
is_headrow = (
|
|
594
|
+
all([cell.name == 'th' for cell in cells])
|
|
595
|
+
or (not el.previous_sibling and not el.parent.name == 'tbody')
|
|
596
|
+
or (not el.previous_sibling and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1)
|
|
597
|
+
)
|
|
598
|
+
overline = ''
|
|
599
|
+
underline = ''
|
|
600
|
+
if is_headrow and not el.previous_sibling:
|
|
601
|
+
# first row and is headline: print headline underline
|
|
602
|
+
full_colspan = 0
|
|
603
|
+
for cell in cells:
|
|
604
|
+
if 'colspan' in cell.attrs and cell['colspan'].isdigit():
|
|
605
|
+
full_colspan += int(cell["colspan"])
|
|
606
|
+
else:
|
|
607
|
+
full_colspan += 1
|
|
608
|
+
underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
|
|
609
|
+
elif (not el.previous_sibling
|
|
610
|
+
and (el.parent.name == 'table'
|
|
611
|
+
or (el.parent.name == 'tbody'
|
|
612
|
+
and not el.parent.previous_sibling))):
|
|
613
|
+
# first row, not headline, and:
|
|
614
|
+
# - the parent is table or
|
|
615
|
+
# - the parent is tbody at the beginning of a table.
|
|
616
|
+
# print empty headline above this row
|
|
617
|
+
overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n'
|
|
618
|
+
overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
|
|
619
|
+
return overline + '|' + text + '\n' + underline
|
|
620
|
+
|
|
621
|
+
def _extract_metadata(self, scout):
|
|
622
|
+
"""
|
|
623
|
+
Extract metadata from the parsed document.
|
|
624
|
+
|
|
625
|
+
Args:
|
|
626
|
+
scout (Union[Scout, Tag, ScoutSearchResult]): Parsed object
|
|
627
|
+
|
|
628
|
+
Returns:
|
|
629
|
+
Dict[str, Any]: Extracted metadata
|
|
630
|
+
"""
|
|
631
|
+
metadata = {}
|
|
632
|
+
try:
|
|
633
|
+
# Handle ScoutSearchResult
|
|
634
|
+
if hasattr(scout, '_results'):
|
|
635
|
+
scout = scout._results[0] if scout._results else None
|
|
636
|
+
|
|
637
|
+
if scout is None:
|
|
638
|
+
return metadata
|
|
639
|
+
|
|
640
|
+
# Find head tag
|
|
641
|
+
head = scout.find('head')
|
|
642
|
+
if not head and hasattr(scout, 'find_all'):
|
|
643
|
+
head_list = scout.find_all('head')
|
|
644
|
+
head = head_list[0] if head_list else None
|
|
645
|
+
|
|
646
|
+
if head:
|
|
647
|
+
# Extract title
|
|
648
|
+
title_tag = head.find('title') or (head.find_all('title')[0] if head.find_all('title') else None)
|
|
649
|
+
metadata['title'] = title_tag.get_text() if title_tag else None
|
|
650
|
+
|
|
651
|
+
# Extract meta tags
|
|
652
|
+
metadata['meta'] = {}
|
|
653
|
+
meta_tags = head.find_all('meta') if hasattr(head, 'find_all') else []
|
|
654
|
+
for meta in meta_tags:
|
|
655
|
+
name = meta.get('name') or meta.get('property')
|
|
656
|
+
content = meta.get('content')
|
|
657
|
+
if name and content:
|
|
658
|
+
metadata['meta'][name] = content
|
|
659
|
+
|
|
660
|
+
except Exception as e:
|
|
661
|
+
logger.warning(f"Metadata extraction failed: {e}")
|
|
662
|
+
|
|
663
|
+
return metadata
|
|
664
|
+
|
|
665
|
+
def _extract_semantic_info(self, scout):
|
|
666
|
+
"""
|
|
667
|
+
Extract semantic information from the document.
|
|
668
|
+
|
|
669
|
+
Args:
|
|
670
|
+
scout (Union[Scout, Tag, ScoutSearchResult]): Parsed object
|
|
671
|
+
|
|
672
|
+
Returns:
|
|
673
|
+
Dict[str, Any]: Semantic information
|
|
674
|
+
"""
|
|
675
|
+
# Handle ScoutSearchResult
|
|
676
|
+
if hasattr(scout, '_results'):
|
|
677
|
+
scout = scout._results[0] if scout._results else None
|
|
678
|
+
|
|
679
|
+
if scout is None:
|
|
680
|
+
return {
|
|
681
|
+
'language': 'unknown',
|
|
682
|
+
'text_density': 0,
|
|
683
|
+
'content_types': {}
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
semantic_info = {
|
|
687
|
+
'language': 'unknown',
|
|
688
|
+
'text_density': 0,
|
|
689
|
+
'content_types': {}
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
try:
|
|
693
|
+
# Try to find language
|
|
694
|
+
html_tag = scout.find('html')
|
|
695
|
+
if not html_tag and hasattr(scout, 'find_all'):
|
|
696
|
+
html_tags = scout.find_all('html')
|
|
697
|
+
html_tag = html_tags[0] if html_tags else None
|
|
698
|
+
|
|
699
|
+
semantic_info['language'] = html_tag.get('lang', 'unknown') if html_tag else 'unknown'
|
|
700
|
+
|
|
701
|
+
# Calculate text density
|
|
702
|
+
total_text = scout.get_text() if hasattr(scout, 'get_text') else ''
|
|
703
|
+
total_html = str(scout)
|
|
704
|
+
semantic_info['text_density'] = len(total_text) / len(total_html) * 100 if total_html else 0
|
|
705
|
+
|
|
706
|
+
# Analyze content types
|
|
707
|
+
content_types = {}
|
|
708
|
+
for tag in scout.find_all() if hasattr(scout, 'find_all') else [scout]:
|
|
709
|
+
tag_type = tag.name
|
|
710
|
+
content_types[tag_type] = content_types.get(tag_type, 0) + 1
|
|
711
|
+
|
|
712
|
+
semantic_info['content_types'] = content_types
|
|
713
|
+
|
|
714
|
+
except Exception as e:
|
|
715
|
+
logger.warning(f"Semantic info extraction failed: {e}")
|
|
716
|
+
|
|
717
|
+
return semantic_info
|
|
718
|
+
|
|
719
|
+
def _analyze_structure(self, scout):
|
|
720
|
+
"""
|
|
721
|
+
Analyze document structure.
|
|
722
|
+
|
|
723
|
+
Args:
|
|
724
|
+
scout (Scout): Parsed Scout object
|
|
725
|
+
|
|
726
|
+
Returns:
|
|
727
|
+
Dict[str, Any]: Document structure information
|
|
728
|
+
"""
|
|
729
|
+
structure = {
|
|
730
|
+
'headings': [
|
|
731
|
+
{'level': h.name, 'text': h.get_text(strip=True)}
|
|
732
|
+
for h in scout.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
|
|
733
|
+
],
|
|
734
|
+
'sections': [],
|
|
735
|
+
'links': [
|
|
736
|
+
{'href': a.get('href'), 'text': a.get_text(strip=True)}
|
|
737
|
+
for a in scout.find_all('a')
|
|
738
|
+
]
|
|
739
|
+
}
|
|
740
|
+
return structure
|
|
741
|
+
|
|
742
|
+
def _calculate_text_density(self, scout):
|
|
743
|
+
"""
|
|
744
|
+
Calculate text density of the document.
|
|
745
|
+
|
|
746
|
+
Args:
|
|
747
|
+
scout (Scout): Parsed Scout object
|
|
748
|
+
|
|
749
|
+
Returns:
|
|
750
|
+
float: Text density percentage
|
|
751
|
+
"""
|
|
752
|
+
try:
|
|
753
|
+
total_text = scout.get_text()
|
|
754
|
+
total_html = str(scout)
|
|
755
|
+
return len(total_text) / len(total_html) * 100 if total_html else 0
|
|
756
|
+
except Exception as e:
|
|
757
|
+
logger.warning(f"Text density calculation failed: {e}")
|
|
758
|
+
return 0
|
|
759
|
+
|
|
760
|
+
def _analyze_content_types(self, scout):
|
|
761
|
+
"""
|
|
762
|
+
Analyze content types in the document.
|
|
763
|
+
|
|
764
|
+
Args:
|
|
765
|
+
scout (Scout): Parsed Scout object
|
|
766
|
+
|
|
767
|
+
Returns:
|
|
768
|
+
Dict[str, int]: Content type counts
|
|
769
|
+
"""
|
|
770
|
+
content_types = {}
|
|
771
|
+
try:
|
|
772
|
+
for tag in scout.find_all():
|
|
773
|
+
tag_type = tag.name
|
|
774
|
+
content_types[tag_type] = content_types.get(tag_type, 0) + 1
|
|
775
|
+
except Exception as e:
|
|
776
|
+
logger.warning(f"Content type analysis failed: {e}")
|
|
777
|
+
return content_types
|
|
778
|
+
|
|
779
|
+
def markdownify(html: str, **options) -> Union[str, Dict[str, Any]]:
|
|
780
|
+
"""
|
|
781
|
+
Convert HTML to Markdown with advanced options.
|
|
782
|
+
|
|
783
|
+
Args:
|
|
784
|
+
html (str): HTML content to convert
|
|
785
|
+
**options: Conversion options
|
|
786
|
+
|
|
787
|
+
Returns:
|
|
788
|
+
Union[str, Dict[str, Any]]: Markdown text or structured output
|
|
789
|
+
"""
|
|
790
|
+
try:
|
|
791
|
+
# Use Scout's native markdown conversion
|
|
792
|
+
scout = Scout(html, features='html.parser')
|
|
793
|
+
|
|
794
|
+
# Handle ScoutSearchResult
|
|
795
|
+
if hasattr(scout, '_results'):
|
|
796
|
+
scout = scout._results[0] if scout._results else scout
|
|
797
|
+
|
|
798
|
+
# Determine conversion style based on options
|
|
799
|
+
heading_style = options.get('heading_style', 'ATX')
|
|
800
|
+
|
|
801
|
+
# Custom markdown conversion to preserve formatting
|
|
802
|
+
def convert_tag(tag):
|
|
803
|
+
# Handle specific tag types
|
|
804
|
+
if tag.name == 'h1':
|
|
805
|
+
return f"# {tag.get_text(strip=True)}\n\n"
|
|
806
|
+
elif tag.name == 'h2':
|
|
807
|
+
return f"## {tag.get_text(strip=True)}\n\n"
|
|
808
|
+
elif tag.name == 'h3':
|
|
809
|
+
return f"### {tag.get_text(strip=True)}\n\n"
|
|
810
|
+
elif tag.name == 'p':
|
|
811
|
+
return f"{tag.get_text(strip=True)}\n\n"
|
|
812
|
+
elif tag.name == 'strong':
|
|
813
|
+
return f"**{tag.get_text(strip=True)}**"
|
|
814
|
+
elif tag.name == 'em':
|
|
815
|
+
return f"*{tag.get_text(strip=True)}*"
|
|
816
|
+
elif tag.name == 'ul':
|
|
817
|
+
return ''.join(f"* {li.get_text(strip=True)}\n" for li in tag.find_all('li'))
|
|
818
|
+
elif tag.name == 'ol':
|
|
819
|
+
return ''.join(f"{i+1}. {li.get_text(strip=True)}\n" for i, li in enumerate(tag.find_all('li')))
|
|
820
|
+
elif tag.name == 'a':
|
|
821
|
+
return f"[{tag.get_text(strip=True)}]({tag.get('href', '')})"
|
|
822
|
+
return tag.get_text(strip=True)
|
|
823
|
+
|
|
824
|
+
# Traverse and convert tags
|
|
825
|
+
markdown_parts = []
|
|
826
|
+
for tag in scout.find_all():
|
|
827
|
+
if tag.name in ['h1', 'h2', 'h3', 'p', 'strong', 'em', 'ul', 'ol', 'a']:
|
|
828
|
+
markdown_parts.append(convert_tag(tag))
|
|
829
|
+
|
|
830
|
+
markdown = '\n'.join(markdown_parts)
|
|
831
|
+
|
|
832
|
+
# If structured output is requested, include additional metadata
|
|
833
|
+
if options.get('STRUCTURED_OUTPUT', False):
|
|
834
|
+
# Custom metadata extraction
|
|
835
|
+
metadata = {}
|
|
836
|
+
try:
|
|
837
|
+
head = scout.find('head') or scout.find_all('head')[0] if scout.find_all('head') else None
|
|
838
|
+
if head:
|
|
839
|
+
# Extract title
|
|
840
|
+
title_tag = head.find('title') or head.find_all('title')[0] if head.find_all('title') else None
|
|
841
|
+
metadata['title'] = title_tag.get_text() if title_tag else None
|
|
842
|
+
|
|
843
|
+
# Extract meta tags
|
|
844
|
+
metadata['meta'] = {
|
|
845
|
+
meta.get('name', meta.get('property')): meta.get('content')
|
|
846
|
+
for meta in head.find_all('meta')
|
|
847
|
+
if meta.get('name') or meta.get('property')
|
|
848
|
+
}
|
|
849
|
+
except Exception as e:
|
|
850
|
+
logger.warning(f"Metadata extraction failed: {e}")
|
|
851
|
+
|
|
852
|
+
return {
|
|
853
|
+
'markdown': markdown,
|
|
854
|
+
'metadata': metadata,
|
|
855
|
+
'structure': scout.analyze_page_structure(),
|
|
856
|
+
'semantic_info': scout.extract_semantic_info()
|
|
857
|
+
}
|
|
858
|
+
|
|
859
|
+
return markdown
|
|
860
|
+
except Exception as e:
|
|
861
|
+
logger.error(f"Markdownify failed: {e}", exc_info=True)
|
|
862
|
+
return str(e)
|