auto-coder 0.1.200__py3-none-any.whl → 0.1.201__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.200.dist-info → auto_coder-0.1.201.dist-info}/METADATA +9 -1
- {auto_coder-0.1.200.dist-info → auto_coder-0.1.201.dist-info}/RECORD +16 -15
- autocoder/rag/cache/base_cache.py +1 -1
- autocoder/rag/cache/byzer_storage_cache.py +33 -10
- autocoder/rag/cache/simple_cache.py +65 -24
- autocoder/rag/document_retriever.py +5 -10
- autocoder/rag/loaders/docx_loader.py +15 -2
- autocoder/rag/loaders/pdf_loader.py +14 -2
- autocoder/rag/long_context_rag.py +3 -0
- autocoder/rag/utils.py +9 -13
- autocoder/utils/_markitdown.py +1298 -0
- autocoder/version.py +1 -1
- {auto_coder-0.1.200.dist-info → auto_coder-0.1.201.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.200.dist-info → auto_coder-0.1.201.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.200.dist-info → auto_coder-0.1.201.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.200.dist-info → auto_coder-0.1.201.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1298 @@
|
|
|
1
|
+
# type: ignore
|
|
2
|
+
import base64
|
|
3
|
+
import binascii
|
|
4
|
+
import copy
|
|
5
|
+
import html
|
|
6
|
+
import json
|
|
7
|
+
import mimetypes
|
|
8
|
+
import io
|
|
9
|
+
import os
|
|
10
|
+
import re
|
|
11
|
+
import shutil
|
|
12
|
+
import subprocess
|
|
13
|
+
import sys
|
|
14
|
+
import tempfile
|
|
15
|
+
import traceback
|
|
16
|
+
from typing import Any, Dict, List, Optional, Union
|
|
17
|
+
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
|
18
|
+
|
|
19
|
+
import mammoth
|
|
20
|
+
import markdownify
|
|
21
|
+
import pandas as pd
|
|
22
|
+
import pdfminer
|
|
23
|
+
import pdfminer.high_level
|
|
24
|
+
from pdfminer.converter import PDFPageAggregator
|
|
25
|
+
from pdfminer.layout import LAParams, LTImage, LTFigure
|
|
26
|
+
from pdfminer.pdfdevice import PDFDevice
|
|
27
|
+
from pdfminer.pdfdocument import PDFDocument
|
|
28
|
+
from pdfminer.pdfparser import PDFParser
|
|
29
|
+
from pdfminer.pdfpage import PDFPage
|
|
30
|
+
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|
31
|
+
import pptx
|
|
32
|
+
from pdfminer.image import ImageWriter
|
|
33
|
+
|
|
34
|
+
import numpy as np
|
|
35
|
+
from PIL import Image
|
|
36
|
+
|
|
37
|
+
# File-format detection
|
|
38
|
+
import puremagic
|
|
39
|
+
import requests
|
|
40
|
+
from bs4 import BeautifulSoup
|
|
41
|
+
|
|
42
|
+
# Optional Transcription support
|
|
43
|
+
try:
|
|
44
|
+
import pydub
|
|
45
|
+
import speech_recognition as sr
|
|
46
|
+
|
|
47
|
+
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
|
48
|
+
except ModuleNotFoundError:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
# Optional YouTube transcription support
|
|
52
|
+
try:
|
|
53
|
+
from youtube_transcript_api import YouTubeTranscriptApi
|
|
54
|
+
|
|
55
|
+
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
|
|
56
|
+
except ModuleNotFoundError:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|
61
|
+
"""
|
|
62
|
+
A custom version of markdownify's MarkdownConverter. Changes include:
|
|
63
|
+
|
|
64
|
+
- Altering the default heading style to use '#', '##', etc.
|
|
65
|
+
- Removing javascript hyperlinks.
|
|
66
|
+
- Truncating images with large data:uri sources.
|
|
67
|
+
- Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(self, **options: Any):
|
|
71
|
+
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
|
72
|
+
# Explicitly cast options to the expected type if necessary
|
|
73
|
+
super().__init__(**options)
|
|
74
|
+
|
|
75
|
+
def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
|
|
76
|
+
"""Same as usual, but be sure to start with a new line"""
|
|
77
|
+
if not convert_as_inline:
|
|
78
|
+
if not re.search(r"^\n", text):
|
|
79
|
+
return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
|
80
|
+
|
|
81
|
+
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
|
82
|
+
|
|
83
|
+
def convert_a(self, el: Any, text: str, convert_as_inline: bool):
|
|
84
|
+
"""Same as usual converter, but removes Javascript links and escapes URIs."""
|
|
85
|
+
prefix, suffix, text = markdownify.chomp(text) # type: ignore
|
|
86
|
+
if not text:
|
|
87
|
+
return ""
|
|
88
|
+
href = el.get("href")
|
|
89
|
+
title = el.get("title")
|
|
90
|
+
|
|
91
|
+
# Escape URIs and skip non-http or file schemes
|
|
92
|
+
if href:
|
|
93
|
+
try:
|
|
94
|
+
parsed_url = urlparse(href) # type: ignore
|
|
95
|
+
# type: ignore
|
|
96
|
+
if parsed_url.scheme and parsed_url.scheme.lower() not in [
|
|
97
|
+
"http",
|
|
98
|
+
"https",
|
|
99
|
+
"file",
|
|
100
|
+
]:
|
|
101
|
+
return "%s%s%s" % (prefix, text, suffix)
|
|
102
|
+
href = urlunparse(
|
|
103
|
+
parsed_url._replace(path=quote(unquote(parsed_url.path)))
|
|
104
|
+
) # type: ignore
|
|
105
|
+
except ValueError: # It's not clear if this ever gets thrown
|
|
106
|
+
return "%s%s%s" % (prefix, text, suffix)
|
|
107
|
+
|
|
108
|
+
# For the replacement see #29: text nodes underscores are escaped
|
|
109
|
+
if (
|
|
110
|
+
self.options["autolinks"]
|
|
111
|
+
and text.replace(r"\_", "_") == href
|
|
112
|
+
and not title
|
|
113
|
+
and not self.options["default_title"]
|
|
114
|
+
):
|
|
115
|
+
# Shortcut syntax
|
|
116
|
+
return "<%s>" % href
|
|
117
|
+
if self.options["default_title"] and not title:
|
|
118
|
+
title = href
|
|
119
|
+
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
|
120
|
+
return (
|
|
121
|
+
"%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
|
|
122
|
+
if href
|
|
123
|
+
else text
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
|
|
127
|
+
"""Same as usual converter, but removes data URIs"""
|
|
128
|
+
|
|
129
|
+
alt = el.attrs.get("alt", None) or ""
|
|
130
|
+
src = el.attrs.get("src", None) or ""
|
|
131
|
+
title = el.attrs.get("title", None) or ""
|
|
132
|
+
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
|
133
|
+
if (
|
|
134
|
+
convert_as_inline
|
|
135
|
+
and el.parent.name not in self.options["keep_inline_images_in"]
|
|
136
|
+
):
|
|
137
|
+
return alt
|
|
138
|
+
|
|
139
|
+
# Remove dataURIs
|
|
140
|
+
if src.startswith("data:"):
|
|
141
|
+
src = src.split(",")[0] + "..."
|
|
142
|
+
|
|
143
|
+
return "" % (alt, src, title_part)
|
|
144
|
+
|
|
145
|
+
def convert_soup(self, soup: Any) -> str:
|
|
146
|
+
return super().convert_soup(soup) # type: ignore
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class DocumentConverterResult:
|
|
150
|
+
"""The result of converting a document to text."""
|
|
151
|
+
|
|
152
|
+
def __init__(self, title: Union[str, None] = None, text_content: str = ""):
|
|
153
|
+
self.title: Union[str, None] = title
|
|
154
|
+
self.text_content: str = text_content
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class DocumentConverter:
|
|
158
|
+
"""Abstract superclass of all DocumentConverters."""
|
|
159
|
+
|
|
160
|
+
def convert(
|
|
161
|
+
self, local_path: str, **kwargs: Any
|
|
162
|
+
) -> Union[None, DocumentConverterResult]:
|
|
163
|
+
raise NotImplementedError()
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class PlainTextConverter(DocumentConverter):
|
|
167
|
+
"""Anything with content type text/plain"""
|
|
168
|
+
|
|
169
|
+
def convert(
|
|
170
|
+
self, local_path: str, **kwargs: Any
|
|
171
|
+
) -> Union[None, DocumentConverterResult]:
|
|
172
|
+
# Guess the content type from any file extension that might be around
|
|
173
|
+
content_type, _ = mimetypes.guess_type(
|
|
174
|
+
"__placeholder" + kwargs.get("file_extension", "")
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Only accept text files
|
|
178
|
+
if content_type is None:
|
|
179
|
+
return None
|
|
180
|
+
elif "text/" not in content_type.lower():
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
text_content = ""
|
|
184
|
+
with open(local_path, "rt", encoding="utf-8") as fh:
|
|
185
|
+
text_content = fh.read()
|
|
186
|
+
return DocumentConverterResult(
|
|
187
|
+
title=None,
|
|
188
|
+
text_content=text_content,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class HtmlConverter(DocumentConverter):
|
|
193
|
+
"""Anything with content type text/html"""
|
|
194
|
+
|
|
195
|
+
def convert(
|
|
196
|
+
self, local_path: str, **kwargs: Any
|
|
197
|
+
) -> Union[None, DocumentConverterResult]:
|
|
198
|
+
# Bail if not html
|
|
199
|
+
extension = kwargs.get("file_extension", "")
|
|
200
|
+
if extension.lower() not in [".html", ".htm"]:
|
|
201
|
+
return None
|
|
202
|
+
|
|
203
|
+
result = None
|
|
204
|
+
with open(local_path, "rt", encoding="utf-8") as fh:
|
|
205
|
+
result = self._convert(fh.read())
|
|
206
|
+
|
|
207
|
+
return result
|
|
208
|
+
|
|
209
|
+
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
|
|
210
|
+
"""Helper function that converts and HTML string."""
|
|
211
|
+
|
|
212
|
+
# Parse the string
|
|
213
|
+
soup = BeautifulSoup(html_content, "html.parser")
|
|
214
|
+
|
|
215
|
+
# Remove javascript and style blocks
|
|
216
|
+
for script in soup(["script", "style"]):
|
|
217
|
+
script.extract()
|
|
218
|
+
|
|
219
|
+
# Print only the main content
|
|
220
|
+
body_elm = soup.find("body")
|
|
221
|
+
webpage_text = ""
|
|
222
|
+
if body_elm:
|
|
223
|
+
webpage_text = _CustomMarkdownify().convert_soup(body_elm)
|
|
224
|
+
else:
|
|
225
|
+
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
|
226
|
+
|
|
227
|
+
assert isinstance(webpage_text, str)
|
|
228
|
+
|
|
229
|
+
return DocumentConverterResult(
|
|
230
|
+
title=None if soup.title is None else soup.title.string,
|
|
231
|
+
text_content=webpage_text,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class WikipediaConverter(DocumentConverter):
|
|
236
|
+
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
|
237
|
+
|
|
238
|
+
def convert(
|
|
239
|
+
self, local_path: str, **kwargs: Any
|
|
240
|
+
) -> Union[None, DocumentConverterResult]:
|
|
241
|
+
# Bail if not Wikipedia
|
|
242
|
+
extension = kwargs.get("file_extension", "")
|
|
243
|
+
if extension.lower() not in [".html", ".htm"]:
|
|
244
|
+
return None
|
|
245
|
+
url = kwargs.get("url", "")
|
|
246
|
+
if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
|
|
247
|
+
return None
|
|
248
|
+
|
|
249
|
+
# Parse the file
|
|
250
|
+
soup = None
|
|
251
|
+
with open(local_path, "rt", encoding="utf-8") as fh:
|
|
252
|
+
soup = BeautifulSoup(fh.read(), "html.parser")
|
|
253
|
+
|
|
254
|
+
# Remove javascript and style blocks
|
|
255
|
+
for script in soup(["script", "style"]):
|
|
256
|
+
script.extract()
|
|
257
|
+
|
|
258
|
+
# Print only the main content
|
|
259
|
+
body_elm = soup.find("div", {"id": "mw-content-text"})
|
|
260
|
+
title_elm = soup.find("span", {"class": "mw-page-title-main"})
|
|
261
|
+
|
|
262
|
+
webpage_text = ""
|
|
263
|
+
main_title = None if soup.title is None else soup.title.string
|
|
264
|
+
|
|
265
|
+
if body_elm:
|
|
266
|
+
# What's the title
|
|
267
|
+
if title_elm and len(title_elm) > 0:
|
|
268
|
+
main_title = title_elm.string # type: ignore
|
|
269
|
+
assert isinstance(main_title, str)
|
|
270
|
+
|
|
271
|
+
# Convert the page
|
|
272
|
+
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
|
|
273
|
+
body_elm
|
|
274
|
+
)
|
|
275
|
+
else:
|
|
276
|
+
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
|
277
|
+
|
|
278
|
+
return DocumentConverterResult(
|
|
279
|
+
title=main_title,
|
|
280
|
+
text_content=webpage_text,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
class YouTubeConverter(DocumentConverter):
|
|
285
|
+
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
|
|
286
|
+
|
|
287
|
+
def convert(
|
|
288
|
+
self, local_path: str, **kwargs: Any
|
|
289
|
+
) -> Union[None, DocumentConverterResult]:
|
|
290
|
+
# Bail if not YouTube
|
|
291
|
+
extension = kwargs.get("file_extension", "")
|
|
292
|
+
if extension.lower() not in [".html", ".htm"]:
|
|
293
|
+
return None
|
|
294
|
+
url = kwargs.get("url", "")
|
|
295
|
+
if not url.startswith("https://www.youtube.com/watch?"):
|
|
296
|
+
return None
|
|
297
|
+
|
|
298
|
+
# Parse the file
|
|
299
|
+
soup = None
|
|
300
|
+
with open(local_path, "rt", encoding="utf-8") as fh:
|
|
301
|
+
soup = BeautifulSoup(fh.read(), "html.parser")
|
|
302
|
+
|
|
303
|
+
# Read the meta tags
|
|
304
|
+
assert soup.title is not None and soup.title.string is not None
|
|
305
|
+
metadata: Dict[str, str] = {"title": soup.title.string}
|
|
306
|
+
for meta in soup(["meta"]):
|
|
307
|
+
for a in meta.attrs:
|
|
308
|
+
if a in ["itemprop", "property", "name"]:
|
|
309
|
+
metadata[meta[a]] = meta.get("content", "")
|
|
310
|
+
break
|
|
311
|
+
|
|
312
|
+
# We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
|
|
313
|
+
try:
|
|
314
|
+
for script in soup(["script"]):
|
|
315
|
+
content = script.text
|
|
316
|
+
if "ytInitialData" in content:
|
|
317
|
+
lines = re.split(r"\r?\n", content)
|
|
318
|
+
obj_start = lines[0].find("{")
|
|
319
|
+
obj_end = lines[0].rfind("}")
|
|
320
|
+
if obj_start >= 0 and obj_end >= 0:
|
|
321
|
+
data = json.loads(lines[0][obj_start : obj_end + 1])
|
|
322
|
+
attrdesc = self._findKey(
|
|
323
|
+
data, "attributedDescriptionBodyText"
|
|
324
|
+
) # type: ignore
|
|
325
|
+
if attrdesc:
|
|
326
|
+
metadata["description"] = str(attrdesc["content"])
|
|
327
|
+
break
|
|
328
|
+
except Exception:
|
|
329
|
+
pass
|
|
330
|
+
|
|
331
|
+
# Start preparing the page
|
|
332
|
+
webpage_text = "# YouTube\n"
|
|
333
|
+
|
|
334
|
+
title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore
|
|
335
|
+
assert isinstance(title, str)
|
|
336
|
+
|
|
337
|
+
if title:
|
|
338
|
+
webpage_text += f"\n## {title}\n"
|
|
339
|
+
|
|
340
|
+
stats = ""
|
|
341
|
+
views = self._get(metadata, ["interactionCount"]) # type: ignore
|
|
342
|
+
if views:
|
|
343
|
+
stats += f"- **Views:** {views}\n"
|
|
344
|
+
|
|
345
|
+
keywords = self._get(metadata, ["keywords"]) # type: ignore
|
|
346
|
+
if keywords:
|
|
347
|
+
stats += f"- **Keywords:** {keywords}\n"
|
|
348
|
+
|
|
349
|
+
runtime = self._get(metadata, ["duration"]) # type: ignore
|
|
350
|
+
if runtime:
|
|
351
|
+
stats += f"- **Runtime:** {runtime}\n"
|
|
352
|
+
|
|
353
|
+
if len(stats) > 0:
|
|
354
|
+
webpage_text += f"\n### Video Metadata\n{stats}\n"
|
|
355
|
+
|
|
356
|
+
description = self._get(
|
|
357
|
+
metadata, ["description", "og:description"]
|
|
358
|
+
) # type: ignore
|
|
359
|
+
if description:
|
|
360
|
+
webpage_text += f"\n### Description\n{description}\n"
|
|
361
|
+
|
|
362
|
+
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
|
|
363
|
+
transcript_text = ""
|
|
364
|
+
parsed_url = urlparse(url) # type: ignore
|
|
365
|
+
params = parse_qs(parsed_url.query) # type: ignore
|
|
366
|
+
if "v" in params:
|
|
367
|
+
assert isinstance(params["v"][0], str)
|
|
368
|
+
video_id = str(params["v"][0])
|
|
369
|
+
try:
|
|
370
|
+
# Must be a single transcript.
|
|
371
|
+
transcript = YouTubeTranscriptApi.get_transcript(
|
|
372
|
+
video_id
|
|
373
|
+
) # type: ignore
|
|
374
|
+
transcript_text = " ".join(
|
|
375
|
+
[part["text"] for part in transcript]
|
|
376
|
+
) # type: ignore
|
|
377
|
+
# Alternative formatting:
|
|
378
|
+
# formatter = TextFormatter()
|
|
379
|
+
# formatter.format_transcript(transcript)
|
|
380
|
+
except Exception:
|
|
381
|
+
pass
|
|
382
|
+
if transcript_text:
|
|
383
|
+
webpage_text += f"\n### Transcript\n{transcript_text}\n"
|
|
384
|
+
|
|
385
|
+
title = title if title else soup.title.string
|
|
386
|
+
assert isinstance(title, str)
|
|
387
|
+
|
|
388
|
+
return DocumentConverterResult(
|
|
389
|
+
title=title,
|
|
390
|
+
text_content=webpage_text,
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
def _get(
|
|
394
|
+
self,
|
|
395
|
+
metadata: Dict[str, str],
|
|
396
|
+
keys: List[str],
|
|
397
|
+
default: Union[str, None] = None,
|
|
398
|
+
) -> Union[str, None]:
|
|
399
|
+
for k in keys:
|
|
400
|
+
if k in metadata:
|
|
401
|
+
return metadata[k]
|
|
402
|
+
return default
|
|
403
|
+
|
|
404
|
+
# TODO: Fix json type
|
|
405
|
+
def _findKey(self, json: Any, key: str) -> Union[str, None]:
|
|
406
|
+
if isinstance(json, list):
|
|
407
|
+
for elm in json:
|
|
408
|
+
ret = self._findKey(elm, key)
|
|
409
|
+
if ret is not None:
|
|
410
|
+
return ret
|
|
411
|
+
elif isinstance(json, dict):
|
|
412
|
+
for k in json:
|
|
413
|
+
if k == key:
|
|
414
|
+
return json[k]
|
|
415
|
+
else:
|
|
416
|
+
ret = self._findKey(json[k], key)
|
|
417
|
+
if ret is not None:
|
|
418
|
+
return ret
|
|
419
|
+
return None
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
class BingSerpConverter(DocumentConverter):
|
|
423
|
+
"""
|
|
424
|
+
Handle Bing results pages (only the organic search results).
|
|
425
|
+
NOTE: It is better to use the Bing API
|
|
426
|
+
"""
|
|
427
|
+
|
|
428
|
+
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
429
|
+
# Bail if not a Bing SERP
|
|
430
|
+
extension = kwargs.get("file_extension", "")
|
|
431
|
+
if extension.lower() not in [".html", ".htm"]:
|
|
432
|
+
return None
|
|
433
|
+
url = kwargs.get("url", "")
|
|
434
|
+
if not re.search(r"^https://www\.bing\.com/search\?q=", url):
|
|
435
|
+
return None
|
|
436
|
+
|
|
437
|
+
# Parse the query parameters
|
|
438
|
+
parsed_params = parse_qs(urlparse(url).query)
|
|
439
|
+
query = parsed_params.get("q", [""])[0]
|
|
440
|
+
|
|
441
|
+
# Parse the file
|
|
442
|
+
soup = None
|
|
443
|
+
with open(local_path, "rt", encoding="utf-8") as fh:
|
|
444
|
+
soup = BeautifulSoup(fh.read(), "html.parser")
|
|
445
|
+
|
|
446
|
+
# Clean up some formatting
|
|
447
|
+
for tptt in soup.find_all(class_="tptt"):
|
|
448
|
+
if hasattr(tptt, "string") and tptt.string:
|
|
449
|
+
tptt.string += " "
|
|
450
|
+
for slug in soup.find_all(class_="algoSlug_icon"):
|
|
451
|
+
slug.extract()
|
|
452
|
+
|
|
453
|
+
# Parse the algorithmic results
|
|
454
|
+
_markdownify = _CustomMarkdownify()
|
|
455
|
+
results = list()
|
|
456
|
+
for result in soup.find_all(class_="b_algo"):
|
|
457
|
+
# Rewrite redirect urls
|
|
458
|
+
for a in result.find_all("a", href=True):
|
|
459
|
+
parsed_href = urlparse(a["href"])
|
|
460
|
+
qs = parse_qs(parsed_href.query)
|
|
461
|
+
|
|
462
|
+
# The destination is contained in the u parameter,
|
|
463
|
+
# but appears to be base64 encoded, with some prefix
|
|
464
|
+
if "u" in qs:
|
|
465
|
+
u = (
|
|
466
|
+
qs["u"][0][2:].strip() + "=="
|
|
467
|
+
) # Python 3 doesn't care about extra padding
|
|
468
|
+
|
|
469
|
+
try:
|
|
470
|
+
# RFC 4648 / Base64URL" variant, which uses "-" and "_"
|
|
471
|
+
a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8")
|
|
472
|
+
except UnicodeDecodeError:
|
|
473
|
+
pass
|
|
474
|
+
except binascii.Error:
|
|
475
|
+
pass
|
|
476
|
+
|
|
477
|
+
# Convert to markdown
|
|
478
|
+
md_result = _markdownify.convert_soup(result).strip()
|
|
479
|
+
lines = [line.strip() for line in re.split(r"\n+", md_result)]
|
|
480
|
+
results.append("\n".join([line for line in lines if len(line) > 0]))
|
|
481
|
+
|
|
482
|
+
webpage_text = (
|
|
483
|
+
f"## A Bing search for '{query}' found the following results:\n\n"
|
|
484
|
+
+ "\n\n".join(results)
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
return DocumentConverterResult(
|
|
488
|
+
title=None if soup.title is None else soup.title.string,
|
|
489
|
+
text_content=webpage_text,
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
class PdfConverter(DocumentConverter):
|
|
494
|
+
"""
|
|
495
|
+
Converts PDFs to Markdown with support for extracting and including images.
|
|
496
|
+
"""
|
|
497
|
+
|
|
498
|
+
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
499
|
+
# Bail if not a PDF
|
|
500
|
+
extension = kwargs.get("file_extension", "")
|
|
501
|
+
if extension.lower() != ".pdf":
|
|
502
|
+
return None
|
|
503
|
+
|
|
504
|
+
image_output_dir = None
|
|
505
|
+
if kwargs.get("image_output_dir", None):
|
|
506
|
+
image_output_dir = kwargs.get("image_output_dir")
|
|
507
|
+
else:
|
|
508
|
+
# Create output directory for images if it doesn't exist
|
|
509
|
+
image_output_dir = os.path.join(
|
|
510
|
+
os.path.dirname(local_path), "_images", os.path.basename(local_path)
|
|
511
|
+
)
|
|
512
|
+
os.makedirs(image_output_dir, exist_ok=True)
|
|
513
|
+
|
|
514
|
+
text_content = []
|
|
515
|
+
image_count = 0
|
|
516
|
+
|
|
517
|
+
# Open and process PDF
|
|
518
|
+
with open(local_path, "rb") as file:
|
|
519
|
+
# Create PDF parser and document
|
|
520
|
+
parser = PDFParser(file)
|
|
521
|
+
document = PDFDocument(parser)
|
|
522
|
+
rsrcmgr = PDFResourceManager()
|
|
523
|
+
laparams = LAParams()
|
|
524
|
+
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
|
525
|
+
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
526
|
+
|
|
527
|
+
# Process each page
|
|
528
|
+
for page in PDFPage.create_pages(document):
|
|
529
|
+
interpreter.process_page(page)
|
|
530
|
+
layout = device.get_result()
|
|
531
|
+
|
|
532
|
+
# Extract text and images from the page
|
|
533
|
+
page_content = self._process_layout(
|
|
534
|
+
layout, image_output_dir, image_count
|
|
535
|
+
)
|
|
536
|
+
text_content.extend(page_content)
|
|
537
|
+
image_count += len([c for c in page_content if c.startswith("![Image")])
|
|
538
|
+
|
|
539
|
+
return DocumentConverterResult(
|
|
540
|
+
title=None,
|
|
541
|
+
text_content="\n".join(text_content),
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
def _process_layout(
|
|
545
|
+
self, layout, image_output_dir: str, image_count: int
|
|
546
|
+
) -> List[str]:
|
|
547
|
+
"""Process the layout of a PDF page, extracting both text and images."""
|
|
548
|
+
content = []
|
|
549
|
+
iw = ImageWriter(image_output_dir)
|
|
550
|
+
|
|
551
|
+
for lt_obj in layout:
|
|
552
|
+
# Handle images
|
|
553
|
+
if isinstance(lt_obj, LTImage) or (
|
|
554
|
+
isinstance(lt_obj, LTFigure) and lt_obj.name.startswith("Im")
|
|
555
|
+
):
|
|
556
|
+
image_count += 1
|
|
557
|
+
image_data = None
|
|
558
|
+
image_meta = {}
|
|
559
|
+
image_path = os.path.join(image_output_dir, f"image_{image_count}.png")
|
|
560
|
+
|
|
561
|
+
if hasattr(lt_obj, "stream"):
|
|
562
|
+
image_data = lt_obj.stream.get_data()
|
|
563
|
+
image_meta = lt_obj.stream.attrs
|
|
564
|
+
elif hasattr(lt_obj, "filter"):
|
|
565
|
+
image_data = lt_obj.filter
|
|
566
|
+
|
|
567
|
+
if image_data:
|
|
568
|
+
if isinstance(lt_obj, LTImage):
|
|
569
|
+
name = iw.export_image(lt_obj)
|
|
570
|
+
suffix = os.path.splitext(name)[1]
|
|
571
|
+
temp_path = os.path.join(image_output_dir, name)
|
|
572
|
+
image_path = os.path.join(image_output_dir, f"image_{image_count}{suffix}")
|
|
573
|
+
os.rename(temp_path, image_path)
|
|
574
|
+
content.append(f"")
|
|
575
|
+
continue
|
|
576
|
+
try:
|
|
577
|
+
# Try to handle raw pixel data
|
|
578
|
+
if "BitsPerComponent" in image_meta:
|
|
579
|
+
width = image_meta["Width"]
|
|
580
|
+
height = image_meta["Height"]
|
|
581
|
+
bits = image_meta["BitsPerComponent"]
|
|
582
|
+
colorspace = image_meta["ColorSpace"].name
|
|
583
|
+
new_image_data = np.frombuffer(image_data, dtype=np.uint8)
|
|
584
|
+
# Normalize to 8-bit if necessary
|
|
585
|
+
if bits != 8:
|
|
586
|
+
max_val = (1 << bits) - 1
|
|
587
|
+
new_image_data = (
|
|
588
|
+
new_image_data.astype("float32") * 255 / max_val
|
|
589
|
+
).astype("uint8")
|
|
590
|
+
|
|
591
|
+
if colorspace == "DeviceRGB":
|
|
592
|
+
new_image_data = new_image_data.reshape(
|
|
593
|
+
(height, width, 3)
|
|
594
|
+
)
|
|
595
|
+
img = Image.fromarray(new_image_data, "RGB")
|
|
596
|
+
img.save(image_path)
|
|
597
|
+
content.append(
|
|
598
|
+
f"\n"
|
|
599
|
+
)
|
|
600
|
+
continue
|
|
601
|
+
elif colorspace == "DeviceGray":
|
|
602
|
+
new_image_data = new_image_data.reshape((height, width))
|
|
603
|
+
img = Image.fromarray(new_image_data, "L")
|
|
604
|
+
img.save(image_path)
|
|
605
|
+
content.append(
|
|
606
|
+
f"\n"
|
|
607
|
+
)
|
|
608
|
+
continue
|
|
609
|
+
except Exception as e:
|
|
610
|
+
print(
|
|
611
|
+
f"Error extracting image: {e} fallback to writing original data"
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
with open(image_path, "wb") as img_file:
|
|
615
|
+
img_file.write(image_data)
|
|
616
|
+
|
|
617
|
+
content.append(f"\n")
|
|
618
|
+
|
|
619
|
+
# Handle text
|
|
620
|
+
if hasattr(lt_obj, "get_text"):
|
|
621
|
+
text = lt_obj.get_text().strip()
|
|
622
|
+
if text:
|
|
623
|
+
content.append(text)
|
|
624
|
+
|
|
625
|
+
# Recursively process nested layouts
|
|
626
|
+
elif hasattr(lt_obj, "_objs"):
|
|
627
|
+
content.extend(
|
|
628
|
+
self._process_layout(lt_obj._objs, image_output_dir, image_count)
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
return content
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
class DocxConverter(HtmlConverter):
|
|
635
|
+
"""
|
|
636
|
+
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
|
637
|
+
"""
|
|
638
|
+
|
|
639
|
+
def __init__(self):
|
|
640
|
+
self._image_counter = 0
|
|
641
|
+
super().__init__()
|
|
642
|
+
|
|
643
|
+
def _save_image(self, image, output_dir: str) -> str:
|
|
644
|
+
"""
|
|
645
|
+
保存图片并返回相对路径,使用递增的计数器来命名文件
|
|
646
|
+
"""
|
|
647
|
+
# 获取图片内容和格式
|
|
648
|
+
image_format = image.content_type.split('/')[-1] if image.content_type else 'png'
|
|
649
|
+
|
|
650
|
+
# 增加计数器并生成文件名
|
|
651
|
+
self._image_counter += 1
|
|
652
|
+
image_filename = f"image_{self._image_counter}.{image_format}"
|
|
653
|
+
|
|
654
|
+
# 保存图片
|
|
655
|
+
image_path = os.path.join(output_dir, image_filename)
|
|
656
|
+
with image.open() as image_content, open(image_path, 'wb') as f:
|
|
657
|
+
f.write(image_content.read())
|
|
658
|
+
|
|
659
|
+
return image_path
|
|
660
|
+
|
|
661
|
+
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
662
|
+
# Bail if not a DOCX
|
|
663
|
+
extension = kwargs.get("file_extension", "")
|
|
664
|
+
if extension.lower() != ".docx":
|
|
665
|
+
return None
|
|
666
|
+
|
|
667
|
+
# 设置图片输出目录
|
|
668
|
+
image_output_dir = None
|
|
669
|
+
if kwargs.get("image_output_dir", None):
|
|
670
|
+
image_output_dir = kwargs.get("image_output_dir")
|
|
671
|
+
else:
|
|
672
|
+
# Create output directory for images if it doesn't exist
|
|
673
|
+
image_output_dir = os.path.join(os.path.dirname(
|
|
674
|
+
local_path), "_images", os.path.basename(local_path))
|
|
675
|
+
os.makedirs(image_output_dir, exist_ok=True)
|
|
676
|
+
|
|
677
|
+
result = None
|
|
678
|
+
with open(local_path, "rb") as docx_file:
|
|
679
|
+
# 配置图片转换器
|
|
680
|
+
def transform_image(image):
|
|
681
|
+
return {
|
|
682
|
+
"src": self._save_image(image, image_output_dir),
|
|
683
|
+
"alt": image.alt_text if image.alt_text else f"Image {self._image_counter}"
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
# 进行转换
|
|
687
|
+
result = mammoth.convert_to_html(
|
|
688
|
+
docx_file,
|
|
689
|
+
convert_image=mammoth.images.inline(transform_image)
|
|
690
|
+
)
|
|
691
|
+
html_content = result.value
|
|
692
|
+
result = self._convert(html_content)
|
|
693
|
+
|
|
694
|
+
return result
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
class XlsxConverter(HtmlConverter):
|
|
698
|
+
"""
|
|
699
|
+
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
|
700
|
+
"""
|
|
701
|
+
|
|
702
|
+
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
703
|
+
# Bail if not a XLSX
|
|
704
|
+
extension = kwargs.get("file_extension", "")
|
|
705
|
+
if extension.lower() != ".xlsx":
|
|
706
|
+
return None
|
|
707
|
+
|
|
708
|
+
sheets = pd.read_excel(local_path, sheet_name=None)
|
|
709
|
+
md_content = ""
|
|
710
|
+
for s in sheets:
|
|
711
|
+
md_content += f"## {s}\n"
|
|
712
|
+
html_content = sheets[s].to_html(index=False)
|
|
713
|
+
md_content += self._convert(html_content).text_content.strip() + "\n\n"
|
|
714
|
+
|
|
715
|
+
return DocumentConverterResult(
|
|
716
|
+
title=None,
|
|
717
|
+
text_content=md_content.strip(),
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
class PptxConverter(HtmlConverter):
|
|
722
|
+
"""
|
|
723
|
+
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
|
724
|
+
"""
|
|
725
|
+
|
|
726
|
+
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
727
|
+
# Bail if not a PPTX
|
|
728
|
+
extension = kwargs.get("file_extension", "")
|
|
729
|
+
if extension.lower() != ".pptx":
|
|
730
|
+
return None
|
|
731
|
+
|
|
732
|
+
md_content = ""
|
|
733
|
+
|
|
734
|
+
presentation = pptx.Presentation(local_path)
|
|
735
|
+
slide_num = 0
|
|
736
|
+
for slide in presentation.slides:
|
|
737
|
+
slide_num += 1
|
|
738
|
+
|
|
739
|
+
md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
|
|
740
|
+
|
|
741
|
+
title = slide.shapes.title
|
|
742
|
+
for shape in slide.shapes:
|
|
743
|
+
# Pictures
|
|
744
|
+
if self._is_picture(shape):
|
|
745
|
+
# https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
|
|
746
|
+
alt_text = ""
|
|
747
|
+
try:
|
|
748
|
+
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
|
|
749
|
+
except Exception:
|
|
750
|
+
pass
|
|
751
|
+
|
|
752
|
+
# A placeholder name
|
|
753
|
+
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
|
754
|
+
md_content += (
|
|
755
|
+
"\n\n"
|
|
760
|
+
)
|
|
761
|
+
|
|
762
|
+
# Tables
|
|
763
|
+
if self._is_table(shape):
|
|
764
|
+
html_table = "<html><body><table>"
|
|
765
|
+
first_row = True
|
|
766
|
+
for row in shape.table.rows:
|
|
767
|
+
html_table += "<tr>"
|
|
768
|
+
for cell in row.cells:
|
|
769
|
+
if first_row:
|
|
770
|
+
html_table += "<th>" + html.escape(cell.text) + "</th>"
|
|
771
|
+
else:
|
|
772
|
+
html_table += "<td>" + html.escape(cell.text) + "</td>"
|
|
773
|
+
html_table += "</tr>"
|
|
774
|
+
first_row = False
|
|
775
|
+
html_table += "</table></body></html>"
|
|
776
|
+
md_content += (
|
|
777
|
+
"\n" + self._convert(html_table).text_content.strip() + "\n"
|
|
778
|
+
)
|
|
779
|
+
|
|
780
|
+
# Text areas
|
|
781
|
+
elif shape.has_text_frame:
|
|
782
|
+
if shape == title:
|
|
783
|
+
md_content += "# " + shape.text.lstrip() + "\n"
|
|
784
|
+
else:
|
|
785
|
+
md_content += shape.text + "\n"
|
|
786
|
+
|
|
787
|
+
md_content = md_content.strip()
|
|
788
|
+
|
|
789
|
+
if slide.has_notes_slide:
|
|
790
|
+
md_content += "\n\n### Notes:\n"
|
|
791
|
+
notes_frame = slide.notes_slide.notes_text_frame
|
|
792
|
+
if notes_frame is not None:
|
|
793
|
+
md_content += notes_frame.text
|
|
794
|
+
md_content = md_content.strip()
|
|
795
|
+
|
|
796
|
+
return DocumentConverterResult(
|
|
797
|
+
title=None,
|
|
798
|
+
text_content=md_content.strip(),
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
def _is_picture(self, shape):
|
|
802
|
+
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
|
|
803
|
+
return True
|
|
804
|
+
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
|
|
805
|
+
if hasattr(shape, "image"):
|
|
806
|
+
return True
|
|
807
|
+
return False
|
|
808
|
+
|
|
809
|
+
def _is_table(self, shape):
|
|
810
|
+
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
|
|
811
|
+
return True
|
|
812
|
+
return False
|
|
813
|
+
|
|
814
|
+
|
|
815
|
+
class MediaConverter(DocumentConverter):
|
|
816
|
+
"""
|
|
817
|
+
Abstract class for multi-modal media (e.g., images and audio)
|
|
818
|
+
"""
|
|
819
|
+
|
|
820
|
+
def _get_metadata(self, local_path):
|
|
821
|
+
exiftool = shutil.which("exiftool")
|
|
822
|
+
if not exiftool:
|
|
823
|
+
return None
|
|
824
|
+
else:
|
|
825
|
+
try:
|
|
826
|
+
result = subprocess.run(
|
|
827
|
+
[exiftool, "-json", local_path], capture_output=True, text=True
|
|
828
|
+
).stdout
|
|
829
|
+
return json.loads(result)[0]
|
|
830
|
+
except Exception:
|
|
831
|
+
return None
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
class WavConverter(MediaConverter):
|
|
835
|
+
"""
|
|
836
|
+
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
|
837
|
+
"""
|
|
838
|
+
|
|
839
|
+
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
840
|
+
# Bail if not a XLSX
|
|
841
|
+
extension = kwargs.get("file_extension", "")
|
|
842
|
+
if extension.lower() != ".wav":
|
|
843
|
+
return None
|
|
844
|
+
|
|
845
|
+
md_content = ""
|
|
846
|
+
|
|
847
|
+
# Add metadata
|
|
848
|
+
metadata = self._get_metadata(local_path)
|
|
849
|
+
if metadata:
|
|
850
|
+
for f in [
|
|
851
|
+
"Title",
|
|
852
|
+
"Artist",
|
|
853
|
+
"Author",
|
|
854
|
+
"Band",
|
|
855
|
+
"Album",
|
|
856
|
+
"Genre",
|
|
857
|
+
"Track",
|
|
858
|
+
"DateTimeOriginal",
|
|
859
|
+
"CreateDate",
|
|
860
|
+
"Duration",
|
|
861
|
+
]:
|
|
862
|
+
if f in metadata:
|
|
863
|
+
md_content += f"{f}: {metadata[f]}\n"
|
|
864
|
+
|
|
865
|
+
# Transcribe
|
|
866
|
+
if IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
|
867
|
+
try:
|
|
868
|
+
transcript = self._transcribe_audio(local_path)
|
|
869
|
+
md_content += "\n\n### Audio Transcript:\n" + (
|
|
870
|
+
"[No speech detected]" if transcript == "" else transcript
|
|
871
|
+
)
|
|
872
|
+
except Exception:
|
|
873
|
+
md_content += (
|
|
874
|
+
"\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
|
875
|
+
)
|
|
876
|
+
|
|
877
|
+
return DocumentConverterResult(
|
|
878
|
+
title=None,
|
|
879
|
+
text_content=md_content.strip(),
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
def _transcribe_audio(self, local_path) -> str:
|
|
883
|
+
recognizer = sr.Recognizer()
|
|
884
|
+
with sr.AudioFile(local_path) as source:
|
|
885
|
+
audio = recognizer.record(source)
|
|
886
|
+
return recognizer.recognize_google(audio).strip()
|
|
887
|
+
|
|
888
|
+
|
|
889
|
+
class Mp3Converter(WavConverter):
|
|
890
|
+
"""
|
|
891
|
+
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
|
|
892
|
+
"""
|
|
893
|
+
|
|
894
|
+
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
895
|
+
# Bail if not a MP3
|
|
896
|
+
extension = kwargs.get("file_extension", "")
|
|
897
|
+
if extension.lower() != ".mp3":
|
|
898
|
+
return None
|
|
899
|
+
|
|
900
|
+
md_content = ""
|
|
901
|
+
|
|
902
|
+
# Add metadata
|
|
903
|
+
metadata = self._get_metadata(local_path)
|
|
904
|
+
if metadata:
|
|
905
|
+
for f in [
|
|
906
|
+
"Title",
|
|
907
|
+
"Artist",
|
|
908
|
+
"Author",
|
|
909
|
+
"Band",
|
|
910
|
+
"Album",
|
|
911
|
+
"Genre",
|
|
912
|
+
"Track",
|
|
913
|
+
"DateTimeOriginal",
|
|
914
|
+
"CreateDate",
|
|
915
|
+
"Duration",
|
|
916
|
+
]:
|
|
917
|
+
if f in metadata:
|
|
918
|
+
md_content += f"{f}: {metadata[f]}\n"
|
|
919
|
+
|
|
920
|
+
# Transcribe
|
|
921
|
+
if IS_AUDIO_TRANSCRIPTION_CAPABLE:
|
|
922
|
+
handle, temp_path = tempfile.mkstemp(suffix=".wav")
|
|
923
|
+
os.close(handle)
|
|
924
|
+
try:
|
|
925
|
+
sound = pydub.AudioSegment.from_mp3(local_path)
|
|
926
|
+
sound.export(temp_path, format="wav")
|
|
927
|
+
|
|
928
|
+
_args = dict()
|
|
929
|
+
_args.update(kwargs)
|
|
930
|
+
_args["file_extension"] = ".wav"
|
|
931
|
+
|
|
932
|
+
try:
|
|
933
|
+
transcript = super()._transcribe_audio(temp_path).strip()
|
|
934
|
+
md_content += "\n\n### Audio Transcript:\n" + (
|
|
935
|
+
"[No speech detected]" if transcript == "" else transcript
|
|
936
|
+
)
|
|
937
|
+
except Exception:
|
|
938
|
+
md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
|
939
|
+
|
|
940
|
+
finally:
|
|
941
|
+
os.unlink(temp_path)
|
|
942
|
+
|
|
943
|
+
# Return the result
|
|
944
|
+
return DocumentConverterResult(
|
|
945
|
+
title=None,
|
|
946
|
+
text_content=md_content.strip(),
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
|
|
950
|
+
class ImageConverter(MediaConverter):
|
|
951
|
+
"""
|
|
952
|
+
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
|
|
953
|
+
"""
|
|
954
|
+
|
|
955
|
+
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
956
|
+
# Bail if not a XLSX
|
|
957
|
+
extension = kwargs.get("file_extension", "")
|
|
958
|
+
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
|
|
959
|
+
return None
|
|
960
|
+
|
|
961
|
+
md_content = ""
|
|
962
|
+
|
|
963
|
+
# Add metadata
|
|
964
|
+
metadata = self._get_metadata(local_path)
|
|
965
|
+
if metadata:
|
|
966
|
+
for f in [
|
|
967
|
+
"ImageSize",
|
|
968
|
+
"Title",
|
|
969
|
+
"Caption",
|
|
970
|
+
"Description",
|
|
971
|
+
"Keywords",
|
|
972
|
+
"Artist",
|
|
973
|
+
"Author",
|
|
974
|
+
"DateTimeOriginal",
|
|
975
|
+
"CreateDate",
|
|
976
|
+
"GPSPosition",
|
|
977
|
+
]:
|
|
978
|
+
if f in metadata:
|
|
979
|
+
md_content += f"{f}: {metadata[f]}\n"
|
|
980
|
+
|
|
981
|
+
# Try describing the image with GPTV
|
|
982
|
+
mlm_client = kwargs.get("mlm_client")
|
|
983
|
+
mlm_model = kwargs.get("mlm_model")
|
|
984
|
+
if mlm_client is not None and mlm_model is not None:
|
|
985
|
+
md_content += (
|
|
986
|
+
"\n# Description:\n"
|
|
987
|
+
+ self._get_mlm_description(
|
|
988
|
+
local_path,
|
|
989
|
+
extension,
|
|
990
|
+
mlm_client,
|
|
991
|
+
mlm_model,
|
|
992
|
+
prompt=kwargs.get("mlm_prompt"),
|
|
993
|
+
).strip()
|
|
994
|
+
+ "\n"
|
|
995
|
+
)
|
|
996
|
+
|
|
997
|
+
return DocumentConverterResult(
|
|
998
|
+
title=None,
|
|
999
|
+
text_content=md_content,
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1002
|
+
def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
|
|
1003
|
+
if prompt is None or prompt.strip() == "":
|
|
1004
|
+
prompt = "Write a detailed caption for this image."
|
|
1005
|
+
|
|
1006
|
+
sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
|
|
1007
|
+
|
|
1008
|
+
data_uri = ""
|
|
1009
|
+
with open(local_path, "rb") as image_file:
|
|
1010
|
+
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
|
|
1011
|
+
if content_type is None:
|
|
1012
|
+
content_type = "image/jpeg"
|
|
1013
|
+
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
|
|
1014
|
+
data_uri = f"data:{content_type};base64,{image_base64}"
|
|
1015
|
+
|
|
1016
|
+
messages = [
|
|
1017
|
+
{
|
|
1018
|
+
"role": "user",
|
|
1019
|
+
"content": [
|
|
1020
|
+
{"type": "text", "text": prompt},
|
|
1021
|
+
{
|
|
1022
|
+
"type": "image_url",
|
|
1023
|
+
"image_url": {
|
|
1024
|
+
"url": data_uri,
|
|
1025
|
+
},
|
|
1026
|
+
},
|
|
1027
|
+
],
|
|
1028
|
+
}
|
|
1029
|
+
]
|
|
1030
|
+
|
|
1031
|
+
response = client.chat.completions.create(model=model, messages=messages)
|
|
1032
|
+
return response.choices[0].message.content
|
|
1033
|
+
|
|
1034
|
+
|
|
1035
|
+
class FileConversionException(BaseException):
|
|
1036
|
+
pass
|
|
1037
|
+
|
|
1038
|
+
|
|
1039
|
+
class UnsupportedFormatException(BaseException):
|
|
1040
|
+
pass
|
|
1041
|
+
|
|
1042
|
+
|
|
1043
|
+
class MarkItDown:
|
|
1044
|
+
"""(In preview) An extremely simple text-based document reader, suitable for LLM use.
|
|
1045
|
+
This reader will convert common file-types or webpages to Markdown."""
|
|
1046
|
+
|
|
1047
|
+
def __init__(
|
|
1048
|
+
self,
|
|
1049
|
+
requests_session: Optional[requests.Session] = None,
|
|
1050
|
+
mlm_client: Optional[Any] = None,
|
|
1051
|
+
mlm_model: Optional[Any] = None,
|
|
1052
|
+
):
|
|
1053
|
+
if requests_session is None:
|
|
1054
|
+
self._requests_session = requests.Session()
|
|
1055
|
+
else:
|
|
1056
|
+
self._requests_session = requests_session
|
|
1057
|
+
|
|
1058
|
+
self._mlm_client = mlm_client
|
|
1059
|
+
self._mlm_model = mlm_model
|
|
1060
|
+
|
|
1061
|
+
self._page_converters: List[DocumentConverter] = []
|
|
1062
|
+
|
|
1063
|
+
# Register converters for successful browsing operations
|
|
1064
|
+
# Later registrations are tried first / take higher priority than earlier registrations
|
|
1065
|
+
# To this end, the most specific converters should appear below the most generic converters
|
|
1066
|
+
self.register_page_converter(PlainTextConverter())
|
|
1067
|
+
self.register_page_converter(HtmlConverter())
|
|
1068
|
+
self.register_page_converter(WikipediaConverter())
|
|
1069
|
+
self.register_page_converter(YouTubeConverter())
|
|
1070
|
+
self.register_page_converter(BingSerpConverter())
|
|
1071
|
+
self.register_page_converter(DocxConverter())
|
|
1072
|
+
self.register_page_converter(XlsxConverter())
|
|
1073
|
+
self.register_page_converter(PptxConverter())
|
|
1074
|
+
self.register_page_converter(WavConverter())
|
|
1075
|
+
self.register_page_converter(Mp3Converter())
|
|
1076
|
+
self.register_page_converter(ImageConverter())
|
|
1077
|
+
self.register_page_converter(PdfConverter())
|
|
1078
|
+
|
|
1079
|
+
def convert(
|
|
1080
|
+
self, source: Union[str, requests.Response], **kwargs: Any
|
|
1081
|
+
) -> DocumentConverterResult: # TODO: deal with kwargs
|
|
1082
|
+
"""
|
|
1083
|
+
Args:
|
|
1084
|
+
- source: can be a string representing a path or url, or a requests.response object
|
|
1085
|
+
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
|
1086
|
+
"""
|
|
1087
|
+
|
|
1088
|
+
# Local path or url
|
|
1089
|
+
if isinstance(source, str):
|
|
1090
|
+
if (
|
|
1091
|
+
source.startswith("http://")
|
|
1092
|
+
or source.startswith("https://")
|
|
1093
|
+
or source.startswith("file://")
|
|
1094
|
+
):
|
|
1095
|
+
return self.convert_url(source, **kwargs)
|
|
1096
|
+
else:
|
|
1097
|
+
return self.convert_local(source, **kwargs)
|
|
1098
|
+
# Request response
|
|
1099
|
+
elif isinstance(source, requests.Response):
|
|
1100
|
+
return self.convert_response(source, **kwargs)
|
|
1101
|
+
|
|
1102
|
+
def convert_local(
|
|
1103
|
+
self, path: str, **kwargs: Any
|
|
1104
|
+
) -> DocumentConverterResult: # TODO: deal with kwargs
|
|
1105
|
+
# Prepare a list of extensions to try (in order of priority)
|
|
1106
|
+
ext = kwargs.get("file_extension")
|
|
1107
|
+
extensions = [ext] if ext is not None else []
|
|
1108
|
+
|
|
1109
|
+
# Get extension alternatives from the path and puremagic
|
|
1110
|
+
base, ext = os.path.splitext(path)
|
|
1111
|
+
self._append_ext(extensions, ext)
|
|
1112
|
+
|
|
1113
|
+
if not extensions:
|
|
1114
|
+
for g in self._guess_ext_magic(path):
|
|
1115
|
+
self._append_ext(extensions, g)
|
|
1116
|
+
|
|
1117
|
+
# Convert
|
|
1118
|
+
return self._convert(path, extensions, **kwargs)
|
|
1119
|
+
|
|
1120
|
+
# TODO what should stream's type be?
|
|
1121
|
+
def convert_stream(
|
|
1122
|
+
self, stream: Any, **kwargs: Any
|
|
1123
|
+
) -> DocumentConverterResult: # TODO: deal with kwargs
|
|
1124
|
+
# Prepare a list of extensions to try (in order of priority)
|
|
1125
|
+
ext = kwargs.get("file_extension")
|
|
1126
|
+
extensions = [ext] if ext is not None else []
|
|
1127
|
+
|
|
1128
|
+
# Save the file locally to a temporary file. It will be deleted before this method exits
|
|
1129
|
+
handle, temp_path = tempfile.mkstemp()
|
|
1130
|
+
fh = os.fdopen(handle, "wb")
|
|
1131
|
+
result = None
|
|
1132
|
+
try:
|
|
1133
|
+
# Write to the temporary file
|
|
1134
|
+
content = stream.read()
|
|
1135
|
+
if isinstance(content, str):
|
|
1136
|
+
fh.write(content.encode("utf-8"))
|
|
1137
|
+
else:
|
|
1138
|
+
fh.write(content)
|
|
1139
|
+
fh.close()
|
|
1140
|
+
|
|
1141
|
+
# Use puremagic to check for more extension options
|
|
1142
|
+
for g in self._guess_ext_magic(temp_path):
|
|
1143
|
+
self._append_ext(extensions, g)
|
|
1144
|
+
|
|
1145
|
+
# Convert
|
|
1146
|
+
result = self._convert(temp_path, extensions, **kwargs)
|
|
1147
|
+
# Clean up
|
|
1148
|
+
finally:
|
|
1149
|
+
try:
|
|
1150
|
+
fh.close()
|
|
1151
|
+
except Exception:
|
|
1152
|
+
pass
|
|
1153
|
+
os.unlink(temp_path)
|
|
1154
|
+
|
|
1155
|
+
return result
|
|
1156
|
+
|
|
1157
|
+
def convert_url(
|
|
1158
|
+
self, url: str, **kwargs: Any
|
|
1159
|
+
) -> DocumentConverterResult: # TODO: fix kwargs type
|
|
1160
|
+
# Send a HTTP request to the URL
|
|
1161
|
+
response = self._requests_session.get(url, stream=True)
|
|
1162
|
+
response.raise_for_status()
|
|
1163
|
+
return self.convert_response(response, **kwargs)
|
|
1164
|
+
|
|
1165
|
+
def convert_response(
|
|
1166
|
+
self, response: requests.Response, **kwargs: Any
|
|
1167
|
+
) -> DocumentConverterResult: # TODO fix kwargs type
|
|
1168
|
+
# Prepare a list of extensions to try (in order of priority)
|
|
1169
|
+
ext = kwargs.get("file_extension")
|
|
1170
|
+
extensions = [ext] if ext is not None else []
|
|
1171
|
+
|
|
1172
|
+
# Guess from the mimetype
|
|
1173
|
+
content_type = response.headers.get("content-type", "").split(";")[0]
|
|
1174
|
+
self._append_ext(extensions, mimetypes.guess_extension(content_type))
|
|
1175
|
+
|
|
1176
|
+
# Read the content disposition if there is one
|
|
1177
|
+
content_disposition = response.headers.get("content-disposition", "")
|
|
1178
|
+
m = re.search(r"filename=([^;]+)", content_disposition)
|
|
1179
|
+
if m:
|
|
1180
|
+
base, ext = os.path.splitext(m.group(1).strip("\"'"))
|
|
1181
|
+
self._append_ext(extensions, ext)
|
|
1182
|
+
|
|
1183
|
+
# Read from the extension from the path
|
|
1184
|
+
base, ext = os.path.splitext(urlparse(response.url).path)
|
|
1185
|
+
self._append_ext(extensions, ext)
|
|
1186
|
+
|
|
1187
|
+
# Save the file locally to a temporary file. It will be deleted before this method exits
|
|
1188
|
+
handle, temp_path = tempfile.mkstemp()
|
|
1189
|
+
fh = os.fdopen(handle, "wb")
|
|
1190
|
+
result = None
|
|
1191
|
+
try:
|
|
1192
|
+
# Download the file
|
|
1193
|
+
for chunk in response.iter_content(chunk_size=512):
|
|
1194
|
+
fh.write(chunk)
|
|
1195
|
+
fh.close()
|
|
1196
|
+
|
|
1197
|
+
# Use puremagic to check for more extension options
|
|
1198
|
+
for g in self._guess_ext_magic(temp_path):
|
|
1199
|
+
self._append_ext(extensions, g)
|
|
1200
|
+
|
|
1201
|
+
# Convert
|
|
1202
|
+
result = self._convert(temp_path, extensions, url=response.url)
|
|
1203
|
+
# Clean up
|
|
1204
|
+
finally:
|
|
1205
|
+
try:
|
|
1206
|
+
fh.close()
|
|
1207
|
+
except Exception:
|
|
1208
|
+
pass
|
|
1209
|
+
os.unlink(temp_path)
|
|
1210
|
+
|
|
1211
|
+
return result
|
|
1212
|
+
|
|
1213
|
+
def _convert(
|
|
1214
|
+
self, local_path: str, extensions: List[Union[str, None]], **kwargs
|
|
1215
|
+
) -> DocumentConverterResult:
|
|
1216
|
+
error_trace = ""
|
|
1217
|
+
res = None
|
|
1218
|
+
for ext in extensions + [None]: # Try last with no extension
|
|
1219
|
+
for converter in self._page_converters:
|
|
1220
|
+
_kwargs = copy.deepcopy(kwargs)
|
|
1221
|
+
|
|
1222
|
+
# Overwrite file_extension appropriately
|
|
1223
|
+
if ext is None:
|
|
1224
|
+
if "file_extension" in _kwargs:
|
|
1225
|
+
del _kwargs["file_extension"]
|
|
1226
|
+
else:
|
|
1227
|
+
_kwargs.update({"file_extension": ext})
|
|
1228
|
+
|
|
1229
|
+
# Copy any additional global options
|
|
1230
|
+
if "mlm_client" not in _kwargs and self._mlm_client is not None:
|
|
1231
|
+
_kwargs["mlm_client"] = self._mlm_client
|
|
1232
|
+
|
|
1233
|
+
if "mlm_model" not in _kwargs and self._mlm_model is not None:
|
|
1234
|
+
_kwargs["mlm_model"] = self._mlm_model
|
|
1235
|
+
|
|
1236
|
+
# If we hit an error log it and keep trying
|
|
1237
|
+
try:
|
|
1238
|
+
res = converter.convert(local_path, **_kwargs)
|
|
1239
|
+
except Exception:
|
|
1240
|
+
error_trace = ("\n\n" + traceback.format_exc()).strip()
|
|
1241
|
+
|
|
1242
|
+
if res is not None:
|
|
1243
|
+
# Normalize the content
|
|
1244
|
+
res.text_content = "\n".join(
|
|
1245
|
+
[line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
|
|
1246
|
+
)
|
|
1247
|
+
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
|
|
1248
|
+
|
|
1249
|
+
# Todo
|
|
1250
|
+
return res
|
|
1251
|
+
|
|
1252
|
+
# If we got this far without success, report any exceptions
|
|
1253
|
+
if len(error_trace) > 0:
|
|
1254
|
+
raise FileConversionException(
|
|
1255
|
+
f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
|
|
1256
|
+
)
|
|
1257
|
+
|
|
1258
|
+
# Nothing can handle it!
|
|
1259
|
+
raise UnsupportedFormatException(
|
|
1260
|
+
f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
|
|
1261
|
+
)
|
|
1262
|
+
|
|
1263
|
+
def _append_ext(self, extensions, ext):
|
|
1264
|
+
"""Append a unique non-None, non-empty extension to a list of extensions."""
|
|
1265
|
+
if ext is None:
|
|
1266
|
+
return
|
|
1267
|
+
ext = ext.strip()
|
|
1268
|
+
if ext == "":
|
|
1269
|
+
return
|
|
1270
|
+
# if ext not in extensions:
|
|
1271
|
+
if True:
|
|
1272
|
+
extensions.append(ext)
|
|
1273
|
+
|
|
1274
|
+
def _guess_ext_magic(self, path):
|
|
1275
|
+
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
|
|
1276
|
+
# Use puremagic to guess
|
|
1277
|
+
try:
|
|
1278
|
+
guesses = puremagic.magic_file(path)
|
|
1279
|
+
extensions = list()
|
|
1280
|
+
for g in guesses:
|
|
1281
|
+
ext = g.extension.strip()
|
|
1282
|
+
if len(ext) > 0:
|
|
1283
|
+
if not ext.startswith("."):
|
|
1284
|
+
ext = "." + ext
|
|
1285
|
+
if ext not in extensions:
|
|
1286
|
+
extensions.append(ext)
|
|
1287
|
+
return extensions
|
|
1288
|
+
except FileNotFoundError:
|
|
1289
|
+
pass
|
|
1290
|
+
except IsADirectoryError:
|
|
1291
|
+
pass
|
|
1292
|
+
except PermissionError:
|
|
1293
|
+
pass
|
|
1294
|
+
return []
|
|
1295
|
+
|
|
1296
|
+
def register_page_converter(self, converter: DocumentConverter) -> None:
|
|
1297
|
+
"""Register a page text converter."""
|
|
1298
|
+
self._page_converters.insert(0, converter)
|