auto-coder 0.1.199__py3-none-any.whl → 0.1.201__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

@@ -0,0 +1,1298 @@
1
+ # type: ignore
2
+ import base64
3
+ import binascii
4
+ import copy
5
+ import html
6
+ import json
7
+ import mimetypes
8
+ import io
9
+ import os
10
+ import re
11
+ import shutil
12
+ import subprocess
13
+ import sys
14
+ import tempfile
15
+ import traceback
16
+ from typing import Any, Dict, List, Optional, Union
17
+ from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
18
+
19
+ import mammoth
20
+ import markdownify
21
+ import pandas as pd
22
+ import pdfminer
23
+ import pdfminer.high_level
24
+ from pdfminer.converter import PDFPageAggregator
25
+ from pdfminer.layout import LAParams, LTImage, LTFigure
26
+ from pdfminer.pdfdevice import PDFDevice
27
+ from pdfminer.pdfdocument import PDFDocument
28
+ from pdfminer.pdfparser import PDFParser
29
+ from pdfminer.pdfpage import PDFPage
30
+ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
31
+ import pptx
32
+ from pdfminer.image import ImageWriter
33
+
34
+ import numpy as np
35
+ from PIL import Image
36
+
37
+ # File-format detection
38
+ import puremagic
39
+ import requests
40
+ from bs4 import BeautifulSoup
41
+
42
+ # Optional Transcription support
43
+ try:
44
+ import pydub
45
+ import speech_recognition as sr
46
+
47
+ IS_AUDIO_TRANSCRIPTION_CAPABLE = True
48
+ except ModuleNotFoundError:
49
+ pass
50
+
51
+ # Optional YouTube transcription support
52
+ try:
53
+ from youtube_transcript_api import YouTubeTranscriptApi
54
+
55
+ IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
56
+ except ModuleNotFoundError:
57
+ pass
58
+
59
+
60
+ class _CustomMarkdownify(markdownify.MarkdownConverter):
61
+ """
62
+ A custom version of markdownify's MarkdownConverter. Changes include:
63
+
64
+ - Altering the default heading style to use '#', '##', etc.
65
+ - Removing javascript hyperlinks.
66
+ - Truncating images with large data:uri sources.
67
+ - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
68
+ """
69
+
70
+ def __init__(self, **options: Any):
71
+ options["heading_style"] = options.get("heading_style", markdownify.ATX)
72
+ # Explicitly cast options to the expected type if necessary
73
+ super().__init__(**options)
74
+
75
+ def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
76
+ """Same as usual, but be sure to start with a new line"""
77
+ if not convert_as_inline:
78
+ if not re.search(r"^\n", text):
79
+ return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
80
+
81
+ return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
82
+
83
+ def convert_a(self, el: Any, text: str, convert_as_inline: bool):
84
+ """Same as usual converter, but removes Javascript links and escapes URIs."""
85
+ prefix, suffix, text = markdownify.chomp(text) # type: ignore
86
+ if not text:
87
+ return ""
88
+ href = el.get("href")
89
+ title = el.get("title")
90
+
91
+ # Escape URIs and skip non-http or file schemes
92
+ if href:
93
+ try:
94
+ parsed_url = urlparse(href) # type: ignore
95
+ # type: ignore
96
+ if parsed_url.scheme and parsed_url.scheme.lower() not in [
97
+ "http",
98
+ "https",
99
+ "file",
100
+ ]:
101
+ return "%s%s%s" % (prefix, text, suffix)
102
+ href = urlunparse(
103
+ parsed_url._replace(path=quote(unquote(parsed_url.path)))
104
+ ) # type: ignore
105
+ except ValueError: # It's not clear if this ever gets thrown
106
+ return "%s%s%s" % (prefix, text, suffix)
107
+
108
+ # For the replacement see #29: text nodes underscores are escaped
109
+ if (
110
+ self.options["autolinks"]
111
+ and text.replace(r"\_", "_") == href
112
+ and not title
113
+ and not self.options["default_title"]
114
+ ):
115
+ # Shortcut syntax
116
+ return "<%s>" % href
117
+ if self.options["default_title"] and not title:
118
+ title = href
119
+ title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
120
+ return (
121
+ "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
122
+ if href
123
+ else text
124
+ )
125
+
126
+ def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
127
+ """Same as usual converter, but removes data URIs"""
128
+
129
+ alt = el.attrs.get("alt", None) or ""
130
+ src = el.attrs.get("src", None) or ""
131
+ title = el.attrs.get("title", None) or ""
132
+ title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
133
+ if (
134
+ convert_as_inline
135
+ and el.parent.name not in self.options["keep_inline_images_in"]
136
+ ):
137
+ return alt
138
+
139
+ # Remove dataURIs
140
+ if src.startswith("data:"):
141
+ src = src.split(",")[0] + "..."
142
+
143
+ return "![%s](%s%s)" % (alt, src, title_part)
144
+
145
+ def convert_soup(self, soup: Any) -> str:
146
+ return super().convert_soup(soup) # type: ignore
147
+
148
+
149
+ class DocumentConverterResult:
150
+ """The result of converting a document to text."""
151
+
152
+ def __init__(self, title: Union[str, None] = None, text_content: str = ""):
153
+ self.title: Union[str, None] = title
154
+ self.text_content: str = text_content
155
+
156
+
157
+ class DocumentConverter:
158
+ """Abstract superclass of all DocumentConverters."""
159
+
160
+ def convert(
161
+ self, local_path: str, **kwargs: Any
162
+ ) -> Union[None, DocumentConverterResult]:
163
+ raise NotImplementedError()
164
+
165
+
166
+ class PlainTextConverter(DocumentConverter):
167
+ """Anything with content type text/plain"""
168
+
169
+ def convert(
170
+ self, local_path: str, **kwargs: Any
171
+ ) -> Union[None, DocumentConverterResult]:
172
+ # Guess the content type from any file extension that might be around
173
+ content_type, _ = mimetypes.guess_type(
174
+ "__placeholder" + kwargs.get("file_extension", "")
175
+ )
176
+
177
+ # Only accept text files
178
+ if content_type is None:
179
+ return None
180
+ elif "text/" not in content_type.lower():
181
+ return None
182
+
183
+ text_content = ""
184
+ with open(local_path, "rt", encoding="utf-8") as fh:
185
+ text_content = fh.read()
186
+ return DocumentConverterResult(
187
+ title=None,
188
+ text_content=text_content,
189
+ )
190
+
191
+
192
+ class HtmlConverter(DocumentConverter):
193
+ """Anything with content type text/html"""
194
+
195
+ def convert(
196
+ self, local_path: str, **kwargs: Any
197
+ ) -> Union[None, DocumentConverterResult]:
198
+ # Bail if not html
199
+ extension = kwargs.get("file_extension", "")
200
+ if extension.lower() not in [".html", ".htm"]:
201
+ return None
202
+
203
+ result = None
204
+ with open(local_path, "rt", encoding="utf-8") as fh:
205
+ result = self._convert(fh.read())
206
+
207
+ return result
208
+
209
+ def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
210
+ """Helper function that converts and HTML string."""
211
+
212
+ # Parse the string
213
+ soup = BeautifulSoup(html_content, "html.parser")
214
+
215
+ # Remove javascript and style blocks
216
+ for script in soup(["script", "style"]):
217
+ script.extract()
218
+
219
+ # Print only the main content
220
+ body_elm = soup.find("body")
221
+ webpage_text = ""
222
+ if body_elm:
223
+ webpage_text = _CustomMarkdownify().convert_soup(body_elm)
224
+ else:
225
+ webpage_text = _CustomMarkdownify().convert_soup(soup)
226
+
227
+ assert isinstance(webpage_text, str)
228
+
229
+ return DocumentConverterResult(
230
+ title=None if soup.title is None else soup.title.string,
231
+ text_content=webpage_text,
232
+ )
233
+
234
+
235
+ class WikipediaConverter(DocumentConverter):
236
+ """Handle Wikipedia pages separately, focusing only on the main document content."""
237
+
238
+ def convert(
239
+ self, local_path: str, **kwargs: Any
240
+ ) -> Union[None, DocumentConverterResult]:
241
+ # Bail if not Wikipedia
242
+ extension = kwargs.get("file_extension", "")
243
+ if extension.lower() not in [".html", ".htm"]:
244
+ return None
245
+ url = kwargs.get("url", "")
246
+ if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
247
+ return None
248
+
249
+ # Parse the file
250
+ soup = None
251
+ with open(local_path, "rt", encoding="utf-8") as fh:
252
+ soup = BeautifulSoup(fh.read(), "html.parser")
253
+
254
+ # Remove javascript and style blocks
255
+ for script in soup(["script", "style"]):
256
+ script.extract()
257
+
258
+ # Print only the main content
259
+ body_elm = soup.find("div", {"id": "mw-content-text"})
260
+ title_elm = soup.find("span", {"class": "mw-page-title-main"})
261
+
262
+ webpage_text = ""
263
+ main_title = None if soup.title is None else soup.title.string
264
+
265
+ if body_elm:
266
+ # What's the title
267
+ if title_elm and len(title_elm) > 0:
268
+ main_title = title_elm.string # type: ignore
269
+ assert isinstance(main_title, str)
270
+
271
+ # Convert the page
272
+ webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
273
+ body_elm
274
+ )
275
+ else:
276
+ webpage_text = _CustomMarkdownify().convert_soup(soup)
277
+
278
+ return DocumentConverterResult(
279
+ title=main_title,
280
+ text_content=webpage_text,
281
+ )
282
+
283
+
284
+ class YouTubeConverter(DocumentConverter):
285
+ """Handle YouTube specially, focusing on the video title, description, and transcript."""
286
+
287
+ def convert(
288
+ self, local_path: str, **kwargs: Any
289
+ ) -> Union[None, DocumentConverterResult]:
290
+ # Bail if not YouTube
291
+ extension = kwargs.get("file_extension", "")
292
+ if extension.lower() not in [".html", ".htm"]:
293
+ return None
294
+ url = kwargs.get("url", "")
295
+ if not url.startswith("https://www.youtube.com/watch?"):
296
+ return None
297
+
298
+ # Parse the file
299
+ soup = None
300
+ with open(local_path, "rt", encoding="utf-8") as fh:
301
+ soup = BeautifulSoup(fh.read(), "html.parser")
302
+
303
+ # Read the meta tags
304
+ assert soup.title is not None and soup.title.string is not None
305
+ metadata: Dict[str, str] = {"title": soup.title.string}
306
+ for meta in soup(["meta"]):
307
+ for a in meta.attrs:
308
+ if a in ["itemprop", "property", "name"]:
309
+ metadata[meta[a]] = meta.get("content", "")
310
+ break
311
+
312
+ # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
313
+ try:
314
+ for script in soup(["script"]):
315
+ content = script.text
316
+ if "ytInitialData" in content:
317
+ lines = re.split(r"\r?\n", content)
318
+ obj_start = lines[0].find("{")
319
+ obj_end = lines[0].rfind("}")
320
+ if obj_start >= 0 and obj_end >= 0:
321
+ data = json.loads(lines[0][obj_start : obj_end + 1])
322
+ attrdesc = self._findKey(
323
+ data, "attributedDescriptionBodyText"
324
+ ) # type: ignore
325
+ if attrdesc:
326
+ metadata["description"] = str(attrdesc["content"])
327
+ break
328
+ except Exception:
329
+ pass
330
+
331
+ # Start preparing the page
332
+ webpage_text = "# YouTube\n"
333
+
334
+ title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore
335
+ assert isinstance(title, str)
336
+
337
+ if title:
338
+ webpage_text += f"\n## {title}\n"
339
+
340
+ stats = ""
341
+ views = self._get(metadata, ["interactionCount"]) # type: ignore
342
+ if views:
343
+ stats += f"- **Views:** {views}\n"
344
+
345
+ keywords = self._get(metadata, ["keywords"]) # type: ignore
346
+ if keywords:
347
+ stats += f"- **Keywords:** {keywords}\n"
348
+
349
+ runtime = self._get(metadata, ["duration"]) # type: ignore
350
+ if runtime:
351
+ stats += f"- **Runtime:** {runtime}\n"
352
+
353
+ if len(stats) > 0:
354
+ webpage_text += f"\n### Video Metadata\n{stats}\n"
355
+
356
+ description = self._get(
357
+ metadata, ["description", "og:description"]
358
+ ) # type: ignore
359
+ if description:
360
+ webpage_text += f"\n### Description\n{description}\n"
361
+
362
+ if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
363
+ transcript_text = ""
364
+ parsed_url = urlparse(url) # type: ignore
365
+ params = parse_qs(parsed_url.query) # type: ignore
366
+ if "v" in params:
367
+ assert isinstance(params["v"][0], str)
368
+ video_id = str(params["v"][0])
369
+ try:
370
+ # Must be a single transcript.
371
+ transcript = YouTubeTranscriptApi.get_transcript(
372
+ video_id
373
+ ) # type: ignore
374
+ transcript_text = " ".join(
375
+ [part["text"] for part in transcript]
376
+ ) # type: ignore
377
+ # Alternative formatting:
378
+ # formatter = TextFormatter()
379
+ # formatter.format_transcript(transcript)
380
+ except Exception:
381
+ pass
382
+ if transcript_text:
383
+ webpage_text += f"\n### Transcript\n{transcript_text}\n"
384
+
385
+ title = title if title else soup.title.string
386
+ assert isinstance(title, str)
387
+
388
+ return DocumentConverterResult(
389
+ title=title,
390
+ text_content=webpage_text,
391
+ )
392
+
393
+ def _get(
394
+ self,
395
+ metadata: Dict[str, str],
396
+ keys: List[str],
397
+ default: Union[str, None] = None,
398
+ ) -> Union[str, None]:
399
+ for k in keys:
400
+ if k in metadata:
401
+ return metadata[k]
402
+ return default
403
+
404
+ # TODO: Fix json type
405
+ def _findKey(self, json: Any, key: str) -> Union[str, None]:
406
+ if isinstance(json, list):
407
+ for elm in json:
408
+ ret = self._findKey(elm, key)
409
+ if ret is not None:
410
+ return ret
411
+ elif isinstance(json, dict):
412
+ for k in json:
413
+ if k == key:
414
+ return json[k]
415
+ else:
416
+ ret = self._findKey(json[k], key)
417
+ if ret is not None:
418
+ return ret
419
+ return None
420
+
421
+
422
+ class BingSerpConverter(DocumentConverter):
423
+ """
424
+ Handle Bing results pages (only the organic search results).
425
+ NOTE: It is better to use the Bing API
426
+ """
427
+
428
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
429
+ # Bail if not a Bing SERP
430
+ extension = kwargs.get("file_extension", "")
431
+ if extension.lower() not in [".html", ".htm"]:
432
+ return None
433
+ url = kwargs.get("url", "")
434
+ if not re.search(r"^https://www\.bing\.com/search\?q=", url):
435
+ return None
436
+
437
+ # Parse the query parameters
438
+ parsed_params = parse_qs(urlparse(url).query)
439
+ query = parsed_params.get("q", [""])[0]
440
+
441
+ # Parse the file
442
+ soup = None
443
+ with open(local_path, "rt", encoding="utf-8") as fh:
444
+ soup = BeautifulSoup(fh.read(), "html.parser")
445
+
446
+ # Clean up some formatting
447
+ for tptt in soup.find_all(class_="tptt"):
448
+ if hasattr(tptt, "string") and tptt.string:
449
+ tptt.string += " "
450
+ for slug in soup.find_all(class_="algoSlug_icon"):
451
+ slug.extract()
452
+
453
+ # Parse the algorithmic results
454
+ _markdownify = _CustomMarkdownify()
455
+ results = list()
456
+ for result in soup.find_all(class_="b_algo"):
457
+ # Rewrite redirect urls
458
+ for a in result.find_all("a", href=True):
459
+ parsed_href = urlparse(a["href"])
460
+ qs = parse_qs(parsed_href.query)
461
+
462
+ # The destination is contained in the u parameter,
463
+ # but appears to be base64 encoded, with some prefix
464
+ if "u" in qs:
465
+ u = (
466
+ qs["u"][0][2:].strip() + "=="
467
+ ) # Python 3 doesn't care about extra padding
468
+
469
+ try:
470
+ # RFC 4648 / Base64URL" variant, which uses "-" and "_"
471
+ a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8")
472
+ except UnicodeDecodeError:
473
+ pass
474
+ except binascii.Error:
475
+ pass
476
+
477
+ # Convert to markdown
478
+ md_result = _markdownify.convert_soup(result).strip()
479
+ lines = [line.strip() for line in re.split(r"\n+", md_result)]
480
+ results.append("\n".join([line for line in lines if len(line) > 0]))
481
+
482
+ webpage_text = (
483
+ f"## A Bing search for '{query}' found the following results:\n\n"
484
+ + "\n\n".join(results)
485
+ )
486
+
487
+ return DocumentConverterResult(
488
+ title=None if soup.title is None else soup.title.string,
489
+ text_content=webpage_text,
490
+ )
491
+
492
+
493
+ class PdfConverter(DocumentConverter):
494
+ """
495
+ Converts PDFs to Markdown with support for extracting and including images.
496
+ """
497
+
498
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
499
+ # Bail if not a PDF
500
+ extension = kwargs.get("file_extension", "")
501
+ if extension.lower() != ".pdf":
502
+ return None
503
+
504
+ image_output_dir = None
505
+ if kwargs.get("image_output_dir", None):
506
+ image_output_dir = kwargs.get("image_output_dir")
507
+ else:
508
+ # Create output directory for images if it doesn't exist
509
+ image_output_dir = os.path.join(
510
+ os.path.dirname(local_path), "_images", os.path.basename(local_path)
511
+ )
512
+ os.makedirs(image_output_dir, exist_ok=True)
513
+
514
+ text_content = []
515
+ image_count = 0
516
+
517
+ # Open and process PDF
518
+ with open(local_path, "rb") as file:
519
+ # Create PDF parser and document
520
+ parser = PDFParser(file)
521
+ document = PDFDocument(parser)
522
+ rsrcmgr = PDFResourceManager()
523
+ laparams = LAParams()
524
+ device = PDFPageAggregator(rsrcmgr, laparams=laparams)
525
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
526
+
527
+ # Process each page
528
+ for page in PDFPage.create_pages(document):
529
+ interpreter.process_page(page)
530
+ layout = device.get_result()
531
+
532
+ # Extract text and images from the page
533
+ page_content = self._process_layout(
534
+ layout, image_output_dir, image_count
535
+ )
536
+ text_content.extend(page_content)
537
+ image_count += len([c for c in page_content if c.startswith("![Image")])
538
+
539
+ return DocumentConverterResult(
540
+ title=None,
541
+ text_content="\n".join(text_content),
542
+ )
543
+
544
+ def _process_layout(
545
+ self, layout, image_output_dir: str, image_count: int
546
+ ) -> List[str]:
547
+ """Process the layout of a PDF page, extracting both text and images."""
548
+ content = []
549
+ iw = ImageWriter(image_output_dir)
550
+
551
+ for lt_obj in layout:
552
+ # Handle images
553
+ if isinstance(lt_obj, LTImage) or (
554
+ isinstance(lt_obj, LTFigure) and lt_obj.name.startswith("Im")
555
+ ):
556
+ image_count += 1
557
+ image_data = None
558
+ image_meta = {}
559
+ image_path = os.path.join(image_output_dir, f"image_{image_count}.png")
560
+
561
+ if hasattr(lt_obj, "stream"):
562
+ image_data = lt_obj.stream.get_data()
563
+ image_meta = lt_obj.stream.attrs
564
+ elif hasattr(lt_obj, "filter"):
565
+ image_data = lt_obj.filter
566
+
567
+ if image_data:
568
+ if isinstance(lt_obj, LTImage):
569
+ name = iw.export_image(lt_obj)
570
+ suffix = os.path.splitext(name)[1]
571
+ temp_path = os.path.join(image_output_dir, name)
572
+ image_path = os.path.join(image_output_dir, f"image_{image_count}{suffix}")
573
+ os.rename(temp_path, image_path)
574
+ content.append(f"![Image {image_count}]({image_path})")
575
+ continue
576
+ try:
577
+ # Try to handle raw pixel data
578
+ if "BitsPerComponent" in image_meta:
579
+ width = image_meta["Width"]
580
+ height = image_meta["Height"]
581
+ bits = image_meta["BitsPerComponent"]
582
+ colorspace = image_meta["ColorSpace"].name
583
+ new_image_data = np.frombuffer(image_data, dtype=np.uint8)
584
+ # Normalize to 8-bit if necessary
585
+ if bits != 8:
586
+ max_val = (1 << bits) - 1
587
+ new_image_data = (
588
+ new_image_data.astype("float32") * 255 / max_val
589
+ ).astype("uint8")
590
+
591
+ if colorspace == "DeviceRGB":
592
+ new_image_data = new_image_data.reshape(
593
+ (height, width, 3)
594
+ )
595
+ img = Image.fromarray(new_image_data, "RGB")
596
+ img.save(image_path)
597
+ content.append(
598
+ f"![Image {image_count}]({image_path})\n"
599
+ )
600
+ continue
601
+ elif colorspace == "DeviceGray":
602
+ new_image_data = new_image_data.reshape((height, width))
603
+ img = Image.fromarray(new_image_data, "L")
604
+ img.save(image_path)
605
+ content.append(
606
+ f"![Image {image_count}]({image_path})\n"
607
+ )
608
+ continue
609
+ except Exception as e:
610
+ print(
611
+ f"Error extracting image: {e} fallback to writing original data"
612
+ )
613
+
614
+ with open(image_path, "wb") as img_file:
615
+ img_file.write(image_data)
616
+
617
+ content.append(f"![Image {image_count}]({image_path})\n")
618
+
619
+ # Handle text
620
+ if hasattr(lt_obj, "get_text"):
621
+ text = lt_obj.get_text().strip()
622
+ if text:
623
+ content.append(text)
624
+
625
+ # Recursively process nested layouts
626
+ elif hasattr(lt_obj, "_objs"):
627
+ content.extend(
628
+ self._process_layout(lt_obj._objs, image_output_dir, image_count)
629
+ )
630
+
631
+ return content
632
+
633
+
634
+ class DocxConverter(HtmlConverter):
635
+ """
636
+ Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
637
+ """
638
+
639
+ def __init__(self):
640
+ self._image_counter = 0
641
+ super().__init__()
642
+
643
+ def _save_image(self, image, output_dir: str) -> str:
644
+ """
645
+ 保存图片并返回相对路径,使用递增的计数器来命名文件
646
+ """
647
+ # 获取图片内容和格式
648
+ image_format = image.content_type.split('/')[-1] if image.content_type else 'png'
649
+
650
+ # 增加计数器并生成文件名
651
+ self._image_counter += 1
652
+ image_filename = f"image_{self._image_counter}.{image_format}"
653
+
654
+ # 保存图片
655
+ image_path = os.path.join(output_dir, image_filename)
656
+ with image.open() as image_content, open(image_path, 'wb') as f:
657
+ f.write(image_content.read())
658
+
659
+ return image_path
660
+
661
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
662
+ # Bail if not a DOCX
663
+ extension = kwargs.get("file_extension", "")
664
+ if extension.lower() != ".docx":
665
+ return None
666
+
667
+ # 设置图片输出目录
668
+ image_output_dir = None
669
+ if kwargs.get("image_output_dir", None):
670
+ image_output_dir = kwargs.get("image_output_dir")
671
+ else:
672
+ # Create output directory for images if it doesn't exist
673
+ image_output_dir = os.path.join(os.path.dirname(
674
+ local_path), "_images", os.path.basename(local_path))
675
+ os.makedirs(image_output_dir, exist_ok=True)
676
+
677
+ result = None
678
+ with open(local_path, "rb") as docx_file:
679
+ # 配置图片转换器
680
+ def transform_image(image):
681
+ return {
682
+ "src": self._save_image(image, image_output_dir),
683
+ "alt": image.alt_text if image.alt_text else f"Image {self._image_counter}"
684
+ }
685
+
686
+ # 进行转换
687
+ result = mammoth.convert_to_html(
688
+ docx_file,
689
+ convert_image=mammoth.images.inline(transform_image)
690
+ )
691
+ html_content = result.value
692
+ result = self._convert(html_content)
693
+
694
+ return result
695
+
696
+
697
+ class XlsxConverter(HtmlConverter):
698
+ """
699
+ Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
700
+ """
701
+
702
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
703
+ # Bail if not a XLSX
704
+ extension = kwargs.get("file_extension", "")
705
+ if extension.lower() != ".xlsx":
706
+ return None
707
+
708
+ sheets = pd.read_excel(local_path, sheet_name=None)
709
+ md_content = ""
710
+ for s in sheets:
711
+ md_content += f"## {s}\n"
712
+ html_content = sheets[s].to_html(index=False)
713
+ md_content += self._convert(html_content).text_content.strip() + "\n\n"
714
+
715
+ return DocumentConverterResult(
716
+ title=None,
717
+ text_content=md_content.strip(),
718
+ )
719
+
720
+
721
+ class PptxConverter(HtmlConverter):
722
+ """
723
+ Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
724
+ """
725
+
726
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
727
+ # Bail if not a PPTX
728
+ extension = kwargs.get("file_extension", "")
729
+ if extension.lower() != ".pptx":
730
+ return None
731
+
732
+ md_content = ""
733
+
734
+ presentation = pptx.Presentation(local_path)
735
+ slide_num = 0
736
+ for slide in presentation.slides:
737
+ slide_num += 1
738
+
739
+ md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
740
+
741
+ title = slide.shapes.title
742
+ for shape in slide.shapes:
743
+ # Pictures
744
+ if self._is_picture(shape):
745
+ # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
746
+ alt_text = ""
747
+ try:
748
+ alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
749
+ except Exception:
750
+ pass
751
+
752
+ # A placeholder name
753
+ filename = re.sub(r"\W", "", shape.name) + ".jpg"
754
+ md_content += (
755
+ "\n!["
756
+ + (alt_text if alt_text else shape.name)
757
+ + "]("
758
+ + filename
759
+ + ")\n"
760
+ )
761
+
762
+ # Tables
763
+ if self._is_table(shape):
764
+ html_table = "<html><body><table>"
765
+ first_row = True
766
+ for row in shape.table.rows:
767
+ html_table += "<tr>"
768
+ for cell in row.cells:
769
+ if first_row:
770
+ html_table += "<th>" + html.escape(cell.text) + "</th>"
771
+ else:
772
+ html_table += "<td>" + html.escape(cell.text) + "</td>"
773
+ html_table += "</tr>"
774
+ first_row = False
775
+ html_table += "</table></body></html>"
776
+ md_content += (
777
+ "\n" + self._convert(html_table).text_content.strip() + "\n"
778
+ )
779
+
780
+ # Text areas
781
+ elif shape.has_text_frame:
782
+ if shape == title:
783
+ md_content += "# " + shape.text.lstrip() + "\n"
784
+ else:
785
+ md_content += shape.text + "\n"
786
+
787
+ md_content = md_content.strip()
788
+
789
+ if slide.has_notes_slide:
790
+ md_content += "\n\n### Notes:\n"
791
+ notes_frame = slide.notes_slide.notes_text_frame
792
+ if notes_frame is not None:
793
+ md_content += notes_frame.text
794
+ md_content = md_content.strip()
795
+
796
+ return DocumentConverterResult(
797
+ title=None,
798
+ text_content=md_content.strip(),
799
+ )
800
+
801
+ def _is_picture(self, shape):
802
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
803
+ return True
804
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
805
+ if hasattr(shape, "image"):
806
+ return True
807
+ return False
808
+
809
+ def _is_table(self, shape):
810
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
811
+ return True
812
+ return False
813
+
814
+
815
+ class MediaConverter(DocumentConverter):
816
+ """
817
+ Abstract class for multi-modal media (e.g., images and audio)
818
+ """
819
+
820
+ def _get_metadata(self, local_path):
821
+ exiftool = shutil.which("exiftool")
822
+ if not exiftool:
823
+ return None
824
+ else:
825
+ try:
826
+ result = subprocess.run(
827
+ [exiftool, "-json", local_path], capture_output=True, text=True
828
+ ).stdout
829
+ return json.loads(result)[0]
830
+ except Exception:
831
+ return None
832
+
833
+
834
+ class WavConverter(MediaConverter):
835
+ """
836
+ Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
837
+ """
838
+
839
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
840
+ # Bail if not a XLSX
841
+ extension = kwargs.get("file_extension", "")
842
+ if extension.lower() != ".wav":
843
+ return None
844
+
845
+ md_content = ""
846
+
847
+ # Add metadata
848
+ metadata = self._get_metadata(local_path)
849
+ if metadata:
850
+ for f in [
851
+ "Title",
852
+ "Artist",
853
+ "Author",
854
+ "Band",
855
+ "Album",
856
+ "Genre",
857
+ "Track",
858
+ "DateTimeOriginal",
859
+ "CreateDate",
860
+ "Duration",
861
+ ]:
862
+ if f in metadata:
863
+ md_content += f"{f}: {metadata[f]}\n"
864
+
865
+ # Transcribe
866
+ if IS_AUDIO_TRANSCRIPTION_CAPABLE:
867
+ try:
868
+ transcript = self._transcribe_audio(local_path)
869
+ md_content += "\n\n### Audio Transcript:\n" + (
870
+ "[No speech detected]" if transcript == "" else transcript
871
+ )
872
+ except Exception:
873
+ md_content += (
874
+ "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
875
+ )
876
+
877
+ return DocumentConverterResult(
878
+ title=None,
879
+ text_content=md_content.strip(),
880
+ )
881
+
882
+ def _transcribe_audio(self, local_path) -> str:
883
+ recognizer = sr.Recognizer()
884
+ with sr.AudioFile(local_path) as source:
885
+ audio = recognizer.record(source)
886
+ return recognizer.recognize_google(audio).strip()
887
+
888
+
889
+ class Mp3Converter(WavConverter):
890
+ """
891
+ Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
892
+ """
893
+
894
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
895
+ # Bail if not a MP3
896
+ extension = kwargs.get("file_extension", "")
897
+ if extension.lower() != ".mp3":
898
+ return None
899
+
900
+ md_content = ""
901
+
902
+ # Add metadata
903
+ metadata = self._get_metadata(local_path)
904
+ if metadata:
905
+ for f in [
906
+ "Title",
907
+ "Artist",
908
+ "Author",
909
+ "Band",
910
+ "Album",
911
+ "Genre",
912
+ "Track",
913
+ "DateTimeOriginal",
914
+ "CreateDate",
915
+ "Duration",
916
+ ]:
917
+ if f in metadata:
918
+ md_content += f"{f}: {metadata[f]}\n"
919
+
920
+ # Transcribe
921
+ if IS_AUDIO_TRANSCRIPTION_CAPABLE:
922
+ handle, temp_path = tempfile.mkstemp(suffix=".wav")
923
+ os.close(handle)
924
+ try:
925
+ sound = pydub.AudioSegment.from_mp3(local_path)
926
+ sound.export(temp_path, format="wav")
927
+
928
+ _args = dict()
929
+ _args.update(kwargs)
930
+ _args["file_extension"] = ".wav"
931
+
932
+ try:
933
+ transcript = super()._transcribe_audio(temp_path).strip()
934
+ md_content += "\n\n### Audio Transcript:\n" + (
935
+ "[No speech detected]" if transcript == "" else transcript
936
+ )
937
+ except Exception:
938
+ md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
939
+
940
+ finally:
941
+ os.unlink(temp_path)
942
+
943
+ # Return the result
944
+ return DocumentConverterResult(
945
+ title=None,
946
+ text_content=md_content.strip(),
947
+ )
948
+
949
+
950
+ class ImageConverter(MediaConverter):
951
+ """
952
+ Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
953
+ """
954
+
955
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
956
+ # Bail if not a XLSX
957
+ extension = kwargs.get("file_extension", "")
958
+ if extension.lower() not in [".jpg", ".jpeg", ".png"]:
959
+ return None
960
+
961
+ md_content = ""
962
+
963
+ # Add metadata
964
+ metadata = self._get_metadata(local_path)
965
+ if metadata:
966
+ for f in [
967
+ "ImageSize",
968
+ "Title",
969
+ "Caption",
970
+ "Description",
971
+ "Keywords",
972
+ "Artist",
973
+ "Author",
974
+ "DateTimeOriginal",
975
+ "CreateDate",
976
+ "GPSPosition",
977
+ ]:
978
+ if f in metadata:
979
+ md_content += f"{f}: {metadata[f]}\n"
980
+
981
+ # Try describing the image with GPTV
982
+ mlm_client = kwargs.get("mlm_client")
983
+ mlm_model = kwargs.get("mlm_model")
984
+ if mlm_client is not None and mlm_model is not None:
985
+ md_content += (
986
+ "\n# Description:\n"
987
+ + self._get_mlm_description(
988
+ local_path,
989
+ extension,
990
+ mlm_client,
991
+ mlm_model,
992
+ prompt=kwargs.get("mlm_prompt"),
993
+ ).strip()
994
+ + "\n"
995
+ )
996
+
997
+ return DocumentConverterResult(
998
+ title=None,
999
+ text_content=md_content,
1000
+ )
1001
+
1002
+ def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
1003
+ if prompt is None or prompt.strip() == "":
1004
+ prompt = "Write a detailed caption for this image."
1005
+
1006
+ sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
1007
+
1008
+ data_uri = ""
1009
+ with open(local_path, "rb") as image_file:
1010
+ content_type, encoding = mimetypes.guess_type("_dummy" + extension)
1011
+ if content_type is None:
1012
+ content_type = "image/jpeg"
1013
+ image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
1014
+ data_uri = f"data:{content_type};base64,{image_base64}"
1015
+
1016
+ messages = [
1017
+ {
1018
+ "role": "user",
1019
+ "content": [
1020
+ {"type": "text", "text": prompt},
1021
+ {
1022
+ "type": "image_url",
1023
+ "image_url": {
1024
+ "url": data_uri,
1025
+ },
1026
+ },
1027
+ ],
1028
+ }
1029
+ ]
1030
+
1031
+ response = client.chat.completions.create(model=model, messages=messages)
1032
+ return response.choices[0].message.content
1033
+
1034
+
1035
+ class FileConversionException(BaseException):
1036
+ pass
1037
+
1038
+
1039
+ class UnsupportedFormatException(BaseException):
1040
+ pass
1041
+
1042
+
1043
+ class MarkItDown:
1044
+ """(In preview) An extremely simple text-based document reader, suitable for LLM use.
1045
+ This reader will convert common file-types or webpages to Markdown."""
1046
+
1047
+ def __init__(
1048
+ self,
1049
+ requests_session: Optional[requests.Session] = None,
1050
+ mlm_client: Optional[Any] = None,
1051
+ mlm_model: Optional[Any] = None,
1052
+ ):
1053
+ if requests_session is None:
1054
+ self._requests_session = requests.Session()
1055
+ else:
1056
+ self._requests_session = requests_session
1057
+
1058
+ self._mlm_client = mlm_client
1059
+ self._mlm_model = mlm_model
1060
+
1061
+ self._page_converters: List[DocumentConverter] = []
1062
+
1063
+ # Register converters for successful browsing operations
1064
+ # Later registrations are tried first / take higher priority than earlier registrations
1065
+ # To this end, the most specific converters should appear below the most generic converters
1066
+ self.register_page_converter(PlainTextConverter())
1067
+ self.register_page_converter(HtmlConverter())
1068
+ self.register_page_converter(WikipediaConverter())
1069
+ self.register_page_converter(YouTubeConverter())
1070
+ self.register_page_converter(BingSerpConverter())
1071
+ self.register_page_converter(DocxConverter())
1072
+ self.register_page_converter(XlsxConverter())
1073
+ self.register_page_converter(PptxConverter())
1074
+ self.register_page_converter(WavConverter())
1075
+ self.register_page_converter(Mp3Converter())
1076
+ self.register_page_converter(ImageConverter())
1077
+ self.register_page_converter(PdfConverter())
1078
+
1079
+ def convert(
1080
+ self, source: Union[str, requests.Response], **kwargs: Any
1081
+ ) -> DocumentConverterResult: # TODO: deal with kwargs
1082
+ """
1083
+ Args:
1084
+ - source: can be a string representing a path or url, or a requests.response object
1085
+ - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
1086
+ """
1087
+
1088
+ # Local path or url
1089
+ if isinstance(source, str):
1090
+ if (
1091
+ source.startswith("http://")
1092
+ or source.startswith("https://")
1093
+ or source.startswith("file://")
1094
+ ):
1095
+ return self.convert_url(source, **kwargs)
1096
+ else:
1097
+ return self.convert_local(source, **kwargs)
1098
+ # Request response
1099
+ elif isinstance(source, requests.Response):
1100
+ return self.convert_response(source, **kwargs)
1101
+
1102
+ def convert_local(
1103
+ self, path: str, **kwargs: Any
1104
+ ) -> DocumentConverterResult: # TODO: deal with kwargs
1105
+ # Prepare a list of extensions to try (in order of priority)
1106
+ ext = kwargs.get("file_extension")
1107
+ extensions = [ext] if ext is not None else []
1108
+
1109
+ # Get extension alternatives from the path and puremagic
1110
+ base, ext = os.path.splitext(path)
1111
+ self._append_ext(extensions, ext)
1112
+
1113
+ if not extensions:
1114
+ for g in self._guess_ext_magic(path):
1115
+ self._append_ext(extensions, g)
1116
+
1117
+ # Convert
1118
+ return self._convert(path, extensions, **kwargs)
1119
+
1120
+ # TODO what should stream's type be?
1121
+ def convert_stream(
1122
+ self, stream: Any, **kwargs: Any
1123
+ ) -> DocumentConverterResult: # TODO: deal with kwargs
1124
+ # Prepare a list of extensions to try (in order of priority)
1125
+ ext = kwargs.get("file_extension")
1126
+ extensions = [ext] if ext is not None else []
1127
+
1128
+ # Save the file locally to a temporary file. It will be deleted before this method exits
1129
+ handle, temp_path = tempfile.mkstemp()
1130
+ fh = os.fdopen(handle, "wb")
1131
+ result = None
1132
+ try:
1133
+ # Write to the temporary file
1134
+ content = stream.read()
1135
+ if isinstance(content, str):
1136
+ fh.write(content.encode("utf-8"))
1137
+ else:
1138
+ fh.write(content)
1139
+ fh.close()
1140
+
1141
+ # Use puremagic to check for more extension options
1142
+ for g in self._guess_ext_magic(temp_path):
1143
+ self._append_ext(extensions, g)
1144
+
1145
+ # Convert
1146
+ result = self._convert(temp_path, extensions, **kwargs)
1147
+ # Clean up
1148
+ finally:
1149
+ try:
1150
+ fh.close()
1151
+ except Exception:
1152
+ pass
1153
+ os.unlink(temp_path)
1154
+
1155
+ return result
1156
+
1157
+ def convert_url(
1158
+ self, url: str, **kwargs: Any
1159
+ ) -> DocumentConverterResult: # TODO: fix kwargs type
1160
+ # Send a HTTP request to the URL
1161
+ response = self._requests_session.get(url, stream=True)
1162
+ response.raise_for_status()
1163
+ return self.convert_response(response, **kwargs)
1164
+
1165
+ def convert_response(
1166
+ self, response: requests.Response, **kwargs: Any
1167
+ ) -> DocumentConverterResult: # TODO fix kwargs type
1168
+ # Prepare a list of extensions to try (in order of priority)
1169
+ ext = kwargs.get("file_extension")
1170
+ extensions = [ext] if ext is not None else []
1171
+
1172
+ # Guess from the mimetype
1173
+ content_type = response.headers.get("content-type", "").split(";")[0]
1174
+ self._append_ext(extensions, mimetypes.guess_extension(content_type))
1175
+
1176
+ # Read the content disposition if there is one
1177
+ content_disposition = response.headers.get("content-disposition", "")
1178
+ m = re.search(r"filename=([^;]+)", content_disposition)
1179
+ if m:
1180
+ base, ext = os.path.splitext(m.group(1).strip("\"'"))
1181
+ self._append_ext(extensions, ext)
1182
+
1183
+ # Read from the extension from the path
1184
+ base, ext = os.path.splitext(urlparse(response.url).path)
1185
+ self._append_ext(extensions, ext)
1186
+
1187
+ # Save the file locally to a temporary file. It will be deleted before this method exits
1188
+ handle, temp_path = tempfile.mkstemp()
1189
+ fh = os.fdopen(handle, "wb")
1190
+ result = None
1191
+ try:
1192
+ # Download the file
1193
+ for chunk in response.iter_content(chunk_size=512):
1194
+ fh.write(chunk)
1195
+ fh.close()
1196
+
1197
+ # Use puremagic to check for more extension options
1198
+ for g in self._guess_ext_magic(temp_path):
1199
+ self._append_ext(extensions, g)
1200
+
1201
+ # Convert
1202
+ result = self._convert(temp_path, extensions, url=response.url)
1203
+ # Clean up
1204
+ finally:
1205
+ try:
1206
+ fh.close()
1207
+ except Exception:
1208
+ pass
1209
+ os.unlink(temp_path)
1210
+
1211
+ return result
1212
+
1213
+ def _convert(
1214
+ self, local_path: str, extensions: List[Union[str, None]], **kwargs
1215
+ ) -> DocumentConverterResult:
1216
+ error_trace = ""
1217
+ res = None
1218
+ for ext in extensions + [None]: # Try last with no extension
1219
+ for converter in self._page_converters:
1220
+ _kwargs = copy.deepcopy(kwargs)
1221
+
1222
+ # Overwrite file_extension appropriately
1223
+ if ext is None:
1224
+ if "file_extension" in _kwargs:
1225
+ del _kwargs["file_extension"]
1226
+ else:
1227
+ _kwargs.update({"file_extension": ext})
1228
+
1229
+ # Copy any additional global options
1230
+ if "mlm_client" not in _kwargs and self._mlm_client is not None:
1231
+ _kwargs["mlm_client"] = self._mlm_client
1232
+
1233
+ if "mlm_model" not in _kwargs and self._mlm_model is not None:
1234
+ _kwargs["mlm_model"] = self._mlm_model
1235
+
1236
+ # If we hit an error log it and keep trying
1237
+ try:
1238
+ res = converter.convert(local_path, **_kwargs)
1239
+ except Exception:
1240
+ error_trace = ("\n\n" + traceback.format_exc()).strip()
1241
+
1242
+ if res is not None:
1243
+ # Normalize the content
1244
+ res.text_content = "\n".join(
1245
+ [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
1246
+ )
1247
+ res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
1248
+
1249
+ # Todo
1250
+ return res
1251
+
1252
+ # If we got this far without success, report any exceptions
1253
+ if len(error_trace) > 0:
1254
+ raise FileConversionException(
1255
+ f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
1256
+ )
1257
+
1258
+ # Nothing can handle it!
1259
+ raise UnsupportedFormatException(
1260
+ f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
1261
+ )
1262
+
1263
+ def _append_ext(self, extensions, ext):
1264
+ """Append a unique non-None, non-empty extension to a list of extensions."""
1265
+ if ext is None:
1266
+ return
1267
+ ext = ext.strip()
1268
+ if ext == "":
1269
+ return
1270
+ # if ext not in extensions:
1271
+ if True:
1272
+ extensions.append(ext)
1273
+
1274
+ def _guess_ext_magic(self, path):
1275
+ """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
1276
+ # Use puremagic to guess
1277
+ try:
1278
+ guesses = puremagic.magic_file(path)
1279
+ extensions = list()
1280
+ for g in guesses:
1281
+ ext = g.extension.strip()
1282
+ if len(ext) > 0:
1283
+ if not ext.startswith("."):
1284
+ ext = "." + ext
1285
+ if ext not in extensions:
1286
+ extensions.append(ext)
1287
+ return extensions
1288
+ except FileNotFoundError:
1289
+ pass
1290
+ except IsADirectoryError:
1291
+ pass
1292
+ except PermissionError:
1293
+ pass
1294
+ return []
1295
+
1296
+ def register_page_converter(self, converter: DocumentConverter) -> None:
1297
+ """Register a page text converter."""
1298
+ self._page_converters.insert(0, converter)