auto-coder 0.1.200__py3-none-any.whl → 0.1.202__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

@@ -0,0 +1,1321 @@
1
+ # type: ignore
2
+ import base64
3
+ import binascii
4
+ import copy
5
+ import html
6
+ import json
7
+ import mimetypes
8
+ import io
9
+ import os
10
+ import re
11
+ import shutil
12
+ import subprocess
13
+ import sys
14
+ import tempfile
15
+ import traceback
16
+ from typing import Any, Dict, List, Optional, Union
17
+ from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
18
+
19
+ import mammoth
20
+ import markdownify
21
+ import pandas as pd
22
+ import pdfminer
23
+ import pdfminer.high_level
24
+ from pdfminer.converter import PDFPageAggregator
25
+ from pdfminer.layout import LAParams, LTImage, LTFigure
26
+ from pdfminer.pdfdevice import PDFDevice
27
+ from pdfminer.pdfdocument import PDFDocument
28
+ from pdfminer.pdfparser import PDFParser
29
+ from pdfminer.pdfpage import PDFPage
30
+ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
31
+ import pptx
32
+ from pdfminer.image import ImageWriter
33
+
34
+ import numpy as np
35
+ from PIL import Image
36
+
37
+ # File-format detection
38
+ import puremagic
39
+ import requests
40
+ from bs4 import BeautifulSoup
41
+
42
+ # Optional Transcription support
43
+ try:
44
+ import pydub
45
+ import speech_recognition as sr
46
+
47
+ IS_AUDIO_TRANSCRIPTION_CAPABLE = True
48
+ except ModuleNotFoundError:
49
+ pass
50
+
51
+ # Optional YouTube transcription support
52
+ try:
53
+ from youtube_transcript_api import YouTubeTranscriptApi
54
+
55
+ IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
56
+ except ModuleNotFoundError:
57
+ pass
58
+
59
+
60
+ class _CustomMarkdownify(markdownify.MarkdownConverter):
61
+ """
62
+ A custom version of markdownify's MarkdownConverter. Changes include:
63
+
64
+ - Altering the default heading style to use '#', '##', etc.
65
+ - Removing javascript hyperlinks.
66
+ - Truncating images with large data:uri sources.
67
+ - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
68
+ """
69
+
70
+ def __init__(self, **options: Any):
71
+ options["heading_style"] = options.get(
72
+ "heading_style", markdownify.ATX)
73
+ # Explicitly cast options to the expected type if necessary
74
+ super().__init__(**options)
75
+
76
+ def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
77
+ """Same as usual, but be sure to start with a new line"""
78
+ if not convert_as_inline:
79
+ if not re.search(r"^\n", text):
80
+ return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
81
+
82
+ return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
83
+
84
+ def convert_a(self, el: Any, text: str, convert_as_inline: bool):
85
+ """Same as usual converter, but removes Javascript links and escapes URIs."""
86
+ prefix, suffix, text = markdownify.chomp(text) # type: ignore
87
+ if not text:
88
+ return ""
89
+ href = el.get("href")
90
+ title = el.get("title")
91
+
92
+ # Escape URIs and skip non-http or file schemes
93
+ if href:
94
+ try:
95
+ parsed_url = urlparse(href) # type: ignore
96
+ # type: ignore
97
+ if parsed_url.scheme and parsed_url.scheme.lower() not in [
98
+ "http",
99
+ "https",
100
+ "file",
101
+ ]:
102
+ return "%s%s%s" % (prefix, text, suffix)
103
+ href = urlunparse(
104
+ parsed_url._replace(path=quote(unquote(parsed_url.path)))
105
+ ) # type: ignore
106
+ except ValueError: # It's not clear if this ever gets thrown
107
+ return "%s%s%s" % (prefix, text, suffix)
108
+
109
+ # For the replacement see #29: text nodes underscores are escaped
110
+ if (
111
+ self.options["autolinks"]
112
+ and text.replace(r"\_", "_") == href
113
+ and not title
114
+ and not self.options["default_title"]
115
+ ):
116
+ # Shortcut syntax
117
+ return "<%s>" % href
118
+ if self.options["default_title"] and not title:
119
+ title = href
120
+ title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
121
+ return (
122
+ "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
123
+ if href
124
+ else text
125
+ )
126
+
127
+ def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
128
+ """Same as usual converter, but removes data URIs"""
129
+
130
+ alt = el.attrs.get("alt", None) or ""
131
+ src = el.attrs.get("src", None) or ""
132
+ title = el.attrs.get("title", None) or ""
133
+ title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
134
+ if (
135
+ convert_as_inline
136
+ and el.parent.name not in self.options["keep_inline_images_in"]
137
+ ):
138
+ return alt
139
+
140
+ # Remove dataURIs
141
+ if src.startswith("data:"):
142
+ src = src.split(",")[0] + "..."
143
+
144
+ return "![%s](%s%s)" % (alt, src, title_part)
145
+
146
+ def convert_soup(self, soup: Any) -> str:
147
+ return super().convert_soup(soup) # type: ignore
148
+
149
+
150
+ class DocumentConverterResult:
151
+ """The result of converting a document to text."""
152
+
153
+ def __init__(self, title: Union[str, None] = None, text_content: str = ""):
154
+ self.title: Union[str, None] = title
155
+ self.text_content: str = text_content
156
+
157
+
158
+ class DocumentConverter:
159
+ """Abstract superclass of all DocumentConverters."""
160
+
161
+ def convert(
162
+ self, local_path: str, **kwargs: Any
163
+ ) -> Union[None, DocumentConverterResult]:
164
+ raise NotImplementedError()
165
+
166
+
167
+ class PlainTextConverter(DocumentConverter):
168
+ """Anything with content type text/plain"""
169
+
170
+ def convert(
171
+ self, local_path: str, **kwargs: Any
172
+ ) -> Union[None, DocumentConverterResult]:
173
+ # Guess the content type from any file extension that might be around
174
+ content_type, _ = mimetypes.guess_type(
175
+ "__placeholder" + kwargs.get("file_extension", "")
176
+ )
177
+
178
+ # Only accept text files
179
+ if content_type is None:
180
+ return None
181
+ elif "text/" not in content_type.lower():
182
+ return None
183
+
184
+ text_content = ""
185
+ with open(local_path, "rt", encoding="utf-8") as fh:
186
+ text_content = fh.read()
187
+ return DocumentConverterResult(
188
+ title=None,
189
+ text_content=text_content,
190
+ )
191
+
192
+
193
+ class HtmlConverter(DocumentConverter):
194
+ """Anything with content type text/html"""
195
+
196
+ def convert(
197
+ self, local_path: str, **kwargs: Any
198
+ ) -> Union[None, DocumentConverterResult]:
199
+ # Bail if not html
200
+ extension = kwargs.get("file_extension", "")
201
+ if extension.lower() not in [".html", ".htm"]:
202
+ return None
203
+
204
+ result = None
205
+ with open(local_path, "rt", encoding="utf-8") as fh:
206
+ result = self._convert(fh.read())
207
+
208
+ return result
209
+
210
+ def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
211
+ """Helper function that converts and HTML string."""
212
+
213
+ # Parse the string
214
+ soup = BeautifulSoup(html_content, "html.parser")
215
+
216
+ # Remove javascript and style blocks
217
+ for script in soup(["script", "style"]):
218
+ script.extract()
219
+
220
+ # Print only the main content
221
+ body_elm = soup.find("body")
222
+ webpage_text = ""
223
+ if body_elm:
224
+ webpage_text = _CustomMarkdownify().convert_soup(body_elm)
225
+ else:
226
+ webpage_text = _CustomMarkdownify().convert_soup(soup)
227
+
228
+ assert isinstance(webpage_text, str)
229
+
230
+ return DocumentConverterResult(
231
+ title=None if soup.title is None else soup.title.string,
232
+ text_content=webpage_text,
233
+ )
234
+
235
+
236
+ class WikipediaConverter(DocumentConverter):
237
+ """Handle Wikipedia pages separately, focusing only on the main document content."""
238
+
239
+ def convert(
240
+ self, local_path: str, **kwargs: Any
241
+ ) -> Union[None, DocumentConverterResult]:
242
+ # Bail if not Wikipedia
243
+ extension = kwargs.get("file_extension", "")
244
+ if extension.lower() not in [".html", ".htm"]:
245
+ return None
246
+ url = kwargs.get("url", "")
247
+ if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
248
+ return None
249
+
250
+ # Parse the file
251
+ soup = None
252
+ with open(local_path, "rt", encoding="utf-8") as fh:
253
+ soup = BeautifulSoup(fh.read(), "html.parser")
254
+
255
+ # Remove javascript and style blocks
256
+ for script in soup(["script", "style"]):
257
+ script.extract()
258
+
259
+ # Print only the main content
260
+ body_elm = soup.find("div", {"id": "mw-content-text"})
261
+ title_elm = soup.find("span", {"class": "mw-page-title-main"})
262
+
263
+ webpage_text = ""
264
+ main_title = None if soup.title is None else soup.title.string
265
+
266
+ if body_elm:
267
+ # What's the title
268
+ if title_elm and len(title_elm) > 0:
269
+ main_title = title_elm.string # type: ignore
270
+ assert isinstance(main_title, str)
271
+
272
+ # Convert the page
273
+ webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
274
+ body_elm
275
+ )
276
+ else:
277
+ webpage_text = _CustomMarkdownify().convert_soup(soup)
278
+
279
+ return DocumentConverterResult(
280
+ title=main_title,
281
+ text_content=webpage_text,
282
+ )
283
+
284
+
285
+ class YouTubeConverter(DocumentConverter):
286
+ """Handle YouTube specially, focusing on the video title, description, and transcript."""
287
+
288
+ def convert(
289
+ self, local_path: str, **kwargs: Any
290
+ ) -> Union[None, DocumentConverterResult]:
291
+ # Bail if not YouTube
292
+ extension = kwargs.get("file_extension", "")
293
+ if extension.lower() not in [".html", ".htm"]:
294
+ return None
295
+ url = kwargs.get("url", "")
296
+ if not url.startswith("https://www.youtube.com/watch?"):
297
+ return None
298
+
299
+ # Parse the file
300
+ soup = None
301
+ with open(local_path, "rt", encoding="utf-8") as fh:
302
+ soup = BeautifulSoup(fh.read(), "html.parser")
303
+
304
+ # Read the meta tags
305
+ assert soup.title is not None and soup.title.string is not None
306
+ metadata: Dict[str, str] = {"title": soup.title.string}
307
+ for meta in soup(["meta"]):
308
+ for a in meta.attrs:
309
+ if a in ["itemprop", "property", "name"]:
310
+ metadata[meta[a]] = meta.get("content", "")
311
+ break
312
+
313
+ # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
314
+ try:
315
+ for script in soup(["script"]):
316
+ content = script.text
317
+ if "ytInitialData" in content:
318
+ lines = re.split(r"\r?\n", content)
319
+ obj_start = lines[0].find("{")
320
+ obj_end = lines[0].rfind("}")
321
+ if obj_start >= 0 and obj_end >= 0:
322
+ data = json.loads(lines[0][obj_start: obj_end + 1])
323
+ attrdesc = self._findKey(
324
+ data, "attributedDescriptionBodyText"
325
+ ) # type: ignore
326
+ if attrdesc:
327
+ metadata["description"] = str(attrdesc["content"])
328
+ break
329
+ except Exception:
330
+ pass
331
+
332
+ # Start preparing the page
333
+ webpage_text = "# YouTube\n"
334
+
335
+ title = self._get(
336
+ metadata, ["title", "og:title", "name"]) # type: ignore
337
+ assert isinstance(title, str)
338
+
339
+ if title:
340
+ webpage_text += f"\n## {title}\n"
341
+
342
+ stats = ""
343
+ views = self._get(metadata, ["interactionCount"]) # type: ignore
344
+ if views:
345
+ stats += f"- **Views:** {views}\n"
346
+
347
+ keywords = self._get(metadata, ["keywords"]) # type: ignore
348
+ if keywords:
349
+ stats += f"- **Keywords:** {keywords}\n"
350
+
351
+ runtime = self._get(metadata, ["duration"]) # type: ignore
352
+ if runtime:
353
+ stats += f"- **Runtime:** {runtime}\n"
354
+
355
+ if len(stats) > 0:
356
+ webpage_text += f"\n### Video Metadata\n{stats}\n"
357
+
358
+ description = self._get(
359
+ metadata, ["description", "og:description"]
360
+ ) # type: ignore
361
+ if description:
362
+ webpage_text += f"\n### Description\n{description}\n"
363
+
364
+ if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
365
+ transcript_text = ""
366
+ parsed_url = urlparse(url) # type: ignore
367
+ params = parse_qs(parsed_url.query) # type: ignore
368
+ if "v" in params:
369
+ assert isinstance(params["v"][0], str)
370
+ video_id = str(params["v"][0])
371
+ try:
372
+ # Must be a single transcript.
373
+ transcript = YouTubeTranscriptApi.get_transcript(
374
+ video_id
375
+ ) # type: ignore
376
+ transcript_text = " ".join(
377
+ [part["text"] for part in transcript]
378
+ ) # type: ignore
379
+ # Alternative formatting:
380
+ # formatter = TextFormatter()
381
+ # formatter.format_transcript(transcript)
382
+ except Exception:
383
+ pass
384
+ if transcript_text:
385
+ webpage_text += f"\n### Transcript\n{transcript_text}\n"
386
+
387
+ title = title if title else soup.title.string
388
+ assert isinstance(title, str)
389
+
390
+ return DocumentConverterResult(
391
+ title=title,
392
+ text_content=webpage_text,
393
+ )
394
+
395
+ def _get(
396
+ self,
397
+ metadata: Dict[str, str],
398
+ keys: List[str],
399
+ default: Union[str, None] = None,
400
+ ) -> Union[str, None]:
401
+ for k in keys:
402
+ if k in metadata:
403
+ return metadata[k]
404
+ return default
405
+
406
+ # TODO: Fix json type
407
+ def _findKey(self, json: Any, key: str) -> Union[str, None]:
408
+ if isinstance(json, list):
409
+ for elm in json:
410
+ ret = self._findKey(elm, key)
411
+ if ret is not None:
412
+ return ret
413
+ elif isinstance(json, dict):
414
+ for k in json:
415
+ if k == key:
416
+ return json[k]
417
+ else:
418
+ ret = self._findKey(json[k], key)
419
+ if ret is not None:
420
+ return ret
421
+ return None
422
+
423
+
424
+ class BingSerpConverter(DocumentConverter):
425
+ """
426
+ Handle Bing results pages (only the organic search results).
427
+ NOTE: It is better to use the Bing API
428
+ """
429
+
430
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
431
+ # Bail if not a Bing SERP
432
+ extension = kwargs.get("file_extension", "")
433
+ if extension.lower() not in [".html", ".htm"]:
434
+ return None
435
+ url = kwargs.get("url", "")
436
+ if not re.search(r"^https://www\.bing\.com/search\?q=", url):
437
+ return None
438
+
439
+ # Parse the query parameters
440
+ parsed_params = parse_qs(urlparse(url).query)
441
+ query = parsed_params.get("q", [""])[0]
442
+
443
+ # Parse the file
444
+ soup = None
445
+ with open(local_path, "rt", encoding="utf-8") as fh:
446
+ soup = BeautifulSoup(fh.read(), "html.parser")
447
+
448
+ # Clean up some formatting
449
+ for tptt in soup.find_all(class_="tptt"):
450
+ if hasattr(tptt, "string") and tptt.string:
451
+ tptt.string += " "
452
+ for slug in soup.find_all(class_="algoSlug_icon"):
453
+ slug.extract()
454
+
455
+ # Parse the algorithmic results
456
+ _markdownify = _CustomMarkdownify()
457
+ results = list()
458
+ for result in soup.find_all(class_="b_algo"):
459
+ # Rewrite redirect urls
460
+ for a in result.find_all("a", href=True):
461
+ parsed_href = urlparse(a["href"])
462
+ qs = parse_qs(parsed_href.query)
463
+
464
+ # The destination is contained in the u parameter,
465
+ # but appears to be base64 encoded, with some prefix
466
+ if "u" in qs:
467
+ u = (
468
+ qs["u"][0][2:].strip() + "=="
469
+ ) # Python 3 doesn't care about extra padding
470
+
471
+ try:
472
+ # RFC 4648 / Base64URL" variant, which uses "-" and "_"
473
+ a["href"] = base64.b64decode(
474
+ u, altchars="-_").decode("utf-8")
475
+ except UnicodeDecodeError:
476
+ pass
477
+ except binascii.Error:
478
+ pass
479
+
480
+ # Convert to markdown
481
+ md_result = _markdownify.convert_soup(result).strip()
482
+ lines = [line.strip() for line in re.split(r"\n+", md_result)]
483
+ results.append(
484
+ "\n".join([line for line in lines if len(line) > 0]))
485
+
486
+ webpage_text = (
487
+ f"## A Bing search for '{query}' found the following results:\n\n"
488
+ + "\n\n".join(results)
489
+ )
490
+
491
+ return DocumentConverterResult(
492
+ title=None if soup.title is None else soup.title.string,
493
+ text_content=webpage_text,
494
+ )
495
+
496
+
497
+ class PdfConverter(DocumentConverter):
498
+ """
499
+ Converts PDFs to Markdown with support for extracting and including images.
500
+ """
501
+
502
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
503
+ # Bail if not a PDF
504
+ extension = kwargs.get("file_extension", "")
505
+ if extension.lower() != ".pdf":
506
+ return None
507
+
508
+ image_output_dir = None
509
+ if kwargs.get("image_output_dir", None):
510
+ image_output_dir = kwargs.get("image_output_dir")
511
+ else:
512
+ # Create output directory for images if it doesn't exist
513
+ image_output_dir = os.path.join(
514
+ os.path.dirname(local_path), "_images", os.path.basename(
515
+ local_path).replace(" ", "_")
516
+ )
517
+ os.makedirs(image_output_dir, exist_ok=True)
518
+
519
+ text_content = []
520
+ image_count = 0
521
+
522
+ # Open and process PDF
523
+ with open(local_path, "rb") as file:
524
+ # Create PDF parser and document
525
+ parser = PDFParser(file)
526
+ document = PDFDocument(parser)
527
+ rsrcmgr = PDFResourceManager()
528
+ laparams = LAParams()
529
+ device = PDFPageAggregator(rsrcmgr, laparams=laparams)
530
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
531
+
532
+ # Process each page
533
+ for page in PDFPage.create_pages(document):
534
+ interpreter.process_page(page)
535
+ layout = device.get_result()
536
+
537
+ # Extract text and images from the page
538
+ page_content = self._process_layout(
539
+ layout, image_output_dir, image_count
540
+ )
541
+ text_content.extend(page_content)
542
+ image_count += len([c for c in page_content if c.startswith("![Image")])
543
+
544
+ return DocumentConverterResult(
545
+ title=None,
546
+ text_content="\n".join(text_content),
547
+ )
548
+
549
+ def _process_layout(
550
+ self, layout, image_output_dir: str, image_count: int
551
+ ) -> List[str]:
552
+ """Process the layout of a PDF page, extracting both text and images."""
553
+ content = []
554
+ local_image_count = image_count
555
+ for lt_obj in layout:
556
+ # Handle images
557
+ if isinstance(lt_obj, LTImage) or (
558
+ isinstance(lt_obj, LTFigure) and lt_obj.name.startswith("Im")
559
+ ):
560
+ image_data = None
561
+ image_meta = {}
562
+ image_path = os.path.join(
563
+ image_output_dir, f"image_{local_image_count}.png")
564
+
565
+ if hasattr(lt_obj, "stream"):
566
+ image_data = lt_obj.stream.get_data()
567
+ image_meta = lt_obj.stream.attrs
568
+ elif hasattr(lt_obj, "filter"):
569
+ image_data = lt_obj.filter
570
+
571
+ if image_data:
572
+ if isinstance(lt_obj, LTImage):
573
+ iw = ImageWriter(image_output_dir)
574
+ name = iw.export_image(lt_obj)
575
+ suffix = os.path.splitext(name)[1]
576
+ temp_path = os.path.join(image_output_dir, name)
577
+ image_path = os.path.join(
578
+ image_output_dir, f"image_{local_image_count}{suffix}")
579
+ os.rename(temp_path, image_path)
580
+ content.append(f"![Image {local_image_count}]({image_path})")
581
+ local_image_count += 1
582
+ continue
583
+ try:
584
+ # Try to handle raw pixel data
585
+ if "BitsPerComponent" in image_meta:
586
+ width = image_meta["Width"]
587
+ height = image_meta["Height"]
588
+ bits = image_meta["BitsPerComponent"]
589
+ colorspace = image_meta["ColorSpace"].name
590
+ new_image_data = np.frombuffer(
591
+ image_data, dtype=np.uint8)
592
+ # Normalize to 8-bit if necessary
593
+ if bits != 8:
594
+ max_val = (1 << bits) - 1
595
+ new_image_data = (
596
+ new_image_data.astype(
597
+ "float32") * 255 / max_val
598
+ ).astype("uint8")
599
+
600
+ if colorspace == "DeviceRGB":
601
+ new_image_data = new_image_data.reshape(
602
+ (height, width, 3)
603
+ )
604
+ img = Image.fromarray(new_image_data, "RGB")
605
+ img.save(image_path)
606
+ content.append(
607
+ f"![Image {local_image_count}]({image_path})\n"
608
+ )
609
+ local_image_count += 1
610
+ continue
611
+ elif colorspace == "DeviceGray":
612
+ new_image_data = new_image_data.reshape(
613
+ (height, width))
614
+ img = Image.fromarray(new_image_data, "L")
615
+ img.save(image_path)
616
+ content.append(
617
+ f"![Image {local_image_count}]({image_path})\n"
618
+ )
619
+ local_image_count += 1
620
+ continue
621
+ except Exception as e:
622
+ print(
623
+ f"Error extracting image: {e} fallback to writing original data"
624
+ )
625
+
626
+ with open(image_path, "wb") as img_file:
627
+ img_file.write(image_data)
628
+
629
+ content.append(f"![Image {local_image_count}]({image_path})\n")
630
+ local_image_count += 1
631
+
632
+ # Handle text
633
+ if hasattr(lt_obj, "get_text"):
634
+ text = lt_obj.get_text().strip()
635
+ if text:
636
+ content.append(text)
637
+
638
+ # Recursively process nested layouts
639
+ elif hasattr(lt_obj, "_objs"):
640
+ content.extend(
641
+ self._process_layout(
642
+ lt_obj._objs, image_output_dir, image_count)
643
+ )
644
+
645
+ return content
646
+
647
+
648
+ class DocxConverter(HtmlConverter):
649
+ """
650
+ Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
651
+ """
652
+
653
+ def __init__(self):
654
+ self._image_counter = 0
655
+ super().__init__()
656
+
657
+ def _save_image(self, image, output_dir: str) -> str:
658
+ """
659
+ 保存图片并返回相对路径,使用递增的计数器来命名文件
660
+ """
661
+ # 获取图片内容和格式
662
+ image_format = image.content_type.split(
663
+ '/')[-1] if image.content_type else 'png'
664
+
665
+ # 增加计数器并生成文件名
666
+ self._image_counter += 1
667
+ image_filename = f"image_{self._image_counter}.{image_format}"
668
+
669
+ # 保存图片
670
+ image_path = os.path.join(output_dir, image_filename)
671
+ with image.open() as image_content, open(image_path, 'wb') as f:
672
+ f.write(image_content.read())
673
+
674
+ return image_path
675
+
676
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
677
+ # Bail if not a DOCX
678
+ extension = kwargs.get("file_extension", "")
679
+ if extension.lower() != ".docx":
680
+ return None
681
+
682
+ # 设置图片输出目录
683
+ image_output_dir = None
684
+ if kwargs.get("image_output_dir", None):
685
+ image_output_dir = kwargs.get("image_output_dir")
686
+ else:
687
+ # Create output directory for images if it doesn't exist
688
+ image_output_dir = os.path.join(os.path.dirname(
689
+ local_path), "_images", os.path.basename(local_path).replace(" ", "_"))
690
+ os.makedirs(image_output_dir, exist_ok=True)
691
+
692
+ result = None
693
+ with open(local_path, "rb") as docx_file:
694
+ # 配置图片转换器
695
+ def transform_image(image):
696
+ return {
697
+ "src": self._save_image(image, image_output_dir),
698
+ "alt": image.alt_text if image.alt_text else f"Image {self._image_counter}"
699
+ }
700
+
701
+ # 进行转换
702
+ result = mammoth.convert_to_html(
703
+ docx_file,
704
+ convert_image=mammoth.images.inline(transform_image)
705
+ )
706
+ html_content = result.value
707
+ result = self._convert(html_content)
708
+
709
+ return result
710
+
711
+
712
+ class XlsxConverter(HtmlConverter):
713
+ """
714
+ Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
715
+ """
716
+
717
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
718
+ # Bail if not a XLSX
719
+ extension = kwargs.get("file_extension", "")
720
+ if extension.lower() != ".xlsx":
721
+ return None
722
+
723
+ sheets = pd.read_excel(local_path, sheet_name=None)
724
+ md_content = ""
725
+ for s in sheets:
726
+ md_content += f"## {s}\n"
727
+ html_content = sheets[s].to_html(index=False)
728
+ md_content += self._convert(
729
+ html_content).text_content.strip() + "\n\n"
730
+
731
+ return DocumentConverterResult(
732
+ title=None,
733
+ text_content=md_content.strip(),
734
+ )
735
+
736
+
737
+ class PptxConverter(HtmlConverter):
738
+ """
739
+ Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
740
+ """
741
+
742
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
743
+ # Bail if not a PPTX
744
+ extension = kwargs.get("file_extension", "")
745
+ if extension.lower() != ".pptx":
746
+ return None
747
+
748
+ md_content = ""
749
+
750
+ presentation = pptx.Presentation(local_path)
751
+ slide_num = 0
752
+ for slide in presentation.slides:
753
+ slide_num += 1
754
+
755
+ md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
756
+
757
+ title = slide.shapes.title
758
+ for shape in slide.shapes:
759
+ # Pictures
760
+ if self._is_picture(shape):
761
+ # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
762
+ alt_text = ""
763
+ try:
764
+ alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
765
+ "descr", "")
766
+ except Exception:
767
+ pass
768
+
769
+ # A placeholder name
770
+ filename = re.sub(r"\W", "", shape.name) + ".jpg"
771
+ md_content += (
772
+ "\n!["
773
+ + (alt_text if alt_text else shape.name)
774
+ + "]("
775
+ + filename
776
+ + ")\n"
777
+ )
778
+
779
+ # Tables
780
+ if self._is_table(shape):
781
+ html_table = "<html><body><table>"
782
+ first_row = True
783
+ for row in shape.table.rows:
784
+ html_table += "<tr>"
785
+ for cell in row.cells:
786
+ if first_row:
787
+ html_table += "<th>" + \
788
+ html.escape(cell.text) + "</th>"
789
+ else:
790
+ html_table += "<td>" + \
791
+ html.escape(cell.text) + "</td>"
792
+ html_table += "</tr>"
793
+ first_row = False
794
+ html_table += "</table></body></html>"
795
+ md_content += (
796
+ "\n" +
797
+ self._convert(html_table).text_content.strip() + "\n"
798
+ )
799
+
800
+ # Text areas
801
+ elif shape.has_text_frame:
802
+ if shape == title:
803
+ md_content += "# " + shape.text.lstrip() + "\n"
804
+ else:
805
+ md_content += shape.text + "\n"
806
+
807
+ md_content = md_content.strip()
808
+
809
+ if slide.has_notes_slide:
810
+ md_content += "\n\n### Notes:\n"
811
+ notes_frame = slide.notes_slide.notes_text_frame
812
+ if notes_frame is not None:
813
+ md_content += notes_frame.text
814
+ md_content = md_content.strip()
815
+
816
+ return DocumentConverterResult(
817
+ title=None,
818
+ text_content=md_content.strip(),
819
+ )
820
+
821
+ def _is_picture(self, shape):
822
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
823
+ return True
824
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
825
+ if hasattr(shape, "image"):
826
+ return True
827
+ return False
828
+
829
+ def _is_table(self, shape):
830
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
831
+ return True
832
+ return False
833
+
834
+
835
+ class MediaConverter(DocumentConverter):
836
+ """
837
+ Abstract class for multi-modal media (e.g., images and audio)
838
+ """
839
+
840
+ def _get_metadata(self, local_path):
841
+ exiftool = shutil.which("exiftool")
842
+ if not exiftool:
843
+ return None
844
+ else:
845
+ try:
846
+ result = subprocess.run(
847
+ [exiftool, "-json", local_path], capture_output=True, text=True
848
+ ).stdout
849
+ return json.loads(result)[0]
850
+ except Exception:
851
+ return None
852
+
853
+
854
+ class WavConverter(MediaConverter):
855
+ """
856
+ Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
857
+ """
858
+
859
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
860
+ # Bail if not a XLSX
861
+ extension = kwargs.get("file_extension", "")
862
+ if extension.lower() != ".wav":
863
+ return None
864
+
865
+ md_content = ""
866
+
867
+ # Add metadata
868
+ metadata = self._get_metadata(local_path)
869
+ if metadata:
870
+ for f in [
871
+ "Title",
872
+ "Artist",
873
+ "Author",
874
+ "Band",
875
+ "Album",
876
+ "Genre",
877
+ "Track",
878
+ "DateTimeOriginal",
879
+ "CreateDate",
880
+ "Duration",
881
+ ]:
882
+ if f in metadata:
883
+ md_content += f"{f}: {metadata[f]}\n"
884
+
885
+ # Transcribe
886
+ if IS_AUDIO_TRANSCRIPTION_CAPABLE:
887
+ try:
888
+ transcript = self._transcribe_audio(local_path)
889
+ md_content += "\n\n### Audio Transcript:\n" + (
890
+ "[No speech detected]" if transcript == "" else transcript
891
+ )
892
+ except Exception:
893
+ md_content += (
894
+ "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
895
+ )
896
+
897
+ return DocumentConverterResult(
898
+ title=None,
899
+ text_content=md_content.strip(),
900
+ )
901
+
902
+ def _transcribe_audio(self, local_path) -> str:
903
+ recognizer = sr.Recognizer()
904
+ with sr.AudioFile(local_path) as source:
905
+ audio = recognizer.record(source)
906
+ return recognizer.recognize_google(audio).strip()
907
+
908
+
909
+ class Mp3Converter(WavConverter):
910
+ """
911
+ Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
912
+ """
913
+
914
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
915
+ # Bail if not a MP3
916
+ extension = kwargs.get("file_extension", "")
917
+ if extension.lower() != ".mp3":
918
+ return None
919
+
920
+ md_content = ""
921
+
922
+ # Add metadata
923
+ metadata = self._get_metadata(local_path)
924
+ if metadata:
925
+ for f in [
926
+ "Title",
927
+ "Artist",
928
+ "Author",
929
+ "Band",
930
+ "Album",
931
+ "Genre",
932
+ "Track",
933
+ "DateTimeOriginal",
934
+ "CreateDate",
935
+ "Duration",
936
+ ]:
937
+ if f in metadata:
938
+ md_content += f"{f}: {metadata[f]}\n"
939
+
940
+ # Transcribe
941
+ if IS_AUDIO_TRANSCRIPTION_CAPABLE:
942
+ handle, temp_path = tempfile.mkstemp(suffix=".wav")
943
+ os.close(handle)
944
+ try:
945
+ sound = pydub.AudioSegment.from_mp3(local_path)
946
+ sound.export(temp_path, format="wav")
947
+
948
+ _args = dict()
949
+ _args.update(kwargs)
950
+ _args["file_extension"] = ".wav"
951
+
952
+ try:
953
+ transcript = super()._transcribe_audio(temp_path).strip()
954
+ md_content += "\n\n### Audio Transcript:\n" + (
955
+ "[No speech detected]" if transcript == "" else transcript
956
+ )
957
+ except Exception:
958
+ md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
959
+
960
+ finally:
961
+ os.unlink(temp_path)
962
+
963
+ # Return the result
964
+ return DocumentConverterResult(
965
+ title=None,
966
+ text_content=md_content.strip(),
967
+ )
968
+
969
+
970
+ class ImageConverter(MediaConverter):
971
+ """
972
+ Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
973
+ """
974
+
975
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
976
+ # Bail if not a XLSX
977
+ extension = kwargs.get("file_extension", "")
978
+ if extension.lower() not in [".jpg", ".jpeg", ".png"]:
979
+ return None
980
+
981
+ md_content = ""
982
+
983
+ # Add metadata
984
+ metadata = self._get_metadata(local_path)
985
+ if metadata:
986
+ for f in [
987
+ "ImageSize",
988
+ "Title",
989
+ "Caption",
990
+ "Description",
991
+ "Keywords",
992
+ "Artist",
993
+ "Author",
994
+ "DateTimeOriginal",
995
+ "CreateDate",
996
+ "GPSPosition",
997
+ ]:
998
+ if f in metadata:
999
+ md_content += f"{f}: {metadata[f]}\n"
1000
+
1001
+ # Try describing the image with GPTV
1002
+ mlm_client = kwargs.get("mlm_client")
1003
+ mlm_model = kwargs.get("mlm_model")
1004
+ if mlm_client is not None and mlm_model is not None:
1005
+ md_content += (
1006
+ "\n# Description:\n"
1007
+ + self._get_mlm_description(
1008
+ local_path,
1009
+ extension,
1010
+ mlm_client,
1011
+ mlm_model,
1012
+ prompt=kwargs.get("mlm_prompt"),
1013
+ ).strip()
1014
+ + "\n"
1015
+ )
1016
+
1017
+ return DocumentConverterResult(
1018
+ title=None,
1019
+ text_content=md_content,
1020
+ )
1021
+
1022
+ def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
1023
+ if prompt is None or prompt.strip() == "":
1024
+ prompt = "Write a detailed caption for this image."
1025
+
1026
+ sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
1027
+
1028
+ data_uri = ""
1029
+ with open(local_path, "rb") as image_file:
1030
+ content_type, encoding = mimetypes.guess_type("_dummy" + extension)
1031
+ if content_type is None:
1032
+ content_type = "image/jpeg"
1033
+ image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
1034
+ data_uri = f"data:{content_type};base64,{image_base64}"
1035
+
1036
+ messages = [
1037
+ {
1038
+ "role": "user",
1039
+ "content": [
1040
+ {"type": "text", "text": prompt},
1041
+ {
1042
+ "type": "image_url",
1043
+ "image_url": {
1044
+ "url": data_uri,
1045
+ },
1046
+ },
1047
+ ],
1048
+ }
1049
+ ]
1050
+
1051
+ response = client.chat.completions.create(
1052
+ model=model, messages=messages)
1053
+ return response.choices[0].message.content
1054
+
1055
+
1056
+ class FileConversionException(BaseException):
1057
+ pass
1058
+
1059
+
1060
+ class UnsupportedFormatException(BaseException):
1061
+ pass
1062
+
1063
+
1064
+ class MarkItDown:
1065
+ """(In preview) An extremely simple text-based document reader, suitable for LLM use.
1066
+ This reader will convert common file-types or webpages to Markdown."""
1067
+
1068
+ def __init__(
1069
+ self,
1070
+ requests_session: Optional[requests.Session] = None,
1071
+ mlm_client: Optional[Any] = None,
1072
+ mlm_model: Optional[Any] = None,
1073
+ ):
1074
+ if requests_session is None:
1075
+ self._requests_session = requests.Session()
1076
+ else:
1077
+ self._requests_session = requests_session
1078
+
1079
+ self._mlm_client = mlm_client
1080
+ self._mlm_model = mlm_model
1081
+
1082
+ self._page_converters: List[DocumentConverter] = []
1083
+
1084
+ # Register converters for successful browsing operations
1085
+ # Later registrations are tried first / take higher priority than earlier registrations
1086
+ # To this end, the most specific converters should appear below the most generic converters
1087
+ self.register_page_converter(PlainTextConverter())
1088
+ self.register_page_converter(HtmlConverter())
1089
+ self.register_page_converter(WikipediaConverter())
1090
+ self.register_page_converter(YouTubeConverter())
1091
+ self.register_page_converter(BingSerpConverter())
1092
+ self.register_page_converter(DocxConverter())
1093
+ self.register_page_converter(XlsxConverter())
1094
+ self.register_page_converter(PptxConverter())
1095
+ self.register_page_converter(WavConverter())
1096
+ self.register_page_converter(Mp3Converter())
1097
+ self.register_page_converter(ImageConverter())
1098
+ self.register_page_converter(PdfConverter())
1099
+
1100
+ def convert(
1101
+ self, source: Union[str, requests.Response], **kwargs: Any
1102
+ ) -> DocumentConverterResult: # TODO: deal with kwargs
1103
+ """
1104
+ Args:
1105
+ - source: can be a string representing a path or url, or a requests.response object
1106
+ - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
1107
+ """
1108
+
1109
+ # Local path or url
1110
+ if isinstance(source, str):
1111
+ if (
1112
+ source.startswith("http://")
1113
+ or source.startswith("https://")
1114
+ or source.startswith("file://")
1115
+ ):
1116
+ return self.convert_url(source, **kwargs)
1117
+ else:
1118
+ return self.convert_local(source, **kwargs)
1119
+ # Request response
1120
+ elif isinstance(source, requests.Response):
1121
+ return self.convert_response(source, **kwargs)
1122
+
1123
+ def convert_local(
1124
+ self, path: str, **kwargs: Any
1125
+ ) -> DocumentConverterResult: # TODO: deal with kwargs
1126
+ # Prepare a list of extensions to try (in order of priority)
1127
+ ext = kwargs.get("file_extension")
1128
+ extensions = [ext] if ext is not None else []
1129
+
1130
+ # Get extension alternatives from the path and puremagic
1131
+ base, ext = os.path.splitext(path)
1132
+ self._append_ext(extensions, ext)
1133
+
1134
+ if not extensions:
1135
+ for g in self._guess_ext_magic(path):
1136
+ self._append_ext(extensions, g)
1137
+
1138
+ # Convert
1139
+ return self._convert(path, extensions, **kwargs)
1140
+
1141
+ # TODO what should stream's type be?
1142
+ def convert_stream(
1143
+ self, stream: Any, **kwargs: Any
1144
+ ) -> DocumentConverterResult: # TODO: deal with kwargs
1145
+ # Prepare a list of extensions to try (in order of priority)
1146
+ ext = kwargs.get("file_extension")
1147
+ extensions = [ext] if ext is not None else []
1148
+
1149
+ # Save the file locally to a temporary file. It will be deleted before this method exits
1150
+ handle, temp_path = tempfile.mkstemp()
1151
+ fh = os.fdopen(handle, "wb")
1152
+ result = None
1153
+ try:
1154
+ # Write to the temporary file
1155
+ content = stream.read()
1156
+ if isinstance(content, str):
1157
+ fh.write(content.encode("utf-8"))
1158
+ else:
1159
+ fh.write(content)
1160
+ fh.close()
1161
+
1162
+ # Use puremagic to check for more extension options
1163
+ for g in self._guess_ext_magic(temp_path):
1164
+ self._append_ext(extensions, g)
1165
+
1166
+ # Convert
1167
+ result = self._convert(temp_path, extensions, **kwargs)
1168
+ # Clean up
1169
+ finally:
1170
+ try:
1171
+ fh.close()
1172
+ except Exception:
1173
+ pass
1174
+ os.unlink(temp_path)
1175
+
1176
+ return result
1177
+
1178
+ def convert_url(
1179
+ self, url: str, **kwargs: Any
1180
+ ) -> DocumentConverterResult: # TODO: fix kwargs type
1181
+ # Send a HTTP request to the URL
1182
+ response = self._requests_session.get(url, stream=True)
1183
+ response.raise_for_status()
1184
+ return self.convert_response(response, **kwargs)
1185
+
1186
+ def convert_response(
1187
+ self, response: requests.Response, **kwargs: Any
1188
+ ) -> DocumentConverterResult: # TODO fix kwargs type
1189
+ # Prepare a list of extensions to try (in order of priority)
1190
+ ext = kwargs.get("file_extension")
1191
+ extensions = [ext] if ext is not None else []
1192
+
1193
+ # Guess from the mimetype
1194
+ content_type = response.headers.get("content-type", "").split(";")[0]
1195
+ self._append_ext(extensions, mimetypes.guess_extension(content_type))
1196
+
1197
+ # Read the content disposition if there is one
1198
+ content_disposition = response.headers.get("content-disposition", "")
1199
+ m = re.search(r"filename=([^;]+)", content_disposition)
1200
+ if m:
1201
+ base, ext = os.path.splitext(m.group(1).strip("\"'"))
1202
+ self._append_ext(extensions, ext)
1203
+
1204
+ # Read from the extension from the path
1205
+ base, ext = os.path.splitext(urlparse(response.url).path)
1206
+ self._append_ext(extensions, ext)
1207
+
1208
+ # Save the file locally to a temporary file. It will be deleted before this method exits
1209
+ handle, temp_path = tempfile.mkstemp()
1210
+ fh = os.fdopen(handle, "wb")
1211
+ result = None
1212
+ try:
1213
+ # Download the file
1214
+ for chunk in response.iter_content(chunk_size=512):
1215
+ fh.write(chunk)
1216
+ fh.close()
1217
+
1218
+ # Use puremagic to check for more extension options
1219
+ for g in self._guess_ext_magic(temp_path):
1220
+ self._append_ext(extensions, g)
1221
+
1222
+ # Convert
1223
+ result = self._convert(temp_path, extensions, url=response.url)
1224
+ # Clean up
1225
+ finally:
1226
+ try:
1227
+ fh.close()
1228
+ except Exception:
1229
+ pass
1230
+ os.unlink(temp_path)
1231
+
1232
+ return result
1233
+
1234
+ def _convert(
1235
+ self, local_path: str, extensions: List[Union[str, None]], **kwargs
1236
+ ) -> DocumentConverterResult:
1237
+ error_trace = ""
1238
+ res = None
1239
+ for ext in extensions + [None]: # Try last with no extension
1240
+ for converter in self._page_converters:
1241
+ _kwargs = copy.deepcopy(kwargs)
1242
+
1243
+ # Overwrite file_extension appropriately
1244
+ if ext is None:
1245
+ if "file_extension" in _kwargs:
1246
+ del _kwargs["file_extension"]
1247
+ else:
1248
+ _kwargs.update({"file_extension": ext})
1249
+
1250
+ # Copy any additional global options
1251
+ if "mlm_client" not in _kwargs and self._mlm_client is not None:
1252
+ _kwargs["mlm_client"] = self._mlm_client
1253
+
1254
+ if "mlm_model" not in _kwargs and self._mlm_model is not None:
1255
+ _kwargs["mlm_model"] = self._mlm_model
1256
+
1257
+ # If we hit an error log it and keep trying
1258
+ try:
1259
+ res = converter.convert(local_path, **_kwargs)
1260
+ except Exception:
1261
+ error_trace = ("\n\n" + traceback.format_exc()).strip()
1262
+
1263
+ if res is not None:
1264
+ # Normalize the content
1265
+ res.text_content = "\n".join(
1266
+ [line.rstrip()
1267
+ for line in re.split(r"\r?\n", res.text_content)]
1268
+ )
1269
+ res.text_content = re.sub(
1270
+ r"\n{3,}", "\n\n", res.text_content)
1271
+
1272
+ # Todo
1273
+ return res
1274
+
1275
+ # If we got this far without success, report any exceptions
1276
+ if len(error_trace) > 0:
1277
+ raise FileConversionException(
1278
+ f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
1279
+ )
1280
+
1281
+ # Nothing can handle it!
1282
+ raise UnsupportedFormatException(
1283
+ f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
1284
+ )
1285
+
1286
+ def _append_ext(self, extensions, ext):
1287
+ """Append a unique non-None, non-empty extension to a list of extensions."""
1288
+ if ext is None:
1289
+ return
1290
+ ext = ext.strip()
1291
+ if ext == "":
1292
+ return
1293
+ # if ext not in extensions:
1294
+ if True:
1295
+ extensions.append(ext)
1296
+
1297
+ def _guess_ext_magic(self, path):
1298
+ """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
1299
+ # Use puremagic to guess
1300
+ try:
1301
+ guesses = puremagic.magic_file(path)
1302
+ extensions = list()
1303
+ for g in guesses:
1304
+ ext = g.extension.strip()
1305
+ if len(ext) > 0:
1306
+ if not ext.startswith("."):
1307
+ ext = "." + ext
1308
+ if ext not in extensions:
1309
+ extensions.append(ext)
1310
+ return extensions
1311
+ except FileNotFoundError:
1312
+ pass
1313
+ except IsADirectoryError:
1314
+ pass
1315
+ except PermissionError:
1316
+ pass
1317
+ return []
1318
+
1319
+ def register_page_converter(self, converter: DocumentConverter) -> None:
1320
+ """Register a page text converter."""
1321
+ self._page_converters.insert(0, converter)