auto-coder 0.1.201__py3-none-any.whl → 0.1.203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {auto_coder-0.1.201.dist-info → auto_coder-0.1.203.dist-info}/METADATA +1 -1
- {auto_coder-0.1.201.dist-info → auto_coder-0.1.203.dist-info}/RECORD +16 -16
- autocoder/agent/planner.py +8 -6
- autocoder/auto_coder.py +16 -12
- autocoder/chat_auto_coder.py +190 -72
- autocoder/chat_auto_coder_lang.py +7 -3
- autocoder/command_args.py +1 -0
- autocoder/common/git_utils.py +434 -0
- autocoder/rag/long_context_rag.py +15 -8
- autocoder/utils/__init__.py +6 -16
- autocoder/utils/_markitdown.py +62 -39
- autocoder/version.py +1 -1
- {auto_coder-0.1.201.dist-info → auto_coder-0.1.203.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.201.dist-info → auto_coder-0.1.203.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.201.dist-info → auto_coder-0.1.203.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.201.dist-info → auto_coder-0.1.203.dist-info}/top_level.txt +0 -0
autocoder/utils/_markitdown.py
CHANGED
|
@@ -68,7 +68,8 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|
|
68
68
|
"""
|
|
69
69
|
|
|
70
70
|
def __init__(self, **options: Any):
|
|
71
|
-
options["heading_style"] = options.get(
|
|
71
|
+
options["heading_style"] = options.get(
|
|
72
|
+
"heading_style", markdownify.ATX)
|
|
72
73
|
# Explicitly cast options to the expected type if necessary
|
|
73
74
|
super().__init__(**options)
|
|
74
75
|
|
|
@@ -318,7 +319,7 @@ class YouTubeConverter(DocumentConverter):
|
|
|
318
319
|
obj_start = lines[0].find("{")
|
|
319
320
|
obj_end = lines[0].rfind("}")
|
|
320
321
|
if obj_start >= 0 and obj_end >= 0:
|
|
321
|
-
data = json.loads(lines[0][obj_start
|
|
322
|
+
data = json.loads(lines[0][obj_start: obj_end + 1])
|
|
322
323
|
attrdesc = self._findKey(
|
|
323
324
|
data, "attributedDescriptionBodyText"
|
|
324
325
|
) # type: ignore
|
|
@@ -331,7 +332,8 @@ class YouTubeConverter(DocumentConverter):
|
|
|
331
332
|
# Start preparing the page
|
|
332
333
|
webpage_text = "# YouTube\n"
|
|
333
334
|
|
|
334
|
-
title = self._get(
|
|
335
|
+
title = self._get(
|
|
336
|
+
metadata, ["title", "og:title", "name"]) # type: ignore
|
|
335
337
|
assert isinstance(title, str)
|
|
336
338
|
|
|
337
339
|
if title:
|
|
@@ -468,7 +470,8 @@ class BingSerpConverter(DocumentConverter):
|
|
|
468
470
|
|
|
469
471
|
try:
|
|
470
472
|
# RFC 4648 / Base64URL" variant, which uses "-" and "_"
|
|
471
|
-
a["href"] = base64.b64decode(
|
|
473
|
+
a["href"] = base64.b64decode(
|
|
474
|
+
u, altchars="-_").decode("utf-8")
|
|
472
475
|
except UnicodeDecodeError:
|
|
473
476
|
pass
|
|
474
477
|
except binascii.Error:
|
|
@@ -477,7 +480,8 @@ class BingSerpConverter(DocumentConverter):
|
|
|
477
480
|
# Convert to markdown
|
|
478
481
|
md_result = _markdownify.convert_soup(result).strip()
|
|
479
482
|
lines = [line.strip() for line in re.split(r"\n+", md_result)]
|
|
480
|
-
results.append(
|
|
483
|
+
results.append(
|
|
484
|
+
"\n".join([line for line in lines if len(line) > 0]))
|
|
481
485
|
|
|
482
486
|
webpage_text = (
|
|
483
487
|
f"## A Bing search for '{query}' found the following results:\n\n"
|
|
@@ -507,7 +511,8 @@ class PdfConverter(DocumentConverter):
|
|
|
507
511
|
else:
|
|
508
512
|
# Create output directory for images if it doesn't exist
|
|
509
513
|
image_output_dir = os.path.join(
|
|
510
|
-
os.path.dirname(local_path), "_images", os.path.basename(
|
|
514
|
+
os.path.dirname(local_path), "_images", os.path.basename(
|
|
515
|
+
local_path).replace(" ", "_")
|
|
511
516
|
)
|
|
512
517
|
os.makedirs(image_output_dir, exist_ok=True)
|
|
513
518
|
|
|
@@ -545,18 +550,17 @@ class PdfConverter(DocumentConverter):
|
|
|
545
550
|
self, layout, image_output_dir: str, image_count: int
|
|
546
551
|
) -> List[str]:
|
|
547
552
|
"""Process the layout of a PDF page, extracting both text and images."""
|
|
548
|
-
content = []
|
|
549
|
-
|
|
550
|
-
|
|
553
|
+
content = []
|
|
554
|
+
local_image_count = image_count
|
|
551
555
|
for lt_obj in layout:
|
|
552
556
|
# Handle images
|
|
553
557
|
if isinstance(lt_obj, LTImage) or (
|
|
554
558
|
isinstance(lt_obj, LTFigure) and lt_obj.name.startswith("Im")
|
|
555
|
-
):
|
|
556
|
-
image_count += 1
|
|
559
|
+
):
|
|
557
560
|
image_data = None
|
|
558
561
|
image_meta = {}
|
|
559
|
-
image_path = os.path.join(
|
|
562
|
+
image_path = os.path.join(
|
|
563
|
+
image_output_dir, f"image_{local_image_count}.png")
|
|
560
564
|
|
|
561
565
|
if hasattr(lt_obj, "stream"):
|
|
562
566
|
image_data = lt_obj.stream.get_data()
|
|
@@ -566,12 +570,15 @@ class PdfConverter(DocumentConverter):
|
|
|
566
570
|
|
|
567
571
|
if image_data:
|
|
568
572
|
if isinstance(lt_obj, LTImage):
|
|
573
|
+
iw = ImageWriter(image_output_dir)
|
|
569
574
|
name = iw.export_image(lt_obj)
|
|
570
|
-
suffix = os.path.splitext(name)[1]
|
|
575
|
+
suffix = os.path.splitext(name)[1]
|
|
571
576
|
temp_path = os.path.join(image_output_dir, name)
|
|
572
|
-
image_path = os.path.join(
|
|
577
|
+
image_path = os.path.join(
|
|
578
|
+
image_output_dir, f"image_{local_image_count}{suffix}")
|
|
573
579
|
os.rename(temp_path, image_path)
|
|
574
|
-
content.append(f"")
|
|
581
|
+
local_image_count += 1
|
|
575
582
|
continue
|
|
576
583
|
try:
|
|
577
584
|
# Try to handle raw pixel data
|
|
@@ -580,12 +587,14 @@ class PdfConverter(DocumentConverter):
|
|
|
580
587
|
height = image_meta["Height"]
|
|
581
588
|
bits = image_meta["BitsPerComponent"]
|
|
582
589
|
colorspace = image_meta["ColorSpace"].name
|
|
583
|
-
new_image_data = np.frombuffer(
|
|
590
|
+
new_image_data = np.frombuffer(
|
|
591
|
+
image_data, dtype=np.uint8)
|
|
584
592
|
# Normalize to 8-bit if necessary
|
|
585
593
|
if bits != 8:
|
|
586
594
|
max_val = (1 << bits) - 1
|
|
587
595
|
new_image_data = (
|
|
588
|
-
new_image_data.astype(
|
|
596
|
+
new_image_data.astype(
|
|
597
|
+
"float32") * 255 / max_val
|
|
589
598
|
).astype("uint8")
|
|
590
599
|
|
|
591
600
|
if colorspace == "DeviceRGB":
|
|
@@ -595,16 +604,19 @@ class PdfConverter(DocumentConverter):
|
|
|
595
604
|
img = Image.fromarray(new_image_data, "RGB")
|
|
596
605
|
img.save(image_path)
|
|
597
606
|
content.append(
|
|
598
|
-
f"\n"
|
|
599
608
|
)
|
|
609
|
+
local_image_count += 1
|
|
600
610
|
continue
|
|
601
611
|
elif colorspace == "DeviceGray":
|
|
602
|
-
new_image_data = new_image_data.reshape(
|
|
612
|
+
new_image_data = new_image_data.reshape(
|
|
613
|
+
(height, width))
|
|
603
614
|
img = Image.fromarray(new_image_data, "L")
|
|
604
615
|
img.save(image_path)
|
|
605
616
|
content.append(
|
|
606
|
-
f"\n"
|
|
607
618
|
)
|
|
619
|
+
local_image_count += 1
|
|
608
620
|
continue
|
|
609
621
|
except Exception as e:
|
|
610
622
|
print(
|
|
@@ -614,7 +626,8 @@ class PdfConverter(DocumentConverter):
|
|
|
614
626
|
with open(image_path, "wb") as img_file:
|
|
615
627
|
img_file.write(image_data)
|
|
616
628
|
|
|
617
|
-
content.append(f"\n")
|
|
630
|
+
local_image_count += 1
|
|
618
631
|
|
|
619
632
|
# Handle text
|
|
620
633
|
if hasattr(lt_obj, "get_text"):
|
|
@@ -625,7 +638,8 @@ class PdfConverter(DocumentConverter):
|
|
|
625
638
|
# Recursively process nested layouts
|
|
626
639
|
elif hasattr(lt_obj, "_objs"):
|
|
627
640
|
content.extend(
|
|
628
|
-
self._process_layout(
|
|
641
|
+
self._process_layout(
|
|
642
|
+
lt_obj._objs, image_output_dir, image_count)
|
|
629
643
|
)
|
|
630
644
|
|
|
631
645
|
return content
|
|
@@ -635,7 +649,7 @@ class DocxConverter(HtmlConverter):
|
|
|
635
649
|
"""
|
|
636
650
|
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
|
637
651
|
"""
|
|
638
|
-
|
|
652
|
+
|
|
639
653
|
def __init__(self):
|
|
640
654
|
self._image_counter = 0
|
|
641
655
|
super().__init__()
|
|
@@ -644,18 +658,19 @@ class DocxConverter(HtmlConverter):
|
|
|
644
658
|
"""
|
|
645
659
|
保存图片并返回相对路径,使用递增的计数器来命名文件
|
|
646
660
|
"""
|
|
647
|
-
# 获取图片内容和格式
|
|
648
|
-
image_format = image.content_type.split(
|
|
649
|
-
|
|
661
|
+
# 获取图片内容和格式
|
|
662
|
+
image_format = image.content_type.split(
|
|
663
|
+
'/')[-1] if image.content_type else 'png'
|
|
664
|
+
|
|
650
665
|
# 增加计数器并生成文件名
|
|
651
666
|
self._image_counter += 1
|
|
652
667
|
image_filename = f"image_{self._image_counter}.{image_format}"
|
|
653
|
-
|
|
668
|
+
|
|
654
669
|
# 保存图片
|
|
655
670
|
image_path = os.path.join(output_dir, image_filename)
|
|
656
671
|
with image.open() as image_content, open(image_path, 'wb') as f:
|
|
657
672
|
f.write(image_content.read())
|
|
658
|
-
|
|
673
|
+
|
|
659
674
|
return image_path
|
|
660
675
|
|
|
661
676
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
@@ -671,7 +686,7 @@ class DocxConverter(HtmlConverter):
|
|
|
671
686
|
else:
|
|
672
687
|
# Create output directory for images if it doesn't exist
|
|
673
688
|
image_output_dir = os.path.join(os.path.dirname(
|
|
674
|
-
local_path), "_images", os.path.basename(local_path))
|
|
689
|
+
local_path), "_images", os.path.basename(local_path).replace(" ", "_"))
|
|
675
690
|
os.makedirs(image_output_dir, exist_ok=True)
|
|
676
691
|
|
|
677
692
|
result = None
|
|
@@ -682,7 +697,7 @@ class DocxConverter(HtmlConverter):
|
|
|
682
697
|
"src": self._save_image(image, image_output_dir),
|
|
683
698
|
"alt": image.alt_text if image.alt_text else f"Image {self._image_counter}"
|
|
684
699
|
}
|
|
685
|
-
|
|
700
|
+
|
|
686
701
|
# 进行转换
|
|
687
702
|
result = mammoth.convert_to_html(
|
|
688
703
|
docx_file,
|
|
@@ -691,7 +706,7 @@ class DocxConverter(HtmlConverter):
|
|
|
691
706
|
html_content = result.value
|
|
692
707
|
result = self._convert(html_content)
|
|
693
708
|
|
|
694
|
-
return result
|
|
709
|
+
return result
|
|
695
710
|
|
|
696
711
|
|
|
697
712
|
class XlsxConverter(HtmlConverter):
|
|
@@ -710,7 +725,8 @@ class XlsxConverter(HtmlConverter):
|
|
|
710
725
|
for s in sheets:
|
|
711
726
|
md_content += f"## {s}\n"
|
|
712
727
|
html_content = sheets[s].to_html(index=False)
|
|
713
|
-
md_content += self._convert(
|
|
728
|
+
md_content += self._convert(
|
|
729
|
+
html_content).text_content.strip() + "\n\n"
|
|
714
730
|
|
|
715
731
|
return DocumentConverterResult(
|
|
716
732
|
title=None,
|
|
@@ -745,7 +761,8 @@ class PptxConverter(HtmlConverter):
|
|
|
745
761
|
# https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
|
|
746
762
|
alt_text = ""
|
|
747
763
|
try:
|
|
748
|
-
alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
|
|
764
|
+
alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
|
|
765
|
+
"descr", "")
|
|
749
766
|
except Exception:
|
|
750
767
|
pass
|
|
751
768
|
|
|
@@ -767,14 +784,17 @@ class PptxConverter(HtmlConverter):
|
|
|
767
784
|
html_table += "<tr>"
|
|
768
785
|
for cell in row.cells:
|
|
769
786
|
if first_row:
|
|
770
|
-
html_table += "<th>" +
|
|
787
|
+
html_table += "<th>" + \
|
|
788
|
+
html.escape(cell.text) + "</th>"
|
|
771
789
|
else:
|
|
772
|
-
html_table += "<td>" +
|
|
790
|
+
html_table += "<td>" + \
|
|
791
|
+
html.escape(cell.text) + "</td>"
|
|
773
792
|
html_table += "</tr>"
|
|
774
793
|
first_row = False
|
|
775
794
|
html_table += "</table></body></html>"
|
|
776
795
|
md_content += (
|
|
777
|
-
"\n" +
|
|
796
|
+
"\n" +
|
|
797
|
+
self._convert(html_table).text_content.strip() + "\n"
|
|
778
798
|
)
|
|
779
799
|
|
|
780
800
|
# Text areas
|
|
@@ -1028,7 +1048,8 @@ class ImageConverter(MediaConverter):
|
|
|
1028
1048
|
}
|
|
1029
1049
|
]
|
|
1030
1050
|
|
|
1031
|
-
response = client.chat.completions.create(
|
|
1051
|
+
response = client.chat.completions.create(
|
|
1052
|
+
model=model, messages=messages)
|
|
1032
1053
|
return response.choices[0].message.content
|
|
1033
1054
|
|
|
1034
1055
|
|
|
@@ -1242,9 +1263,11 @@ class MarkItDown:
|
|
|
1242
1263
|
if res is not None:
|
|
1243
1264
|
# Normalize the content
|
|
1244
1265
|
res.text_content = "\n".join(
|
|
1245
|
-
[line.rstrip()
|
|
1266
|
+
[line.rstrip()
|
|
1267
|
+
for line in re.split(r"\r?\n", res.text_content)]
|
|
1246
1268
|
)
|
|
1247
|
-
res.text_content = re.sub(
|
|
1269
|
+
res.text_content = re.sub(
|
|
1270
|
+
r"\n{3,}", "\n\n", res.text_content)
|
|
1248
1271
|
|
|
1249
1272
|
# Todo
|
|
1250
1273
|
return res
|
autocoder/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.1.
|
|
1
|
+
__version__ = "0.1.203"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|