auto-coder 0.1.353__py3-none-any.whl → 0.1.354__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

Files changed (41) hide show
  1. {auto_coder-0.1.353.dist-info → auto_coder-0.1.354.dist-info}/METADATA +1 -1
  2. {auto_coder-0.1.353.dist-info → auto_coder-0.1.354.dist-info}/RECORD +41 -29
  3. autocoder/auto_coder_rag.py +37 -1
  4. autocoder/auto_coder_runner.py +8 -0
  5. autocoder/commands/auto_command.py +59 -131
  6. autocoder/commands/tools.py +1 -1
  7. autocoder/common/__init__.py +1 -1
  8. autocoder/common/conversations/__init__.py +52 -0
  9. autocoder/common/conversations/compatibility.py +303 -0
  10. autocoder/common/conversations/conversation_manager.py +502 -0
  11. autocoder/common/conversations/example.py +152 -0
  12. autocoder/common/file_monitor/__init__.py +5 -0
  13. autocoder/common/file_monitor/monitor.py +383 -0
  14. autocoder/common/ignorefiles/__init__.py +4 -0
  15. autocoder/common/ignorefiles/ignore_file_utils.py +103 -0
  16. autocoder/common/ignorefiles/test_ignore_file_utils.py +91 -0
  17. autocoder/common/rulefiles/__init__.py +15 -0
  18. autocoder/common/rulefiles/autocoderrules_utils.py +173 -0
  19. autocoder/common/save_formatted_log.py +54 -0
  20. autocoder/common/v2/agent/agentic_edit.py +7 -36
  21. autocoder/common/v2/agent/agentic_edit_tools/list_files_tool_resolver.py +1 -1
  22. autocoder/common/v2/agent/agentic_edit_tools/search_files_tool_resolver.py +73 -43
  23. autocoder/common/v2/code_editblock_manager.py +20 -8
  24. autocoder/index/index.py +1 -1
  25. autocoder/models.py +22 -9
  26. autocoder/rag/api_server.py +14 -2
  27. autocoder/rag/cache/simple_cache.py +63 -33
  28. autocoder/rag/loaders/docx_loader.py +1 -1
  29. autocoder/rag/loaders/filter_utils.py +133 -76
  30. autocoder/rag/loaders/image_loader.py +15 -3
  31. autocoder/rag/loaders/pdf_loader.py +2 -2
  32. autocoder/rag/long_context_rag.py +11 -0
  33. autocoder/rag/qa_conversation_strategy.py +5 -31
  34. autocoder/rag/utils.py +21 -2
  35. autocoder/utils/_markitdown.py +66 -25
  36. autocoder/utils/auto_coder_utils/chat_stream_out.py +1 -0
  37. autocoder/version.py +1 -1
  38. {auto_coder-0.1.353.dist-info → auto_coder-0.1.354.dist-info}/LICENSE +0 -0
  39. {auto_coder-0.1.353.dist-info → auto_coder-0.1.354.dist-info}/WHEEL +0 -0
  40. {auto_coder-0.1.353.dist-info → auto_coder-0.1.354.dist-info}/entry_points.txt +0 -0
  41. {auto_coder-0.1.353.dist-info → auto_coder-0.1.354.dist-info}/top_level.txt +0 -0
@@ -30,18 +30,20 @@ from pdfminer.pdfpage import PDFPage
30
30
  from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
31
31
  import pptx
32
32
  from pdfminer.image import ImageWriter
33
+ import time
33
34
 
34
35
  import numpy as np
35
36
  from PIL import Image
36
37
 
37
38
  # 新增导入
38
- from autocoder.rag.loaders import filter_utils
39
+ from autocoder.rag.loaders.filter_utils import FilterRuleManager
39
40
  from autocoder.rag.loaders.image_loader import ImageLoader
40
41
 
41
42
  # File-format detection
42
43
  import puremagic
43
44
  import requests
44
45
  from bs4 import BeautifulSoup
46
+ from loguru import logger
45
47
 
46
48
  # Optional Transcription support
47
49
  try:
@@ -503,12 +505,16 @@ class PdfConverter(DocumentConverter):
503
505
  Converts PDFs to Markdown with support for extracting and including images.
504
506
  """
505
507
 
508
+ def __init__(self, llm=None, product_mode="lite"):
509
+ super().__init__()
510
+ self.llm = llm
511
+ self.product_mode = product_mode
512
+
506
513
  def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
507
514
  # Bail if not a PDF
508
515
  extension = kwargs.get("file_extension", "")
509
516
  if extension.lower() != ".pdf":
510
- return None
511
-
517
+ return None
512
518
  image_output_dir = None
513
519
  if kwargs.get("image_output_dir", None):
514
520
  image_output_dir = kwargs.get("image_output_dir")
@@ -531,17 +537,18 @@ class PdfConverter(DocumentConverter):
531
537
  rsrcmgr = PDFResourceManager()
532
538
  laparams = LAParams()
533
539
  device = PDFPageAggregator(rsrcmgr, laparams=laparams)
534
- interpreter = PDFPageInterpreter(rsrcmgr, device)
540
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
535
541
 
536
542
  # Process each page
537
543
  for page in PDFPage.create_pages(document):
538
544
  interpreter.process_page(page)
539
- layout = device.get_result()
545
+ layout = device.get_result()
540
546
 
541
547
  # Extract text and images from the page
542
548
  page_content = self._process_layout(
543
549
  layout, image_output_dir, image_count
544
550
  )
551
+
545
552
  text_content.extend(page_content)
546
553
  image_count += len([c for c in page_content if c.startswith("![Image")])
547
554
 
@@ -582,13 +589,12 @@ class PdfConverter(DocumentConverter):
582
589
  image_output_dir, f"image_{local_image_count}{suffix}")
583
590
  os.rename(temp_path, image_path)
584
591
  content.append(f"![Image {local_image_count}]({image_path})")
585
- # ===== 新增:根据filter_utils判断是否需要解析图片
586
- if filter_utils.should_parse_image(image_path):
587
- try:
588
- _ = ImageLoader.image_to_markdown(image_path, llm=None, engine="paddle")
589
- # image_to_markdown会自动生成md文件
590
- except Exception:
591
- import traceback; traceback.print_exc()
592
+ # ===== 修改:通过FilterRuleManager单例实例判断是否需要解析图片
593
+ v = try_parse_image(image_path,self.llm)
594
+ if v:
595
+ content.append("<image_content>")
596
+ content.append(v)
597
+ content.append("</image_content>")
592
598
  # =====
593
599
  local_image_count += 1
594
600
  continue
@@ -618,7 +624,11 @@ class PdfConverter(DocumentConverter):
618
624
  content.append(
619
625
  f"![Image {local_image_count}]({image_path})\n"
620
626
  )
621
- try_parse_image(image_path)
627
+ v = try_parse_image(image_path,self.llm)
628
+ if v:
629
+ content.append("<image_content>")
630
+ content.append(v)
631
+ content.append("</image_content>")
622
632
  local_image_count += 1
623
633
  continue
624
634
  elif colorspace == "DeviceGray":
@@ -629,7 +639,11 @@ class PdfConverter(DocumentConverter):
629
639
  content.append(
630
640
  f"![Image {local_image_count}]({image_path})\n"
631
641
  )
632
- try_parse_image(image_path)
642
+ v = try_parse_image(image_path,self.llm)
643
+ if v:
644
+ content.append("<image_content>")
645
+ content.append(v)
646
+ content.append("</image_content>")
633
647
  local_image_count += 1
634
648
  continue
635
649
  except Exception as e:
@@ -641,8 +655,12 @@ class PdfConverter(DocumentConverter):
641
655
  img_file.write(image_data)
642
656
 
643
657
  content.append(f"![Image {local_image_count}]({image_path})\n")
644
- # ===== 新增:根据filter_utils判断是否需要解析图片
645
- try_parse_image(image_path)
658
+ # ===== 新增:图片解析
659
+ v = try_parse_image(image_path,self.llm)
660
+ if v:
661
+ content.append("<image_content>")
662
+ content.append(v)
663
+ content.append("</image_content>")
646
664
  local_image_count += 1
647
665
 
648
666
  # Handle text
@@ -1089,6 +1107,8 @@ class MarkItDown:
1089
1107
  llm: Optional[Any] = None,
1090
1108
  product_mode: Optional[str] = None,
1091
1109
  ):
1110
+ # 初始化FilterRuleManager单例实例
1111
+ self._filter_rule_manager = FilterRuleManager.get_instance()
1092
1112
  if requests_session is None:
1093
1113
  self._requests_session = requests.Session()
1094
1114
  else:
@@ -1117,7 +1137,7 @@ class MarkItDown:
1117
1137
  self.register_page_converter(WavConverter())
1118
1138
  self.register_page_converter(Mp3Converter())
1119
1139
  self.register_page_converter(ImageConverter())
1120
- self.register_page_converter(PdfConverter())
1140
+ self.register_page_converter(PdfConverter(llm,product_mode))
1121
1141
 
1122
1142
  def convert(
1123
1143
  self, source: Union[str, requests.Response], **kwargs: Any
@@ -1126,8 +1146,7 @@ class MarkItDown:
1126
1146
  Args:
1127
1147
  - source: can be a string representing a path or url, or a requests.response object
1128
1148
  - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
1129
- """
1130
-
1149
+ """
1131
1150
  # Local path or url
1132
1151
  if isinstance(source, str):
1133
1152
  if (
@@ -1343,14 +1362,36 @@ class MarkItDown:
1343
1362
  self._page_converters.insert(0, converter)
1344
1363
 
1345
1364
 
1346
- def try_parse_image(image_path: str):
1365
+ def try_parse_image(image_path: str, llm=None):
1347
1366
  """
1348
- 根据filter_utils判断是否需要解析图片,如果需要则调用ImageLoader.image_to_markdown。
1367
+ 根据FilterRuleManager单例实例判断是否需要解析图片,如果需要则调用ImageLoader.image_to_markdown。
1349
1368
  解析失败会自动捕获异常。
1350
1369
  """
1351
- if filter_utils.should_parse_image(image_path):
1370
+ import uuid
1371
+ start_time = time.time()
1372
+ req_id = str(uuid.uuid4())[:8]
1373
+ logger.info(f"\n==== [try_parse_image] START | req_id={req_id} ====")
1374
+ logger.info(f"[try_parse_image][{req_id}] image_path: {image_path}, llm: {llm}")
1375
+ if FilterRuleManager.get_instance().should_parse_image(image_path):
1376
+ logger.info(f"[try_parse_image][{req_id}] should_parse_image=True, start parsing...")
1352
1377
  try:
1353
- _ = ImageLoader.image_to_markdown(image_path, llm=None, engine="paddle")
1354
- except Exception:
1355
- import traceback; traceback.print_exc()
1378
+ v = ImageLoader.image_to_markdown(image_path, llm=llm, engine="paddle")
1379
+ logger.info(f"[try_parse_image][{req_id}] image_to_markdown result: {str(v)[:200]}")
1380
+ if llm:
1381
+ v = ImageLoader.format_table_in_content(v, llm)
1382
+ logger.info(f"[try_parse_image][{req_id}] format_table_in_content result: {str(v)[:200]}")
1383
+ elapsed = time.time() - start_time
1384
+ logger.info(f"[try_parse_image][{req_id}] SUCCESS | execution time: {elapsed:.3f} seconds")
1385
+ logger.info(f"==== [try_parse_image] END | req_id={req_id} ====")
1386
+ return v
1387
+ except Exception as e:
1388
+ elapsed = time.time() - start_time
1389
+ logger.error(f"[try_parse_image][{req_id}] EXCEPTION | execution time: {elapsed:.3f} seconds | image_path: {image_path} | llm: {llm}")
1390
+ logger.exception(e)
1391
+ logger.info(f"==== [try_parse_image] END (EXCEPTION) | req_id={req_id} ====")
1392
+ return ""
1393
+ else:
1394
+ logger.info(f"[try_parse_image][{req_id}] should_parse_image=False, skip parsing.")
1395
+ logger.info(f"==== [try_parse_image] END (SKIP) | req_id={req_id} ====")
1396
+ return ""
1356
1397
 
@@ -292,6 +292,7 @@ def stream_out(
292
292
  get_event_manager(args.event_file).write_stream(content.to_dict(),
293
293
  metadata=EventMetadata(
294
294
  stream_out_type=extra_meta.get("stream_out_type", ""),
295
+ path=extra_meta.get("path", ""),
295
296
  is_streaming=True,
296
297
  output="delta",
297
298
  action_file=args.file
autocoder/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.353"
1
+ __version__ = "0.1.354"