auto-coder 0.1.352__py3-none-any.whl → 0.1.354__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.352.dist-info → auto_coder-0.1.354.dist-info}/METADATA +1 -1
- {auto_coder-0.1.352.dist-info → auto_coder-0.1.354.dist-info}/RECORD +43 -30
- autocoder/auto_coder_rag.py +37 -1
- autocoder/auto_coder_runner.py +8 -0
- autocoder/commands/auto_command.py +59 -131
- autocoder/commands/tools.py +1 -1
- autocoder/common/__init__.py +1 -1
- autocoder/common/conversations/__init__.py +52 -0
- autocoder/common/conversations/compatibility.py +303 -0
- autocoder/common/conversations/conversation_manager.py +502 -0
- autocoder/common/conversations/example.py +152 -0
- autocoder/common/file_monitor/__init__.py +5 -0
- autocoder/common/file_monitor/monitor.py +383 -0
- autocoder/common/git_utils.py +1 -1
- autocoder/common/ignorefiles/__init__.py +4 -0
- autocoder/common/ignorefiles/ignore_file_utils.py +103 -0
- autocoder/common/ignorefiles/test_ignore_file_utils.py +91 -0
- autocoder/common/rulefiles/__init__.py +15 -0
- autocoder/common/rulefiles/autocoderrules_utils.py +173 -0
- autocoder/common/save_formatted_log.py +54 -0
- autocoder/common/v2/agent/agentic_edit.py +40 -36
- autocoder/common/v2/agent/agentic_edit_tools/list_files_tool_resolver.py +1 -1
- autocoder/common/v2/agent/agentic_edit_tools/search_files_tool_resolver.py +73 -43
- autocoder/common/v2/agent/agentic_edit_tools/test_search_files_tool_resolver.py +163 -0
- autocoder/common/v2/code_editblock_manager.py +20 -8
- autocoder/index/index.py +1 -1
- autocoder/models.py +22 -9
- autocoder/rag/api_server.py +14 -2
- autocoder/rag/cache/simple_cache.py +63 -33
- autocoder/rag/loaders/docx_loader.py +1 -1
- autocoder/rag/loaders/filter_utils.py +133 -76
- autocoder/rag/loaders/image_loader.py +15 -3
- autocoder/rag/loaders/pdf_loader.py +2 -2
- autocoder/rag/long_context_rag.py +11 -0
- autocoder/rag/qa_conversation_strategy.py +5 -31
- autocoder/rag/utils.py +21 -2
- autocoder/utils/_markitdown.py +66 -25
- autocoder/utils/auto_coder_utils/chat_stream_out.py +1 -0
- autocoder/version.py +1 -1
- {auto_coder-0.1.352.dist-info → auto_coder-0.1.354.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.352.dist-info → auto_coder-0.1.354.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.352.dist-info → auto_coder-0.1.354.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.352.dist-info → auto_coder-0.1.354.dist-info}/top_level.txt +0 -0
|
@@ -1,106 +1,163 @@
|
|
|
1
|
-
|
|
2
1
|
import os
|
|
3
2
|
import json
|
|
4
|
-
|
|
3
|
+
import threading
|
|
4
|
+
from typing import Dict, Optional, List
|
|
5
5
|
from loguru import logger
|
|
6
|
+
from functools import lru_cache
|
|
6
7
|
|
|
7
8
|
class FilterRuleManager:
|
|
8
9
|
'''
|
|
10
|
+
单例模式的过滤规则管理器。支持按文件类型定义不同的过滤规则。
|
|
11
|
+
|
|
12
|
+
支持的规则格式:
|
|
9
13
|
{
|
|
10
|
-
"
|
|
11
|
-
"
|
|
12
|
-
"
|
|
13
|
-
|
|
14
|
-
"
|
|
15
|
-
"
|
|
16
|
-
"
|
|
17
|
-
|
|
14
|
+
"image": {
|
|
15
|
+
"whitelist": ["*.png", "*.jpg"],
|
|
16
|
+
"blacklist": ["*/private/*"]
|
|
17
|
+
},
|
|
18
|
+
"document": {
|
|
19
|
+
"whitelist": ["*.pdf", "*.docx"],
|
|
20
|
+
"blacklist": ["*/tmp/*"]
|
|
21
|
+
},
|
|
22
|
+
"default": {
|
|
23
|
+
"whitelist": [],
|
|
24
|
+
"blacklist": ["*/node_modules/*", "*/.*"]
|
|
18
25
|
}
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
26
|
+
}
|
|
27
|
+
'''
|
|
28
|
+
_instance = None
|
|
29
|
+
_lock = threading.RLock() # 使用可重入锁避免死锁
|
|
30
|
+
|
|
31
|
+
def __new__(cls, *args, **kwargs):
|
|
32
|
+
if cls._instance is None:
|
|
33
|
+
with cls._lock:
|
|
34
|
+
if cls._instance is None: # 双重检查锁定模式
|
|
35
|
+
cls._instance = super(FilterRuleManager, cls).__new__(cls)
|
|
36
|
+
cls._instance._initialized = False
|
|
37
|
+
return cls._instance
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def get_instance(cls):
|
|
41
|
+
return cls() # 直接调用__new__,不需要重复加锁
|
|
42
|
+
|
|
43
|
+
def __init__(self):
|
|
44
|
+
with self._lock:
|
|
45
|
+
if hasattr(self, '_initialized') and self._initialized:
|
|
46
|
+
return
|
|
47
|
+
|
|
48
|
+
self.source_dir = os.getcwd()
|
|
49
|
+
self.filter_rules_path = os.path.join(self.source_dir, ".cache", "filterrules")
|
|
50
|
+
self._cache_rules: Optional[Dict] = None
|
|
51
|
+
self._cache_mtime: Optional[float] = None
|
|
52
|
+
self._rule_lock = threading.RLock() # 单独的锁用于规则访问
|
|
53
|
+
self._initialized = True
|
|
34
54
|
|
|
35
55
|
def load_filter_rules(self) -> Dict:
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
except Exception:
|
|
39
|
-
current_mtime = None
|
|
40
|
-
|
|
56
|
+
# 先检查是否需要重新加载,不持有锁
|
|
57
|
+
current_mtime = self._get_file_mtime()
|
|
41
58
|
need_reload = False
|
|
42
59
|
|
|
43
|
-
|
|
44
|
-
if FilterRuleManager._cache_rules is None:
|
|
60
|
+
if self._cache_rules is None:
|
|
45
61
|
need_reload = True
|
|
46
|
-
elif current_mtime is not None and
|
|
62
|
+
elif current_mtime is not None and self._cache_mtime != current_mtime:
|
|
47
63
|
need_reload = True
|
|
48
64
|
|
|
65
|
+
# 只在需要重新加载时获取锁
|
|
49
66
|
if need_reload:
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
67
|
+
with self._rule_lock:
|
|
68
|
+
# 双重检查,避免多线程重复加载
|
|
69
|
+
current_mtime = self._get_file_mtime()
|
|
70
|
+
if self._cache_rules is None or (current_mtime is not None and self._cache_mtime != current_mtime):
|
|
71
|
+
self._load_rules_from_file(current_mtime)
|
|
72
|
+
|
|
73
|
+
# 返回规则副本,避免外部修改影响缓存
|
|
74
|
+
with self._rule_lock:
|
|
75
|
+
return self._cache_rules.copy() if self._cache_rules else self._get_default_rules()
|
|
76
|
+
|
|
77
|
+
def _get_file_mtime(self) -> Optional[float]:
|
|
78
|
+
"""获取文件修改时间,与IO相关的操作单独提取出来"""
|
|
79
|
+
try:
|
|
80
|
+
return os.path.getmtime(self.filter_rules_path) if os.path.exists(self.filter_rules_path) else None
|
|
81
|
+
except Exception:
|
|
82
|
+
logger.warning(f"Failed to get mtime for {self.filter_rules_path}")
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
def _get_default_rules(self) -> Dict:
|
|
86
|
+
"""返回默认的规则结构"""
|
|
87
|
+
return {
|
|
88
|
+
"default": {
|
|
89
|
+
"whitelist": [],
|
|
90
|
+
"blacklist": []
|
|
91
|
+
}
|
|
92
|
+
}
|
|
60
93
|
|
|
61
|
-
def
|
|
94
|
+
def _load_rules_from_file(self, current_mtime: Optional[float]) -> None:
|
|
95
|
+
"""从文件加载规则,仅在持有锁时调用"""
|
|
96
|
+
self._cache_rules = self._get_default_rules()
|
|
97
|
+
try:
|
|
98
|
+
if os.path.exists(self.filter_rules_path):
|
|
99
|
+
with open(self.filter_rules_path, "r", encoding="utf-8") as f:
|
|
100
|
+
file_rules = json.load(f)
|
|
101
|
+
|
|
102
|
+
# 转换旧格式规则到新格式(如果需要)
|
|
103
|
+
if "whitelist" in file_rules or "blacklist" in file_rules:
|
|
104
|
+
# 旧格式转换为新格式
|
|
105
|
+
self._cache_rules = {
|
|
106
|
+
"default": {
|
|
107
|
+
"whitelist": file_rules.get("whitelist", []),
|
|
108
|
+
"blacklist": file_rules.get("blacklist", [])
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
logger.info("Converted old format rules to new format")
|
|
112
|
+
else:
|
|
113
|
+
# 新格式直接使用
|
|
114
|
+
self._cache_rules = file_rules
|
|
115
|
+
self._cache_mtime = current_mtime
|
|
116
|
+
except Exception as e:
|
|
117
|
+
logger.warning(f"Failed to load filterrules: {e}")
|
|
118
|
+
|
|
119
|
+
@lru_cache(maxsize=1024) # 缓存频繁使用的路径判断结果
|
|
120
|
+
def should_parse_file(self, file_path: str, file_type: str = "default") -> bool:
|
|
62
121
|
"""
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
122
|
+
判断某个文件是否需要进行解析。
|
|
123
|
+
|
|
124
|
+
参数:
|
|
125
|
+
file_path: 文件路径
|
|
126
|
+
file_type: 文件类型(如"image"、"document"等),默认为"default"
|
|
127
|
+
|
|
69
128
|
返回:
|
|
70
129
|
True 表示应该解析
|
|
71
130
|
False 表示不解析
|
|
72
131
|
"""
|
|
73
132
|
import fnmatch
|
|
74
|
-
|
|
75
|
-
|
|
133
|
+
|
|
76
134
|
rules = self.load_filter_rules()
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
return fnmatch.fnmatch(path, pat)
|
|
84
|
-
elif pattern.startswith("regex:"):
|
|
85
|
-
pat = pattern[len("regex:"):]
|
|
86
|
-
try:
|
|
87
|
-
return re.search(pat, path) is not None
|
|
88
|
-
except re.error:
|
|
89
|
-
logger.warning(f"Invalid regex pattern: {pat}")
|
|
90
|
-
return False
|
|
91
|
-
else:
|
|
92
|
-
# 默认按glob处理
|
|
93
|
-
return fnmatch.fnmatch(path, pattern)
|
|
94
|
-
|
|
135
|
+
|
|
136
|
+
# 获取指定类型的规则,如果不存在则使用默认规则
|
|
137
|
+
type_rules = rules.get(file_type, rules.get("default", {"whitelist": [], "blacklist": []}))
|
|
138
|
+
whitelist = type_rules.get("whitelist", [])
|
|
139
|
+
blacklist = type_rules.get("blacklist", [])
|
|
140
|
+
|
|
95
141
|
# 优先匹配黑名单
|
|
96
142
|
for pattern in blacklist:
|
|
97
|
-
if
|
|
143
|
+
if fnmatch.fnmatch(file_path, pattern):
|
|
98
144
|
return False
|
|
99
|
-
|
|
100
|
-
#
|
|
145
|
+
|
|
146
|
+
# 如果白名单为空,则默认所有文件都通过(除非被黑名单过滤)
|
|
147
|
+
if not whitelist:
|
|
148
|
+
return True
|
|
149
|
+
|
|
150
|
+
# 匹配白名单
|
|
101
151
|
for pattern in whitelist:
|
|
102
|
-
if
|
|
152
|
+
if fnmatch.fnmatch(file_path, pattern):
|
|
103
153
|
return True
|
|
104
|
-
|
|
105
|
-
#
|
|
154
|
+
|
|
155
|
+
# 有白名单但不匹配,不通过
|
|
106
156
|
return False
|
|
157
|
+
|
|
158
|
+
# 保持向后兼容
|
|
159
|
+
def should_parse_image(self, file_path: str) -> bool:
|
|
160
|
+
"""
|
|
161
|
+
判断某个图片文件是否需要解析(兼容旧版API)
|
|
162
|
+
"""
|
|
163
|
+
return self.should_parse_file(file_path, "image")
|
|
@@ -538,7 +538,7 @@ class ImageLoader:
|
|
|
538
538
|
def image_to_markdown(
|
|
539
539
|
image_path: str,
|
|
540
540
|
llm,
|
|
541
|
-
engine: str = "
|
|
541
|
+
engine: str = "paddle",
|
|
542
542
|
product_mode: str = "lite",
|
|
543
543
|
paddle_kwargs: dict = None
|
|
544
544
|
) -> str:
|
|
@@ -554,6 +554,13 @@ class ImageLoader:
|
|
|
554
554
|
Returns:
|
|
555
555
|
markdown内容字符串
|
|
556
556
|
"""
|
|
557
|
+
logger.info(f"image_path: {image_path} engine: {engine} product_mode: {product_mode} paddle_kwargs: {paddle_kwargs}")
|
|
558
|
+
|
|
559
|
+
# 新增:如果 engine 为 paddle 且 PaddleOCR 为 None,直接返回空字符串
|
|
560
|
+
if engine == "paddle" and PaddleOCR is None:
|
|
561
|
+
logger.warning("PaddleOCR 未安装,无法识别图片内容,直接返回空字符串。")
|
|
562
|
+
return ""
|
|
563
|
+
|
|
557
564
|
md_content = ImageLoader.extract_text_from_image(
|
|
558
565
|
image_path,
|
|
559
566
|
llm,
|
|
@@ -561,8 +568,13 @@ class ImageLoader:
|
|
|
561
568
|
product_mode=product_mode,
|
|
562
569
|
paddle_kwargs=paddle_kwargs
|
|
563
570
|
)
|
|
564
|
-
|
|
565
|
-
|
|
571
|
+
|
|
572
|
+
# Get directory and filename separately
|
|
573
|
+
dir_name = os.path.dirname(image_path)
|
|
574
|
+
file_name = os.path.basename(image_path)
|
|
575
|
+
base_name = os.path.splitext(file_name)[0]
|
|
576
|
+
# Create new path with dot before filename
|
|
577
|
+
md_path = os.path.join(dir_name, f".{base_name}.md")
|
|
566
578
|
try:
|
|
567
579
|
with open(md_path, "w", encoding="utf-8") as f:
|
|
568
580
|
f.write(md_content)
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from io import BytesIO
|
|
2
2
|
from pypdf import PdfReader
|
|
3
|
-
from autocoder.utils._markitdown import MarkItDown
|
|
4
3
|
import traceback
|
|
5
4
|
|
|
6
5
|
|
|
@@ -15,7 +14,8 @@ def extract_text_from_pdf_old(file_path):
|
|
|
15
14
|
return text
|
|
16
15
|
|
|
17
16
|
def extract_text_from_pdf(file_path, llm=None, product_mode="lite"):
|
|
18
|
-
try:
|
|
17
|
+
try:
|
|
18
|
+
from autocoder.utils._markitdown import MarkItDown
|
|
19
19
|
md_converter = MarkItDown(llm=llm, product_mode=product_mode)
|
|
20
20
|
result = md_converter.convert(file_path)
|
|
21
21
|
return result.text_content
|
|
@@ -41,6 +41,8 @@ from autocoder.rag.qa_conversation_strategy import get_qa_strategy
|
|
|
41
41
|
from autocoder.rag.searchable import SearchableResults
|
|
42
42
|
from autocoder.rag.conversation_to_queries import extract_search_queries
|
|
43
43
|
from autocoder.common import openai_content as OpenAIContentProcessor
|
|
44
|
+
from autocoder.common.save_formatted_log import save_formatted_log
|
|
45
|
+
import json, os
|
|
44
46
|
try:
|
|
45
47
|
from autocoder_pro.rag.llm_compute import LLMComputeEngine
|
|
46
48
|
pro_version = version("auto-coder-pro")
|
|
@@ -849,6 +851,15 @@ class LongContextRAG:
|
|
|
849
851
|
conversations=conversations, local_image_host=self.args.local_image_host
|
|
850
852
|
)
|
|
851
853
|
|
|
854
|
+
# 保存 new_conversations
|
|
855
|
+
try:
|
|
856
|
+
logger.info(f"Saving new_conversations log to {self.args.source_dir}/.cache/logs")
|
|
857
|
+
project_root = self.args.source_dir
|
|
858
|
+
json_text = json.dumps(new_conversations, ensure_ascii=False)
|
|
859
|
+
save_formatted_log(project_root, json_text, "rag_conversation")
|
|
860
|
+
except Exception as e:
|
|
861
|
+
logger.warning(f"Failed to save new_conversations log: {e}")
|
|
862
|
+
|
|
852
863
|
chunks = target_llm.stream_chat_oai(
|
|
853
864
|
conversations=new_conversations,
|
|
854
865
|
model=model,
|
|
@@ -2,6 +2,7 @@ from abc import ABC, abstractmethod
|
|
|
2
2
|
from typing import List, Dict, Any, Generator
|
|
3
3
|
import byzerllm
|
|
4
4
|
from autocoder.common import AutoCoderArgs
|
|
5
|
+
from autocoder.common.rulefiles.autocoderrules_utils import get_rules
|
|
5
6
|
|
|
6
7
|
class QAConversationStrategy(ABC):
|
|
7
8
|
"""
|
|
@@ -124,22 +125,8 @@ class MultiRoundStrategy(QAConversationStrategy):
|
|
|
124
125
|
{% endfor %}
|
|
125
126
|
{% endif %}
|
|
126
127
|
|
|
127
|
-
"""
|
|
128
|
-
|
|
129
|
-
import os
|
|
130
|
-
extra_docs = {}
|
|
131
|
-
rules_dir = os.path.join(self.args.source_dir, ".autocoderrules")
|
|
132
|
-
if os.path.isdir(rules_dir):
|
|
133
|
-
for fname in os.listdir(rules_dir):
|
|
134
|
-
if fname.endswith(".md"):
|
|
135
|
-
fpath = os.path.join(rules_dir, fname)
|
|
136
|
-
try:
|
|
137
|
-
with open(fpath, "r", encoding="utf-8") as f:
|
|
138
|
-
content = f.read()
|
|
139
|
-
key = os.path.splitext(fname)[0]
|
|
140
|
-
extra_docs[key] = content
|
|
141
|
-
except Exception:
|
|
142
|
-
continue
|
|
128
|
+
"""
|
|
129
|
+
extra_docs = get_rules()
|
|
143
130
|
return {"extra_docs": extra_docs}
|
|
144
131
|
|
|
145
132
|
class SingleRoundStrategy(QAConversationStrategy):
|
|
@@ -253,21 +240,8 @@ class SingleRoundStrategy(QAConversationStrategy):
|
|
|
253
240
|
{% endfor %}
|
|
254
241
|
{% endif %}
|
|
255
242
|
|
|
256
|
-
"""
|
|
257
|
-
|
|
258
|
-
extra_docs = {}
|
|
259
|
-
rules_dir = os.path.join(getattr(self, 'args', None).source_dir if getattr(self, 'args', None) else ".", ".autocoderrules")
|
|
260
|
-
if os.path.isdir(rules_dir):
|
|
261
|
-
for fname in os.listdir(rules_dir):
|
|
262
|
-
if fname.endswith(".md"):
|
|
263
|
-
fpath = os.path.join(rules_dir, fname)
|
|
264
|
-
try:
|
|
265
|
-
with open(fpath, "r", encoding="utf-8") as f:
|
|
266
|
-
content = f.read()
|
|
267
|
-
key = os.path.splitext(fname)[0]
|
|
268
|
-
extra_docs[key] = content
|
|
269
|
-
except Exception:
|
|
270
|
-
continue
|
|
243
|
+
"""
|
|
244
|
+
extra_docs = extra_docs = get_rules()
|
|
271
245
|
return {"extra_docs": extra_docs}
|
|
272
246
|
|
|
273
247
|
def get_qa_strategy(args: AutoCoderArgs) -> QAConversationStrategy:
|
autocoder/rag/utils.py
CHANGED
|
@@ -2,8 +2,9 @@ from autocoder.common import SourceCode
|
|
|
2
2
|
from autocoder.rag.token_counter import count_tokens_worker, count_tokens
|
|
3
3
|
from autocoder.rag.loaders.pdf_loader import extract_text_from_pdf
|
|
4
4
|
from autocoder.rag.loaders.docx_loader import extract_text_from_docx
|
|
5
|
-
from autocoder.rag.loaders.excel_loader import extract_text_from_excel
|
|
5
|
+
from autocoder.rag.loaders.excel_loader import extract_text_from_excel
|
|
6
6
|
from autocoder.rag.loaders.ppt_loader import extract_text_from_ppt
|
|
7
|
+
from autocoder.rag.loaders.image_loader import ImageLoader
|
|
7
8
|
from typing import List, Tuple, Optional, Union
|
|
8
9
|
import time
|
|
9
10
|
from loguru import logger
|
|
@@ -21,7 +22,7 @@ def process_file_in_multi_process(
|
|
|
21
22
|
llm = get_single_llm(llm,product_mode)
|
|
22
23
|
|
|
23
24
|
start_time = time.time()
|
|
24
|
-
file_path, relative_path, _, _ = file_info
|
|
25
|
+
file_path, relative_path, _, _ = file_info
|
|
25
26
|
try:
|
|
26
27
|
if file_path.endswith(".pdf"):
|
|
27
28
|
content = extract_text_from_pdf(file_path, llm, product_mode)
|
|
@@ -61,6 +62,15 @@ def process_file_in_multi_process(
|
|
|
61
62
|
tokens=count_tokens_worker(content),
|
|
62
63
|
)
|
|
63
64
|
]
|
|
65
|
+
elif file_path.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif")):
|
|
66
|
+
content = ImageLoader.image_to_markdown(file_path, llm=llm, product_mode=product_mode)
|
|
67
|
+
v = [
|
|
68
|
+
SourceCode(
|
|
69
|
+
module_name=f"##File: {file_path}",
|
|
70
|
+
source_code=content,
|
|
71
|
+
tokens=count_tokens_worker(content),
|
|
72
|
+
)
|
|
73
|
+
]
|
|
64
74
|
else:
|
|
65
75
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
66
76
|
content = f.read()
|
|
@@ -126,6 +136,15 @@ def process_file_local(
|
|
|
126
136
|
tokens=count_tokens(content),
|
|
127
137
|
)
|
|
128
138
|
]
|
|
139
|
+
elif file_path.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif")):
|
|
140
|
+
content = ImageLoader.image_to_markdown(file_path, llm=llm, product_mode=product_mode)
|
|
141
|
+
v = [
|
|
142
|
+
SourceCode(
|
|
143
|
+
module_name=f"##File: {file_path}",
|
|
144
|
+
source_code=content,
|
|
145
|
+
tokens=count_tokens(content),
|
|
146
|
+
)
|
|
147
|
+
]
|
|
129
148
|
else:
|
|
130
149
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
131
150
|
content = f.read()
|
autocoder/utils/_markitdown.py
CHANGED
|
@@ -30,18 +30,20 @@ from pdfminer.pdfpage import PDFPage
|
|
|
30
30
|
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|
31
31
|
import pptx
|
|
32
32
|
from pdfminer.image import ImageWriter
|
|
33
|
+
import time
|
|
33
34
|
|
|
34
35
|
import numpy as np
|
|
35
36
|
from PIL import Image
|
|
36
37
|
|
|
37
38
|
# 新增导入
|
|
38
|
-
from autocoder.rag.loaders import
|
|
39
|
+
from autocoder.rag.loaders.filter_utils import FilterRuleManager
|
|
39
40
|
from autocoder.rag.loaders.image_loader import ImageLoader
|
|
40
41
|
|
|
41
42
|
# File-format detection
|
|
42
43
|
import puremagic
|
|
43
44
|
import requests
|
|
44
45
|
from bs4 import BeautifulSoup
|
|
46
|
+
from loguru import logger
|
|
45
47
|
|
|
46
48
|
# Optional Transcription support
|
|
47
49
|
try:
|
|
@@ -503,12 +505,16 @@ class PdfConverter(DocumentConverter):
|
|
|
503
505
|
Converts PDFs to Markdown with support for extracting and including images.
|
|
504
506
|
"""
|
|
505
507
|
|
|
508
|
+
def __init__(self, llm=None, product_mode="lite"):
|
|
509
|
+
super().__init__()
|
|
510
|
+
self.llm = llm
|
|
511
|
+
self.product_mode = product_mode
|
|
512
|
+
|
|
506
513
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
|
507
514
|
# Bail if not a PDF
|
|
508
515
|
extension = kwargs.get("file_extension", "")
|
|
509
516
|
if extension.lower() != ".pdf":
|
|
510
|
-
return None
|
|
511
|
-
|
|
517
|
+
return None
|
|
512
518
|
image_output_dir = None
|
|
513
519
|
if kwargs.get("image_output_dir", None):
|
|
514
520
|
image_output_dir = kwargs.get("image_output_dir")
|
|
@@ -531,17 +537,18 @@ class PdfConverter(DocumentConverter):
|
|
|
531
537
|
rsrcmgr = PDFResourceManager()
|
|
532
538
|
laparams = LAParams()
|
|
533
539
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
|
534
|
-
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
540
|
+
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
535
541
|
|
|
536
542
|
# Process each page
|
|
537
543
|
for page in PDFPage.create_pages(document):
|
|
538
544
|
interpreter.process_page(page)
|
|
539
|
-
layout = device.get_result()
|
|
545
|
+
layout = device.get_result()
|
|
540
546
|
|
|
541
547
|
# Extract text and images from the page
|
|
542
548
|
page_content = self._process_layout(
|
|
543
549
|
layout, image_output_dir, image_count
|
|
544
550
|
)
|
|
551
|
+
|
|
545
552
|
text_content.extend(page_content)
|
|
546
553
|
image_count += len([c for c in page_content if c.startswith("![Image")])
|
|
547
554
|
|
|
@@ -582,13 +589,12 @@ class PdfConverter(DocumentConverter):
|
|
|
582
589
|
image_output_dir, f"image_{local_image_count}{suffix}")
|
|
583
590
|
os.rename(temp_path, image_path)
|
|
584
591
|
content.append(f"")
|
|
585
|
-
# =====
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
import traceback; traceback.print_exc()
|
|
592
|
+
# ===== 修改:通过FilterRuleManager单例实例判断是否需要解析图片
|
|
593
|
+
v = try_parse_image(image_path,self.llm)
|
|
594
|
+
if v:
|
|
595
|
+
content.append("<image_content>")
|
|
596
|
+
content.append(v)
|
|
597
|
+
content.append("</image_content>")
|
|
592
598
|
# =====
|
|
593
599
|
local_image_count += 1
|
|
594
600
|
continue
|
|
@@ -618,7 +624,11 @@ class PdfConverter(DocumentConverter):
|
|
|
618
624
|
content.append(
|
|
619
625
|
f"\n"
|
|
620
626
|
)
|
|
621
|
-
try_parse_image(image_path)
|
|
627
|
+
v = try_parse_image(image_path,self.llm)
|
|
628
|
+
if v:
|
|
629
|
+
content.append("<image_content>")
|
|
630
|
+
content.append(v)
|
|
631
|
+
content.append("</image_content>")
|
|
622
632
|
local_image_count += 1
|
|
623
633
|
continue
|
|
624
634
|
elif colorspace == "DeviceGray":
|
|
@@ -629,7 +639,11 @@ class PdfConverter(DocumentConverter):
|
|
|
629
639
|
content.append(
|
|
630
640
|
f"\n"
|
|
631
641
|
)
|
|
632
|
-
try_parse_image(image_path)
|
|
642
|
+
v = try_parse_image(image_path,self.llm)
|
|
643
|
+
if v:
|
|
644
|
+
content.append("<image_content>")
|
|
645
|
+
content.append(v)
|
|
646
|
+
content.append("</image_content>")
|
|
633
647
|
local_image_count += 1
|
|
634
648
|
continue
|
|
635
649
|
except Exception as e:
|
|
@@ -641,8 +655,12 @@ class PdfConverter(DocumentConverter):
|
|
|
641
655
|
img_file.write(image_data)
|
|
642
656
|
|
|
643
657
|
content.append(f"\n")
|
|
644
|
-
# =====
|
|
645
|
-
try_parse_image(image_path)
|
|
658
|
+
# ===== 新增:图片解析
|
|
659
|
+
v = try_parse_image(image_path,self.llm)
|
|
660
|
+
if v:
|
|
661
|
+
content.append("<image_content>")
|
|
662
|
+
content.append(v)
|
|
663
|
+
content.append("</image_content>")
|
|
646
664
|
local_image_count += 1
|
|
647
665
|
|
|
648
666
|
# Handle text
|
|
@@ -1089,6 +1107,8 @@ class MarkItDown:
|
|
|
1089
1107
|
llm: Optional[Any] = None,
|
|
1090
1108
|
product_mode: Optional[str] = None,
|
|
1091
1109
|
):
|
|
1110
|
+
# 初始化FilterRuleManager单例实例
|
|
1111
|
+
self._filter_rule_manager = FilterRuleManager.get_instance()
|
|
1092
1112
|
if requests_session is None:
|
|
1093
1113
|
self._requests_session = requests.Session()
|
|
1094
1114
|
else:
|
|
@@ -1117,7 +1137,7 @@ class MarkItDown:
|
|
|
1117
1137
|
self.register_page_converter(WavConverter())
|
|
1118
1138
|
self.register_page_converter(Mp3Converter())
|
|
1119
1139
|
self.register_page_converter(ImageConverter())
|
|
1120
|
-
self.register_page_converter(PdfConverter())
|
|
1140
|
+
self.register_page_converter(PdfConverter(llm,product_mode))
|
|
1121
1141
|
|
|
1122
1142
|
def convert(
|
|
1123
1143
|
self, source: Union[str, requests.Response], **kwargs: Any
|
|
@@ -1126,8 +1146,7 @@ class MarkItDown:
|
|
|
1126
1146
|
Args:
|
|
1127
1147
|
- source: can be a string representing a path or url, or a requests.response object
|
|
1128
1148
|
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
|
1129
|
-
"""
|
|
1130
|
-
|
|
1149
|
+
"""
|
|
1131
1150
|
# Local path or url
|
|
1132
1151
|
if isinstance(source, str):
|
|
1133
1152
|
if (
|
|
@@ -1343,14 +1362,36 @@ class MarkItDown:
|
|
|
1343
1362
|
self._page_converters.insert(0, converter)
|
|
1344
1363
|
|
|
1345
1364
|
|
|
1346
|
-
def try_parse_image(image_path: str):
|
|
1365
|
+
def try_parse_image(image_path: str, llm=None):
|
|
1347
1366
|
"""
|
|
1348
|
-
根据
|
|
1367
|
+
根据FilterRuleManager单例实例判断是否需要解析图片,如果需要则调用ImageLoader.image_to_markdown。
|
|
1349
1368
|
解析失败会自动捕获异常。
|
|
1350
1369
|
"""
|
|
1351
|
-
|
|
1370
|
+
import uuid
|
|
1371
|
+
start_time = time.time()
|
|
1372
|
+
req_id = str(uuid.uuid4())[:8]
|
|
1373
|
+
logger.info(f"\n==== [try_parse_image] START | req_id={req_id} ====")
|
|
1374
|
+
logger.info(f"[try_parse_image][{req_id}] image_path: {image_path}, llm: {llm}")
|
|
1375
|
+
if FilterRuleManager.get_instance().should_parse_image(image_path):
|
|
1376
|
+
logger.info(f"[try_parse_image][{req_id}] should_parse_image=True, start parsing...")
|
|
1352
1377
|
try:
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1378
|
+
v = ImageLoader.image_to_markdown(image_path, llm=llm, engine="paddle")
|
|
1379
|
+
logger.info(f"[try_parse_image][{req_id}] image_to_markdown result: {str(v)[:200]}")
|
|
1380
|
+
if llm:
|
|
1381
|
+
v = ImageLoader.format_table_in_content(v, llm)
|
|
1382
|
+
logger.info(f"[try_parse_image][{req_id}] format_table_in_content result: {str(v)[:200]}")
|
|
1383
|
+
elapsed = time.time() - start_time
|
|
1384
|
+
logger.info(f"[try_parse_image][{req_id}] SUCCESS | execution time: {elapsed:.3f} seconds")
|
|
1385
|
+
logger.info(f"==== [try_parse_image] END | req_id={req_id} ====")
|
|
1386
|
+
return v
|
|
1387
|
+
except Exception as e:
|
|
1388
|
+
elapsed = time.time() - start_time
|
|
1389
|
+
logger.error(f"[try_parse_image][{req_id}] EXCEPTION | execution time: {elapsed:.3f} seconds | image_path: {image_path} | llm: {llm}")
|
|
1390
|
+
logger.exception(e)
|
|
1391
|
+
logger.info(f"==== [try_parse_image] END (EXCEPTION) | req_id={req_id} ====")
|
|
1392
|
+
return ""
|
|
1393
|
+
else:
|
|
1394
|
+
logger.info(f"[try_parse_image][{req_id}] should_parse_image=False, skip parsing.")
|
|
1395
|
+
logger.info(f"==== [try_parse_image] END (SKIP) | req_id={req_id} ====")
|
|
1396
|
+
return ""
|
|
1356
1397
|
|
|
@@ -292,6 +292,7 @@ def stream_out(
|
|
|
292
292
|
get_event_manager(args.event_file).write_stream(content.to_dict(),
|
|
293
293
|
metadata=EventMetadata(
|
|
294
294
|
stream_out_type=extra_meta.get("stream_out_type", ""),
|
|
295
|
+
path=extra_meta.get("path", ""),
|
|
295
296
|
is_streaming=True,
|
|
296
297
|
output="delta",
|
|
297
298
|
action_file=args.file
|
autocoder/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.1.
|
|
1
|
+
__version__ = "0.1.354"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|