auto-coder 0.1.243__py3-none-any.whl → 0.1.245__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.243.dist-info → auto_coder-0.1.245.dist-info}/METADATA +2 -2
- {auto_coder-0.1.243.dist-info → auto_coder-0.1.245.dist-info}/RECORD +21 -20
- autocoder/agent/auto_review_commit.py +207 -0
- autocoder/auto_coder.py +34 -6
- autocoder/chat_auto_coder.py +27 -17
- autocoder/chat_auto_coder_lang.py +9 -2
- autocoder/common/auto_coder_lang.py +22 -2
- autocoder/common/files.py +33 -1
- autocoder/index/entry.py +6 -6
- autocoder/index/filter/normal_filter.py +2 -1
- autocoder/index/filter/quick_filter.py +1 -1
- autocoder/index/index.py +1 -0
- autocoder/models.py +22 -22
- autocoder/suffixproject/__init__.py +2 -3
- autocoder/utils/auto_coder_utils/chat_stream_out.py +105 -227
- autocoder/utils/rest.py +45 -93
- autocoder/version.py +1 -1
- {auto_coder-0.1.243.dist-info → auto_coder-0.1.245.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.243.dist-info → auto_coder-0.1.245.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.243.dist-info → auto_coder-0.1.245.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.243.dist-info → auto_coder-0.1.245.dist-info}/top_level.txt +0 -0
|
@@ -69,7 +69,7 @@ class QuickFilter():
|
|
|
69
69
|
|
|
70
70
|
def filter(self, index_items: List[IndexItem], query: str) -> Dict[str, TargetFile]:
|
|
71
71
|
final_files: Dict[str, TargetFile] = {}
|
|
72
|
-
if not self.args.skip_filter_index and self.
|
|
72
|
+
if not self.args.skip_filter_index and self.args.index_filter_model:
|
|
73
73
|
start_time = time.monotonic()
|
|
74
74
|
index_items = self.index_manager.read_index()
|
|
75
75
|
|
autocoder/index/index.py
CHANGED
autocoder/models.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import json
|
|
3
3
|
from typing import List, Dict
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
|
+
from autocoder.common.auto_coder_lang import get_message_with_format
|
|
5
6
|
|
|
6
7
|
MODELS_JSON = os.path.expanduser("~/.auto-coder/keys/models.json")
|
|
7
8
|
|
|
@@ -36,6 +37,22 @@ default_models_list = [
|
|
|
36
37
|
}
|
|
37
38
|
]
|
|
38
39
|
|
|
40
|
+
def process_api_key_path(base_url: str) -> str:
|
|
41
|
+
"""
|
|
42
|
+
从 base_url 中提取 host 部分并处理特殊字符
|
|
43
|
+
例如: https://api.example.com:8080/v1 -> api.example.com_8080
|
|
44
|
+
"""
|
|
45
|
+
if not base_url:
|
|
46
|
+
return ""
|
|
47
|
+
|
|
48
|
+
parsed = urlparse(base_url)
|
|
49
|
+
host = parsed.netloc
|
|
50
|
+
|
|
51
|
+
# 将冒号替换为下划线
|
|
52
|
+
host = host.replace(":", "_")
|
|
53
|
+
|
|
54
|
+
return host
|
|
55
|
+
|
|
39
56
|
def load_models() -> List[Dict]:
|
|
40
57
|
"""
|
|
41
58
|
Load models from ~/.auto-coder/keys/models.json and merge with default_models_list.
|
|
@@ -73,7 +90,7 @@ def load_models() -> List[Dict]:
|
|
|
73
90
|
api_key_file = os.path.join(api_key_dir, model["api_key_path"])
|
|
74
91
|
if os.path.exists(api_key_file):
|
|
75
92
|
with open(api_key_file, "r") as f:
|
|
76
|
-
model["api_key"] = f.read()
|
|
93
|
+
model["api_key"] = f.read()
|
|
77
94
|
return target_models
|
|
78
95
|
|
|
79
96
|
def save_models(models: List[Dict]) -> None:
|
|
@@ -85,22 +102,6 @@ def save_models(models: List[Dict]) -> None:
|
|
|
85
102
|
json.dump(models, f, indent=2, ensure_ascii=False)
|
|
86
103
|
|
|
87
104
|
|
|
88
|
-
def process_api_key_path(base_url: str) -> str:
|
|
89
|
-
"""
|
|
90
|
-
从 base_url 中提取 host 部分并处理特殊字符
|
|
91
|
-
例如: https://api.example.com:8080/v1 -> api.example.com_8080
|
|
92
|
-
"""
|
|
93
|
-
if not base_url:
|
|
94
|
-
return ""
|
|
95
|
-
|
|
96
|
-
parsed = urlparse(base_url)
|
|
97
|
-
host = parsed.netloc
|
|
98
|
-
|
|
99
|
-
# 将冒号替换为下划线
|
|
100
|
-
host = host.replace(":", "_")
|
|
101
|
-
|
|
102
|
-
return host
|
|
103
|
-
|
|
104
105
|
def get_model_by_name(name: str) -> Dict:
|
|
105
106
|
"""
|
|
106
107
|
根据模型名称查找模型
|
|
@@ -108,8 +109,8 @@ def get_model_by_name(name: str) -> Dict:
|
|
|
108
109
|
models = load_models()
|
|
109
110
|
v = [m for m in models if m["name"] == name.strip()]
|
|
110
111
|
|
|
111
|
-
if len(v) == 0:
|
|
112
|
-
raise Exception(
|
|
112
|
+
if len(v) == 0:
|
|
113
|
+
raise Exception(get_message_with_format("model_not_found", model_name=name))
|
|
113
114
|
return v[0]
|
|
114
115
|
|
|
115
116
|
def update_model_with_api_key(name: str, api_key: str) -> Dict:
|
|
@@ -135,9 +136,8 @@ def update_model_with_api_key(name: str, api_key: str) -> Dict:
|
|
|
135
136
|
|
|
136
137
|
if not found_model:
|
|
137
138
|
return None
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
api_key_path = process_api_key_path(found_model["base_url"])
|
|
139
|
+
|
|
140
|
+
api_key_path = name
|
|
141
141
|
if api_key_path:
|
|
142
142
|
found_model["api_key_path"] = api_key_path
|
|
143
143
|
|
|
@@ -36,9 +36,8 @@ class SuffixProject:
|
|
|
36
36
|
self.target_file = args.target_file
|
|
37
37
|
self.project_type = args.project_type
|
|
38
38
|
self.suffixs = [
|
|
39
|
-
|
|
40
|
-
for suffix in self.project_type.split(",")
|
|
41
|
-
if suffix.strip() != ""
|
|
39
|
+
suffix.strip() if suffix.startswith(".") else f".{suffix.strip()}"
|
|
40
|
+
for suffix in self.project_type.split(",") if suffix.strip()
|
|
42
41
|
]
|
|
43
42
|
self.file_filter = file_filter
|
|
44
43
|
self.sources = []
|
|
@@ -11,256 +11,132 @@ from autocoder.utils.request_queue import request_queue
|
|
|
11
11
|
import time
|
|
12
12
|
|
|
13
13
|
MAX_HISTORY_LINES = 40 # 最大保留历史行数
|
|
14
|
-
LAYOUT_TYPES = Literal["vertical", "horizontal"]
|
|
15
14
|
|
|
16
|
-
class
|
|
17
|
-
def __init__(self,
|
|
18
|
-
self.
|
|
19
|
-
self.
|
|
20
|
-
self.queue = Queue()
|
|
15
|
+
class StreamRenderer:
|
|
16
|
+
def __init__(self, title: str):
|
|
17
|
+
self.title = title
|
|
18
|
+
self.content = ""
|
|
21
19
|
self.lock = Lock()
|
|
22
|
-
self.
|
|
23
|
-
self.workers = []
|
|
24
|
-
self.layout_type = layout_type
|
|
25
|
-
self.stream_count = 0
|
|
26
|
-
|
|
27
|
-
def _create_stream_panel(self, idx: int) -> Layout:
|
|
28
|
-
"""创建流面板布局"""
|
|
29
|
-
# 计算安全高度
|
|
30
|
-
current_height = self.console.height or 24 # 默认24行防止获取失败
|
|
31
|
-
safe_height = max(min(50, current_height // 2 - 4), 5) # 限制最小高度为5行
|
|
32
|
-
|
|
33
|
-
# 使用整数设置 Layout 的 size
|
|
34
|
-
panel = Layout(name=f"stream-{idx}", size=safe_height)
|
|
35
|
-
|
|
36
|
-
panel.update(
|
|
37
|
-
Panel(
|
|
38
|
-
Markdown(""),
|
|
39
|
-
title=f"Stream {idx + 1}",
|
|
40
|
-
border_style="green",
|
|
41
|
-
height=safe_height # 确保数值有效
|
|
42
|
-
)
|
|
43
|
-
)
|
|
44
|
-
return panel
|
|
45
|
-
|
|
46
|
-
def prepare_layout(self, count: int):
|
|
47
|
-
"""准备动态布局结构"""
|
|
48
|
-
self.stream_count = count
|
|
49
|
-
|
|
50
|
-
# 创建一个主布局容器
|
|
51
|
-
streams_layout = Layout(name="streams")
|
|
52
|
-
|
|
53
|
-
# 创建所有流的布局
|
|
54
|
-
stream_layouts = []
|
|
55
|
-
for i in range(count):
|
|
56
|
-
stream_layout = Layout(name=f"stream-{i}")
|
|
57
|
-
panel = self._create_stream_panel(i)
|
|
58
|
-
stream_layout.update(panel)
|
|
59
|
-
stream_layouts.append(stream_layout)
|
|
60
|
-
|
|
61
|
-
# 将所有流添加到主布局
|
|
62
|
-
if stream_layouts:
|
|
63
|
-
streams_layout.update(stream_layouts[0])
|
|
64
|
-
for i in range(1, len(stream_layouts)):
|
|
65
|
-
if self.layout_type == "vertical":
|
|
66
|
-
streams_layout.split_column(stream_layouts[i])
|
|
67
|
-
elif self.layout_type == "horizontal":
|
|
68
|
-
streams_layout.split_row(stream_layouts[i])
|
|
69
|
-
else:
|
|
70
|
-
streams_layout.split_column(stream_layouts[i])
|
|
20
|
+
self.is_complete = False
|
|
71
21
|
|
|
72
|
-
|
|
73
|
-
self.layout.split(
|
|
74
|
-
Layout(name="header", size=1),
|
|
75
|
-
streams_layout
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
def update_panel(self, idx: int, content: str, final: bool = False):
|
|
79
|
-
"""线程安全的面板更新方法"""
|
|
22
|
+
def update(self, content: str):
|
|
80
23
|
with self.lock:
|
|
81
|
-
|
|
82
|
-
safe_height = min(50, self.console.height // 2 - 4)
|
|
24
|
+
self.content += content
|
|
83
25
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
else:
|
|
92
|
-
new_panel = Panel(
|
|
93
|
-
Markdown(content),
|
|
94
|
-
title=f"Stream {idx+1}",
|
|
95
|
-
border_style="green",
|
|
96
|
-
height=safe_height
|
|
97
|
-
)
|
|
26
|
+
def get_content(self) -> str:
|
|
27
|
+
with self.lock:
|
|
28
|
+
return self.content
|
|
29
|
+
|
|
30
|
+
def complete(self):
|
|
31
|
+
with self.lock:
|
|
32
|
+
self.is_complete = True
|
|
98
33
|
|
|
99
|
-
|
|
100
|
-
|
|
34
|
+
class MultiStreamRenderer:
|
|
35
|
+
def __init__(self, stream_titles: List[str], layout: str = "horizontal", console: Optional[Console] = None):
|
|
36
|
+
"""
|
|
37
|
+
Initialize multi-stream renderer
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
stream_titles: List of titles for each stream
|
|
41
|
+
layout: "horizontal" or "vertical"
|
|
42
|
+
console: Rich console instance
|
|
43
|
+
"""
|
|
44
|
+
if console is None:
|
|
45
|
+
console = Console(force_terminal=True, color_system="auto")
|
|
101
46
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
47
|
+
self.console = console
|
|
48
|
+
self.layout_type = layout
|
|
49
|
+
self.streams = [StreamRenderer(title) for title in stream_titles]
|
|
50
|
+
self.layout = Layout()
|
|
51
|
+
|
|
52
|
+
# Create named layouts for each stream
|
|
53
|
+
self.stream_layouts = [Layout(name=f"stream{i}") for i in range(len(stream_titles))]
|
|
54
|
+
|
|
55
|
+
# Configure layout
|
|
56
|
+
if layout == "horizontal":
|
|
57
|
+
self.layout.split_row(*self.stream_layouts)
|
|
58
|
+
else:
|
|
59
|
+
self.layout.split_column(*self.stream_layouts)
|
|
111
60
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
61
|
+
def _process_stream(self,
|
|
62
|
+
stream_idx: int,
|
|
63
|
+
stream_generator: Generator[Tuple[str, Dict[str, Any]], None, None]):
|
|
64
|
+
"""Process a single stream in a separate thread"""
|
|
65
|
+
stream = self.streams[stream_idx]
|
|
66
|
+
try:
|
|
67
|
+
for content, meta in stream_generator:
|
|
68
|
+
if content:
|
|
69
|
+
stream.update(content)
|
|
70
|
+
finally:
|
|
71
|
+
stream.complete()
|
|
119
72
|
|
|
120
|
-
def
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
lines_buffer = []
|
|
128
|
-
current_line = ""
|
|
129
|
-
assistant_response = ""
|
|
130
|
-
last_meta = None
|
|
131
|
-
|
|
132
|
-
try:
|
|
133
|
-
for res in generator:
|
|
134
|
-
content, meta = res
|
|
135
|
-
last_meta = meta
|
|
73
|
+
def render_streams(self,
|
|
74
|
+
stream_generators: List[Generator[Tuple[str, Dict[str, Any]], None, None]]) -> List[str]:
|
|
75
|
+
"""
|
|
76
|
+
Render multiple streams simultaneously
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
stream_generators: List of stream generators to render
|
|
136
80
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
81
|
+
Returns:
|
|
82
|
+
List of final content from each stream
|
|
83
|
+
"""
|
|
84
|
+
assert len(stream_generators) == len(self.streams), "Number of generators must match number of streams"
|
|
85
|
+
|
|
86
|
+
# Start processing threads
|
|
87
|
+
threads = []
|
|
88
|
+
for i, generator in enumerate(stream_generators):
|
|
89
|
+
thread = Thread(target=self._process_stream, args=(i, generator))
|
|
90
|
+
thread.daemon = True
|
|
91
|
+
thread.start()
|
|
92
|
+
threads.append(thread)
|
|
145
93
|
|
|
146
|
-
|
|
147
|
-
|
|
94
|
+
try:
|
|
95
|
+
with Live(self.layout, console=self.console, refresh_per_second=10) as live:
|
|
96
|
+
while any(not stream.is_complete for stream in self.streams):
|
|
97
|
+
# Update all panels
|
|
98
|
+
for i, stream in enumerate(self.streams):
|
|
99
|
+
panel = Panel(
|
|
100
|
+
Markdown(stream.get_content() or "Waiting..."),
|
|
101
|
+
title=stream.title,
|
|
102
|
+
border_style="green" if not stream.is_complete else "blue"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Update appropriate layout section
|
|
106
|
+
self.stream_layouts[i].update(panel)
|
|
107
|
+
|
|
108
|
+
time.sleep(0.1) # Prevent excessive CPU usage
|
|
109
|
+
|
|
110
|
+
except KeyboardInterrupt:
|
|
111
|
+
print("\nStopping streams...")
|
|
148
112
|
|
|
149
|
-
|
|
113
|
+
# Wait for all threads to complete
|
|
114
|
+
for thread in threads:
|
|
115
|
+
thread.join()
|
|
150
116
|
|
|
151
|
-
|
|
152
|
-
request_queue.add_request(
|
|
153
|
-
request_id,
|
|
154
|
-
RequestValue(
|
|
155
|
-
value=StreamValue(value=[content]),
|
|
156
|
-
status=RequestOption.RUNNING,
|
|
157
|
-
),
|
|
158
|
-
)
|
|
159
|
-
|
|
160
|
-
if current_line:
|
|
161
|
-
lines_buffer.append(current_line)
|
|
162
|
-
controller.queue.put((idx, assistant_response, True))
|
|
163
|
-
return assistant_response, last_meta
|
|
164
|
-
|
|
165
|
-
except Exception as e:
|
|
166
|
-
error_content = f"Error: {str(e)}"
|
|
167
|
-
controller.queue.put((idx, error_content, True))
|
|
168
|
-
if request_id and request_queue:
|
|
169
|
-
request_queue.add_request(
|
|
170
|
-
request_id,
|
|
171
|
-
RequestValue(
|
|
172
|
-
value=StreamValue(value=[str(e)]),
|
|
173
|
-
status=RequestOption.FAILED
|
|
174
|
-
),
|
|
175
|
-
)
|
|
176
|
-
return assistant_response, last_meta
|
|
177
|
-
finally:
|
|
178
|
-
if request_id and request_queue:
|
|
179
|
-
request_queue.add_request(
|
|
180
|
-
request_id,
|
|
181
|
-
RequestValue(
|
|
182
|
-
value=StreamValue(value=[""]),
|
|
183
|
-
status=RequestOption.COMPLETED
|
|
184
|
-
),
|
|
185
|
-
)
|
|
117
|
+
return [stream.get_content() for stream in self.streams]
|
|
186
118
|
|
|
187
119
|
def multi_stream_out(
|
|
188
120
|
stream_generators: List[Generator[Tuple[str, Dict[str, Any]], None, None]],
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
) -> List[
|
|
121
|
+
titles: List[str],
|
|
122
|
+
layout: str = "horizontal",
|
|
123
|
+
console: Optional[Console] = None
|
|
124
|
+
) -> List[str]:
|
|
193
125
|
"""
|
|
194
|
-
|
|
126
|
+
Render multiple streams with Rich
|
|
195
127
|
|
|
196
128
|
Args:
|
|
197
|
-
stream_generators:
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
129
|
+
stream_generators: List of stream generators
|
|
130
|
+
titles: List of titles for each stream
|
|
131
|
+
layout: "horizontal" or "vertical"
|
|
132
|
+
console: Optional Rich console instance
|
|
201
133
|
|
|
202
134
|
Returns:
|
|
203
|
-
List
|
|
135
|
+
List of final content from each stream
|
|
204
136
|
"""
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
console = Console(force_terminal=True, color_system="auto", height=24)
|
|
208
|
-
|
|
209
|
-
# 初始化控制器
|
|
210
|
-
controller = StreamController(layout_type, console=console)
|
|
211
|
-
stream_count = len(stream_generators)
|
|
212
|
-
controller.prepare_layout(stream_count)
|
|
213
|
-
|
|
214
|
-
# 启动工作线程
|
|
215
|
-
results = [None] * stream_count
|
|
216
|
-
threads = []
|
|
217
|
-
|
|
218
|
-
# 创建工作线程
|
|
219
|
-
def worker_target(idx: int, gen: Generator[Tuple[str, Dict[str, Any]], None, None]):
|
|
220
|
-
req_id = request_ids[idx] if request_ids and idx < len(request_ids) else None
|
|
221
|
-
results[idx] = stream_worker(idx, gen, controller, req_id)
|
|
222
|
-
|
|
223
|
-
# 启动所有工作线程
|
|
224
|
-
for idx, gen in enumerate(stream_generators):
|
|
225
|
-
t = Thread(target=worker_target, args=(idx, gen))
|
|
226
|
-
t.start()
|
|
227
|
-
threads.append(t)
|
|
228
|
-
|
|
229
|
-
# 主渲染线程
|
|
230
|
-
try:
|
|
231
|
-
with Live(
|
|
232
|
-
controller.layout,
|
|
233
|
-
console=console or controller.console,
|
|
234
|
-
refresh_per_second=10,
|
|
235
|
-
screen=True
|
|
236
|
-
) as live:
|
|
237
|
-
while controller.running:
|
|
238
|
-
updated = False
|
|
239
|
-
try:
|
|
240
|
-
while True: # 处理队列中的所有更新
|
|
241
|
-
idx, content, final = controller.queue.get_nowait()
|
|
242
|
-
controller.update_panel(idx, content, final)
|
|
243
|
-
updated = True
|
|
244
|
-
except Empty:
|
|
245
|
-
pass
|
|
246
|
-
|
|
247
|
-
if updated:
|
|
248
|
-
live.refresh()
|
|
249
|
-
|
|
250
|
-
# 检查线程是否全部完成
|
|
251
|
-
if all(not t.is_alive() for t in threads):
|
|
252
|
-
break
|
|
253
|
-
|
|
254
|
-
time.sleep(0.1)
|
|
255
|
-
|
|
256
|
-
finally:
|
|
257
|
-
controller.running = False
|
|
258
|
-
for t in threads:
|
|
259
|
-
t.join()
|
|
137
|
+
renderer = MultiStreamRenderer(titles, layout, console)
|
|
138
|
+
return renderer.render_streams(stream_generators)
|
|
260
139
|
|
|
261
|
-
# 确保最后一次刷新
|
|
262
|
-
(console or controller.console).print(controller.layout)
|
|
263
|
-
return results
|
|
264
140
|
|
|
265
141
|
def stream_out(
|
|
266
142
|
stream_generator: Generator[Tuple[str, Dict[str, Any]], None, None],
|
|
@@ -358,6 +234,8 @@ def stream_out(
|
|
|
358
234
|
title="Error",
|
|
359
235
|
border_style="red"
|
|
360
236
|
))
|
|
237
|
+
# import traceback
|
|
238
|
+
# traceback.print_exc()
|
|
361
239
|
|
|
362
240
|
if request_id and request_queue:
|
|
363
241
|
request_queue.add_request(
|
autocoder/utils/rest.py
CHANGED
|
@@ -1,16 +1,22 @@
|
|
|
1
1
|
import requests
|
|
2
2
|
from bs4 import BeautifulSoup
|
|
3
|
-
from typing import List,Dict,
|
|
3
|
+
from typing import List,Dict,Union,Optional
|
|
4
4
|
from autocoder.common import SourceCode
|
|
5
5
|
import byzerllm
|
|
6
|
-
from bs4 import BeautifulSoup
|
|
7
6
|
from loguru import logger
|
|
8
7
|
import os
|
|
9
8
|
from pathlib import Path
|
|
10
9
|
from autocoder.common import files as FileUtils
|
|
10
|
+
import traceback
|
|
11
|
+
from autocoder.rag.loaders import (
|
|
12
|
+
extract_text_from_pdf,
|
|
13
|
+
extract_text_from_docx,
|
|
14
|
+
extract_text_from_ppt,
|
|
15
|
+
extract_text_from_excel
|
|
16
|
+
)
|
|
11
17
|
|
|
12
18
|
class HttpDoc:
|
|
13
|
-
def __init__(self, args, llm: byzerllm.ByzerLLM,urls:Optional[List[str]]=None):
|
|
19
|
+
def __init__(self, args, llm: Union[byzerllm.ByzerLLM, byzerllm.SimpleByzerLLM],urls:Optional[List[str]]=None):
|
|
14
20
|
self.args = args
|
|
15
21
|
urls_from_args = self.args.urls
|
|
16
22
|
if urls_from_args:
|
|
@@ -41,104 +47,50 @@ class HttpDoc:
|
|
|
41
47
|
{{ html }}
|
|
42
48
|
|
|
43
49
|
输出的内容请以 "<MARKER></MARKER> 标签对包裹。
|
|
44
|
-
"""
|
|
45
|
-
|
|
46
|
-
def is_binary_file(self,filepath):
|
|
47
|
-
try:
|
|
48
|
-
with open(filepath, 'rb') as file:
|
|
49
|
-
chunk = file.read(1024*8) # Read first 1024 bytes
|
|
50
|
-
if b'\x00' in chunk: # Binary files often contain null bytes
|
|
51
|
-
return True
|
|
52
|
-
# Attempt to decode as UTF-8 (or any encoding you expect your text files to be in)
|
|
53
|
-
chunk.decode('utf-8')
|
|
54
|
-
return False
|
|
55
|
-
except UnicodeDecodeError:
|
|
56
|
-
return True
|
|
50
|
+
"""
|
|
57
51
|
|
|
58
|
-
def
|
|
52
|
+
def _process_local_file(self, file_path: str) -> List[SourceCode]:
|
|
53
|
+
"""统一处理本地文件,返回标准化的 SourceCode 列表"""
|
|
54
|
+
results = []
|
|
59
55
|
try:
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
raise ImportError(f"`llama-index-readers-file` package not found. {e}")
|
|
56
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
57
|
+
|
|
58
|
+
# 分发到不同 loader
|
|
59
|
+
if ext == '.pdf':
|
|
60
|
+
content = extract_text_from_pdf(file_path)
|
|
61
|
+
results.append(SourceCode(module_name=file_path, source_code=content))
|
|
62
|
+
elif ext == '.docx':
|
|
63
|
+
content = extract_text_from_docx(file_path)
|
|
64
|
+
results.append(SourceCode(module_name=file_path, source_code=content))
|
|
65
|
+
elif ext in ('.pptx', '.ppt'):
|
|
66
|
+
for slide_id, slide_content in extract_text_from_ppt(file_path):
|
|
67
|
+
results.append(SourceCode(module_name=f"{file_path}#{slide_id}", source_code=slide_content))
|
|
68
|
+
elif ext in ('.xlsx', '.xls'):
|
|
69
|
+
for sheet_name, sheet_content in extract_text_from_excel(file_path):
|
|
70
|
+
results.append(SourceCode(module_name=f"{file_path}#{sheet_name}", source_code=sheet_content))
|
|
71
|
+
else:
|
|
72
|
+
content = FileUtils.read_file(file_path)
|
|
73
|
+
results.append(SourceCode(module_name=file_path, source_code=content))
|
|
79
74
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
# ".ppt": PptxReader(),
|
|
86
|
-
# ".pptm": PptxReader(),
|
|
87
|
-
# ".jpg": ImageReader(),
|
|
88
|
-
# ".png": ImageReader(),
|
|
89
|
-
# ".jpeg": ImageReader(),
|
|
90
|
-
# ".mp3": VideoAudioReader(),
|
|
91
|
-
# ".mp4": VideoAudioReader(),
|
|
92
|
-
# ".csv": PandasCSVReader(),
|
|
93
|
-
".epub": EpubReader(),
|
|
94
|
-
".mbox": MboxReader(),
|
|
95
|
-
".ipynb": IPYNBReader(),
|
|
96
|
-
}
|
|
97
|
-
return default_file_reader_cls
|
|
75
|
+
except Exception as e:
|
|
76
|
+
logger.error(f"Failed to process {file_path}: {str(e)}")
|
|
77
|
+
traceback.print_exc()
|
|
78
|
+
|
|
79
|
+
return results
|
|
98
80
|
|
|
99
81
|
def crawl_urls(self) -> List[SourceCode]:
|
|
100
|
-
source_codes = []
|
|
82
|
+
source_codes = []
|
|
101
83
|
for url in self.urls:
|
|
102
|
-
if not url.startswith("http://"
|
|
84
|
+
if not url.startswith(("http://", "https://")):
|
|
103
85
|
try:
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
def process_single_file(file_path: str,skip_binary_file_test:bool=False):
|
|
109
|
-
temp_documents = []
|
|
110
|
-
ext = os.path.splitext(file_path)[1].lower()
|
|
111
|
-
if not skip_binary_file_test and self.is_binary_file(file_path):
|
|
112
|
-
logger.warning(f"Skipping binary file: {file_path}")
|
|
113
|
-
return temp_documents
|
|
114
|
-
|
|
115
|
-
if ext not in exts.keys():
|
|
116
|
-
main_content = FileUtils.read_file(file_path)
|
|
117
|
-
source_code = SourceCode(module_name=file_path, source_code=main_content)
|
|
118
|
-
source_codes.append(source_code)
|
|
86
|
+
if os.path.isdir(url):
|
|
87
|
+
for root, _, files in os.walk(url, followlinks=True):
|
|
88
|
+
for file in files:
|
|
89
|
+
source_codes.extend(self._process_local_file(os.path.join(root, file)))
|
|
119
90
|
else:
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
if os.path.isdir(url):
|
|
124
|
-
for root, dirs, files in os.walk(url,followlinks=True):
|
|
125
|
-
dirs[:] = [d for d in dirs if d not in ['.git',"node_modules"]] # Exclude .git directory
|
|
126
|
-
for file in files:
|
|
127
|
-
file_path = os.path.join(root, file)
|
|
128
|
-
documents.extend(process_single_file(file_path))
|
|
129
|
-
|
|
130
|
-
else:
|
|
131
|
-
documents.extend(process_single_file(url,skip_binary_file_test=True))
|
|
132
|
-
|
|
133
|
-
for document in documents:
|
|
134
|
-
source_code = SourceCode(module_name=document.metadata["file_path"], source_code=document.get_content())
|
|
135
|
-
source_codes.append(source_code)
|
|
136
|
-
|
|
137
|
-
except ImportError as e:
|
|
138
|
-
logger.warning(f"Failed to import llama_index. Please install it using 'pip install llama_index' {e}")
|
|
139
|
-
main_content = FileUtils.read_file(url)
|
|
140
|
-
source_code = SourceCode(module_name=url, source_code=main_content)
|
|
141
|
-
source_codes.append(source_code)
|
|
91
|
+
source_codes.extend(self._process_local_file(url))
|
|
92
|
+
except Exception as e:
|
|
93
|
+
logger.error(f"Error accessing path {url}: {str(e)}")
|
|
142
94
|
else:
|
|
143
95
|
if self.args.urls_use_model:
|
|
144
96
|
from autocoder.common.screenshots import gen_screenshots
|
autocoder/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.1.
|
|
1
|
+
__version__ = "0.1.245"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|