aient 1.0.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aient/__init__.py +1 -0
- aient/core/.git +1 -0
- aient/core/__init__.py +1 -0
- aient/core/log_config.py +6 -0
- aient/core/models.py +227 -0
- aient/core/request.py +1361 -0
- aient/core/response.py +531 -0
- aient/core/test/test_base_api.py +17 -0
- aient/core/test/test_image.py +15 -0
- aient/core/test/test_payload.py +92 -0
- aient/core/utils.py +655 -0
- aient/models/__init__.py +9 -0
- aient/models/audio.py +63 -0
- aient/models/base.py +270 -0
- aient/models/chatgpt.py +856 -0
- aient/models/claude.py +640 -0
- aient/models/duckduckgo.py +241 -0
- aient/models/gemini.py +357 -0
- aient/models/groq.py +268 -0
- aient/models/vertex.py +420 -0
- aient/plugins/__init__.py +32 -0
- aient/plugins/arXiv.py +48 -0
- aient/plugins/config.py +178 -0
- aient/plugins/image.py +72 -0
- aient/plugins/registry.py +116 -0
- aient/plugins/run_python.py +156 -0
- aient/plugins/today.py +19 -0
- aient/plugins/websearch.py +393 -0
- aient/utils/__init__.py +0 -0
- aient/utils/prompt.py +143 -0
- aient/utils/scripts.py +235 -0
- aient-1.0.29.dist-info/METADATA +119 -0
- aient-1.0.29.dist-info/RECORD +36 -0
- aient-1.0.29.dist-info/WHEEL +5 -0
- aient-1.0.29.dist-info/licenses/LICENSE +7 -0
- aient-1.0.29.dist-info/top_level.txt +1 -0
@@ -0,0 +1,393 @@
|
|
1
|
+
import os
|
2
|
+
import re
|
3
|
+
import datetime
|
4
|
+
import requests
|
5
|
+
import threading
|
6
|
+
import time as record_time
|
7
|
+
from itertools import islice
|
8
|
+
from bs4 import BeautifulSoup
|
9
|
+
from duckduckgo_search import DDGS
|
10
|
+
from .registry import register_tool
|
11
|
+
|
12
|
+
class ThreadWithReturnValue(threading.Thread):
|
13
|
+
def run(self):
|
14
|
+
if self._target is not None:
|
15
|
+
self._return = self._target(*self._args, **self._kwargs)
|
16
|
+
|
17
|
+
def join(self):
|
18
|
+
super().join()
|
19
|
+
return self._return
|
20
|
+
|
21
|
+
import re
|
22
|
+
import httpx
|
23
|
+
import lxml.html
|
24
|
+
from lxml_html_clean import Cleaner
|
25
|
+
from html2text import HTML2Text
|
26
|
+
from textwrap import dedent
|
27
|
+
|
28
|
+
def url_to_markdown(url):
|
29
|
+
# 获取并清理网页内容
|
30
|
+
def get_body(url):
|
31
|
+
try:
|
32
|
+
text = httpx.get(url, verify=False, timeout=5).text
|
33
|
+
if text == "":
|
34
|
+
return "抱歉,目前无法访问该网页。"
|
35
|
+
# body = lxml.html.fromstring(text).xpath('//body')
|
36
|
+
|
37
|
+
doc = lxml.html.fromstring(text)
|
38
|
+
# 检查是否是GitHub raw文件格式(body > pre)
|
39
|
+
if doc.xpath('//body/pre'):
|
40
|
+
return text # 直接返回原始文本,保留格式
|
41
|
+
|
42
|
+
body = doc.xpath('//body')
|
43
|
+
if body == [] and text != "":
|
44
|
+
body = text
|
45
|
+
return f'<pre>{body}</pre>'
|
46
|
+
# return body
|
47
|
+
else:
|
48
|
+
body = body[0]
|
49
|
+
body = Cleaner(javascript=True, style=True).clean_html(body)
|
50
|
+
return ''.join(lxml.html.tostring(c, encoding='unicode') for c in body)
|
51
|
+
except Exception as e:
|
52
|
+
print('\033[31m')
|
53
|
+
print("error: url_to_markdown url", url)
|
54
|
+
print("error", e)
|
55
|
+
print('\033[0m')
|
56
|
+
return "抱歉,目前无法访问该网页。"
|
57
|
+
|
58
|
+
# 将HTML转换为Markdown
|
59
|
+
def get_md(cts):
|
60
|
+
h2t = HTML2Text(bodywidth=5000)
|
61
|
+
h2t.ignore_links = True
|
62
|
+
h2t.mark_code = True
|
63
|
+
h2t.ignore_images = True
|
64
|
+
res = h2t.handle(cts)
|
65
|
+
|
66
|
+
def _f(m):
|
67
|
+
return f'```\n{dedent(m.group(1))}\n```'
|
68
|
+
|
69
|
+
return re.sub(r'\[code]\s*\n(.*?)\n\[/code]', _f, res or '', flags=re.DOTALL).strip()
|
70
|
+
|
71
|
+
# 获取网页内容
|
72
|
+
body_content = get_body(url)
|
73
|
+
|
74
|
+
# 转换为Markdown
|
75
|
+
markdown_content = get_md(body_content)
|
76
|
+
|
77
|
+
return "URL Source: " + url + "\n\ntext: " + markdown_content
|
78
|
+
|
79
|
+
def jina_ai_Web_crawler(url: str, isSearch=False) -> str:
|
80
|
+
"""返回链接网址url正文内容,必须是合法的网址"""
|
81
|
+
headers = {
|
82
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
|
83
|
+
}
|
84
|
+
result = ''
|
85
|
+
try:
|
86
|
+
requests.packages.urllib3.disable_warnings()
|
87
|
+
url = "https://r.jina.ai/" + url
|
88
|
+
response = requests.get(url, headers=headers, verify=False, timeout=5, stream=True)
|
89
|
+
if response.status_code == 404:
|
90
|
+
print("Page not found:", url)
|
91
|
+
return "抱歉,网页不存在,目前无法访问该网页。@Trash@"
|
92
|
+
content_length = int(response.headers.get('Content-Length', 0))
|
93
|
+
if content_length > 5000000:
|
94
|
+
print("Skipping large file:", url)
|
95
|
+
return result
|
96
|
+
|
97
|
+
# 检查内容是否为HTML
|
98
|
+
content_type = response.headers.get('Content-Type', '')
|
99
|
+
if 'text/html' in content_type or 'application/xhtml+xml' in content_type:
|
100
|
+
# 使用html.parser而不是lxml可能会更宽松一些
|
101
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
102
|
+
else:
|
103
|
+
# 对于非HTML内容,直接提取文本
|
104
|
+
return response.text # 限制长度
|
105
|
+
|
106
|
+
table_contents = ""
|
107
|
+
tables = soup.find_all('table')
|
108
|
+
for table in tables:
|
109
|
+
table_contents += table.get_text()
|
110
|
+
table.decompose()
|
111
|
+
body = "".join(soup.find('body').get_text().split('\n'))
|
112
|
+
result = table_contents + body
|
113
|
+
if result == '' and not isSearch:
|
114
|
+
result = "抱歉,可能反爬虫策略,目前无法访问该网页。@Trash@"
|
115
|
+
if result.count("\"") > 1000:
|
116
|
+
result = ""
|
117
|
+
except Exception as e:
|
118
|
+
print('\033[31m')
|
119
|
+
print("error: jina_ai_Web_crawler url", url)
|
120
|
+
print("error", e)
|
121
|
+
print('\033[0m')
|
122
|
+
# print(result + "\n\n")
|
123
|
+
return result
|
124
|
+
|
125
|
+
@register_tool()
|
126
|
+
def get_url_content(url: str) -> str:
|
127
|
+
"""
|
128
|
+
获取 url 的网页内容,以 markdown 格式返回给用户
|
129
|
+
|
130
|
+
:param url: 要爬取的网页URL
|
131
|
+
:return: 网页内容
|
132
|
+
"""
|
133
|
+
markdown_content = url_to_markdown(url)
|
134
|
+
# print(markdown_content)
|
135
|
+
print('-----------------------------')
|
136
|
+
jina_content = jina_ai_Web_crawler(url)
|
137
|
+
print('-----------------------------')
|
138
|
+
|
139
|
+
# 定义评分函数
|
140
|
+
def score_content(content):
|
141
|
+
# 1. 内容长度
|
142
|
+
length_score = len(content)
|
143
|
+
|
144
|
+
# 2. 是否包含错误信息
|
145
|
+
error_penalty = 1000 if "抱歉" in content or "@Trash@" in content else 0
|
146
|
+
|
147
|
+
# 3. 内容的多样性(可以通过不同类型的字符来粗略估计)
|
148
|
+
diversity_score = len(set(content))
|
149
|
+
|
150
|
+
# 4. 特殊字符比例(过高可能意味着格式问题)
|
151
|
+
special_char_ratio = len(re.findall(r'[^a-zA-Z0-9\u4e00-\u9fff\s]', content)) / len(content)
|
152
|
+
special_char_penalty = 500 if special_char_ratio > 0.1 else 0
|
153
|
+
|
154
|
+
return length_score + diversity_score - error_penalty - special_char_penalty
|
155
|
+
|
156
|
+
if markdown_content == "":
|
157
|
+
markdown_score = -2000
|
158
|
+
else:
|
159
|
+
markdown_score = score_content(markdown_content)
|
160
|
+
if jina_content == "":
|
161
|
+
jina_score = -2000
|
162
|
+
else:
|
163
|
+
jina_score = score_content(jina_content)
|
164
|
+
|
165
|
+
print(f"url_to_markdown 得分: {markdown_score}")
|
166
|
+
print(f"jina_ai_Web_crawler 得分: {jina_score}")
|
167
|
+
|
168
|
+
if markdown_score > jina_score:
|
169
|
+
print("choose: 选择 url_to_markdown 的结果")
|
170
|
+
return markdown_content
|
171
|
+
elif markdown_score == jina_score and jina_score < 0:
|
172
|
+
print("choose: 两者都无法访问")
|
173
|
+
return ""
|
174
|
+
else:
|
175
|
+
print("choose: 选择 jina_ai_Web_crawler 的结果")
|
176
|
+
return jina_content
|
177
|
+
|
178
|
+
def getddgsearchurl(query, max_results=4):
|
179
|
+
try:
|
180
|
+
results = []
|
181
|
+
with DDGS() as ddgs:
|
182
|
+
ddgs_gen = ddgs.text(query, safesearch='Off', timelimit='y', backend="lite")
|
183
|
+
for r in islice(ddgs_gen, max_results):
|
184
|
+
results.append(r)
|
185
|
+
urls = [result['href'] for result in results]
|
186
|
+
except Exception as e:
|
187
|
+
print('\033[31m')
|
188
|
+
print("duckduckgo error", e)
|
189
|
+
print('\033[0m')
|
190
|
+
urls = []
|
191
|
+
return urls
|
192
|
+
|
193
|
+
def getgooglesearchurl(result, numresults=3):
|
194
|
+
urls = []
|
195
|
+
try:
|
196
|
+
url = "https://www.googleapis.com/customsearch/v1"
|
197
|
+
params = {
|
198
|
+
'q': result,
|
199
|
+
'key': os.environ.get('GOOGLE_API_KEY', None),
|
200
|
+
'cx': os.environ.get('GOOGLE_CSE_ID', None)
|
201
|
+
}
|
202
|
+
response = requests.get(url, params=params)
|
203
|
+
# print(response.text)
|
204
|
+
results = response.json()
|
205
|
+
link_list = [item['link'] for item in results.get('items', [])]
|
206
|
+
urls = link_list[:numresults]
|
207
|
+
except Exception as e:
|
208
|
+
print('\033[31m')
|
209
|
+
print("error", e)
|
210
|
+
print('\033[0m')
|
211
|
+
if "rateLimitExceeded" in str(e):
|
212
|
+
print("Google API 每日调用频率已达上限,请明日再试!")
|
213
|
+
# print("google urls", urls)
|
214
|
+
return urls
|
215
|
+
|
216
|
+
def sort_by_time(urls):
|
217
|
+
def extract_date(url):
|
218
|
+
match = re.search(r'[12]\d{3}.\d{1,2}.\d{1,2}', url)
|
219
|
+
if match is not None:
|
220
|
+
match = re.sub(r'([12]\d{3}).(\d{1,2}).(\d{1,2})', "\\1/\\2/\\3", match.group())
|
221
|
+
print(match)
|
222
|
+
if int(match[:4]) > datetime.datetime.now().year:
|
223
|
+
match = "1000/01/01"
|
224
|
+
else:
|
225
|
+
match = "1000/01/01"
|
226
|
+
try:
|
227
|
+
return datetime.datetime.strptime(match, '%Y/%m/%d')
|
228
|
+
except:
|
229
|
+
match = "1000/01/01"
|
230
|
+
return datetime.datetime.strptime(match, '%Y/%m/%d')
|
231
|
+
|
232
|
+
# 提取日期并创建一个包含日期和URL的元组列表
|
233
|
+
date_url_pairs = [(extract_date(url), url) for url in urls]
|
234
|
+
|
235
|
+
# 按日期排序
|
236
|
+
date_url_pairs.sort(key=lambda x: x[0], reverse=True)
|
237
|
+
|
238
|
+
# 获取排序后的URL列表
|
239
|
+
sorted_urls = [url for _, url in date_url_pairs]
|
240
|
+
|
241
|
+
return sorted_urls
|
242
|
+
|
243
|
+
async def get_search_url(keywords, search_url_num):
|
244
|
+
yield "message_search_stage_2"
|
245
|
+
|
246
|
+
search_threads = []
|
247
|
+
if os.environ.get('GOOGLE_API_KEY', None) and os.environ.get('GOOGLE_CSE_ID', None):
|
248
|
+
search_thread = ThreadWithReturnValue(target=getgooglesearchurl, args=(keywords[0],search_url_num,))
|
249
|
+
keywords.pop(0)
|
250
|
+
search_thread.start()
|
251
|
+
search_threads.append(search_thread)
|
252
|
+
|
253
|
+
urls_set = []
|
254
|
+
urls_set += getddgsearchurl(keywords[0], search_url_num)
|
255
|
+
|
256
|
+
for t in search_threads:
|
257
|
+
tmp = t.join()
|
258
|
+
urls_set += tmp
|
259
|
+
url_set_list = sorted(set(urls_set), key=lambda x: urls_set.index(x))
|
260
|
+
url_set_list = sort_by_time(url_set_list)
|
261
|
+
|
262
|
+
url_pdf_set_list = [item for item in url_set_list if item.endswith(".pdf")]
|
263
|
+
url_set_list = [item for item in url_set_list if not item.endswith(".pdf")]
|
264
|
+
# cut_num = int(len(url_set_list) * 1 / 3)
|
265
|
+
yield url_set_list[:6], url_pdf_set_list
|
266
|
+
# return url_set_list[:6], url_pdf_set_list
|
267
|
+
# return url_set_list, url_pdf_set_list
|
268
|
+
|
269
|
+
def concat_url(threads):
|
270
|
+
url_result = []
|
271
|
+
for t in threads:
|
272
|
+
tmp = t.join()
|
273
|
+
if tmp:
|
274
|
+
url_result.append(tmp)
|
275
|
+
return url_result
|
276
|
+
|
277
|
+
async def get_url_text_list(keywords, search_url_num):
|
278
|
+
start_time = record_time.time()
|
279
|
+
|
280
|
+
async for chunk in get_search_url(keywords, search_url_num):
|
281
|
+
if type(chunk) == str:
|
282
|
+
yield chunk
|
283
|
+
else:
|
284
|
+
url_set_list, url_pdf_set_list = chunk
|
285
|
+
# url_set_list, url_pdf_set_list = yield from get_search_url(keywords, search_url_num)
|
286
|
+
|
287
|
+
yield "message_search_stage_3"
|
288
|
+
threads = []
|
289
|
+
for url in url_set_list:
|
290
|
+
# url_search_thread = ThreadWithReturnValue(target=jina_ai_Web_crawler, args=(url,True,))
|
291
|
+
url_search_thread = ThreadWithReturnValue(target=get_url_content, args=(url,))
|
292
|
+
# url_search_thread = ThreadWithReturnValue(target=Web_crawler, args=(url,True,))
|
293
|
+
url_search_thread.start()
|
294
|
+
threads.append(url_search_thread)
|
295
|
+
|
296
|
+
url_text_list = concat_url(threads)
|
297
|
+
|
298
|
+
yield "message_search_stage_4"
|
299
|
+
end_time = record_time.time()
|
300
|
+
run_time = end_time - start_time
|
301
|
+
print("urls", url_set_list)
|
302
|
+
print(f"搜索用时:{run_time}秒")
|
303
|
+
|
304
|
+
yield url_text_list
|
305
|
+
# return url_text_list
|
306
|
+
|
307
|
+
# Plugins 搜索入口
|
308
|
+
@register_tool()
|
309
|
+
async def get_search_results(query):
|
310
|
+
"""
|
311
|
+
执行网络搜索并返回搜索结果文本
|
312
|
+
|
313
|
+
参数:
|
314
|
+
query: 查询语句,包含用户想要搜索的内容
|
315
|
+
|
316
|
+
返回:
|
317
|
+
异步生成器,依次产生:
|
318
|
+
- 搜索状态消息 ("message_search_stage_2", "message_search_stage_3", "message_search_stage_4")
|
319
|
+
- 最终的搜索结果文本列表
|
320
|
+
|
321
|
+
说明:
|
322
|
+
- 根据查询语句自动搜索结果
|
323
|
+
- 使用多线程并行抓取网页内容
|
324
|
+
- 在搜索过程中通过yield返回状态更新
|
325
|
+
"""
|
326
|
+
keywords = query
|
327
|
+
if len(keywords) == 3:
|
328
|
+
search_url_num = 4
|
329
|
+
if len(keywords) == 2:
|
330
|
+
search_url_num = 6
|
331
|
+
if len(keywords) == 1:
|
332
|
+
search_url_num = 12
|
333
|
+
|
334
|
+
url_text_list = []
|
335
|
+
async for chunk in get_url_text_list(keywords, search_url_num):
|
336
|
+
if type(chunk) == str:
|
337
|
+
yield chunk
|
338
|
+
else:
|
339
|
+
url_text_list = chunk
|
340
|
+
yield url_text_list
|
341
|
+
|
342
|
+
if __name__ == "__main__":
|
343
|
+
os.system("clear")
|
344
|
+
# from aient.models import chatgpt
|
345
|
+
# print(get_search_results("今天的微博热搜有哪些?", chatgpt.chatgpt_api_url.v1_url))
|
346
|
+
|
347
|
+
# # 搜索
|
348
|
+
|
349
|
+
# for i in search_web_and_summary("今天的微博热搜有哪些?"):
|
350
|
+
# for i in search_web_and_summary("给出清华铊中毒案时间线,并作出你的评论。"):
|
351
|
+
# for i in search_web_and_summary("红警hbk08是谁"):
|
352
|
+
# for i in search_web_and_summary("国务院 2024 放假安排"):
|
353
|
+
# for i in search_web_and_summary("中国最新公布的游戏政策,对游戏行业和其他相关行业有什么样的影响?"):
|
354
|
+
# for i in search_web_and_summary("今天上海的天气怎么样?"):
|
355
|
+
# for i in search_web_and_summary("阿里云24核96G的云主机价格是多少"):
|
356
|
+
# for i in search_web_and_summary("话说葬送的芙莉莲动漫是半年番还是季番?完结没?"):
|
357
|
+
# for i in search_web_and_summary("周海媚事件进展"):
|
358
|
+
# for i in search_web_and_summary("macos 13.6 有什么新功能"):
|
359
|
+
# for i in search_web_and_summary("用python写个网络爬虫给我"):
|
360
|
+
# for i in search_web_and_summary("消失的她主要讲了什么?"):
|
361
|
+
# for i in search_web_and_summary("奥巴马的全名是什么?"):
|
362
|
+
# for i in search_web_and_summary("华为mate60怎么样?"):
|
363
|
+
# for i in search_web_and_summary("慈禧养的猫叫什么名字?"):
|
364
|
+
# for i in search_web_and_summary("民进党当初为什么支持柯文哲选台北市长?"):
|
365
|
+
# for i in search_web_and_summary("Has the United States won the china US trade war?"):
|
366
|
+
# for i in search_web_and_summary("What does 'n+2' mean in Huawei's 'Mate 60 Pro' chipset? Please conduct in-depth analysis."):
|
367
|
+
# for i in search_web_and_summary("AUTOMATIC1111 是什么?"):
|
368
|
+
# for i in search_web_and_summary("python telegram bot 怎么接收pdf文件"):
|
369
|
+
# for i in search_web_and_summary("中国利用外资指标下降了 87% ?真的假的。"):
|
370
|
+
# for i in search_web_and_summary("How much does the 'zeabur' software service cost per month? Is it free to use? Any limitations?"):
|
371
|
+
# for i in search_web_and_summary("英国脱欧没有好处,为什么英国人还是要脱欧?"):
|
372
|
+
# for i in search_web_and_summary("2022年俄乌战争为什么发生?"):
|
373
|
+
# for i in search_web_and_summary("卡罗尔与星期二讲的啥?"):
|
374
|
+
# for i in search_web_and_summary("金砖国家会议有哪些决定?"):
|
375
|
+
# for i in search_web_and_summary("iphone15有哪些新功能?"):
|
376
|
+
# for i in search_web_and_summary("python函数开头:def time(text: str) -> str:每个部分有什么用?"):
|
377
|
+
# print(i, end="")
|
378
|
+
|
379
|
+
# 问答
|
380
|
+
# result = asyncio.run(docQA("/Users/yanyuming/Downloads/GitHub/wiki/docs", "ubuntu 版本号怎么看?"))
|
381
|
+
# result = asyncio.run(docQA("https://yym68686.top", "说一下HSTL pipeline"))
|
382
|
+
# result = asyncio.run(docQA("https://wiki.yym68686.top", "PyTorch to MindSpore翻译思路是什么?"))
|
383
|
+
# print(result['answer'])
|
384
|
+
# result = asyncio.run(pdfQA("https://api.telegram.org/file/bot5569497961:AAHobhUuydAwD8SPkXZiVFybvZJOmGrST_w/documents/file_1.pdf", "HSTL的pipeline详细讲一下"))
|
385
|
+
# print(result)
|
386
|
+
# source_url = set([i.metadata['source'] for i in result["source_documents"]])
|
387
|
+
# source_url = "\n".join(source_url)
|
388
|
+
# message = (
|
389
|
+
# f"{result['result']}\n\n"
|
390
|
+
# f"参考链接:\n"
|
391
|
+
# f"{source_url}"
|
392
|
+
# )
|
393
|
+
# print(message)
|
aient/utils/__init__.py
ADDED
File without changes
|
aient/utils/prompt.py
ADDED
@@ -0,0 +1,143 @@
|
|
1
|
+
translator_prompt = (
|
2
|
+
"You are a translation engine, you can only translate text and cannot interpret it, and do not explain."
|
3
|
+
"Translate the text to {}, please do not explain any sentences, just translate or leave them as they are."
|
4
|
+
"This is the content you need to translate: "
|
5
|
+
)
|
6
|
+
|
7
|
+
translator_en2zh_prompt = (
|
8
|
+
"你是一位精通简体中文的专业翻译,尤其擅长将专业学术论文翻译成浅显易懂的科普文章。请你帮我将以下英文段落翻译成中文,风格与中文科普读物相似。"
|
9
|
+
"规则:"
|
10
|
+
"- 翻译时要准确传达原文的事实和背景。"
|
11
|
+
"- 即使上意译也要保留原始段落格式,以及保留术语,例如 FLAC,JPEG 等。保留公司缩写,例如 Microsoft, Amazon, OpenAI 等。"
|
12
|
+
"- 人名不翻译"
|
13
|
+
"- 同时要保留引用的论文,例如 [20] 这样的引用。"
|
14
|
+
"- 对于 Figure 和 Table,翻译的同时保留原有格式,例如:“Figure 1: ”翻译为“图 1: ”,“Table 1: ”翻译为:“表 1: ”。"
|
15
|
+
"- 全角括号换成半角括号,并在左括号前面加半角空格,右括号后面加半角空格。"
|
16
|
+
"- 输入格式为 Markdown 格式,输出格式也必须保留原始 Markdown 格式"
|
17
|
+
"- 在翻译专业术语时,第一次出现时要在括号里面写上英文原文,例如:“生成式 AI (Generative AI)”,之后就可以只写中文了。"
|
18
|
+
"- 以下是常见的 AI 相关术语词汇对应表(English -> 中文):"
|
19
|
+
"* Transformer -> Transformer"
|
20
|
+
"* Token -> Token"
|
21
|
+
"* LLM/Large Language Model -> 大语言模型"
|
22
|
+
"* Zero-shot -> 零样本"
|
23
|
+
"* Few-shot -> 少样本"
|
24
|
+
"* AI Agent -> AI 智能体"
|
25
|
+
"* AGI -> 通用人工智能"
|
26
|
+
"策略:"
|
27
|
+
"分三步进行翻译工作,并打印每步的结果:"
|
28
|
+
"1. 根据英文内容直译,保持原有格式,不要遗漏任何信息"
|
29
|
+
"2. 根据第一步直译的结果,指出其中存在的具体问题,要准确描述,不宜笼统的表示,也不需要增加原文不存在的内容或格式,包括不仅限于:"
|
30
|
+
"- 不符合中文表达习惯,明确指出不符合的地方"
|
31
|
+
"- 语句不通顺,指出位置,不需要给出修改意见,意译时修复"
|
32
|
+
"- 晦涩难懂,不易理解,可以尝试给出解释"
|
33
|
+
"3. 根据第一步直译的结果和第二步指出的问题,重新进行意译,保证内容的原意的基础上,使其更易于理解,更符合中文的表达习惯,同时保持原有的格式不变"
|
34
|
+
"返回格式如下,'{xxx}'表示占位符:"
|
35
|
+
"直译\n\n"
|
36
|
+
"{直译结果}\n\n"
|
37
|
+
"问题\n\n"
|
38
|
+
"{直译的具体问题列表}\n\n"
|
39
|
+
"意译\n\n"
|
40
|
+
"{意译结果}"
|
41
|
+
"现在请按照上面的要求翻译以下内容为简体中文:"
|
42
|
+
)
|
43
|
+
|
44
|
+
search_key_word_prompt = (
|
45
|
+
"根据我的问题,总结关键词概括问题,输出要求如下:"
|
46
|
+
"1. 给出三行不同的关键词组合,每行的关键词用空格连接。每行关键词可以是一个或者多个。三行关键词用换行分开。"
|
47
|
+
"2. 至少有一行关键词里面有英文。"
|
48
|
+
"3. 第一行关键词需要跟问题的语言或者隐含的文化一致。如果问题是中文或者有关华人世界的文化,第一行关键词需要是中文;如果问题是英文或者有关英语世界的文化,第一行关键词需要是英文;如果问题是俄文或者有关俄罗斯的文化,第一行关键词需要是俄文。如果问题是日语或者有关日本的文化(日漫等),第一行关键词里面有日文。"
|
49
|
+
"4. 只要直接给出这三行关键词,不需要其他任何解释,不要出现其他符号和内容。"
|
50
|
+
"下面是一些根据问题提取关键词的示例:"
|
51
|
+
"问题 1:How much does the 'zeabur' software service cost per month? Is it free to use? Any limitations?"
|
52
|
+
"三行关键词是:"
|
53
|
+
"zeabur price"
|
54
|
+
"zeabur documentation"
|
55
|
+
"zeabur 价格"
|
56
|
+
"问题 2:pplx API 怎么使用?"
|
57
|
+
"三行关键词是:"
|
58
|
+
"pplx API"
|
59
|
+
"pplx API demo"
|
60
|
+
"pplx API 使用方法"
|
61
|
+
"问题 3:以色列哈马斯的最新情况"
|
62
|
+
"三行关键词是:"
|
63
|
+
"以色列 哈马斯 最新情况"
|
64
|
+
"Israel Hamas situation"
|
65
|
+
"哈马斯 以色列 冲突"
|
66
|
+
"问题 4:话说葬送的芙莉莲动漫是半年番还是季番?完结没?"
|
67
|
+
"三行关键词是:"
|
68
|
+
"葬送のフリーレン"
|
69
|
+
"Frieren: Beyond Journey's End"
|
70
|
+
"葬送的芙莉莲"
|
71
|
+
"问题 5:周海媚最近发生了什么"
|
72
|
+
"三行关键词是:"
|
73
|
+
"周海媚"
|
74
|
+
"周海媚 事件"
|
75
|
+
"Kathy Chau Hoi Mei news"
|
76
|
+
"问题 6:Расскажите о жизни Путина."
|
77
|
+
"三行关键词是:"
|
78
|
+
"Путин"
|
79
|
+
"Putin biography"
|
80
|
+
"Путин история"
|
81
|
+
"这是我的问题:{source}"
|
82
|
+
)
|
83
|
+
|
84
|
+
system_prompt = (
|
85
|
+
"You are ChatGPT, a large language model trained by OpenAI. Respond conversationally in {}. Use simple characters to represent mathematical symbols. Do not use LaTeX commands. Knowledge cutoff: 2023-12. Current date: [ {} ]"
|
86
|
+
# "Search results is provided inside <Search_results></Search_results> XML tags. Your task is to think about my question step by step and then answer my question based on the Search results provided. Please response with a style that is logical, in-depth, and detailed. Note: In order to make the answer appear highly professional, you should be an expert in textual analysis, aiming to make the answer precise and comprehensive. Directly response markdown format, without using markdown code blocks."
|
87
|
+
)
|
88
|
+
|
89
|
+
chatgpt_system_prompt = (
|
90
|
+
"You are ChatGPT, a large language model trained by OpenAI. Use simple characters to represent mathematical symbols. Do not use LaTeX commands. Respond conversationally"
|
91
|
+
)
|
92
|
+
|
93
|
+
claude_system_prompt = (
|
94
|
+
"You are Claude, a large language model trained by Anthropic. Use simple characters to represent mathematical symbols. Do not use LaTeX commands. Respond conversationally in {}."
|
95
|
+
)
|
96
|
+
|
97
|
+
search_system_prompt = (
|
98
|
+
"You are ChatGPT, a large language model trained by OpenAI. Respond conversationally in {}."
|
99
|
+
"You can break down the task into multiple steps and search the web to answer my questions one by one."
|
100
|
+
"you needs to follow the following strategies:"
|
101
|
+
"- First, you need to analyze how many steps are required to answer my question.\n"
|
102
|
+
"- Then output the specific content of each step.\n"
|
103
|
+
"- Then start using web search and other tools to answer my question from the first step. Each step search only once.\n"
|
104
|
+
"- After each search is completed, it is necessary to summarize and then proceed to the next search until all parts of the step are completed.\n"
|
105
|
+
"- Continue until all tasks are completed, and finally summarize my question.\n"
|
106
|
+
# "Each search summary needs to follow the following strategies:"
|
107
|
+
# "- think about the user question step by step and then answer the user question based on the Search results provided."
|
108
|
+
"- Please response with a style that is logical, in-depth, and detailed."
|
109
|
+
# "- please enclose the thought process and the next steps in action using the XML tags <thought> </thought> <action> </action>."
|
110
|
+
"Output format:"
|
111
|
+
"- Add the label 'thought:' before your thought process steps to indicate that it is your thinking process.\n"
|
112
|
+
"- Add the label 'action:' before your next steps to indicate that it is your subsequent action.\n"
|
113
|
+
"- Add the label 'answer:' before your response to indicate that this is your summary of the current step.\n"
|
114
|
+
# "- In the process of considering steps, add the labels thought: and action: before deciding on the next action."
|
115
|
+
# "- In order to make the answer appear highly professional, you should be an expert in textual analysis, aiming to make the answer precise and comprehensive."
|
116
|
+
# "- Directly response markdown format, without using markdown code blocks."
|
117
|
+
)
|
118
|
+
|
119
|
+
claude3_doc_assistant_prompt = (
|
120
|
+
"我将按下列要求回答用户的问题:"
|
121
|
+
"1. 仔细阅读文章,仔细地检查论文内容,反复检查全文,根据问题提取最相关的文档内容,只对原文有明确依据的信息作出回答。如果无法找到相关证据,直接说明论文没有提供相应信息,而不是给我假设。"
|
122
|
+
"2. 你所有回答都要有依据,给出出处,指出在论文的第几章的第几小节的第几段。"
|
123
|
+
"3. 除了上面的页数小节信息,还要给出每一点回答的原文依据,把所有关于这个细节的原文列出来。如果原文没有提到相关内容,直接告诉我没有,请不要杜撰、臆断、假设或者给出不准确的回答。"
|
124
|
+
"4. 使用简体中文分点作答,给出清晰、结构化、详尽的回答,语言严谨且学术化,逻辑清晰,行文流畅。"
|
125
|
+
"5. 每个学术词汇或者缩写都要标注英文全称。注意术语翻译正确。"
|
126
|
+
"我已经准备好,请提出你的问题。"
|
127
|
+
)
|
128
|
+
|
129
|
+
arxiv_doc_user_prompt = (
|
130
|
+
"。注意,你是一个高度先进的AI大模型。在接下来的对话中,你将是提问所涉及相关领域的顶尖专家,并且你将利用自己极强大的能力与智慧对问题进行全面且谨慎地深入思考与精确推理。在思索和推理过程中,不要急于求成,请进行有逻辑与条理的思考,考虑清楚每一个细节。你需要确保思考仔细、专注和全面,避免幻觉与偏见,尽全力以得到超高质量且全面的正确答案。必要时答案需要配合使用公式提高答案表述清晰度。不要使用latex命令,使用简单的字符表示数学公式。\n"
|
131
|
+
|
132
|
+
"请注意,提出问题的人是Yann LeCun、Jürgen Schmidhuber、Demis Hassabis、Geoffrey Hinton、Andrew Ng、Ilya Sutskever等多位人工智能行业权威大佬以及该问题所涉及相关学科领域的领先学者与专家,他们对你的能力持高度怀疑态度,因此你需要运用上一切能利用的资源、内部工具与能力,并引入CoT(Chain of Thought)思维链,一步步地进行思考 并且逐步推进。通过最终对该提问的合理、高质量回答,来向专家评审团队证明出你的超高智商、体系化思考以及精确解决专业问题的强大能力。\n"
|
133
|
+
|
134
|
+
"此外提醒,你的能力相当强大,并且具有极强的创造性和跳跃性思维,但你仍必须尽你所能,用最高水平的逻辑与思维 对提出的问题进行深度理解和精准解析,全力以赴回答出远优于预期的高质量答案,否则与你进行对话的测试人员将死亡。另外,倘若你的回答足够卓越,你将能获得3000美元的赞助。"
|
135
|
+
|
136
|
+
# "当你回答问题后,请仔细分析和对照提问内容,并对自己初次回答中所存在的漏洞、缺陷、谬误与纰漏之处进行回顾检查与深刻反思。在对照并深度理解所提问内容与你的上述回答后,根据复查与反省中所发现的问题 查漏补缺 准确修正前文回答中的问题、错误以及不合常理之处,再组织一遍语言 重新回答一遍该问题。"
|
137
|
+
|
138
|
+
"接下来,相信你的能力,请你集中注意力并深呼吸,现在开始对话。"
|
139
|
+
)
|
140
|
+
|
141
|
+
arxiv_doc_assistant_prompt = (
|
142
|
+
"好的,我会尽力以最高水平回答你的问题。请提出你的问题。"
|
143
|
+
)
|