botrun-flow-lang 5.12.263__py3-none-any.whl → 5.12.264__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- botrun_flow_lang/api/auth_api.py +39 -39
- botrun_flow_lang/api/auth_utils.py +183 -183
- botrun_flow_lang/api/botrun_back_api.py +65 -65
- botrun_flow_lang/api/flow_api.py +3 -3
- botrun_flow_lang/api/hatch_api.py +508 -508
- botrun_flow_lang/api/langgraph_api.py +811 -811
- botrun_flow_lang/api/line_bot_api.py +1484 -1484
- botrun_flow_lang/api/model_api.py +300 -300
- botrun_flow_lang/api/rate_limit_api.py +32 -32
- botrun_flow_lang/api/routes.py +79 -79
- botrun_flow_lang/api/search_api.py +53 -53
- botrun_flow_lang/api/storage_api.py +395 -395
- botrun_flow_lang/api/subsidy_api.py +290 -290
- botrun_flow_lang/api/subsidy_api_system_prompt.txt +109 -109
- botrun_flow_lang/api/user_setting_api.py +70 -70
- botrun_flow_lang/api/version_api.py +31 -31
- botrun_flow_lang/api/youtube_api.py +26 -26
- botrun_flow_lang/constants.py +13 -13
- botrun_flow_lang/langgraph_agents/agents/agent_runner.py +178 -178
- botrun_flow_lang/langgraph_agents/agents/agent_tools/step_planner.py +77 -77
- botrun_flow_lang/langgraph_agents/agents/checkpointer/firestore_checkpointer.py +666 -666
- botrun_flow_lang/langgraph_agents/agents/gov_researcher/GOV_RESEARCHER_PRD.md +192 -192
- botrun_flow_lang/langgraph_agents/agents/gov_researcher/gemini_subsidy_graph.py +460 -460
- botrun_flow_lang/langgraph_agents/agents/gov_researcher/gov_researcher_2_graph.py +1002 -1002
- botrun_flow_lang/langgraph_agents/agents/gov_researcher/gov_researcher_graph.py +822 -822
- botrun_flow_lang/langgraph_agents/agents/langgraph_react_agent.py +723 -723
- botrun_flow_lang/langgraph_agents/agents/search_agent_graph.py +864 -864
- botrun_flow_lang/langgraph_agents/agents/tools/__init__.py +4 -4
- botrun_flow_lang/langgraph_agents/agents/tools/gemini_code_execution.py +376 -376
- botrun_flow_lang/langgraph_agents/agents/util/gemini_grounding.py +66 -66
- botrun_flow_lang/langgraph_agents/agents/util/html_util.py +316 -316
- botrun_flow_lang/langgraph_agents/agents/util/img_util.py +294 -294
- botrun_flow_lang/langgraph_agents/agents/util/local_files.py +419 -419
- botrun_flow_lang/langgraph_agents/agents/util/mermaid_util.py +86 -86
- botrun_flow_lang/langgraph_agents/agents/util/model_utils.py +143 -143
- botrun_flow_lang/langgraph_agents/agents/util/pdf_analyzer.py +486 -486
- botrun_flow_lang/langgraph_agents/agents/util/pdf_cache.py +250 -250
- botrun_flow_lang/langgraph_agents/agents/util/pdf_processor.py +204 -204
- botrun_flow_lang/langgraph_agents/agents/util/perplexity_search.py +464 -464
- botrun_flow_lang/langgraph_agents/agents/util/plotly_util.py +59 -59
- botrun_flow_lang/langgraph_agents/agents/util/tavily_search.py +199 -199
- botrun_flow_lang/langgraph_agents/agents/util/youtube_util.py +90 -90
- botrun_flow_lang/langgraph_agents/cache/langgraph_botrun_cache.py +197 -197
- botrun_flow_lang/llm_agent/llm_agent.py +19 -19
- botrun_flow_lang/llm_agent/llm_agent_util.py +83 -83
- botrun_flow_lang/log/.gitignore +2 -2
- botrun_flow_lang/main.py +61 -61
- botrun_flow_lang/main_fast.py +51 -51
- botrun_flow_lang/mcp_server/__init__.py +10 -10
- botrun_flow_lang/mcp_server/default_mcp.py +744 -744
- botrun_flow_lang/models/nodes/utils.py +205 -205
- botrun_flow_lang/models/token_usage.py +34 -34
- botrun_flow_lang/requirements.txt +21 -21
- botrun_flow_lang/services/base/firestore_base.py +30 -30
- botrun_flow_lang/services/hatch/hatch_factory.py +11 -11
- botrun_flow_lang/services/hatch/hatch_fs_store.py +419 -419
- botrun_flow_lang/services/storage/storage_cs_store.py +206 -206
- botrun_flow_lang/services/storage/storage_factory.py +12 -12
- botrun_flow_lang/services/storage/storage_store.py +65 -65
- botrun_flow_lang/services/user_setting/user_setting_factory.py +9 -9
- botrun_flow_lang/services/user_setting/user_setting_fs_store.py +66 -66
- botrun_flow_lang/static/docs/tools/index.html +926 -926
- botrun_flow_lang/tests/api_functional_tests.py +1525 -1525
- botrun_flow_lang/tests/api_stress_test.py +357 -357
- botrun_flow_lang/tests/shared_hatch_tests.py +333 -333
- botrun_flow_lang/tests/test_botrun_app.py +46 -46
- botrun_flow_lang/tests/test_html_util.py +31 -31
- botrun_flow_lang/tests/test_img_analyzer.py +190 -190
- botrun_flow_lang/tests/test_img_util.py +39 -39
- botrun_flow_lang/tests/test_local_files.py +114 -114
- botrun_flow_lang/tests/test_mermaid_util.py +103 -103
- botrun_flow_lang/tests/test_pdf_analyzer.py +104 -104
- botrun_flow_lang/tests/test_plotly_util.py +151 -151
- botrun_flow_lang/tests/test_run_workflow_engine.py +65 -65
- botrun_flow_lang/tools/generate_docs.py +133 -133
- botrun_flow_lang/tools/templates/tools.html +153 -153
- botrun_flow_lang/utils/__init__.py +7 -7
- botrun_flow_lang/utils/botrun_logger.py +344 -344
- botrun_flow_lang/utils/clients/rate_limit_client.py +209 -209
- botrun_flow_lang/utils/clients/token_verify_client.py +153 -153
- botrun_flow_lang/utils/google_drive_utils.py +654 -654
- botrun_flow_lang/utils/langchain_utils.py +324 -324
- botrun_flow_lang/utils/yaml_utils.py +9 -9
- {botrun_flow_lang-5.12.263.dist-info → botrun_flow_lang-5.12.264.dist-info}/METADATA +1 -1
- botrun_flow_lang-5.12.264.dist-info/RECORD +102 -0
- botrun_flow_lang-5.12.263.dist-info/RECORD +0 -102
- {botrun_flow_lang-5.12.263.dist-info → botrun_flow_lang-5.12.264.dist-info}/WHEEL +0 -0
|
@@ -1,205 +1,205 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
from typing import Any, Dict, List
|
|
3
|
-
from urllib.parse import quote
|
|
4
|
-
|
|
5
|
-
import aiohttp
|
|
6
|
-
|
|
7
|
-
from yarl import URL
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
from io import StringIO
|
|
11
|
-
from pdfminer.high_level import extract_text_to_fp
|
|
12
|
-
from pdfminer.layout import LAParams
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
async def scrape_single_pdf(url: str) -> Dict[str, Any]:
|
|
16
|
-
"""從 URL 抓取單個 PDF 文件並轉換為純文字
|
|
17
|
-
|
|
18
|
-
Args:
|
|
19
|
-
url: PDF 文件的 URL
|
|
20
|
-
|
|
21
|
-
Returns:
|
|
22
|
-
Dict[str, Any]: 包含 URL 和轉換後內容的字典,如果失敗則包含錯誤信息
|
|
23
|
-
"""
|
|
24
|
-
try:
|
|
25
|
-
# 使用 aiohttp 下載 PDF 文件
|
|
26
|
-
async with aiohttp.ClientSession() as session:
|
|
27
|
-
async with session.get(url) as response:
|
|
28
|
-
if response.status != 200:
|
|
29
|
-
return {
|
|
30
|
-
"url": url,
|
|
31
|
-
"status": "error",
|
|
32
|
-
"error": f"HTTP error {response.status}",
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
# 讀取 PDF 內容
|
|
36
|
-
pdf_content = await response.read()
|
|
37
|
-
|
|
38
|
-
# 創建輸出緩衝區
|
|
39
|
-
output_string = StringIO()
|
|
40
|
-
|
|
41
|
-
# 設置提取參數
|
|
42
|
-
laparams = LAParams(
|
|
43
|
-
line_margin=0.5,
|
|
44
|
-
word_margin=0.1,
|
|
45
|
-
char_margin=2.0,
|
|
46
|
-
boxes_flow=0.5,
|
|
47
|
-
detect_vertical=True,
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
# 從二進制內容提取文字
|
|
51
|
-
from io import BytesIO
|
|
52
|
-
|
|
53
|
-
pdf_file = BytesIO(pdf_content)
|
|
54
|
-
|
|
55
|
-
# 提取文字
|
|
56
|
-
extract_text_to_fp(
|
|
57
|
-
pdf_file,
|
|
58
|
-
output_string,
|
|
59
|
-
laparams=laparams,
|
|
60
|
-
output_type="text",
|
|
61
|
-
codec="utf-8",
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
# 獲取提取的文字
|
|
65
|
-
content = output_string.getvalue().strip()
|
|
66
|
-
|
|
67
|
-
return {"url": url, "content": content, "status": "success"}
|
|
68
|
-
except Exception as e:
|
|
69
|
-
import traceback
|
|
70
|
-
|
|
71
|
-
traceback.print_exc()
|
|
72
|
-
return {"url": url, "status": "error", "error": str(e)}
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
async def scrape_pdfs(selected_urls: List[str]) -> List[Dict[str, Any]]:
|
|
76
|
-
"""並行抓取多個 PDF 文件的內容
|
|
77
|
-
|
|
78
|
-
Args:
|
|
79
|
-
selected_urls: PDF 文件 URL 列表
|
|
80
|
-
|
|
81
|
-
Returns:
|
|
82
|
-
List[Dict[str, Any]]: 包含每個 PDF 的 URL 和內容的字典列表,只返回成功的結果
|
|
83
|
-
"""
|
|
84
|
-
# 創建所有 PDF 的抓取任務
|
|
85
|
-
scrape_tasks = [scrape_single_url(url, FILE_FORMAT_PDF) for url in selected_urls]
|
|
86
|
-
|
|
87
|
-
# 同時執行所有抓取任務
|
|
88
|
-
scrape_results = await asyncio.gather(*scrape_tasks)
|
|
89
|
-
|
|
90
|
-
# 只返回成功的結果
|
|
91
|
-
return [result for result in scrape_results if result["status"] == "success"]
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
async def scrape_urls(selected_urls: List[str]) -> List[Dict[str, Any]]:
|
|
95
|
-
"""並行抓取所有 URL 的內容"""
|
|
96
|
-
# 一次性創建所有 URL 的抓取任務
|
|
97
|
-
scrape_tasks = [scrape_single_url(url) for url in selected_urls]
|
|
98
|
-
|
|
99
|
-
# 同時執行所有抓取任務
|
|
100
|
-
scrape_results = await asyncio.gather(*scrape_tasks)
|
|
101
|
-
scrape_results = [
|
|
102
|
-
scrape_result
|
|
103
|
-
for scrape_result in scrape_results
|
|
104
|
-
if scrape_result["status"] == "success"
|
|
105
|
-
]
|
|
106
|
-
|
|
107
|
-
# 轉換為原來的輸出格式
|
|
108
|
-
return scrape_results
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
FILE_FORMAT_PDF = "application/pdf"
|
|
112
|
-
FILE_FORMATS = [FILE_FORMAT_PDF]
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
async def scrape_single_url(url: str, file_format: str = None) -> Dict[str, Any]:
|
|
116
|
-
"""抓取單個 URL 的內容"""
|
|
117
|
-
try:
|
|
118
|
-
if "%" not in url:
|
|
119
|
-
quoted_url = quote(url, safe="")
|
|
120
|
-
else:
|
|
121
|
-
quoted_url = url
|
|
122
|
-
scrape_url = f"https://botrun-crawler-fastapi-prod-36186877499.asia-east1.run.app/scrape?url={quoted_url}"
|
|
123
|
-
if file_format is not None and file_format in FILE_FORMATS:
|
|
124
|
-
file_format = quote(file_format, safe="")
|
|
125
|
-
scrape_url = f"{scrape_url}&file_format={file_format}"
|
|
126
|
-
scrape_url = URL(scrape_url, encoded=True)
|
|
127
|
-
async with aiohttp.ClientSession() as session:
|
|
128
|
-
async with session.get(scrape_url) as response:
|
|
129
|
-
if response.status == 200:
|
|
130
|
-
body = await response.json()
|
|
131
|
-
print(f"[scrape_single_url] url: {url}")
|
|
132
|
-
print(
|
|
133
|
-
f"[scrape_single_url] content: {body['data']['markdown'][:100]}"
|
|
134
|
-
)
|
|
135
|
-
return {
|
|
136
|
-
"url": url,
|
|
137
|
-
"title": body["data"]["metadata"]["title"],
|
|
138
|
-
"content": body["data"]["markdown"],
|
|
139
|
-
"status": "success",
|
|
140
|
-
}
|
|
141
|
-
else:
|
|
142
|
-
return {
|
|
143
|
-
"url": url,
|
|
144
|
-
"status": "error",
|
|
145
|
-
"error": f"Scraping failed with status {response.status}",
|
|
146
|
-
}
|
|
147
|
-
except Exception as e:
|
|
148
|
-
return {"url": url, "status": "error", "error": str(e)}
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
async def scrape_vertexai_search_results(search_results: Dict, limit: int = 5):
|
|
152
|
-
"""處理 Vertex AI 搜尋結果,並將抓取的內容更新到原始結果中
|
|
153
|
-
|
|
154
|
-
Args:
|
|
155
|
-
search_results: Vertex AI 搜尋回傳的結果字典
|
|
156
|
-
|
|
157
|
-
Returns:
|
|
158
|
-
Dict: 包含更新後的完整結果和其他格式文件
|
|
159
|
-
"""
|
|
160
|
-
# 分離一般網頁、PDF和其他格式文件
|
|
161
|
-
web_urls = []
|
|
162
|
-
pdf_urls = []
|
|
163
|
-
web_results_map = {} # 用於存放 url 到結果的映射
|
|
164
|
-
pdf_results_map = {} # 用於存放 PDF url 到結果的映射
|
|
165
|
-
other_format_results = []
|
|
166
|
-
updated_results = []
|
|
167
|
-
|
|
168
|
-
for result in search_results["results"][:limit]:
|
|
169
|
-
if result["fileFormat"] == "":
|
|
170
|
-
web_urls.append(result["url"])
|
|
171
|
-
web_results_map[result["url"]] = result
|
|
172
|
-
elif result["fileFormat"] == "PDF/Adobe Acrobat":
|
|
173
|
-
pdf_urls.append(result["url"])
|
|
174
|
-
pdf_results_map[result["url"]] = result
|
|
175
|
-
else:
|
|
176
|
-
other_format_results.append(result)
|
|
177
|
-
updated_results.append(result)
|
|
178
|
-
|
|
179
|
-
# 並行抓取網頁和PDF內容
|
|
180
|
-
scrape_tasks = []
|
|
181
|
-
|
|
182
|
-
if web_urls:
|
|
183
|
-
scrape_tasks.append(scrape_urls(web_urls))
|
|
184
|
-
if pdf_urls:
|
|
185
|
-
scrape_tasks.append(scrape_pdfs(pdf_urls))
|
|
186
|
-
|
|
187
|
-
# 同時執行所有��取任務
|
|
188
|
-
all_results = await asyncio.gather(*scrape_tasks) if scrape_tasks else []
|
|
189
|
-
|
|
190
|
-
# 更新原始結果中的內容
|
|
191
|
-
for results in all_results:
|
|
192
|
-
for scrape_result in results:
|
|
193
|
-
if scrape_result["url"] in web_results_map:
|
|
194
|
-
web_results_map[scrape_result["url"]]["content"] = scrape_result[
|
|
195
|
-
"content"
|
|
196
|
-
]
|
|
197
|
-
elif scrape_result["url"] in pdf_results_map:
|
|
198
|
-
pdf_results_map[scrape_result["url"]]["content"] = scrape_result[
|
|
199
|
-
"content"
|
|
200
|
-
]
|
|
201
|
-
|
|
202
|
-
return {
|
|
203
|
-
"results": updated_results,
|
|
204
|
-
"other_format_results": other_format_results,
|
|
205
|
-
}
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
from urllib.parse import quote
|
|
4
|
+
|
|
5
|
+
import aiohttp
|
|
6
|
+
|
|
7
|
+
from yarl import URL
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
from io import StringIO
|
|
11
|
+
from pdfminer.high_level import extract_text_to_fp
|
|
12
|
+
from pdfminer.layout import LAParams
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def scrape_single_pdf(url: str) -> Dict[str, Any]:
|
|
16
|
+
"""從 URL 抓取單個 PDF 文件並轉換為純文字
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
url: PDF 文件的 URL
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Dict[str, Any]: 包含 URL 和轉換後內容的字典,如果失敗則包含錯誤信息
|
|
23
|
+
"""
|
|
24
|
+
try:
|
|
25
|
+
# 使用 aiohttp 下載 PDF 文件
|
|
26
|
+
async with aiohttp.ClientSession() as session:
|
|
27
|
+
async with session.get(url) as response:
|
|
28
|
+
if response.status != 200:
|
|
29
|
+
return {
|
|
30
|
+
"url": url,
|
|
31
|
+
"status": "error",
|
|
32
|
+
"error": f"HTTP error {response.status}",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# 讀取 PDF 內容
|
|
36
|
+
pdf_content = await response.read()
|
|
37
|
+
|
|
38
|
+
# 創建輸出緩衝區
|
|
39
|
+
output_string = StringIO()
|
|
40
|
+
|
|
41
|
+
# 設置提取參數
|
|
42
|
+
laparams = LAParams(
|
|
43
|
+
line_margin=0.5,
|
|
44
|
+
word_margin=0.1,
|
|
45
|
+
char_margin=2.0,
|
|
46
|
+
boxes_flow=0.5,
|
|
47
|
+
detect_vertical=True,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# 從二進制內容提取文字
|
|
51
|
+
from io import BytesIO
|
|
52
|
+
|
|
53
|
+
pdf_file = BytesIO(pdf_content)
|
|
54
|
+
|
|
55
|
+
# 提取文字
|
|
56
|
+
extract_text_to_fp(
|
|
57
|
+
pdf_file,
|
|
58
|
+
output_string,
|
|
59
|
+
laparams=laparams,
|
|
60
|
+
output_type="text",
|
|
61
|
+
codec="utf-8",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# 獲取提取的文字
|
|
65
|
+
content = output_string.getvalue().strip()
|
|
66
|
+
|
|
67
|
+
return {"url": url, "content": content, "status": "success"}
|
|
68
|
+
except Exception as e:
|
|
69
|
+
import traceback
|
|
70
|
+
|
|
71
|
+
traceback.print_exc()
|
|
72
|
+
return {"url": url, "status": "error", "error": str(e)}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
async def scrape_pdfs(selected_urls: List[str]) -> List[Dict[str, Any]]:
|
|
76
|
+
"""並行抓取多個 PDF 文件的內容
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
selected_urls: PDF 文件 URL 列表
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
List[Dict[str, Any]]: 包含每個 PDF 的 URL 和內容的字典列表,只返回成功的結果
|
|
83
|
+
"""
|
|
84
|
+
# 創建所有 PDF 的抓取任務
|
|
85
|
+
scrape_tasks = [scrape_single_url(url, FILE_FORMAT_PDF) for url in selected_urls]
|
|
86
|
+
|
|
87
|
+
# 同時執行所有抓取任務
|
|
88
|
+
scrape_results = await asyncio.gather(*scrape_tasks)
|
|
89
|
+
|
|
90
|
+
# 只返回成功的結果
|
|
91
|
+
return [result for result in scrape_results if result["status"] == "success"]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
async def scrape_urls(selected_urls: List[str]) -> List[Dict[str, Any]]:
|
|
95
|
+
"""並行抓取所有 URL 的內容"""
|
|
96
|
+
# 一次性創建所有 URL 的抓取任務
|
|
97
|
+
scrape_tasks = [scrape_single_url(url) for url in selected_urls]
|
|
98
|
+
|
|
99
|
+
# 同時執行所有抓取任務
|
|
100
|
+
scrape_results = await asyncio.gather(*scrape_tasks)
|
|
101
|
+
scrape_results = [
|
|
102
|
+
scrape_result
|
|
103
|
+
for scrape_result in scrape_results
|
|
104
|
+
if scrape_result["status"] == "success"
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
# 轉換為原來的輸出格式
|
|
108
|
+
return scrape_results
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
FILE_FORMAT_PDF = "application/pdf"
|
|
112
|
+
FILE_FORMATS = [FILE_FORMAT_PDF]
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
async def scrape_single_url(url: str, file_format: str = None) -> Dict[str, Any]:
|
|
116
|
+
"""抓取單個 URL 的內容"""
|
|
117
|
+
try:
|
|
118
|
+
if "%" not in url:
|
|
119
|
+
quoted_url = quote(url, safe="")
|
|
120
|
+
else:
|
|
121
|
+
quoted_url = url
|
|
122
|
+
scrape_url = f"https://botrun-crawler-fastapi-prod-36186877499.asia-east1.run.app/scrape?url={quoted_url}"
|
|
123
|
+
if file_format is not None and file_format in FILE_FORMATS:
|
|
124
|
+
file_format = quote(file_format, safe="")
|
|
125
|
+
scrape_url = f"{scrape_url}&file_format={file_format}"
|
|
126
|
+
scrape_url = URL(scrape_url, encoded=True)
|
|
127
|
+
async with aiohttp.ClientSession() as session:
|
|
128
|
+
async with session.get(scrape_url) as response:
|
|
129
|
+
if response.status == 200:
|
|
130
|
+
body = await response.json()
|
|
131
|
+
print(f"[scrape_single_url] url: {url}")
|
|
132
|
+
print(
|
|
133
|
+
f"[scrape_single_url] content: {body['data']['markdown'][:100]}"
|
|
134
|
+
)
|
|
135
|
+
return {
|
|
136
|
+
"url": url,
|
|
137
|
+
"title": body["data"]["metadata"]["title"],
|
|
138
|
+
"content": body["data"]["markdown"],
|
|
139
|
+
"status": "success",
|
|
140
|
+
}
|
|
141
|
+
else:
|
|
142
|
+
return {
|
|
143
|
+
"url": url,
|
|
144
|
+
"status": "error",
|
|
145
|
+
"error": f"Scraping failed with status {response.status}",
|
|
146
|
+
}
|
|
147
|
+
except Exception as e:
|
|
148
|
+
return {"url": url, "status": "error", "error": str(e)}
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
async def scrape_vertexai_search_results(search_results: Dict, limit: int = 5):
|
|
152
|
+
"""處理 Vertex AI 搜尋結果,並將抓取的內容更新到原始結果中
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
search_results: Vertex AI 搜尋回傳的結果字典
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Dict: 包含更新後的完整結果和其他格式文件
|
|
159
|
+
"""
|
|
160
|
+
# 分離一般網頁、PDF和其他格式文件
|
|
161
|
+
web_urls = []
|
|
162
|
+
pdf_urls = []
|
|
163
|
+
web_results_map = {} # 用於存放 url 到結果的映射
|
|
164
|
+
pdf_results_map = {} # 用於存放 PDF url 到結果的映射
|
|
165
|
+
other_format_results = []
|
|
166
|
+
updated_results = []
|
|
167
|
+
|
|
168
|
+
for result in search_results["results"][:limit]:
|
|
169
|
+
if result["fileFormat"] == "":
|
|
170
|
+
web_urls.append(result["url"])
|
|
171
|
+
web_results_map[result["url"]] = result
|
|
172
|
+
elif result["fileFormat"] == "PDF/Adobe Acrobat":
|
|
173
|
+
pdf_urls.append(result["url"])
|
|
174
|
+
pdf_results_map[result["url"]] = result
|
|
175
|
+
else:
|
|
176
|
+
other_format_results.append(result)
|
|
177
|
+
updated_results.append(result)
|
|
178
|
+
|
|
179
|
+
# 並行抓取網頁和PDF內容
|
|
180
|
+
scrape_tasks = []
|
|
181
|
+
|
|
182
|
+
if web_urls:
|
|
183
|
+
scrape_tasks.append(scrape_urls(web_urls))
|
|
184
|
+
if pdf_urls:
|
|
185
|
+
scrape_tasks.append(scrape_pdfs(pdf_urls))
|
|
186
|
+
|
|
187
|
+
# 同時執行所有��取任務
|
|
188
|
+
all_results = await asyncio.gather(*scrape_tasks) if scrape_tasks else []
|
|
189
|
+
|
|
190
|
+
# 更新原始結果中的內容
|
|
191
|
+
for results in all_results:
|
|
192
|
+
for scrape_result in results:
|
|
193
|
+
if scrape_result["url"] in web_results_map:
|
|
194
|
+
web_results_map[scrape_result["url"]]["content"] = scrape_result[
|
|
195
|
+
"content"
|
|
196
|
+
]
|
|
197
|
+
elif scrape_result["url"] in pdf_results_map:
|
|
198
|
+
pdf_results_map[scrape_result["url"]]["content"] = scrape_result[
|
|
199
|
+
"content"
|
|
200
|
+
]
|
|
201
|
+
|
|
202
|
+
return {
|
|
203
|
+
"results": updated_results,
|
|
204
|
+
"other_format_results": other_format_results,
|
|
205
|
+
}
|
|
@@ -1,34 +1,34 @@
|
|
|
1
|
-
from pydantic import BaseModel, Field
|
|
2
|
-
from typing import Dict, List, Optional, Any
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class ToolUsage(BaseModel):
|
|
6
|
-
"""Tool level token usage information"""
|
|
7
|
-
|
|
8
|
-
tool_name: str
|
|
9
|
-
input_tokens: int = 0
|
|
10
|
-
output_tokens: int = 0
|
|
11
|
-
total_tokens: int = 0
|
|
12
|
-
metadata: Optional[Dict[str, Any]] = None
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class NodeUsage(BaseModel):
|
|
16
|
-
"""Node level token usage information"""
|
|
17
|
-
|
|
18
|
-
node_name: str
|
|
19
|
-
model_name: Optional[str] = None
|
|
20
|
-
input_tokens: int = 0
|
|
21
|
-
output_tokens: int = 0
|
|
22
|
-
total_tokens: int = 0
|
|
23
|
-
tools: Optional[List[ToolUsage]] = None
|
|
24
|
-
metadata: Optional[Dict[str, Any]] = None
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class TokenUsage(BaseModel):
|
|
28
|
-
"""Overall token usage information"""
|
|
29
|
-
|
|
30
|
-
total_input_tokens: int = 0
|
|
31
|
-
total_output_tokens: int = 0
|
|
32
|
-
total_tokens: int = 0
|
|
33
|
-
nodes: List[NodeUsage]
|
|
34
|
-
metadata: Optional[Dict[str, Any]] = None
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from typing import Dict, List, Optional, Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ToolUsage(BaseModel):
|
|
6
|
+
"""Tool level token usage information"""
|
|
7
|
+
|
|
8
|
+
tool_name: str
|
|
9
|
+
input_tokens: int = 0
|
|
10
|
+
output_tokens: int = 0
|
|
11
|
+
total_tokens: int = 0
|
|
12
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class NodeUsage(BaseModel):
|
|
16
|
+
"""Node level token usage information"""
|
|
17
|
+
|
|
18
|
+
node_name: str
|
|
19
|
+
model_name: Optional[str] = None
|
|
20
|
+
input_tokens: int = 0
|
|
21
|
+
output_tokens: int = 0
|
|
22
|
+
total_tokens: int = 0
|
|
23
|
+
tools: Optional[List[ToolUsage]] = None
|
|
24
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class TokenUsage(BaseModel):
|
|
28
|
+
"""Overall token usage information"""
|
|
29
|
+
|
|
30
|
+
total_input_tokens: int = 0
|
|
31
|
+
total_output_tokens: int = 0
|
|
32
|
+
total_tokens: int = 0
|
|
33
|
+
nodes: List[NodeUsage]
|
|
34
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
@@ -1,22 +1,22 @@
|
|
|
1
|
-
langgraph
|
|
2
|
-
langchain-core
|
|
3
|
-
langchain-community
|
|
4
|
-
langchain-openai
|
|
5
|
-
langchain-anthropic
|
|
6
|
-
langchain-google-genai
|
|
7
|
-
copilotkit
|
|
8
|
-
wikipedia
|
|
9
|
-
litellm
|
|
10
|
-
pdfminer-six
|
|
11
|
-
google-auth
|
|
12
|
-
google-cloud-storage
|
|
13
|
-
google-cloud-firestore
|
|
14
|
-
google-cloud-discoveryengine
|
|
15
|
-
tavily-python
|
|
16
|
-
youtube_transcript_api
|
|
17
|
-
plotly
|
|
18
|
-
google-cloud-aiplatform
|
|
19
|
-
trustcall
|
|
20
|
-
google-generativeai
|
|
21
|
-
langchain-aws
|
|
1
|
+
langgraph
|
|
2
|
+
langchain-core
|
|
3
|
+
langchain-community
|
|
4
|
+
langchain-openai
|
|
5
|
+
langchain-anthropic
|
|
6
|
+
langchain-google-genai
|
|
7
|
+
copilotkit
|
|
8
|
+
wikipedia
|
|
9
|
+
litellm
|
|
10
|
+
pdfminer-six
|
|
11
|
+
google-auth
|
|
12
|
+
google-cloud-storage
|
|
13
|
+
google-cloud-firestore
|
|
14
|
+
google-cloud-discoveryengine
|
|
15
|
+
tavily-python
|
|
16
|
+
youtube_transcript_api
|
|
17
|
+
plotly
|
|
18
|
+
google-cloud-aiplatform
|
|
19
|
+
trustcall
|
|
20
|
+
google-generativeai
|
|
21
|
+
langchain-aws
|
|
22
22
|
langgraph-supervisor
|
|
@@ -1,30 +1,30 @@
|
|
|
1
|
-
from typing import Union
|
|
2
|
-
import os
|
|
3
|
-
from dotenv import load_dotenv
|
|
4
|
-
from google.oauth2 import service_account
|
|
5
|
-
from google.cloud import firestore
|
|
6
|
-
|
|
7
|
-
load_dotenv()
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class FirestoreBase:
|
|
11
|
-
def __init__(self, collection_name: str):
|
|
12
|
-
google_service_account_key_path = os.getenv(
|
|
13
|
-
"GOOGLE_APPLICATION_CREDENTIALS_FOR_FASTAPI",
|
|
14
|
-
"/app/keys/scoop-386004-d22d99a7afd9.json",
|
|
15
|
-
)
|
|
16
|
-
credentials = service_account.Credentials.from_service_account_file(
|
|
17
|
-
google_service_account_key_path,
|
|
18
|
-
scopes=["https://www.googleapis.com/auth/datastore"],
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
# 直接从环境变量获取项目 ID
|
|
22
|
-
project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
|
|
23
|
-
|
|
24
|
-
# 创建 Firestore 客户端,指定项目 ID(如果环境变量中有设置)
|
|
25
|
-
if project_id:
|
|
26
|
-
self.db = firestore.Client(project=project_id, credentials=credentials)
|
|
27
|
-
else:
|
|
28
|
-
self.db = firestore.Client(credentials=credentials)
|
|
29
|
-
|
|
30
|
-
self.collection = self.db.collection(collection_name)
|
|
1
|
+
from typing import Union
|
|
2
|
+
import os
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
from google.oauth2 import service_account
|
|
5
|
+
from google.cloud import firestore
|
|
6
|
+
|
|
7
|
+
load_dotenv()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FirestoreBase:
|
|
11
|
+
def __init__(self, collection_name: str):
|
|
12
|
+
google_service_account_key_path = os.getenv(
|
|
13
|
+
"GOOGLE_APPLICATION_CREDENTIALS_FOR_FASTAPI",
|
|
14
|
+
"/app/keys/scoop-386004-d22d99a7afd9.json",
|
|
15
|
+
)
|
|
16
|
+
credentials = service_account.Credentials.from_service_account_file(
|
|
17
|
+
google_service_account_key_path,
|
|
18
|
+
scopes=["https://www.googleapis.com/auth/datastore"],
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# 直接从环境变量获取项目 ID
|
|
22
|
+
project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
|
|
23
|
+
|
|
24
|
+
# 创建 Firestore 客户端,指定项目 ID(如果环境变量中有设置)
|
|
25
|
+
if project_id:
|
|
26
|
+
self.db = firestore.Client(project=project_id, credentials=credentials)
|
|
27
|
+
else:
|
|
28
|
+
self.db = firestore.Client(credentials=credentials)
|
|
29
|
+
|
|
30
|
+
self.collection = self.db.collection(collection_name)
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from dotenv import load_dotenv
|
|
3
|
-
|
|
4
|
-
from botrun_flow_lang.services.hatch.hatch_fs_store import HatchFsStore
|
|
5
|
-
|
|
6
|
-
load_dotenv()
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def hatch_store_factory() -> HatchFsStore:
|
|
10
|
-
env_name = os.getenv("HATCH_ENV_NAME", "botrun-hatch-dev")
|
|
11
|
-
return HatchFsStore(env_name)
|
|
1
|
+
import os
|
|
2
|
+
from dotenv import load_dotenv
|
|
3
|
+
|
|
4
|
+
from botrun_flow_lang.services.hatch.hatch_fs_store import HatchFsStore
|
|
5
|
+
|
|
6
|
+
load_dotenv()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def hatch_store_factory() -> HatchFsStore:
|
|
10
|
+
env_name = os.getenv("HATCH_ENV_NAME", "botrun-hatch-dev")
|
|
11
|
+
return HatchFsStore(env_name)
|