mm-qa-mcp 3.0.2__py3-none-any.whl → 3.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- minimax_qa_mcp/conf/conf.ini +12 -0
- minimax_qa_mcp/server.py +175 -3
- minimax_qa_mcp/src/auto_case/__init__.py +0 -0
- minimax_qa_mcp/src/auto_case/case_write.py +182 -0
- minimax_qa_mcp/src/auto_case/pdf_jiexi.py +28 -0
- minimax_qa_mcp/src/gitlab_branch_analyzer/__init__.py +11 -0
- minimax_qa_mcp/src/gitlab_branch_analyzer/gitlab_branch_service.py +1150 -0
- {mm_qa_mcp-3.0.2.dist-info → mm_qa_mcp-3.0.4.dist-info}/METADATA +2 -1
- {mm_qa_mcp-3.0.2.dist-info → mm_qa_mcp-3.0.4.dist-info}/RECORD +12 -7
- {mm_qa_mcp-3.0.2.dist-info → mm_qa_mcp-3.0.4.dist-info}/WHEEL +1 -1
- {mm_qa_mcp-3.0.2.dist-info → mm_qa_mcp-3.0.4.dist-info}/entry_points.txt +0 -0
- {mm_qa_mcp-3.0.2.dist-info → mm_qa_mcp-3.0.4.dist-info}/top_level.txt +0 -0
minimax_qa_mcp/conf/conf.ini
CHANGED
|
@@ -66,6 +66,18 @@ url = http://10.11.8.37
|
|
|
66
66
|
port = 8080
|
|
67
67
|
grpc_port = 50051
|
|
68
68
|
|
|
69
|
+
[git_info]
|
|
70
|
+
# GitLab服务器地址
|
|
71
|
+
gitlab_url = https://gitlab.xaminim.com
|
|
72
|
+
# GitLab API访问令牌(Private Token)
|
|
73
|
+
access_token = SrFXdGjtd3AtRVJCkpDn
|
|
74
|
+
|
|
75
|
+
[apollo_info]
|
|
76
|
+
# Apollo配置获取API基础URL
|
|
77
|
+
apollo_base_url = http://swing-babel-ali-prod.xaminim.com/swing/api/get_apollo_value_by_key
|
|
78
|
+
# 存储产品线与GitLab Group映射的Apollo Key
|
|
79
|
+
# Apollo中配置格式: {"qa_group": [{"group_id": 1117, "group_name": "qa"}]}
|
|
80
|
+
product_line_gitlab_key = product_line_gitlab_projects
|
|
69
81
|
|
|
70
82
|
[generator_case_conf]
|
|
71
83
|
# 模型API URL,用于ModuleClient类调用
|
minimax_qa_mcp/server.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import asyncio
|
|
3
3
|
import sys
|
|
4
4
|
import os
|
|
5
|
+
import json
|
|
5
6
|
|
|
6
7
|
# 将项目根目录添加到Python路径中
|
|
7
8
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
@@ -18,8 +19,11 @@ from minimax_qa_mcp.src.generator_case.generator_case import GeneratorCase
|
|
|
18
19
|
from minimax_qa_mcp.src.get_weaviate_info.get_weaviate_info import GetWeaviateInfo
|
|
19
20
|
from minimax_qa_mcp.src.grafana.service import GetFromGrafana, GetApiFromGrafana
|
|
20
21
|
from minimax_qa_mcp.src.gateway_case.get_case import CaseGrafanaService
|
|
21
|
-
from minimax_qa_mcp.src.query_segments.query_segments import query_main, TYPE_API, TYPE_FUNC, TYPE_CODE, TYPE_ANY,
|
|
22
|
+
from minimax_qa_mcp.src.query_segments.query_segments import query_main, TYPE_API, TYPE_FUNC, TYPE_CODE, TYPE_ANY, \
|
|
23
|
+
TYPE_FUNC_DETAIL
|
|
24
|
+
from minimax_qa_mcp.src.auto_case.case_write import PDFWeaviateInfo
|
|
22
25
|
from minimax_qa_mcp.src.get_full_api_call_chain.get_full_api_call_chain import GetFullApiCallChain
|
|
26
|
+
from minimax_qa_mcp.src.gitlab_branch_analyzer.gitlab_branch_service import GitlabBranchService, GitlabRepoTypeDetector, GitlabBedrockResolver
|
|
23
27
|
|
|
24
28
|
# Initialize FastMCP server
|
|
25
29
|
mcp = FastMCP("mcp")
|
|
@@ -162,6 +166,24 @@ async def get_weaviate_info(input_data: str) -> dict:
|
|
|
162
166
|
return result
|
|
163
167
|
|
|
164
168
|
|
|
169
|
+
@mcp.tool()
|
|
170
|
+
async def get_auto_case(file_path: str, ref_case_name: str, use_moxing: bool = True) -> dict:
|
|
171
|
+
"""
|
|
172
|
+
根据给定的文件路径和参考case名称,自动生成case
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
file_path: 需要处理的文件路径
|
|
176
|
+
ref_case_name: 参考的case名称
|
|
177
|
+
use_moxing: 是否使用模型,开关默认开启
|
|
178
|
+
Returns:
|
|
179
|
+
包含自动生成case的相关信息的字典
|
|
180
|
+
"""
|
|
181
|
+
# 调用 PDFWeaviateInfo.get_pdf_and_weaviate_info 方法
|
|
182
|
+
prd_case = PDFWeaviateInfo(file_path, ref_case_name, use_moxing=use_moxing)
|
|
183
|
+
case_response = prd_case.get_pdf_and_weaviate_info()
|
|
184
|
+
return case_response
|
|
185
|
+
|
|
186
|
+
|
|
165
187
|
@mcp.tool()
|
|
166
188
|
async def get_full_api_call_chain(api_path: str) -> dict:
|
|
167
189
|
"""
|
|
@@ -171,6 +193,9 @@ async def get_full_api_call_chain(api_path: str) -> dict:
|
|
|
171
193
|
Return:
|
|
172
194
|
API调用链信息
|
|
173
195
|
"""
|
|
196
|
+
# 判断api_path是不是'/'开头 不是的话 拼接
|
|
197
|
+
if not api_path.startswith('/'):
|
|
198
|
+
api_path = '/' + api_path
|
|
174
199
|
logger.info(f"===== The input params is :{api_path}")
|
|
175
200
|
|
|
176
201
|
api_call_chain = GetFullApiCallChain(api_path)
|
|
@@ -178,6 +203,153 @@ async def get_full_api_call_chain(api_path: str) -> dict:
|
|
|
178
203
|
logger.info(f"===== The result of get_full_api_call_chain is :{result}")
|
|
179
204
|
return result
|
|
180
205
|
|
|
206
|
+
@mcp.tool()
|
|
207
|
+
async def git_repo_clone(repo_url: str, clone_path: str) -> str:
|
|
208
|
+
"""
|
|
209
|
+
克隆Git仓库到指定路径
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
repo_url: Git仓库的URL
|
|
213
|
+
clone_path: 本地克隆路径
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
克隆结果的字符串信息
|
|
217
|
+
"""
|
|
218
|
+
import git
|
|
219
|
+
try:
|
|
220
|
+
git.Repo.clone_from(repo_url, clone_path)
|
|
221
|
+
return f"Successfully cloned {repo_url} to {clone_path}"
|
|
222
|
+
except Exception as e:
|
|
223
|
+
return f"Failed to clone repository: {str(e)}"
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
@mcp.tool()
|
|
227
|
+
async def get_recent_branches_by_product_line(product_line: str, days: int = 3) -> list:
|
|
228
|
+
"""
|
|
229
|
+
获取产品线最近N天新创建的分支列表
|
|
230
|
+
|
|
231
|
+
根据产品线名称,从Apollo获取对应的GitLab项目列表,然后查询这些项目最近N天内新创建的分支。
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
product_line: 产品线名称,如 xingye(星野), talkie, hailuo(海螺)等
|
|
235
|
+
days: 查询最近多少天的分支,默认3天
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
列表,包含新分支信息:
|
|
239
|
+
[
|
|
240
|
+
{
|
|
241
|
+
"repo_url": "https://gitlab.xaminim.com/qa/project_name",
|
|
242
|
+
"project_name": "project_name",
|
|
243
|
+
"branch_name": "feature/xxx",
|
|
244
|
+
"created_at": "2026-02-03T10:30:00+08:00",
|
|
245
|
+
"author": "developer_name"
|
|
246
|
+
},
|
|
247
|
+
...
|
|
248
|
+
]
|
|
249
|
+
"""
|
|
250
|
+
import concurrent.futures
|
|
251
|
+
loop = asyncio.get_running_loop()
|
|
252
|
+
|
|
253
|
+
logger.info(f"===== 获取产品线 {product_line} 最近 {days} 天的新分支 =====")
|
|
254
|
+
|
|
255
|
+
with concurrent.futures.ThreadPoolExecutor() as pool:
|
|
256
|
+
service = GitlabBranchService(product_line)
|
|
257
|
+
result = await loop.run_in_executor(pool, service.analyze_all_projects, days)
|
|
258
|
+
|
|
259
|
+
logger.info(f"===== 产品线 {product_line} 共找到 {len(result)} 个新分支 =====")
|
|
260
|
+
return result
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
@mcp.tool()
|
|
264
|
+
async def detect_repo_type(project_id: int, branch: str = None) -> dict:
|
|
265
|
+
"""
|
|
266
|
+
检测GitLab仓库类型(Server/FE/混合)
|
|
267
|
+
|
|
268
|
+
通过分析仓库的文件结构和依赖,判断项目是后端服务(Server)还是前端项目(FE)。
|
|
269
|
+
|
|
270
|
+
检测规则:
|
|
271
|
+
- Server特征:go.mod, pom.xml, requirements.txt, Cargo.toml, cmd/, pkg/, internal/
|
|
272
|
+
- FE特征:package.json + react/vue/angular依赖, src/components/, pages/
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
project_id: GitLab项目ID,可以从分支查询结果中获取
|
|
276
|
+
branch: 分支名,可选,默认使用仓库默认分支
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
字典,包含仓库类型信息:
|
|
280
|
+
{
|
|
281
|
+
"type": "server" | "fe" | "mixed" | "unknown",
|
|
282
|
+
"language": "go" | "java" | "python" | "react" | "vue" | ...,
|
|
283
|
+
"framework": 框架名称(FE项目会返回),
|
|
284
|
+
"confidence": 0.0-1.0 置信度,
|
|
285
|
+
"indicators": ["go.mod", "cmd/", ...] 检测到的特征
|
|
286
|
+
}
|
|
287
|
+
"""
|
|
288
|
+
import concurrent.futures
|
|
289
|
+
loop = asyncio.get_running_loop()
|
|
290
|
+
|
|
291
|
+
logger.info(f"===== 检测项目 {project_id} 的仓库类型 =====")
|
|
292
|
+
|
|
293
|
+
with concurrent.futures.ThreadPoolExecutor() as pool:
|
|
294
|
+
detector = GitlabRepoTypeDetector()
|
|
295
|
+
result = await loop.run_in_executor(pool, detector.detect_repo_type, project_id, branch)
|
|
296
|
+
|
|
297
|
+
logger.info(f"===== 项目 {project_id} 类型检测结果: {result['type']} =====")
|
|
298
|
+
return result
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
@mcp.tool()
|
|
302
|
+
async def get_bedrock_info_by_repo(project_id: int, branch: str = None) -> dict:
|
|
303
|
+
"""
|
|
304
|
+
获取Git仓库对应的Bedrock部署信息
|
|
305
|
+
|
|
306
|
+
通过解析仓库的配置文件(.gitlab-ci.yml、Makefile等),
|
|
307
|
+
自动提取Bedrock部署相关信息,包括PSM、项目名、应用名等。
|
|
308
|
+
|
|
309
|
+
解析优先级:
|
|
310
|
+
1. .gitlab-ci.yml(最可靠)
|
|
311
|
+
2. Makefile
|
|
312
|
+
3. deploy.yaml / bedrock.yaml
|
|
313
|
+
4. 根据项目名推断(兜底方案)
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
project_id: GitLab项目ID,可以从分支查询结果中获取
|
|
317
|
+
branch: 分支名,可选,默认使用仓库默认分支
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
字典,包含Bedrock部署信息:
|
|
321
|
+
{
|
|
322
|
+
"found": true/false, # 是否找到配置
|
|
323
|
+
"source": "gitlab-ci" | "makefile" | "inferred", # 配置来源
|
|
324
|
+
"psm": "xxx.yyy.zzz", # PSM 服务标识
|
|
325
|
+
"bedrock_project": "project-name", # Bedrock 项目名
|
|
326
|
+
"bedrock_app": "app-name", # Bedrock 应用名
|
|
327
|
+
"image_repo": "txharbor.xaminim.com/...", # 镜像仓库地址
|
|
328
|
+
"deploy_stages": [...], # 部署阶段配置
|
|
329
|
+
"confidence": "high" | "medium" | "low", # 置信度
|
|
330
|
+
"project_info": { # 项目基本信息
|
|
331
|
+
"name": "项目名",
|
|
332
|
+
"path": "group/project",
|
|
333
|
+
"web_url": "https://..."
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
"""
|
|
337
|
+
import concurrent.futures
|
|
338
|
+
loop = asyncio.get_running_loop()
|
|
339
|
+
|
|
340
|
+
logger.info(f"===== 获取项目 {project_id} 的 Bedrock 部署信息 =====")
|
|
341
|
+
|
|
342
|
+
with concurrent.futures.ThreadPoolExecutor() as pool:
|
|
343
|
+
resolver = GitlabBedrockResolver()
|
|
344
|
+
result = await loop.run_in_executor(pool, resolver.get_bedrock_info, project_id, branch)
|
|
345
|
+
|
|
346
|
+
if result.get('found'):
|
|
347
|
+
logger.info(f"===== 项目 {project_id} 找到 Bedrock 配置: PSM={result.get('psm')}, App={result.get('bedrock_app')} =====")
|
|
348
|
+
else:
|
|
349
|
+
logger.warning(f"===== 项目 {project_id} 未找到 Bedrock 配置 =====")
|
|
350
|
+
|
|
351
|
+
return result
|
|
352
|
+
|
|
181
353
|
|
|
182
354
|
def main():
|
|
183
355
|
print("Starting Minimax QA MCP server")
|
|
@@ -190,10 +362,10 @@ def run_server():
|
|
|
190
362
|
# 确保当前工作目录在sys.path中
|
|
191
363
|
if os.getcwd() not in sys.path:
|
|
192
364
|
sys.path.insert(0, os.getcwd())
|
|
193
|
-
|
|
365
|
+
|
|
194
366
|
# 输出启动信息
|
|
195
367
|
print("Starting Minimax QA MCP server from CLI")
|
|
196
|
-
|
|
368
|
+
|
|
197
369
|
# 调用主函数
|
|
198
370
|
main()
|
|
199
371
|
|
|
File without changes
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import re
|
|
3
|
+
from minimax_qa_mcp.src.get_weaviate_info.get_weaviate_info import GetWeaviateInfo
|
|
4
|
+
from minimax_qa_mcp.src.auto_case.pdf_jiexi import read_pdf_text
|
|
5
|
+
import logging
|
|
6
|
+
import requests
|
|
7
|
+
import json
|
|
8
|
+
import time
|
|
9
|
+
|
|
10
|
+
# 设置日志
|
|
11
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
12
|
+
logger = logging.getLogger('生成日志case')
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def retry(max_attempts=3, wait=2):
|
|
16
|
+
def decorator(func):
|
|
17
|
+
@functools.wraps(func)
|
|
18
|
+
def wrapper(*args, **kwargs):
|
|
19
|
+
last_exception = None
|
|
20
|
+
for attempt in range(1, max_attempts + 1):
|
|
21
|
+
try:
|
|
22
|
+
return func(*args, **kwargs)
|
|
23
|
+
except Exception as e:
|
|
24
|
+
logger.warning(f"第{attempt}次请求异常:{e}")
|
|
25
|
+
last_exception = e
|
|
26
|
+
if attempt < max_attempts:
|
|
27
|
+
time.sleep(wait)
|
|
28
|
+
logger.error(f"请求失败,已重试{max_attempts}次。")
|
|
29
|
+
return f"请求失败,已重试{max_attempts}次,最后异常:{last_exception}"
|
|
30
|
+
|
|
31
|
+
return wrapper
|
|
32
|
+
|
|
33
|
+
return decorator
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class PDFWeaviateInfo(GetWeaviateInfo):
|
|
37
|
+
def __init__(self, prd_path, input_question,use_moxing, is_need_module=False):
|
|
38
|
+
super().__init__(input_question, is_need_module)
|
|
39
|
+
self.prd_path = prd_path
|
|
40
|
+
self.timeout = 120
|
|
41
|
+
self.use_moxing = use_moxing
|
|
42
|
+
|
|
43
|
+
def get_pdf_and_weaviate_info(self):
|
|
44
|
+
# 解析PDF内容
|
|
45
|
+
pdf_text = read_pdf_text(self.prd_path)
|
|
46
|
+
# 获取Weaviate信息
|
|
47
|
+
weaviate_info = self.get_knowledge()
|
|
48
|
+
# 拼接结果
|
|
49
|
+
result = {
|
|
50
|
+
'pdf_text': pdf_text,
|
|
51
|
+
'weaviate_info': weaviate_info
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
case_response = self.call_model(result)
|
|
55
|
+
return case_response
|
|
56
|
+
|
|
57
|
+
def call_model(self, prd_content, max_attempts=5, wait=6):
|
|
58
|
+
"""
|
|
59
|
+
调用模型,给出参考case
|
|
60
|
+
Args:
|
|
61
|
+
prd_content: prd信息+参考case
|
|
62
|
+
max_attempts: 最大重试次数
|
|
63
|
+
wait: 重试等待时间(秒)
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
content内容拼接后的字符串,或错误信息
|
|
67
|
+
"""
|
|
68
|
+
prd_case = prd_content.get("pdf_text", "")
|
|
69
|
+
prompt = f"需求内容:{prd_case}"
|
|
70
|
+
cankao_case = prd_content.get("weaviate_info", {}).get("results", [])
|
|
71
|
+
for i, content in enumerate(cankao_case):
|
|
72
|
+
case_content = content.get('content', '').strip()
|
|
73
|
+
case_content_clean = re.sub(r'\s+', ' ', case_content)
|
|
74
|
+
prompt += f"参考用例{i + 1}、用例名称:{content.get('title', 'N/A')}、用例内容:{case_content_clean}\n\n"
|
|
75
|
+
break
|
|
76
|
+
|
|
77
|
+
prompt += " 帮我写一份测试用例,参考用例若与本次需求相关可借鉴其设计思路和细节,不相关则忽略,用例设计需覆盖功能、边界、异常、安全、兼容性、性能、数据一致性、UI等多个维度,每个测试用例需包含:用例名称、用例编号、前置条件、测试步骤、预期结果、测试环境,用例内容要具体、细致,测试步骤要可操作,预期结果要明确"
|
|
78
|
+
# prompt += "帮我写一份测试用例,参考用例若与本次需求相关可借鉴其设计思路和细节,不相关则忽略"
|
|
79
|
+
clean_params = prompt.replace('\\"', "'")
|
|
80
|
+
prd_prompt = clean_params.replace("\n", " ").strip()
|
|
81
|
+
|
|
82
|
+
if not self.use_moxing:
|
|
83
|
+
return prd_prompt
|
|
84
|
+
|
|
85
|
+
payload = {
|
|
86
|
+
"scene": "qa_agent",
|
|
87
|
+
"params": {
|
|
88
|
+
"user_content": prd_prompt
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
logger.info(f"==== 发送请求调用模型 ======")
|
|
93
|
+
# print(payload)
|
|
94
|
+
last_exception = None
|
|
95
|
+
for attempt in range(1, max_attempts + 1):
|
|
96
|
+
try:
|
|
97
|
+
response = requests.post(
|
|
98
|
+
self.api_url,
|
|
99
|
+
json=payload,
|
|
100
|
+
headers={'Content-Type': 'application/json'},
|
|
101
|
+
verify=False,
|
|
102
|
+
timeout=self.timeout
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# logger.info(f"API响应状态码: {response.status_code}")
|
|
106
|
+
# logger.info(f"API响应内容: {response.text}")
|
|
107
|
+
|
|
108
|
+
if response.status_code != 200:
|
|
109
|
+
logger.error(f"API请求失败,状态码: {response.status_code}")
|
|
110
|
+
raise Exception(f"API请求失败,状态码: {response.status_code}")
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
resp_json = response.json()
|
|
114
|
+
if 'response' in resp_json:
|
|
115
|
+
try:
|
|
116
|
+
model_response = json.loads(resp_json['response'])
|
|
117
|
+
if 'choices' in model_response and isinstance(model_response['choices'], list):
|
|
118
|
+
# 兼容OpenAI/Claude等大模型返回结构
|
|
119
|
+
for choice in model_response['choices']:
|
|
120
|
+
message = choice.get('message', {})
|
|
121
|
+
content = message.get('content')
|
|
122
|
+
if content:
|
|
123
|
+
content += "\n- 需求内容" + f"{prd_case}"
|
|
124
|
+
return content
|
|
125
|
+
# 如果没找到content,兜底返回整个choices
|
|
126
|
+
return str(model_response['choices'])
|
|
127
|
+
elif 'content' in model_response and isinstance(model_response['content'], list):
|
|
128
|
+
# 兼容content直接在顶层的情况
|
|
129
|
+
text_content = ""
|
|
130
|
+
for item in model_response['content']:
|
|
131
|
+
if item.get('type') == 'text':
|
|
132
|
+
text_content += item.get('text', '')
|
|
133
|
+
text_content += "\n- 需求内容" + f"{prd_case}"
|
|
134
|
+
return text_content
|
|
135
|
+
else:
|
|
136
|
+
return str(model_response)
|
|
137
|
+
except json.JSONDecodeError as json_e:
|
|
138
|
+
logger.error(f"解析二层JSON失败: {json_e}")
|
|
139
|
+
# 抛出异常,触发重试机制
|
|
140
|
+
raise Exception(f"解析二层JSON失败: {json_e}")
|
|
141
|
+
except Exception as e:
|
|
142
|
+
logger.error(f"处理二层JSON时发生未知错误: {e}")
|
|
143
|
+
# 抛出异常,触发重试机制
|
|
144
|
+
raise Exception(f"处理二层JSON时发生未知错误: {e}")
|
|
145
|
+
|
|
146
|
+
# 如果没有response字段,直接返回原始响应
|
|
147
|
+
return response.text
|
|
148
|
+
|
|
149
|
+
except json.JSONDecodeError as json_e:
|
|
150
|
+
logger.error(f"解析一层JSON失败: {json_e}")
|
|
151
|
+
# 抛出异常,触发重试机制
|
|
152
|
+
raise Exception(f"解析一层JSON失败: {json_e}")
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.error(f"处理响应时发生未知错误: {e}")
|
|
155
|
+
# 抛出异常,触发重试机制
|
|
156
|
+
raise Exception(f"处理响应时发生未知错误: {e}")
|
|
157
|
+
|
|
158
|
+
except requests.RequestException as e:
|
|
159
|
+
logger.warning(f"第{attempt}次网络请求异常:{e}")
|
|
160
|
+
last_exception = e
|
|
161
|
+
if attempt < max_attempts:
|
|
162
|
+
time.sleep(wait)
|
|
163
|
+
continue
|
|
164
|
+
else:
|
|
165
|
+
break
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.warning(f"第{attempt}次请求处理异常:{e}")
|
|
168
|
+
last_exception = e
|
|
169
|
+
if attempt < max_attempts:
|
|
170
|
+
time.sleep(wait)
|
|
171
|
+
continue
|
|
172
|
+
else:
|
|
173
|
+
break
|
|
174
|
+
|
|
175
|
+
logger.error(f"请求失败,已重试{max_attempts}次。最后异常:{last_exception}")
|
|
176
|
+
return f"请求失败,已重试{max_attempts}次,最后异常:{last_exception}"
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
if __name__ == '__main__':
|
|
180
|
+
a = PDFWeaviateInfo(prd_path="支持搜索用户功能.pdf", input_question="星野支持搜索用户功能的相关用例")
|
|
181
|
+
print(a.get_pdf_and_weaviate_info())
|
|
182
|
+
# read_pdf_text(prd_path = "注销流程优化.pdf")
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import pdfplumber
|
|
2
|
+
import warnings
|
|
3
|
+
import re
|
|
4
|
+
warnings.filterwarnings("ignore")
|
|
5
|
+
def read_pdf_text(pdf_path):
|
|
6
|
+
"""
|
|
7
|
+
读取指定路径的PDF文件内容,返回所有文本内容的字符串。
|
|
8
|
+
:param pdf_path: PDF文件的本地路径
|
|
9
|
+
:return: PDF中的全部文本内容
|
|
10
|
+
"""
|
|
11
|
+
all_text = ""
|
|
12
|
+
with pdfplumber.open(pdf_path) as pdf:
|
|
13
|
+
for page in pdf.pages:
|
|
14
|
+
all_text += page.extract_text() or ""
|
|
15
|
+
# 清除特殊不可见字符
|
|
16
|
+
all_text = re.sub(r'[\x00-\x1F\x7F]', '', all_text)
|
|
17
|
+
# all_text = clean_encoding(all_text)
|
|
18
|
+
return all_text
|
|
19
|
+
|
|
20
|
+
# def clean_encoding(text):
|
|
21
|
+
# # 先转换为bytes,再解码回字符串
|
|
22
|
+
# text_bytes = text.encode('utf-8', errors='ignore')
|
|
23
|
+
# cleaned_text = text_bytes.decode('utf-8')
|
|
24
|
+
# return cleaned_text
|
|
25
|
+
if __name__ == "__main__":
|
|
26
|
+
pdf_path = "创作输入框改版.pdf"
|
|
27
|
+
text = read_pdf_text(pdf_path)
|
|
28
|
+
print(text)
|