maque 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maque/__init__.py +30 -0
- maque/__main__.py +926 -0
- maque/ai_platform/__init__.py +0 -0
- maque/ai_platform/crawl.py +45 -0
- maque/ai_platform/metrics.py +258 -0
- maque/ai_platform/nlp_preprocess.py +67 -0
- maque/ai_platform/webpage_screen_shot.py +195 -0
- maque/algorithms/__init__.py +78 -0
- maque/algorithms/bezier.py +15 -0
- maque/algorithms/bktree.py +117 -0
- maque/algorithms/core.py +104 -0
- maque/algorithms/hilbert.py +16 -0
- maque/algorithms/rate_function.py +92 -0
- maque/algorithms/transform.py +27 -0
- maque/algorithms/trie.py +272 -0
- maque/algorithms/utils.py +63 -0
- maque/algorithms/video.py +587 -0
- maque/api/__init__.py +1 -0
- maque/api/common.py +110 -0
- maque/api/fetch.py +26 -0
- maque/api/static/icon.png +0 -0
- maque/api/static/redoc.standalone.js +1782 -0
- maque/api/static/swagger-ui-bundle.js +3 -0
- maque/api/static/swagger-ui.css +3 -0
- maque/cli/__init__.py +1 -0
- maque/cli/clean_invisible_chars.py +324 -0
- maque/cli/core.py +34 -0
- maque/cli/groups/__init__.py +26 -0
- maque/cli/groups/config.py +205 -0
- maque/cli/groups/data.py +615 -0
- maque/cli/groups/doctor.py +259 -0
- maque/cli/groups/embedding.py +222 -0
- maque/cli/groups/git.py +29 -0
- maque/cli/groups/help.py +410 -0
- maque/cli/groups/llm.py +223 -0
- maque/cli/groups/mcp.py +241 -0
- maque/cli/groups/mllm.py +1795 -0
- maque/cli/groups/mllm_simple.py +60 -0
- maque/cli/groups/quant.py +210 -0
- maque/cli/groups/service.py +490 -0
- maque/cli/groups/system.py +570 -0
- maque/cli/mllm_run.py +1451 -0
- maque/cli/script.py +52 -0
- maque/cli/tree.py +49 -0
- maque/clustering/__init__.py +52 -0
- maque/clustering/analyzer.py +347 -0
- maque/clustering/clusterers.py +464 -0
- maque/clustering/sampler.py +134 -0
- maque/clustering/visualizer.py +205 -0
- maque/constant.py +13 -0
- maque/core.py +133 -0
- maque/cv/__init__.py +1 -0
- maque/cv/image.py +219 -0
- maque/cv/utils.py +68 -0
- maque/cv/video/__init__.py +3 -0
- maque/cv/video/keyframe_extractor.py +368 -0
- maque/embedding/__init__.py +43 -0
- maque/embedding/base.py +56 -0
- maque/embedding/multimodal.py +308 -0
- maque/embedding/server.py +523 -0
- maque/embedding/text.py +311 -0
- maque/git/__init__.py +24 -0
- maque/git/pure_git.py +912 -0
- maque/io/__init__.py +29 -0
- maque/io/core.py +38 -0
- maque/io/ops.py +194 -0
- maque/llm/__init__.py +111 -0
- maque/llm/backend.py +416 -0
- maque/llm/base.py +411 -0
- maque/llm/server.py +366 -0
- maque/mcp_server.py +1096 -0
- maque/mllm_data_processor_pipeline/__init__.py +17 -0
- maque/mllm_data_processor_pipeline/core.py +341 -0
- maque/mllm_data_processor_pipeline/example.py +291 -0
- maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
- maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
- maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
- maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
- maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
- maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
- maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
- maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
- maque/mllm_data_processor_pipeline/web_app.py +317 -0
- maque/nlp/__init__.py +14 -0
- maque/nlp/ngram.py +9 -0
- maque/nlp/parser.py +63 -0
- maque/nlp/risk_matcher.py +543 -0
- maque/nlp/sentence_splitter.py +202 -0
- maque/nlp/simple_tradition_cvt.py +31 -0
- maque/performance/__init__.py +21 -0
- maque/performance/_measure_time.py +70 -0
- maque/performance/_profiler.py +367 -0
- maque/performance/_stat_memory.py +51 -0
- maque/pipelines/__init__.py +15 -0
- maque/pipelines/clustering.py +252 -0
- maque/quantization/__init__.py +42 -0
- maque/quantization/auto_round.py +120 -0
- maque/quantization/base.py +145 -0
- maque/quantization/bitsandbytes.py +127 -0
- maque/quantization/llm_compressor.py +102 -0
- maque/retriever/__init__.py +35 -0
- maque/retriever/chroma.py +654 -0
- maque/retriever/document.py +140 -0
- maque/retriever/milvus.py +1140 -0
- maque/table_ops/__init__.py +1 -0
- maque/table_ops/core.py +133 -0
- maque/table_viewer/__init__.py +4 -0
- maque/table_viewer/download_assets.py +57 -0
- maque/table_viewer/server.py +698 -0
- maque/table_viewer/static/element-plus-icons.js +5791 -0
- maque/table_viewer/static/element-plus.css +1 -0
- maque/table_viewer/static/element-plus.js +65236 -0
- maque/table_viewer/static/main.css +268 -0
- maque/table_viewer/static/main.js +669 -0
- maque/table_viewer/static/vue.global.js +18227 -0
- maque/table_viewer/templates/index.html +401 -0
- maque/utils/__init__.py +56 -0
- maque/utils/color.py +68 -0
- maque/utils/color_string.py +45 -0
- maque/utils/compress.py +66 -0
- maque/utils/constant.py +183 -0
- maque/utils/core.py +261 -0
- maque/utils/cursor.py +143 -0
- maque/utils/distance.py +58 -0
- maque/utils/docker.py +96 -0
- maque/utils/downloads.py +51 -0
- maque/utils/excel_helper.py +542 -0
- maque/utils/helper_metrics.py +121 -0
- maque/utils/helper_parser.py +168 -0
- maque/utils/net.py +64 -0
- maque/utils/nvidia_stat.py +140 -0
- maque/utils/ops.py +53 -0
- maque/utils/packages.py +31 -0
- maque/utils/path.py +57 -0
- maque/utils/tar.py +260 -0
- maque/utils/untar.py +129 -0
- maque/web/__init__.py +0 -0
- maque/web/image_downloader.py +1410 -0
- maque-0.2.1.dist-info/METADATA +450 -0
- maque-0.2.1.dist-info/RECORD +143 -0
- maque-0.2.1.dist-info/WHEEL +4 -0
- maque-0.2.1.dist-info/entry_points.txt +3 -0
- maque-0.2.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Web界面应用框架
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import uuid
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, Any, Optional, List
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
|
|
12
|
+
from fastapi import FastAPI, WebSocket, HTTPException, UploadFile, File, BackgroundTasks
|
|
13
|
+
from fastapi.responses import HTMLResponse, FileResponse
|
|
14
|
+
from fastapi.staticfiles import StaticFiles
|
|
15
|
+
from pydantic import BaseModel
|
|
16
|
+
import uvicorn
|
|
17
|
+
|
|
18
|
+
from .core import DataProcessorPipeline, PipelineConfig, StepResult
|
|
19
|
+
from .steps import get_all_steps, create_step_from_config
|
|
20
|
+
from loguru import logger
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PipelineRunRequest(BaseModel):
|
|
24
|
+
"""Pipeline运行请求"""
|
|
25
|
+
|
|
26
|
+
config: Dict[str, Any]
|
|
27
|
+
resume_from: Optional[str] = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class PipelineStatusResponse(BaseModel):
|
|
31
|
+
"""Pipeline状态响应"""
|
|
32
|
+
|
|
33
|
+
pipeline_id: str
|
|
34
|
+
status: str # idle, running, completed, failed
|
|
35
|
+
current_step: Optional[str] = None
|
|
36
|
+
progress: float = 0.0
|
|
37
|
+
results: List[Dict[str, Any]] = []
|
|
38
|
+
error: Optional[str] = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class WebSocketManager:
|
|
42
|
+
"""WebSocket连接管理器"""
|
|
43
|
+
|
|
44
|
+
def __init__(self):
|
|
45
|
+
self.connections: Dict[str, WebSocket] = {}
|
|
46
|
+
|
|
47
|
+
async def connect(self, websocket: WebSocket, client_id: str):
|
|
48
|
+
await websocket.accept()
|
|
49
|
+
self.connections[client_id] = websocket
|
|
50
|
+
logger.info(f"WebSocket连接建立: {client_id}")
|
|
51
|
+
|
|
52
|
+
def disconnect(self, client_id: str):
|
|
53
|
+
if client_id in self.connections:
|
|
54
|
+
del self.connections[client_id]
|
|
55
|
+
logger.info(f"WebSocket连接断开: {client_id}")
|
|
56
|
+
|
|
57
|
+
async def send_message(self, client_id: str, message: Dict[str, Any]):
|
|
58
|
+
if client_id in self.connections:
|
|
59
|
+
try:
|
|
60
|
+
await self.connections[client_id].send_text(json.dumps(message))
|
|
61
|
+
except Exception as e:
|
|
62
|
+
logger.error(f"发送WebSocket消息失败: {e}")
|
|
63
|
+
self.disconnect(client_id)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class PipelineManager:
|
|
67
|
+
"""Pipeline管理器"""
|
|
68
|
+
|
|
69
|
+
def __init__(self):
|
|
70
|
+
self.pipelines: Dict[str, DataProcessorPipeline] = {}
|
|
71
|
+
self.pipeline_status: Dict[str, Dict[str, Any]] = {}
|
|
72
|
+
self.websocket_manager = WebSocketManager()
|
|
73
|
+
|
|
74
|
+
def create_pipeline(self, config: PipelineConfig) -> str:
|
|
75
|
+
"""创建Pipeline"""
|
|
76
|
+
pipeline_id = str(uuid.uuid4())
|
|
77
|
+
pipeline = DataProcessorPipeline(config)
|
|
78
|
+
|
|
79
|
+
self.pipelines[pipeline_id] = pipeline
|
|
80
|
+
self.pipeline_status[pipeline_id] = {
|
|
81
|
+
"status": "idle",
|
|
82
|
+
"current_step": None,
|
|
83
|
+
"progress": 0.0,
|
|
84
|
+
"results": [],
|
|
85
|
+
"error": None,
|
|
86
|
+
"created_at": datetime.now().isoformat(),
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
# 设置状态回调
|
|
90
|
+
pipeline.set_status_callback(
|
|
91
|
+
lambda status, data: asyncio.create_task(
|
|
92
|
+
self._update_pipeline_status(pipeline_id, status, data)
|
|
93
|
+
)
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
return pipeline_id
|
|
97
|
+
|
|
98
|
+
async def _update_pipeline_status(
|
|
99
|
+
self, pipeline_id: str, status: str, data: Dict[str, Any]
|
|
100
|
+
):
|
|
101
|
+
"""更新Pipeline状态"""
|
|
102
|
+
if pipeline_id not in self.pipeline_status:
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
pipeline_status = self.pipeline_status[pipeline_id]
|
|
106
|
+
|
|
107
|
+
if status == "starting":
|
|
108
|
+
pipeline_status["status"] = "running"
|
|
109
|
+
pipeline_status["progress"] = 0.0
|
|
110
|
+
elif status == "executing_step":
|
|
111
|
+
pipeline_status["current_step"] = data.get("step_name")
|
|
112
|
+
pipeline_status["progress"] = (data.get("step_index", 0) + 1) / data.get(
|
|
113
|
+
"total_steps", 1
|
|
114
|
+
)
|
|
115
|
+
elif status == "step_failed":
|
|
116
|
+
pipeline_status["status"] = "failed"
|
|
117
|
+
pipeline_status["error"] = data.get("error")
|
|
118
|
+
elif status == "completed":
|
|
119
|
+
pipeline_status["status"] = "completed"
|
|
120
|
+
pipeline_status["progress"] = 1.0
|
|
121
|
+
pipeline_status["current_step"] = None
|
|
122
|
+
|
|
123
|
+
# 通过WebSocket发送状态更新
|
|
124
|
+
await self.websocket_manager.send_message(
|
|
125
|
+
pipeline_id,
|
|
126
|
+
{
|
|
127
|
+
"type": "status_update",
|
|
128
|
+
"pipeline_id": pipeline_id,
|
|
129
|
+
"status": pipeline_status,
|
|
130
|
+
},
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
async def run_pipeline(self, pipeline_id: str, resume_from: Optional[str] = None):
|
|
134
|
+
"""运行Pipeline"""
|
|
135
|
+
if pipeline_id not in self.pipelines:
|
|
136
|
+
raise ValueError(f"Pipeline {pipeline_id} 不存在")
|
|
137
|
+
|
|
138
|
+
pipeline = self.pipelines[pipeline_id]
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
results = await pipeline.run(resume_from=resume_from)
|
|
142
|
+
self.pipeline_status[pipeline_id]["results"] = [
|
|
143
|
+
r.to_dict() for r in results
|
|
144
|
+
]
|
|
145
|
+
pipeline.save_final_results()
|
|
146
|
+
except Exception as e:
|
|
147
|
+
logger.error(f"Pipeline {pipeline_id} 执行失败: {e}")
|
|
148
|
+
self.pipeline_status[pipeline_id]["status"] = "failed"
|
|
149
|
+
self.pipeline_status[pipeline_id]["error"] = str(e)
|
|
150
|
+
|
|
151
|
+
def get_pipeline_status(self, pipeline_id: str) -> Dict[str, Any]:
|
|
152
|
+
"""获取Pipeline状态"""
|
|
153
|
+
if pipeline_id not in self.pipeline_status:
|
|
154
|
+
raise ValueError(f"Pipeline {pipeline_id} 不存在")
|
|
155
|
+
|
|
156
|
+
return self.pipeline_status[pipeline_id]
|
|
157
|
+
|
|
158
|
+
def list_pipelines(self) -> List[Dict[str, Any]]:
|
|
159
|
+
"""列出所有Pipeline"""
|
|
160
|
+
return [
|
|
161
|
+
{"pipeline_id": pid, **status}
|
|
162
|
+
for pid, status in self.pipeline_status.items()
|
|
163
|
+
]
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class WebApp:
|
|
167
|
+
"""Web应用主类"""
|
|
168
|
+
|
|
169
|
+
def __init__(self, static_dir: Optional[str] = None):
|
|
170
|
+
self.app = FastAPI(title="MLLM Data Processor Pipeline", version="0.1.0")
|
|
171
|
+
self.pipeline_manager = PipelineManager()
|
|
172
|
+
self.static_dir = static_dir or str(Path(__file__).parent / "static")
|
|
173
|
+
|
|
174
|
+
self._setup_routes()
|
|
175
|
+
self._setup_static_files()
|
|
176
|
+
|
|
177
|
+
def _setup_static_files(self):
|
|
178
|
+
"""设置静态文件服务"""
|
|
179
|
+
static_path = Path(self.static_dir)
|
|
180
|
+
if static_path.exists():
|
|
181
|
+
self.app.mount(
|
|
182
|
+
"/static", StaticFiles(directory=str(static_path)), name="static"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
def _setup_routes(self):
|
|
186
|
+
"""设置路由"""
|
|
187
|
+
|
|
188
|
+
@self.app.get("/", response_class=HTMLResponse)
|
|
189
|
+
async def index():
|
|
190
|
+
"""主页"""
|
|
191
|
+
html_file = Path(self.static_dir) / "index.html"
|
|
192
|
+
if html_file.exists():
|
|
193
|
+
return FileResponse(html_file)
|
|
194
|
+
return self._get_default_html()
|
|
195
|
+
|
|
196
|
+
@self.app.post("/api/pipeline/create")
|
|
197
|
+
async def create_pipeline(request: PipelineRunRequest):
|
|
198
|
+
"""创建Pipeline"""
|
|
199
|
+
try:
|
|
200
|
+
config = PipelineConfig.from_dict(request.config)
|
|
201
|
+
pipeline_id = self.pipeline_manager.create_pipeline(config)
|
|
202
|
+
return {"pipeline_id": pipeline_id, "status": "created"}
|
|
203
|
+
except Exception as e:
|
|
204
|
+
logger.error(f"创建Pipeline失败: {e}")
|
|
205
|
+
raise HTTPException(status_code=400, detail=str(e))
|
|
206
|
+
|
|
207
|
+
@self.app.post("/api/pipeline/{pipeline_id}/run")
|
|
208
|
+
async def run_pipeline(
|
|
209
|
+
pipeline_id: str,
|
|
210
|
+
background_tasks: BackgroundTasks,
|
|
211
|
+
resume_from: Optional[str] = None,
|
|
212
|
+
):
|
|
213
|
+
"""运行Pipeline"""
|
|
214
|
+
try:
|
|
215
|
+
background_tasks.add_task(
|
|
216
|
+
self.pipeline_manager.run_pipeline, pipeline_id, resume_from
|
|
217
|
+
)
|
|
218
|
+
return {"message": "Pipeline开始执行", "pipeline_id": pipeline_id}
|
|
219
|
+
except Exception as e:
|
|
220
|
+
logger.error(f"运行Pipeline失败: {e}")
|
|
221
|
+
raise HTTPException(status_code=400, detail=str(e))
|
|
222
|
+
|
|
223
|
+
@self.app.get("/api/pipeline/{pipeline_id}/status")
|
|
224
|
+
async def get_pipeline_status(pipeline_id: str):
|
|
225
|
+
"""获取Pipeline状态"""
|
|
226
|
+
try:
|
|
227
|
+
return self.pipeline_manager.get_pipeline_status(pipeline_id)
|
|
228
|
+
except Exception as e:
|
|
229
|
+
raise HTTPException(status_code=404, detail=str(e))
|
|
230
|
+
|
|
231
|
+
@self.app.get("/api/pipelines")
|
|
232
|
+
async def list_pipelines():
|
|
233
|
+
"""列出所有Pipeline"""
|
|
234
|
+
return self.pipeline_manager.list_pipelines()
|
|
235
|
+
|
|
236
|
+
@self.app.get("/api/steps")
|
|
237
|
+
async def get_available_steps():
|
|
238
|
+
"""获取可用的处理步骤"""
|
|
239
|
+
return {
|
|
240
|
+
"steps": [
|
|
241
|
+
{
|
|
242
|
+
"name": step_class.__name__,
|
|
243
|
+
"description": getattr(step_class, "__doc__", ""),
|
|
244
|
+
"config_schema": getattr(step_class, "CONFIG_SCHEMA", {}),
|
|
245
|
+
}
|
|
246
|
+
for step_class in get_all_steps()
|
|
247
|
+
]
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
@self.app.post("/api/upload")
|
|
251
|
+
async def upload_file(file: UploadFile = File(...)):
|
|
252
|
+
"""上传文件"""
|
|
253
|
+
try:
|
|
254
|
+
upload_dir = Path("uploads")
|
|
255
|
+
upload_dir.mkdir(exist_ok=True)
|
|
256
|
+
|
|
257
|
+
file_path = upload_dir / file.filename
|
|
258
|
+
with open(file_path, "wb") as f:
|
|
259
|
+
content = await file.read()
|
|
260
|
+
f.write(content)
|
|
261
|
+
|
|
262
|
+
return {
|
|
263
|
+
"filename": file.filename,
|
|
264
|
+
"file_path": str(file_path),
|
|
265
|
+
"size": len(content),
|
|
266
|
+
}
|
|
267
|
+
except Exception as e:
|
|
268
|
+
logger.error(f"文件上传失败: {e}")
|
|
269
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
270
|
+
|
|
271
|
+
@self.app.websocket("/ws/{client_id}")
|
|
272
|
+
async def websocket_endpoint(websocket: WebSocket, client_id: str):
|
|
273
|
+
"""WebSocket端点"""
|
|
274
|
+
await self.pipeline_manager.websocket_manager.connect(websocket, client_id)
|
|
275
|
+
try:
|
|
276
|
+
while True:
|
|
277
|
+
data = await websocket.receive_text()
|
|
278
|
+
# 处理客户端消息
|
|
279
|
+
message = json.loads(data)
|
|
280
|
+
logger.info(f"收到WebSocket消息: {message}")
|
|
281
|
+
except Exception as e:
|
|
282
|
+
logger.error(f"WebSocket错误: {e}")
|
|
283
|
+
finally:
|
|
284
|
+
self.pipeline_manager.websocket_manager.disconnect(client_id)
|
|
285
|
+
|
|
286
|
+
def _get_default_html(self) -> str:
|
|
287
|
+
"""获取默认HTML页面"""
|
|
288
|
+
return """
|
|
289
|
+
<!DOCTYPE html>
|
|
290
|
+
<html>
|
|
291
|
+
<head>
|
|
292
|
+
<title>MLLM Data Processor Pipeline</title>
|
|
293
|
+
<meta charset="utf-8">
|
|
294
|
+
<style>
|
|
295
|
+
body { font-family: Arial, sans-serif; margin: 40px; }
|
|
296
|
+
.container { max-width: 1200px; margin: 0 auto; }
|
|
297
|
+
h1 { color: #333; }
|
|
298
|
+
.section { margin: 20px 0; padding: 20px; border: 1px solid #ddd; border-radius: 5px; }
|
|
299
|
+
</style>
|
|
300
|
+
</head>
|
|
301
|
+
<body>
|
|
302
|
+
<div class="container">
|
|
303
|
+
<h1>MLLM Data Processor Pipeline</h1>
|
|
304
|
+
<div class="section">
|
|
305
|
+
<h2>欢迎使用多模态大模型训练数据处理Pipeline</h2>
|
|
306
|
+
<p>这是一个灵活的、模块化的数据处理流水线,支持Web界面交互和断点续传。</p>
|
|
307
|
+
<p>请访问 <a href="/docs">/docs</a> 查看API文档</p>
|
|
308
|
+
</div>
|
|
309
|
+
</div>
|
|
310
|
+
</body>
|
|
311
|
+
</html>
|
|
312
|
+
"""
|
|
313
|
+
|
|
314
|
+
def run(self, host: str = "127.0.0.1", port: int = 8000, **kwargs):
|
|
315
|
+
"""运行Web应用"""
|
|
316
|
+
logger.info(f"启动Web应用: http://{host}:{port}")
|
|
317
|
+
uvicorn.run(self.app, host=host, port=port, **kwargs)
|
maque/nlp/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .sentence_splitter import SentenceSplitter, Sentence, split_sentences
|
|
2
|
+
from .risk_matcher import RiskMatcher, MatchResult, ExtractResult, match_and_extract
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
# sentence_splitter
|
|
6
|
+
"SentenceSplitter",
|
|
7
|
+
"Sentence",
|
|
8
|
+
"split_sentences",
|
|
9
|
+
# risk_matcher
|
|
10
|
+
"RiskMatcher",
|
|
11
|
+
"MatchResult",
|
|
12
|
+
"ExtractResult",
|
|
13
|
+
"match_and_extract",
|
|
14
|
+
]
|
maque/nlp/ngram.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
|
|
2
|
+
def ngram(words: str, n: int, reverse=True):
|
|
3
|
+
ngram_dict = {}
|
|
4
|
+
words = ['<S>'] + list(words) + ['<E>']
|
|
5
|
+
zip_args = ', '.join([f"words[{i}:]" for i in range(n)])
|
|
6
|
+
zip_result = eval(f"zip({zip_args})")
|
|
7
|
+
for i in zip_result:
|
|
8
|
+
ngram_dict[i] = ngram_dict.get(i, 0) + 1
|
|
9
|
+
return sorted(ngram_dict.items(), key=lambda i: i[1], reverse=reverse)
|
maque/nlp/parser.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import json5
|
|
2
|
+
import re
|
|
3
|
+
import ast
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
def extract_code_snippets(text, strict=True):
|
|
7
|
+
"""Extract code snippets"""
|
|
8
|
+
# 首先处理带有 ``` 标志的代码块
|
|
9
|
+
pattern = r"```(\w+)?\s*([\s\S]*?)```"
|
|
10
|
+
matches = re.findall(pattern, text)
|
|
11
|
+
|
|
12
|
+
code_snippets = []
|
|
13
|
+
for lang, code in matches:
|
|
14
|
+
code_snippets.append({
|
|
15
|
+
"language": lang.strip() if lang else "unknown",
|
|
16
|
+
"code": code.strip(),
|
|
17
|
+
})
|
|
18
|
+
|
|
19
|
+
if not strict:
|
|
20
|
+
# 查找并排除已经被处理过的 ``` ... ``` 内的代码块
|
|
21
|
+
text = re.sub(pattern, "", text)
|
|
22
|
+
|
|
23
|
+
# 处理剩下的 { ... } 格式的代码块
|
|
24
|
+
pattern = r"\{[\s\S]*?\}"
|
|
25
|
+
matches = re.findall(pattern, text)
|
|
26
|
+
|
|
27
|
+
for code in matches:
|
|
28
|
+
code_snippets.append({
|
|
29
|
+
"language": "unknown",
|
|
30
|
+
"code": code.strip(),
|
|
31
|
+
})
|
|
32
|
+
|
|
33
|
+
return code_snippets
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def parse_to_obj(text: str, strict=False, raise_error=True):
|
|
37
|
+
"""Parse to obj"""
|
|
38
|
+
code_snippets = extract_code_snippets(text, strict=strict)
|
|
39
|
+
code_snippets = [code_snippet["code"] for code_snippet in code_snippets]
|
|
40
|
+
code_snippets = [code_snippet.strip() for code_snippet in code_snippets if code_snippet.strip()]
|
|
41
|
+
if not code_snippets:
|
|
42
|
+
return None
|
|
43
|
+
code_str = code_snippets[-1]
|
|
44
|
+
try:
|
|
45
|
+
return ast.literal_eval(code_str)
|
|
46
|
+
except:
|
|
47
|
+
try:
|
|
48
|
+
return json5.loads(code_str)
|
|
49
|
+
except:
|
|
50
|
+
if raise_error:
|
|
51
|
+
raise ValueError(f"Failed to parse to obj: {text}")
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def parse_to_code(text: str, strict=False) -> Optional[str]:
|
|
56
|
+
"""Parse to code"""
|
|
57
|
+
code_snippets = extract_code_snippets(text, strict=strict)
|
|
58
|
+
code_snippets = [code_snippet["code"] for code_snippet in code_snippets]
|
|
59
|
+
code_snippets = [code_snippet.strip() for code_snippet in code_snippets if code_snippet.strip()]
|
|
60
|
+
if not code_snippets:
|
|
61
|
+
return None
|
|
62
|
+
code_str = code_snippets[-1]
|
|
63
|
+
return code_str
|