maque 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. maque/__init__.py +30 -0
  2. maque/__main__.py +926 -0
  3. maque/ai_platform/__init__.py +0 -0
  4. maque/ai_platform/crawl.py +45 -0
  5. maque/ai_platform/metrics.py +258 -0
  6. maque/ai_platform/nlp_preprocess.py +67 -0
  7. maque/ai_platform/webpage_screen_shot.py +195 -0
  8. maque/algorithms/__init__.py +78 -0
  9. maque/algorithms/bezier.py +15 -0
  10. maque/algorithms/bktree.py +117 -0
  11. maque/algorithms/core.py +104 -0
  12. maque/algorithms/hilbert.py +16 -0
  13. maque/algorithms/rate_function.py +92 -0
  14. maque/algorithms/transform.py +27 -0
  15. maque/algorithms/trie.py +272 -0
  16. maque/algorithms/utils.py +63 -0
  17. maque/algorithms/video.py +587 -0
  18. maque/api/__init__.py +1 -0
  19. maque/api/common.py +110 -0
  20. maque/api/fetch.py +26 -0
  21. maque/api/static/icon.png +0 -0
  22. maque/api/static/redoc.standalone.js +1782 -0
  23. maque/api/static/swagger-ui-bundle.js +3 -0
  24. maque/api/static/swagger-ui.css +3 -0
  25. maque/cli/__init__.py +1 -0
  26. maque/cli/clean_invisible_chars.py +324 -0
  27. maque/cli/core.py +34 -0
  28. maque/cli/groups/__init__.py +26 -0
  29. maque/cli/groups/config.py +205 -0
  30. maque/cli/groups/data.py +615 -0
  31. maque/cli/groups/doctor.py +259 -0
  32. maque/cli/groups/embedding.py +222 -0
  33. maque/cli/groups/git.py +29 -0
  34. maque/cli/groups/help.py +410 -0
  35. maque/cli/groups/llm.py +223 -0
  36. maque/cli/groups/mcp.py +241 -0
  37. maque/cli/groups/mllm.py +1795 -0
  38. maque/cli/groups/mllm_simple.py +60 -0
  39. maque/cli/groups/quant.py +210 -0
  40. maque/cli/groups/service.py +490 -0
  41. maque/cli/groups/system.py +570 -0
  42. maque/cli/mllm_run.py +1451 -0
  43. maque/cli/script.py +52 -0
  44. maque/cli/tree.py +49 -0
  45. maque/clustering/__init__.py +52 -0
  46. maque/clustering/analyzer.py +347 -0
  47. maque/clustering/clusterers.py +464 -0
  48. maque/clustering/sampler.py +134 -0
  49. maque/clustering/visualizer.py +205 -0
  50. maque/constant.py +13 -0
  51. maque/core.py +133 -0
  52. maque/cv/__init__.py +1 -0
  53. maque/cv/image.py +219 -0
  54. maque/cv/utils.py +68 -0
  55. maque/cv/video/__init__.py +3 -0
  56. maque/cv/video/keyframe_extractor.py +368 -0
  57. maque/embedding/__init__.py +43 -0
  58. maque/embedding/base.py +56 -0
  59. maque/embedding/multimodal.py +308 -0
  60. maque/embedding/server.py +523 -0
  61. maque/embedding/text.py +311 -0
  62. maque/git/__init__.py +24 -0
  63. maque/git/pure_git.py +912 -0
  64. maque/io/__init__.py +29 -0
  65. maque/io/core.py +38 -0
  66. maque/io/ops.py +194 -0
  67. maque/llm/__init__.py +111 -0
  68. maque/llm/backend.py +416 -0
  69. maque/llm/base.py +411 -0
  70. maque/llm/server.py +366 -0
  71. maque/mcp_server.py +1096 -0
  72. maque/mllm_data_processor_pipeline/__init__.py +17 -0
  73. maque/mllm_data_processor_pipeline/core.py +341 -0
  74. maque/mllm_data_processor_pipeline/example.py +291 -0
  75. maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
  76. maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
  77. maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
  78. maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
  79. maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
  80. maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
  81. maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
  82. maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
  83. maque/mllm_data_processor_pipeline/web_app.py +317 -0
  84. maque/nlp/__init__.py +14 -0
  85. maque/nlp/ngram.py +9 -0
  86. maque/nlp/parser.py +63 -0
  87. maque/nlp/risk_matcher.py +543 -0
  88. maque/nlp/sentence_splitter.py +202 -0
  89. maque/nlp/simple_tradition_cvt.py +31 -0
  90. maque/performance/__init__.py +21 -0
  91. maque/performance/_measure_time.py +70 -0
  92. maque/performance/_profiler.py +367 -0
  93. maque/performance/_stat_memory.py +51 -0
  94. maque/pipelines/__init__.py +15 -0
  95. maque/pipelines/clustering.py +252 -0
  96. maque/quantization/__init__.py +42 -0
  97. maque/quantization/auto_round.py +120 -0
  98. maque/quantization/base.py +145 -0
  99. maque/quantization/bitsandbytes.py +127 -0
  100. maque/quantization/llm_compressor.py +102 -0
  101. maque/retriever/__init__.py +35 -0
  102. maque/retriever/chroma.py +654 -0
  103. maque/retriever/document.py +140 -0
  104. maque/retriever/milvus.py +1140 -0
  105. maque/table_ops/__init__.py +1 -0
  106. maque/table_ops/core.py +133 -0
  107. maque/table_viewer/__init__.py +4 -0
  108. maque/table_viewer/download_assets.py +57 -0
  109. maque/table_viewer/server.py +698 -0
  110. maque/table_viewer/static/element-plus-icons.js +5791 -0
  111. maque/table_viewer/static/element-plus.css +1 -0
  112. maque/table_viewer/static/element-plus.js +65236 -0
  113. maque/table_viewer/static/main.css +268 -0
  114. maque/table_viewer/static/main.js +669 -0
  115. maque/table_viewer/static/vue.global.js +18227 -0
  116. maque/table_viewer/templates/index.html +401 -0
  117. maque/utils/__init__.py +56 -0
  118. maque/utils/color.py +68 -0
  119. maque/utils/color_string.py +45 -0
  120. maque/utils/compress.py +66 -0
  121. maque/utils/constant.py +183 -0
  122. maque/utils/core.py +261 -0
  123. maque/utils/cursor.py +143 -0
  124. maque/utils/distance.py +58 -0
  125. maque/utils/docker.py +96 -0
  126. maque/utils/downloads.py +51 -0
  127. maque/utils/excel_helper.py +542 -0
  128. maque/utils/helper_metrics.py +121 -0
  129. maque/utils/helper_parser.py +168 -0
  130. maque/utils/net.py +64 -0
  131. maque/utils/nvidia_stat.py +140 -0
  132. maque/utils/ops.py +53 -0
  133. maque/utils/packages.py +31 -0
  134. maque/utils/path.py +57 -0
  135. maque/utils/tar.py +260 -0
  136. maque/utils/untar.py +129 -0
  137. maque/web/__init__.py +0 -0
  138. maque/web/image_downloader.py +1410 -0
  139. maque-0.2.1.dist-info/METADATA +450 -0
  140. maque-0.2.1.dist-info/RECORD +143 -0
  141. maque-0.2.1.dist-info/WHEEL +4 -0
  142. maque-0.2.1.dist-info/entry_points.txt +3 -0
  143. maque-0.2.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,698 @@
1
+ """
2
+ 表格查看器后端服务 - 基于FastAPI
3
+ 支持表格展示、图片URL预览、筛选、编辑等功能
4
+ """
5
+
6
+ from fastapi import FastAPI, Request, HTTPException, Query, UploadFile, File
7
+ from fastapi.responses import HTMLResponse, FileResponse
8
+ from fastapi.staticfiles import StaticFiles
9
+ from fastapi.templating import Jinja2Templates
10
+ from fastapi.middleware.cors import CORSMiddleware
11
+ import pandas as pd
12
+ import numpy as np
13
+ from pathlib import Path
14
+ from typing import Optional, Dict, Any, List, Union
15
+ import json
16
+ import uvicorn
17
+ import webbrowser
18
+ import asyncio
19
+ from dataclasses import dataclass
20
+ import re
21
+ import requests
22
+ import aiohttp
23
+ import aiofiles
24
+ from urllib.parse import urlparse
25
+ import hashlib
26
+ import tempfile
27
+ import os
28
+ import random
29
+ import time
30
+ from maque.utils.helper_parser import split_image_paths
31
+
32
+
33
+ @dataclass
34
+ class FilterConfig:
35
+ """筛选配置"""
36
+
37
+ column: str
38
+ operator: str # eq, ne, contains, startswith, endswith, gt, lt, ge, le
39
+ value: Any
40
+
41
+
42
+ class TableViewerServer:
43
+ """表格查看器服务器"""
44
+
45
+ def __init__(
46
+ self,
47
+ file_path: Optional[str] = None,
48
+ port: int = 8080,
49
+ host: str = "127.0.0.1",
50
+ sheet_name: Union[str, int] = 0,
51
+ image_columns: Optional[List[str]] = None,
52
+ auto_detect_images: bool = True,
53
+ ):
54
+ self.file_path = Path(file_path) if file_path else None
55
+ self.port = port
56
+ self.host = host
57
+ self.sheet_name = sheet_name
58
+ self.image_columns = image_columns or []
59
+ self.auto_detect_images = auto_detect_images
60
+
61
+ # 初始化FastAPI应用
62
+ self.app = FastAPI(
63
+ title="Sparrow Table Viewer",
64
+ description="高性能表格查看器,支持图片预览、筛选、编辑",
65
+ version="1.0.0",
66
+ )
67
+
68
+ # 设置CORS
69
+ self.app.add_middleware(
70
+ CORSMiddleware,
71
+ allow_origins=["*"],
72
+ allow_methods=["*"],
73
+ allow_headers=["*"],
74
+ )
75
+
76
+ # 挂载静态文件
77
+ static_dir = Path(__file__).parent / "static"
78
+ if static_dir.exists():
79
+ self.app.mount("/static", StaticFiles(directory=static_dir), name="static")
80
+
81
+ # 设置模板目录
82
+ template_dir = Path(__file__).parent / "templates"
83
+ self.templates = Jinja2Templates(directory=template_dir)
84
+
85
+ # 加载数据
86
+ if self.file_path:
87
+ self.df = self._load_data()
88
+ self.original_df = self.df.copy()
89
+
90
+ # 自动检测图片列
91
+ if self.auto_detect_images:
92
+ self._detect_image_columns()
93
+ else:
94
+ # 创建空数据框
95
+ self.df = pd.DataFrame()
96
+ self.original_df = pd.DataFrame()
97
+
98
+ # 图片缓存
99
+ self._image_cache = {}
100
+ self._temp_dir = tempfile.mkdtemp(prefix="maque_table_viewer_")
101
+
102
+ # 上传文件缓存
103
+ self._uploads_dir = Path(self._temp_dir) / "uploads"
104
+ self._uploads_dir.mkdir(exist_ok=True)
105
+
106
+ # 异步HTTP会话
107
+ self._http_session = None
108
+
109
+ # 反爬虫User-Agent池
110
+ self._user_agents = [
111
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
112
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
113
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
114
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
115
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0",
116
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:120.0) Gecko/20100101 Firefox/120.0",
117
+ ]
118
+
119
+ # 注册路由
120
+ self._setup_routes()
121
+
122
+ async def _get_http_session(self):
123
+ """获取或创建HTTP会话"""
124
+ if self._http_session is None or self._http_session.closed:
125
+ # 创建连接器,优化性能
126
+ connector = aiohttp.TCPConnector(
127
+ limit=100, # 总连接池大小
128
+ limit_per_host=20, # 每个主机的连接数
129
+ ttl_dns_cache=300, # DNS缓存时间
130
+ use_dns_cache=True,
131
+ )
132
+
133
+ # 创建超时配置
134
+ timeout = aiohttp.ClientTimeout(
135
+ total=30, # 总超时时间
136
+ connect=10, # 连接超时
137
+ sock_read=20, # 读取超时
138
+ )
139
+
140
+ self._http_session = aiohttp.ClientSession(
141
+ connector=connector,
142
+ timeout=timeout,
143
+ headers={
144
+ "Accept": "image/webp,image/apng,image/*,*/*;q=0.8",
145
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
146
+ "Accept-Encoding": "gzip, deflate, br",
147
+ "Cache-Control": "no-cache",
148
+ "Sec-Fetch-Dest": "image",
149
+ "Sec-Fetch-Mode": "no-cors",
150
+ "Sec-Fetch-Site": "cross-site",
151
+ },
152
+ )
153
+ return self._http_session
154
+
155
+ def _get_anti_bot_headers(self):
156
+ """获取反爬虫请求头"""
157
+ return {
158
+ "User-Agent": random.choice(self._user_agents),
159
+ "Referer": "https://www.google.com/",
160
+ "Accept": "image/webp,image/apng,image/*,*/*;q=0.8",
161
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
162
+ "Accept-Encoding": "gzip, deflate, br",
163
+ "DNT": "1",
164
+ "Connection": "keep-alive",
165
+ "Upgrade-Insecure-Requests": "1",
166
+ "Sec-Fetch-Dest": "image",
167
+ "Sec-Fetch-Mode": "no-cors",
168
+ "Sec-Fetch-Site": "cross-site",
169
+ "Cache-Control": "no-cache",
170
+ "Pragma": "no-cache",
171
+ }
172
+
173
+ async def _download_image_async(
174
+ self, url: str, cache_path: Path, max_retries: int = 3
175
+ ) -> bool:
176
+ """异步下载图片到缓存路径,包含重试机制"""
177
+ session = await self._get_http_session()
178
+
179
+ for attempt in range(max_retries):
180
+ try:
181
+ # 随机延迟,避免被反爬虫检测
182
+ if attempt > 0:
183
+ delay = (2**attempt) + random.uniform(
184
+ 0.1, 0.5
185
+ ) # 指数退避 + 随机抖动
186
+ await asyncio.sleep(delay)
187
+
188
+ # 获取反爬虫请求头
189
+ headers = self._get_anti_bot_headers()
190
+
191
+ async with session.get(url, headers=headers) as response:
192
+ # 检查响应状态
193
+ if response.status == 200:
194
+ # 异步写入文件
195
+ async with aiofiles.open(cache_path, "wb") as f:
196
+ async for chunk in response.content.iter_chunked(8192):
197
+ await f.write(chunk)
198
+ return True
199
+ elif response.status == 403:
200
+ # 403错误,可能是反爬虫,增加延迟
201
+ print(
202
+ f"图片下载被拒绝 (403): {url}, 尝试 {attempt + 1}/{max_retries}"
203
+ )
204
+ if attempt < max_retries - 1:
205
+ await asyncio.sleep(random.uniform(1.0, 3.0))
206
+ continue
207
+ elif response.status == 429:
208
+ # 429限流,增加更长延迟
209
+ print(
210
+ f"请求过于频繁 (429): {url}, 尝试 {attempt + 1}/{max_retries}"
211
+ )
212
+ if attempt < max_retries - 1:
213
+ await asyncio.sleep(random.uniform(3.0, 8.0))
214
+ continue
215
+ else:
216
+ print(f"图片下载失败,状态码 {response.status}: {url}")
217
+ if attempt < max_retries - 1:
218
+ continue
219
+
220
+ except asyncio.TimeoutError:
221
+ print(f"图片下载超时: {url}, 尝试 {attempt + 1}/{max_retries}")
222
+ except aiohttp.ClientError as e:
223
+ print(f"网络错误: {url}, {e}, 尝试 {attempt + 1}/{max_retries}")
224
+ except Exception as e:
225
+ print(f"图片下载异常: {url}, {e}, 尝试 {attempt + 1}/{max_retries}")
226
+
227
+ # 如果不是最后一次尝试,等待后重试
228
+ if attempt < max_retries - 1:
229
+ await asyncio.sleep(random.uniform(0.5, 2.0))
230
+
231
+ return False
232
+
233
+ def _reload_data(self, new_file_path: Path):
234
+ """重新加载新的数据文件"""
235
+ self.file_path = new_file_path
236
+ self.df = self._load_data_from_path(new_file_path)
237
+ self.original_df = self.df.copy()
238
+
239
+ # 重新检测图片列
240
+ self.image_columns = []
241
+ if self.auto_detect_images:
242
+ self._detect_image_columns()
243
+
244
+ def _load_data(self) -> pd.DataFrame:
245
+ """加载当前文件路径的数据"""
246
+ if not self.file_path:
247
+ return pd.DataFrame()
248
+ return self._load_data_from_path(self.file_path)
249
+
250
+ def _load_data_from_path(self, file_path: Path) -> pd.DataFrame:
251
+ """加载指定路径的表格数据"""
252
+ if not file_path.exists():
253
+ raise FileNotFoundError(f"文件不存在: {file_path}")
254
+
255
+ file_extension = file_path.suffix.lower()
256
+
257
+ if file_extension == ".csv":
258
+ try:
259
+ # 尝试不同编码
260
+ for encoding in ["utf-8", "gbk", "gb2312", "latin1"]:
261
+ try:
262
+ df = pd.read_csv(file_path, encoding=encoding)
263
+ print(f"成功使用 {encoding} 编码加载CSV文件")
264
+ return df
265
+ except UnicodeDecodeError:
266
+ continue
267
+ raise ValueError("无法确定CSV文件编码")
268
+ except Exception as e:
269
+ raise ValueError(f"加载CSV文件失败: {e}")
270
+
271
+ elif file_extension in [".xlsx", ".xls"]:
272
+ try:
273
+ df = pd.read_excel(file_path, sheet_name=self.sheet_name)
274
+ return df
275
+ except Exception as e:
276
+ raise ValueError(f"加载Excel文件失败: {e}")
277
+ else:
278
+ raise ValueError(f"不支持的文件格式: {file_extension}")
279
+
280
+ def _detect_image_columns(self):
281
+ """自动检测包含图片URL的列"""
282
+ for column in self.df.columns:
283
+ # 检查前10行的数据
284
+ sample_data = self.df[column].dropna().head(10)
285
+ image_count = 0
286
+
287
+ for value in sample_data:
288
+ if isinstance(value, str):
289
+ # 使用split_image_paths检测图片路径
290
+ image_paths = split_image_paths(value)
291
+ if image_paths:
292
+ image_count += 1
293
+
294
+ # 如果超过50%的样本包含图片URL,则认为是图片列
295
+ if image_count / len(sample_data) > 0.5 if len(sample_data) > 0 else False:
296
+ if column not in self.image_columns:
297
+ self.image_columns.append(column)
298
+ print(f"自动检测到图片列: {column}")
299
+
300
+ def _setup_routes(self):
301
+ """设置API路由"""
302
+
303
+ @self.app.get("/", response_class=HTMLResponse)
304
+ async def get_index(request: Request):
305
+ """主页面"""
306
+ return self.templates.TemplateResponse("index.html", {"request": request})
307
+
308
+ @self.app.get("/api/table/info")
309
+ async def get_table_info():
310
+ """获取表格基本信息"""
311
+ return {
312
+ "total_rows": len(self.original_df),
313
+ "total_columns": len(self.original_df.columns),
314
+ "columns": list(self.original_df.columns),
315
+ "image_columns": self.image_columns,
316
+ "file_path": str(self.file_path),
317
+ "dtypes": {
318
+ col: str(dtype) for col, dtype in self.original_df.dtypes.items()
319
+ },
320
+ }
321
+
322
+ @self.app.get("/api/table/data")
323
+ async def get_table_data(
324
+ page: int = Query(1, ge=1),
325
+ page_size: int = Query(100, ge=10, le=1000),
326
+ sort_by: Optional[str] = None,
327
+ sort_order: str = Query("asc", pattern="^(asc|desc)$"),
328
+ filters: Optional[str] = None,
329
+ visible_columns: Optional[str] = None,
330
+ separator: Optional[str] = Query(None, description="自定义分隔符"),
331
+ ):
332
+ """获取表格数据(分页)"""
333
+ df = self.df.copy()
334
+
335
+ # 应用行筛选
336
+ if filters:
337
+ try:
338
+ filter_configs = json.loads(filters)
339
+ df = self._apply_filters(df, filter_configs)
340
+ except Exception as e:
341
+ raise HTTPException(status_code=400, detail=f"筛选参数错误: {e}")
342
+
343
+ # 应用列筛选
344
+ display_columns = list(df.columns)
345
+ if visible_columns:
346
+ try:
347
+ visible_cols = json.loads(visible_columns)
348
+ if visible_cols and isinstance(visible_cols, list):
349
+ # 确保列存在
350
+ display_columns = [
351
+ col for col in visible_cols if col in df.columns
352
+ ]
353
+ if display_columns:
354
+ df = df[display_columns]
355
+ except Exception as e:
356
+ raise HTTPException(status_code=400, detail=f"列筛选参数错误: {e}")
357
+
358
+ # 排序
359
+ if sort_by and sort_by in df.columns:
360
+ ascending = sort_order == "asc"
361
+ df = df.sort_values(by=sort_by, ascending=ascending)
362
+
363
+ # 分页
364
+ total_rows = len(df)
365
+ start_idx = (page - 1) * page_size
366
+ end_idx = min(start_idx + page_size, total_rows)
367
+ page_data = df.iloc[start_idx:end_idx]
368
+
369
+ # 转换为前端格式
370
+ data = []
371
+ for idx, row in page_data.iterrows():
372
+ row_data = {"_index": idx}
373
+ for col in df.columns:
374
+ value = row[col]
375
+ # 处理NaN值
376
+ if pd.isna(value):
377
+ row_data[col] = None
378
+ else:
379
+ # 如果是图像列,预处理切分图像路径
380
+ if col in self.image_columns and isinstance(value, str):
381
+ # 处理自定义分隔符
382
+ separators = None
383
+ if separator:
384
+ # 处理特殊字符串
385
+ if separator == '\\n':
386
+ separator = '\n'
387
+ elif separator == '\\r':
388
+ separator = '\r'
389
+ elif separator == '\\t':
390
+ separator = '\t'
391
+ separators = [separator]
392
+ image_paths = split_image_paths(value, separators=separators)
393
+ row_data[col] = {
394
+ "original": value, # 保留原始字符串
395
+ "paths": image_paths, # 切分后的路径数组
396
+ }
397
+ else:
398
+ row_data[col] = value
399
+ data.append(row_data)
400
+
401
+ return {
402
+ "data": data,
403
+ "total": total_rows,
404
+ "page": page,
405
+ "page_size": page_size,
406
+ "total_pages": (total_rows + page_size - 1) // page_size,
407
+ "visible_columns": display_columns,
408
+ }
409
+
410
+ @self.app.put("/api/table/cell/{row_index}/{column}")
411
+ async def update_cell(row_index: int, column: str, request: Request):
412
+ """更新单元格数据"""
413
+ # 检查是否有数据
414
+ if self.df.empty:
415
+ raise HTTPException(
416
+ status_code=400, detail="没有加载任何表格数据,请先上传文件"
417
+ )
418
+
419
+ if column not in self.df.columns:
420
+ raise HTTPException(status_code=404, detail="列不存在")
421
+
422
+ if row_index < 0 or row_index >= len(self.df):
423
+ raise HTTPException(status_code=404, detail="行索引超出范围")
424
+
425
+ body = await request.json()
426
+ new_value = body.get("value")
427
+
428
+ # 更新数据
429
+ try:
430
+ self.df.at[row_index, column] = new_value
431
+ return {"success": True, "message": "更新成功"}
432
+ except Exception as e:
433
+ raise HTTPException(status_code=500, detail=f"更新失败: {str(e)}")
434
+
435
+ @self.app.post("/api/table/save")
436
+ async def save_table():
437
+ """保存表格到原文件"""
438
+ if not self.file_path:
439
+ raise HTTPException(
440
+ status_code=400,
441
+ detail="没有原始文件,无法保存。请使用文件上传功能。",
442
+ )
443
+
444
+ if self.df.empty:
445
+ raise HTTPException(status_code=400, detail="没有数据可保存")
446
+
447
+ try:
448
+ if self.file_path.suffix.lower() == ".csv":
449
+ self.df.to_csv(self.file_path, index=False, encoding="utf-8")
450
+ else:
451
+ self.df.to_excel(
452
+ self.file_path, index=False, sheet_name=self.sheet_name
453
+ )
454
+ return {"success": True, "message": "保存成功"}
455
+ except Exception as e:
456
+ raise HTTPException(status_code=500, detail=f"保存失败: {str(e)}")
457
+
458
+ @self.app.post("/api/table/reset")
459
+ async def reset_table():
460
+ """重置表格到原始状态"""
461
+ if not self.file_path:
462
+ raise HTTPException(
463
+ status_code=400, detail="没有原始文件,无法重置。请重新上传文件。"
464
+ )
465
+
466
+ self.df = self.original_df.copy()
467
+ return {"success": True, "message": "重置成功"}
468
+
469
+ @self.app.get("/api/image/proxy")
470
+ async def image_proxy(url: str):
471
+ """图片代理服务(解决跨域问题)"""
472
+ if not url:
473
+ raise HTTPException(status_code=400, detail="URL参数缺失")
474
+
475
+ # 检查缓存
476
+ url_hash = hashlib.md5(url.encode()).hexdigest()
477
+ cache_path = Path(self._temp_dir) / f"{url_hash}"
478
+
479
+ if cache_path.exists():
480
+ return FileResponse(cache_path)
481
+
482
+ try:
483
+ # 判断是本地文件还是网络URL
484
+ if url.startswith(("http://", "https://")):
485
+ # 异步下载网络图片
486
+ success = await self._download_image_async(url, cache_path)
487
+ if success:
488
+ return FileResponse(cache_path)
489
+ else:
490
+ raise HTTPException(status_code=500, detail="图片下载失败")
491
+ else:
492
+ # 本地文件
493
+ try:
494
+ # 规范化路径,处理各种路径格式
495
+ local_path = Path(url).resolve()
496
+
497
+ # 检查文件是否存在
498
+ if local_path.exists() and local_path.is_file():
499
+ # 检查是否为图像文件
500
+ if local_path.suffix.lower() in [
501
+ ".jpg",
502
+ ".jpeg",
503
+ ".png",
504
+ ".gif",
505
+ ".bmp",
506
+ ".webp",
507
+ ]:
508
+ return FileResponse(local_path)
509
+ else:
510
+ raise HTTPException(
511
+ status_code=400,
512
+ detail=f"不支持的图像格式: {local_path.suffix}",
513
+ )
514
+ else:
515
+ # 提供更详细的错误信息
516
+ if not local_path.exists():
517
+ raise HTTPException(
518
+ status_code=404, detail=f"文件不存在: {local_path}"
519
+ )
520
+ else:
521
+ raise HTTPException(
522
+ status_code=400, detail=f"不是文件: {local_path}"
523
+ )
524
+ except Exception as e:
525
+ if isinstance(e, HTTPException):
526
+ raise
527
+ raise HTTPException(
528
+ status_code=500, detail=f"处理本地文件时出错: {str(e)}"
529
+ )
530
+
531
+ except HTTPException:
532
+ raise
533
+ except Exception as e:
534
+ raise HTTPException(status_code=500, detail=f"加载图片失败: {e}")
535
+
536
+ @self.app.post("/api/table/upload")
537
+ async def upload_file(file: UploadFile = File(...)):
538
+ """上传新的表格文件"""
539
+ try:
540
+ # 验证文件格式
541
+ if not file.filename:
542
+ raise HTTPException(status_code=400, detail="未提供文件名")
543
+
544
+ file_extension = Path(file.filename).suffix.lower()
545
+ if file_extension not in [".csv", ".xlsx", ".xls"]:
546
+ raise HTTPException(
547
+ status_code=400, detail=f"不支持的文件格式: {file_extension}"
548
+ )
549
+
550
+ # 保存上传的文件
551
+ upload_path = self._uploads_dir / file.filename
552
+ with open(upload_path, "wb") as buffer:
553
+ content = await file.read()
554
+ buffer.write(content)
555
+
556
+ # 重新加载数据
557
+ self._reload_data(upload_path)
558
+
559
+ return {
560
+ "success": True,
561
+ "message": "文件上传成功",
562
+ "filename": file.filename,
563
+ "total_rows": len(self.df),
564
+ "total_columns": len(self.df.columns),
565
+ "columns": list(self.df.columns),
566
+ "image_columns": self.image_columns,
567
+ }
568
+
569
+ except Exception as e:
570
+ raise HTTPException(status_code=500, detail=f"文件上传失败: {str(e)}")
571
+
572
+ def _apply_filters(
573
+ self, df: pd.DataFrame, filter_configs: List[Dict]
574
+ ) -> pd.DataFrame:
575
+ """应用筛选条件"""
576
+ for filter_config in filter_configs:
577
+ column = filter_config.get("column")
578
+ operator = filter_config.get("operator", "contains")
579
+ value = filter_config.get("value", "")
580
+
581
+ if not column or column not in df.columns:
582
+ continue
583
+
584
+ if operator == "contains":
585
+ mask = (
586
+ df[column]
587
+ .astype(str)
588
+ .str.contains(str(value), case=False, na=False)
589
+ )
590
+ elif operator == "eq":
591
+ mask = df[column] == value
592
+ elif operator == "ne":
593
+ mask = df[column] != value
594
+ elif operator == "startswith":
595
+ mask = df[column].astype(str).str.startswith(str(value), na=False)
596
+ elif operator == "endswith":
597
+ mask = df[column].astype(str).str.endswith(str(value), na=False)
598
+ elif operator == "gt":
599
+ mask = pd.to_numeric(df[column], errors="coerce") > float(value)
600
+ elif operator == "lt":
601
+ mask = pd.to_numeric(df[column], errors="coerce") < float(value)
602
+ elif operator == "ge":
603
+ mask = pd.to_numeric(df[column], errors="coerce") >= float(value)
604
+ elif operator == "le":
605
+ mask = pd.to_numeric(df[column], errors="coerce") <= float(value)
606
+ else:
607
+ continue
608
+
609
+ df = df[mask]
610
+
611
+ return df
612
+
613
+ def run(self, auto_open: bool = True):
614
+ """启动服务器"""
615
+ print(f"启动表格查看器服务...")
616
+ print(f"文件: {self.file_path}")
617
+ print(f"地址: http://{self.host}:{self.port}")
618
+ print(f"数据: {len(self.df)} 行 x {len(self.df.columns)} 列")
619
+ if self.image_columns:
620
+ print(f"图片列: {', '.join(self.image_columns)}")
621
+ # 显示每个图像列的示例路径(用于调试)
622
+ for col in self.image_columns:
623
+ sample_value = (
624
+ self.df[col].dropna().iloc[0]
625
+ if not self.df[col].dropna().empty
626
+ else None
627
+ )
628
+ if sample_value:
629
+ sample_paths = split_image_paths(str(sample_value))
630
+ print(
631
+ f" {col}: 示例路径 -> {sample_paths[:2]}{'...' if len(sample_paths) > 2 else ''}"
632
+ )
633
+ else:
634
+ print("未检测到图片列")
635
+ print(f"提示: 双击单元格可编辑,Ctrl+C 停止服务")
636
+
637
+ if auto_open:
638
+ # 延迟打开浏览器
639
+ def open_browser():
640
+ import time
641
+
642
+ time.sleep(1.5)
643
+ webbrowser.open(f"http://{self.host}:{self.port}")
644
+
645
+ import threading
646
+
647
+ threading.Thread(target=open_browser, daemon=True).start()
648
+
649
+ try:
650
+ uvicorn.run(
651
+ self.app,
652
+ host=self.host,
653
+ port=self.port,
654
+ log_level="warning", # 减少日志输出
655
+ )
656
+ except KeyboardInterrupt:
657
+ print("\n服务器已停止")
658
+ finally:
659
+ # 清理HTTP会话
660
+ if self._http_session and not self._http_session.closed:
661
+ asyncio.run(self._http_session.close())
662
+
663
+ # 清理临时文件
664
+ import shutil
665
+
666
+ if Path(self._temp_dir).exists():
667
+ shutil.rmtree(self._temp_dir, ignore_errors=True)
668
+
669
+
670
+ def start_table_viewer(
671
+ file_path: str,
672
+ port: int = 8080,
673
+ host: str = "0.0.0.0",
674
+ sheet_name: Union[str, int] = 0,
675
+ image_columns: Optional[List[str]] = None,
676
+ auto_detect_images: bool = True,
677
+ auto_open: bool = True,
678
+ ):
679
+ """启动表格查看器的便捷函数"""
680
+ server = TableViewerServer(
681
+ file_path=file_path,
682
+ port=port,
683
+ host=host,
684
+ sheet_name=sheet_name,
685
+ image_columns=image_columns,
686
+ auto_detect_images=auto_detect_images,
687
+ )
688
+ server.run(auto_open=auto_open)
689
+
690
+
691
+ if __name__ == "__main__":
692
+ import sys
693
+
694
+ if len(sys.argv) < 2:
695
+ print("用法: python server.py <文件路径>")
696
+ sys.exit(1)
697
+
698
+ start_table_viewer(sys.argv[1])