memect-ppx 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memect/__init__.py +4 -0
- memect/app.py +303 -0
- memect/base/__init__.py +0 -0
- memect/base/api.py +75 -0
- memect/base/bbox.py +795 -0
- memect/base/config.py +394 -0
- memect/base/debug.py +290 -0
- memect/base/images.py +301 -0
- memect/base/job.py +398 -0
- memect/base/lists.py +288 -0
- memect/base/matrix.py +309 -0
- memect/base/pdfs.py +103 -0
- memect/base/sdk.py +288 -0
- memect/base/task.py +732 -0
- memect/base/test.py +51 -0
- memect/base/utils.py +330 -0
- memect/base/zip.py +171 -0
- memect/cli.py +244 -0
- memect/conf/__init__.py +0 -0
- memect/conf/log.default.py +154 -0
- memect/conf/settings.custom.py +4 -0
- memect/conf/settings.default.py +600 -0
- memect/nvidia_path.py +56 -0
- memect/pdf/__init__.py +0 -0
- memect/pdf/backup/formula_model.py +95 -0
- memect/pdf/backup/layout.py +65 -0
- memect/pdf/backup/otsl_ai.py +281 -0
- memect/pdf/backup/watermark.py +481 -0
- memect/pdf/base.py +2486 -0
- memect/pdf/chars.py +1416 -0
- memect/pdf/commons.py +86 -0
- memect/pdf/default/footer.py +10 -0
- memect/pdf/default/footnote.py +11 -0
- memect/pdf/default/header.py +11 -0
- memect/pdf/default/parser.py +601 -0
- memect/pdf/default/pdf.py +269 -0
- memect/pdf/default/pdf_pdfoxide.py +130 -0
- memect/pdf/default/pdf_pymupdf.py +1193 -0
- memect/pdf/default/table/_wbk.py +52 -0
- memect/pdf/default/table/_wbk_ai.py +906 -0
- memect/pdf/default/table/_wbk_default.py +2640 -0
- memect/pdf/default/table/_wbk_default2.py +2429 -0
- memect/pdf/default/table/_wbk_locator.py +2526 -0
- memect/pdf/default/table/_wtable.py +2364 -0
- memect/pdf/default/table/auto.py +56 -0
- memect/pdf/default/table/line.py +637 -0
- memect/pdf/default/table/llm.py +142 -0
- memect/pdf/default/table/parser.py +88 -0
- memect/pdf/default/table/wbk.py +36 -0
- memect/pdf/default/table/ybk.py +111 -0
- memect/pdf/default/tree/parser.py +10 -0
- memect/pdf/docx.py +38 -0
- memect/pdf/font.py +508 -0
- memect/pdf/fonts/__init__.py +9 -0
- memect/pdf/fonts/tool.py +103 -0
- memect/pdf/grid.py +590 -0
- memect/pdf/html/__init__.py +4 -0
- memect/pdf/html/renderer.py +2020 -0
- memect/pdf/llm/deepseek.py +241 -0
- memect/pdf/llm/glm.py +421 -0
- memect/pdf/llm/llm.py +223 -0
- memect/pdf/llm/mineru.py +0 -0
- memect/pdf/llm/paddle.py +408 -0
- memect/pdf/model.py +885 -0
- memect/pdf/otsl.py +223 -0
- memect/pdf/parser.py +332 -0
- memect/pdf/patch_table_cls.py +48 -0
- memect/pdf/pdf2image.py +561 -0
- memect/pdf/plane.py +185 -0
- memect/pdf/pptx.py +663 -0
- memect/pdf/service.py +456 -0
- memect/pdf/sort.py +92 -0
- memect/pdf/vline.py +1072 -0
- memect/pdf/watermark.py +481 -0
- memect/pdf/wingdings.py +907 -0
- memect/web/__init__.py +0 -0
- memect_ppx-0.0.1.dist-info/METADATA +301 -0
- memect_ppx-0.0.1.dist-info/RECORD +83 -0
- memect_ppx-0.0.1.dist-info/WHEEL +5 -0
- memect_ppx-0.0.1.dist-info/entry_points.txt +2 -0
- memect_ppx-0.0.1.dist-info/licenses/LICENSE +165 -0
- memect_ppx-0.0.1.dist-info/top_level.txt +2 -0
- ppx.py +6 -0
memect/__init__.py
ADDED
memect/app.py
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import datetime
|
|
5
|
+
import logging
|
|
6
|
+
import time
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from types import TracebackType
|
|
9
|
+
from typing import Any, Final, Mapping, Protocol, Self, Sequence
|
|
10
|
+
|
|
11
|
+
# ============================
|
|
12
|
+
from fastapi import FastAPI, Request
|
|
13
|
+
from fastapi.exceptions import RequestValidationError
|
|
14
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
15
|
+
from fastapi.middleware.gzip import GZipMiddleware
|
|
16
|
+
from fastapi.responses import FileResponse, JSONResponse
|
|
17
|
+
from fastapi.staticfiles import StaticFiles
|
|
18
|
+
from pydantic import BaseModel
|
|
19
|
+
|
|
20
|
+
from memect.pdf.service import PdfService
|
|
21
|
+
|
|
22
|
+
from memect.base.api import ApiError, ApiInfo
|
|
23
|
+
from memect.base.config import get_settings
|
|
24
|
+
|
|
25
|
+
# ==========================
|
|
26
|
+
from memect.base.utils import Timer
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ServerSettings(BaseModel):
|
|
30
|
+
provider: str = "uvicorn"
|
|
31
|
+
host: str = ""
|
|
32
|
+
port: int = 9527
|
|
33
|
+
cors: Mapping[str, Any] | None = None
|
|
34
|
+
uvicorn: Any = None
|
|
35
|
+
hypercorn: Any = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ApiService(Protocol):
|
|
39
|
+
def setup(self, app: FastAPI):
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
def get_info(self) -> ApiInfo: ...
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class App:
|
|
46
|
+
_logger: Final = logging.getLogger(f"{__module__}.{__qualname__}")
|
|
47
|
+
|
|
48
|
+
def __init__(self, settings: Mapping[str, Any] | None = None):
|
|
49
|
+
super().__init__()
|
|
50
|
+
|
|
51
|
+
self._settings: Final = settings or get_settings()
|
|
52
|
+
self._timer = Timer.start()
|
|
53
|
+
self._fastapi: FastAPI | None = None
|
|
54
|
+
self._pdf_service: PdfService | None = None
|
|
55
|
+
|
|
56
|
+
self._inited: bool = False
|
|
57
|
+
|
|
58
|
+
async def _init(self):
|
|
59
|
+
if self._inited:
|
|
60
|
+
return
|
|
61
|
+
self._inited = True
|
|
62
|
+
self._logger.info("start init app...")
|
|
63
|
+
self._pdf_service = PdfService(self._settings["pdf_service"])
|
|
64
|
+
self._fastapi = self._create_fastapi([self._pdf_service])
|
|
65
|
+
self._logger.info("end init app,elapsed=%.3f", self._timer.elapsed())
|
|
66
|
+
|
|
67
|
+
async def _exit(self):
|
|
68
|
+
self._logger.info("start exit app")
|
|
69
|
+
if self._pdf_service:
|
|
70
|
+
await self._pdf_service.close()
|
|
71
|
+
self._logger.info("end exit app uptime=%s", self._timer.uptime())
|
|
72
|
+
|
|
73
|
+
def _create_fastapi(self, services: Sequence[ApiService] | None = None):
|
|
74
|
+
self._logger.info("start create fastapi")
|
|
75
|
+
cors_cfg = self._settings["server"].get("cors")
|
|
76
|
+
|
|
77
|
+
async def on_startup():
|
|
78
|
+
# 如果需要使用同一个loop,需要在这里初始化
|
|
79
|
+
# await api.init()
|
|
80
|
+
self._logger.info("fastapi startup")
|
|
81
|
+
|
|
82
|
+
async def on_shutdown():
|
|
83
|
+
# await api.close()
|
|
84
|
+
self._logger.info("fastapi shutdown")
|
|
85
|
+
|
|
86
|
+
app = FastAPI(on_startup=[on_startup], on_shutdown=[on_shutdown])
|
|
87
|
+
if isinstance(cors_cfg, dict):
|
|
88
|
+
self._logger.info("setup cors")
|
|
89
|
+
app.add_middleware(CORSMiddleware, **cors_cfg)
|
|
90
|
+
|
|
91
|
+
app.add_middleware(GZipMiddleware, minimum_size=1000)
|
|
92
|
+
|
|
93
|
+
error_headers = {"x-api-status": "error"}
|
|
94
|
+
|
|
95
|
+
@app.exception_handler(RequestValidationError)
|
|
96
|
+
async def validation_exception_handler(
|
|
97
|
+
request: Request, exc: RequestValidationError
|
|
98
|
+
):
|
|
99
|
+
# 参数错误使用debug级别的日志就可以,否则可能会很多,如果恶意攻击的
|
|
100
|
+
# 默认不会输出到日志,如果需要,可以在这里记录
|
|
101
|
+
# str(exc) 返回太具体,连错误文件和位置都暴露了
|
|
102
|
+
fields = {err["loc"][-1]: err["msg"] for err in exc.errors()}
|
|
103
|
+
error = ApiError(ApiError.PARAMETER, "参数错误", details=fields)
|
|
104
|
+
return JSONResponse(
|
|
105
|
+
status_code=200,
|
|
106
|
+
headers=error_headers,
|
|
107
|
+
content={"error": error.jsonify()},
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
@app.exception_handler(ApiError)
|
|
111
|
+
async def api_exception_handler(request: Request, exc: ApiError):
|
|
112
|
+
# fastapi不会记录日志
|
|
113
|
+
return JSONResponse(
|
|
114
|
+
status_code=200, headers=error_headers, content={"error": exc.jsonify()}
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
@app.exception_handler(Exception)
|
|
118
|
+
async def system_exception_handler(request: Request, exc: Exception):
|
|
119
|
+
# 不需要返回具体的信息给客户端,这个异常fastapi会log
|
|
120
|
+
error = ApiError(ApiError.SYSTEM, "系统异常")
|
|
121
|
+
return JSONResponse(
|
|
122
|
+
status_code=200,
|
|
123
|
+
headers=error_headers,
|
|
124
|
+
content={"error": error.jsonify()},
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# ====api====
|
|
128
|
+
api_infos: list[ApiInfo] = []
|
|
129
|
+
if services:
|
|
130
|
+
for service in services:
|
|
131
|
+
service.setup(app)
|
|
132
|
+
api_infos.append(service.get_info())
|
|
133
|
+
|
|
134
|
+
@app.get("/admin/gc")
|
|
135
|
+
def gc():
|
|
136
|
+
# TODO 后续可以要求一个token
|
|
137
|
+
import gc
|
|
138
|
+
|
|
139
|
+
t1 = time.monotonic()
|
|
140
|
+
gc.collect()
|
|
141
|
+
gc.get_count()
|
|
142
|
+
t2 = time.monotonic()
|
|
143
|
+
return {"elapsed": t2 - t1}
|
|
144
|
+
|
|
145
|
+
@app.get("/admin/state.html")
|
|
146
|
+
async def state():
|
|
147
|
+
return FileResponse("./web/state.html")
|
|
148
|
+
|
|
149
|
+
@app.get("/apis")
|
|
150
|
+
async def apis():
|
|
151
|
+
return api_infos
|
|
152
|
+
|
|
153
|
+
# ===k8s=====
|
|
154
|
+
@app.get("/health")
|
|
155
|
+
async def health():
|
|
156
|
+
# k8s检查,如果没有返回200,就重启容器
|
|
157
|
+
return {
|
|
158
|
+
"status": "ok",
|
|
159
|
+
"datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"),
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
@app.get("/ready")
|
|
163
|
+
async def ready():
|
|
164
|
+
# k8s检查是否准备好了,如:数据库等都连接了
|
|
165
|
+
return {}
|
|
166
|
+
|
|
167
|
+
#
|
|
168
|
+
|
|
169
|
+
@app.get("/echo")
|
|
170
|
+
def echo():
|
|
171
|
+
return {
|
|
172
|
+
"message": "echo",
|
|
173
|
+
"datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"),
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
@app.get("/version.json")
|
|
177
|
+
async def version():
|
|
178
|
+
return {}
|
|
179
|
+
|
|
180
|
+
@app.get("/changelog.md")
|
|
181
|
+
async def changelog():
|
|
182
|
+
return FileResponse("./changelog.md")
|
|
183
|
+
|
|
184
|
+
import memect.web
|
|
185
|
+
app.mount(
|
|
186
|
+
"/", StaticFiles(directory=Path(memect.web.__file__).parent.absolute(), html=True), name="web"
|
|
187
|
+
)
|
|
188
|
+
self._logger.info("end create fastapi")
|
|
189
|
+
return app
|
|
190
|
+
|
|
191
|
+
async def __call__(self, scope: Any, receive: Any, send: Any) -> Any:
|
|
192
|
+
# 必须在这里初始化,因为需要在异步环境下
|
|
193
|
+
await self._init()
|
|
194
|
+
assert self._fastapi is not None
|
|
195
|
+
# 可以根据asgi自行处理请求,目前使用fastapi
|
|
196
|
+
return await self._fastapi(scope, receive, send)
|
|
197
|
+
|
|
198
|
+
async def serve(self):
|
|
199
|
+
server_cfg = self._settings["server"]
|
|
200
|
+
self._logger.info("start httpserver,provider=%s", server_cfg["provider"])
|
|
201
|
+
host = server_cfg.get("host", "0.0.0.0")
|
|
202
|
+
port = server_cfg.get("port", 9527)
|
|
203
|
+
try:
|
|
204
|
+
if server_cfg["provider"] == "uvicorn":
|
|
205
|
+
# uvicorn仅仅支持http1.1,不支持http2
|
|
206
|
+
# 禁用uvicorn的日志设置,使用全局的设置
|
|
207
|
+
# uvicorn的access log的缺点就是,不显示请求返回的内容长度和耗时,因为是请求在返回headers的时候就记录了日志
|
|
208
|
+
# 而不是在请求处理完毕后记录请求的日志
|
|
209
|
+
# 如果需要,可以参考https://github.com/Kludex/asgi-logger
|
|
210
|
+
# 使用uvloop+httptools可以达到3500-3700个请求/秒
|
|
211
|
+
# pypy不支持uvloop,但是也不需要使用pypy来执行io操作,因为比cpython慢
|
|
212
|
+
# 使用对象不支持reload,workers等参数,实际上也不需要
|
|
213
|
+
# 可以通过nginx或者gunicon等来支持
|
|
214
|
+
# ab -n 1000 -c 10
|
|
215
|
+
# pypy 2200/s(asyncio)
|
|
216
|
+
# python3.11 2200/s(asyncio) 3000/s(uvloop)
|
|
217
|
+
import uvicorn
|
|
218
|
+
|
|
219
|
+
config = uvicorn.Config(
|
|
220
|
+
self, **server_cfg.get("uvicorn") or {}, host=host, port=port
|
|
221
|
+
)
|
|
222
|
+
server = uvicorn.Server(config)
|
|
223
|
+
await server.serve()
|
|
224
|
+
elif server_cfg["provider"] == "granian":
|
|
225
|
+
from granian.constants import Interfaces
|
|
226
|
+
from granian.server.embed import Server
|
|
227
|
+
|
|
228
|
+
server = Server(
|
|
229
|
+
self,
|
|
230
|
+
address=host,
|
|
231
|
+
port=port,
|
|
232
|
+
interface=Interfaces.ASGI,
|
|
233
|
+
**server_cfg.get("granian", {}),
|
|
234
|
+
)
|
|
235
|
+
await server.serve()
|
|
236
|
+
elif server_cfg["provider"] == "hypercorn":
|
|
237
|
+
# 支持http1.1,http2,http3
|
|
238
|
+
# pip install hypercorn
|
|
239
|
+
# ab -n 1000 -c 10
|
|
240
|
+
# pypy: 1200/s
|
|
241
|
+
# python3.11: 1200/s,1400/s(uvloop)
|
|
242
|
+
# 所以,使用默认的asyncio,pypy和python是持平的,uvloop会快10%-20%
|
|
243
|
+
from hypercorn.asyncio import serve # type:ignore
|
|
244
|
+
from hypercorn.config import Config
|
|
245
|
+
|
|
246
|
+
# from hypercorn.statsd import StatsdLogger
|
|
247
|
+
# config.statsd_host=None
|
|
248
|
+
# config.set_statsd_logger_class(StatsdLogger)
|
|
249
|
+
config = Config.from_mapping(
|
|
250
|
+
server_cfg.get("hypercorn"), bind=[f"{host}:{port}"]
|
|
251
|
+
)
|
|
252
|
+
await serve(self, config)
|
|
253
|
+
else:
|
|
254
|
+
raise ValueError(f"不支持的provider={server_cfg['provider']}")
|
|
255
|
+
|
|
256
|
+
finally:
|
|
257
|
+
self._logger.info("exit httpserver uptime=%s", self._timer.uptime())
|
|
258
|
+
|
|
259
|
+
# TODO 如果需要异步
|
|
260
|
+
async def __aenter__(self) -> Self:
|
|
261
|
+
await self._init()
|
|
262
|
+
return self
|
|
263
|
+
|
|
264
|
+
async def __aexit__(
|
|
265
|
+
self, et: type | None, ev: BaseException | None, tb: TracebackType | None
|
|
266
|
+
) -> bool | None:
|
|
267
|
+
await self._exit()
|
|
268
|
+
|
|
269
|
+
@classmethod
|
|
270
|
+
def run_async(cls, c: Any):
|
|
271
|
+
try:
|
|
272
|
+
import uvloop
|
|
273
|
+
|
|
274
|
+
use_uvloop = True
|
|
275
|
+
except ModuleNotFoundError:
|
|
276
|
+
use_uvloop = False
|
|
277
|
+
if use_uvloop:
|
|
278
|
+
cls._logger.info("use uv loop")
|
|
279
|
+
import uvloop
|
|
280
|
+
|
|
281
|
+
# 如下执行,不修改全局
|
|
282
|
+
with asyncio.Runner(loop_factory=uvloop.new_event_loop) as runner:
|
|
283
|
+
runner.run(c)
|
|
284
|
+
else:
|
|
285
|
+
# 不在这里install,使用默认的
|
|
286
|
+
# uvloop.install()
|
|
287
|
+
asyncio.run(c)
|
|
288
|
+
|
|
289
|
+
@classmethod
|
|
290
|
+
def run(cls):
|
|
291
|
+
cls.run_async(cls.arun())
|
|
292
|
+
|
|
293
|
+
@classmethod
|
|
294
|
+
async def arun(cls):
|
|
295
|
+
async with App() as app:
|
|
296
|
+
await app.serve()
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
async def create_app():
|
|
300
|
+
from memect.base.config import setup
|
|
301
|
+
|
|
302
|
+
setup()
|
|
303
|
+
return App()
|
memect/base/__init__.py
ADDED
|
File without changes
|
memect/base/api.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
|
|
2
|
+
from typing import Any, Final, Mapping, NotRequired, Sequence, TypedDict
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ErrorModel(BaseModel):
|
|
8
|
+
code:int|None=0
|
|
9
|
+
message:str|None=None
|
|
10
|
+
extras:Mapping[str,Any]|None=None
|
|
11
|
+
|
|
12
|
+
class ApiError(Exception):
|
|
13
|
+
|
|
14
|
+
#在多数情况下,调用者都不需要关心错误码,除非是需要自动化程序,如:现在的Agent等,可以根据错误码进行合适的处理
|
|
15
|
+
ANY=10000
|
|
16
|
+
"""表示任何不需要区分的错误"""
|
|
17
|
+
SYSTEM=10001
|
|
18
|
+
"""系统错误"""
|
|
19
|
+
PARAMETER=10002
|
|
20
|
+
"""参数错误"""
|
|
21
|
+
|
|
22
|
+
def __init__(self,code:int,message:str,**extras:Any):
|
|
23
|
+
super().__init__(code,message,extras)
|
|
24
|
+
self.code:Final=code
|
|
25
|
+
self.message:Final=message
|
|
26
|
+
self.extras:Final=dict(extras)
|
|
27
|
+
|
|
28
|
+
def jsonify(self)->dict[str,Any]:
|
|
29
|
+
data={
|
|
30
|
+
'code':self.code,
|
|
31
|
+
'message':self.message
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
# 为了兼容老的api接口,code='running',按现在应该是返回数值+status='running'
|
|
35
|
+
if self.extras.get('status') in ('running','waiting'):
|
|
36
|
+
data['code']='running'
|
|
37
|
+
data.update(self.extras)
|
|
38
|
+
return data
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def from_dict(cls,error:Mapping[str,Any]):
|
|
42
|
+
return cls(**error)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class FileType(TypedDict):
|
|
46
|
+
name: str
|
|
47
|
+
"""类型的名字"""
|
|
48
|
+
exts: Sequence[str]
|
|
49
|
+
"""文件的扩展名"""
|
|
50
|
+
max_length: int
|
|
51
|
+
"""文件的字节数"""
|
|
52
|
+
max_size: NotRequired[tuple[int, int] | None]
|
|
53
|
+
"""图片的width/height"""
|
|
54
|
+
max_page_count: NotRequired[int | None]
|
|
55
|
+
"""pdf文件允许的页数"""
|
|
56
|
+
max_file_count: NotRequired[int | None]
|
|
57
|
+
"""zip文件允许的文件数"""
|
|
58
|
+
|
|
59
|
+
class ApiInfo(TypedDict):
|
|
60
|
+
name: str
|
|
61
|
+
url: str
|
|
62
|
+
|
|
63
|
+
allow_async: bool
|
|
64
|
+
allow_timeout: bool
|
|
65
|
+
allow_form: bool
|
|
66
|
+
allow_task_id: bool
|
|
67
|
+
|
|
68
|
+
file: Mapping[str, Any]
|
|
69
|
+
schema: Mapping[str, Any]
|
|
70
|
+
defaults: Mapping[str, Any] | None
|
|
71
|
+
|
|
72
|
+
def usage():
|
|
73
|
+
ApiError(ApiError.SYSTEM,'')
|
|
74
|
+
ApiError(ApiError.ANY,'xxx',status='running')
|
|
75
|
+
|