memect-ppx 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. memect/__init__.py +4 -0
  2. memect/app.py +303 -0
  3. memect/base/__init__.py +0 -0
  4. memect/base/api.py +75 -0
  5. memect/base/bbox.py +795 -0
  6. memect/base/config.py +394 -0
  7. memect/base/debug.py +290 -0
  8. memect/base/images.py +301 -0
  9. memect/base/job.py +398 -0
  10. memect/base/lists.py +288 -0
  11. memect/base/matrix.py +309 -0
  12. memect/base/pdfs.py +103 -0
  13. memect/base/sdk.py +288 -0
  14. memect/base/task.py +732 -0
  15. memect/base/test.py +51 -0
  16. memect/base/utils.py +330 -0
  17. memect/base/zip.py +171 -0
  18. memect/cli.py +244 -0
  19. memect/conf/__init__.py +0 -0
  20. memect/conf/log.default.py +154 -0
  21. memect/conf/settings.custom.py +4 -0
  22. memect/conf/settings.default.py +600 -0
  23. memect/nvidia_path.py +56 -0
  24. memect/pdf/__init__.py +0 -0
  25. memect/pdf/backup/formula_model.py +95 -0
  26. memect/pdf/backup/layout.py +65 -0
  27. memect/pdf/backup/otsl_ai.py +281 -0
  28. memect/pdf/backup/watermark.py +481 -0
  29. memect/pdf/base.py +2486 -0
  30. memect/pdf/chars.py +1416 -0
  31. memect/pdf/commons.py +86 -0
  32. memect/pdf/default/footer.py +10 -0
  33. memect/pdf/default/footnote.py +11 -0
  34. memect/pdf/default/header.py +11 -0
  35. memect/pdf/default/parser.py +601 -0
  36. memect/pdf/default/pdf.py +269 -0
  37. memect/pdf/default/pdf_pdfoxide.py +130 -0
  38. memect/pdf/default/pdf_pymupdf.py +1193 -0
  39. memect/pdf/default/table/_wbk.py +52 -0
  40. memect/pdf/default/table/_wbk_ai.py +906 -0
  41. memect/pdf/default/table/_wbk_default.py +2640 -0
  42. memect/pdf/default/table/_wbk_default2.py +2429 -0
  43. memect/pdf/default/table/_wbk_locator.py +2526 -0
  44. memect/pdf/default/table/_wtable.py +2364 -0
  45. memect/pdf/default/table/auto.py +56 -0
  46. memect/pdf/default/table/line.py +637 -0
  47. memect/pdf/default/table/llm.py +142 -0
  48. memect/pdf/default/table/parser.py +88 -0
  49. memect/pdf/default/table/wbk.py +36 -0
  50. memect/pdf/default/table/ybk.py +111 -0
  51. memect/pdf/default/tree/parser.py +10 -0
  52. memect/pdf/docx.py +38 -0
  53. memect/pdf/font.py +508 -0
  54. memect/pdf/fonts/__init__.py +9 -0
  55. memect/pdf/fonts/tool.py +103 -0
  56. memect/pdf/grid.py +590 -0
  57. memect/pdf/html/__init__.py +4 -0
  58. memect/pdf/html/renderer.py +2020 -0
  59. memect/pdf/llm/deepseek.py +241 -0
  60. memect/pdf/llm/glm.py +421 -0
  61. memect/pdf/llm/llm.py +223 -0
  62. memect/pdf/llm/mineru.py +0 -0
  63. memect/pdf/llm/paddle.py +408 -0
  64. memect/pdf/model.py +885 -0
  65. memect/pdf/otsl.py +223 -0
  66. memect/pdf/parser.py +332 -0
  67. memect/pdf/patch_table_cls.py +48 -0
  68. memect/pdf/pdf2image.py +561 -0
  69. memect/pdf/plane.py +185 -0
  70. memect/pdf/pptx.py +663 -0
  71. memect/pdf/service.py +456 -0
  72. memect/pdf/sort.py +92 -0
  73. memect/pdf/vline.py +1072 -0
  74. memect/pdf/watermark.py +481 -0
  75. memect/pdf/wingdings.py +907 -0
  76. memect/web/__init__.py +0 -0
  77. memect_ppx-0.0.1.dist-info/METADATA +301 -0
  78. memect_ppx-0.0.1.dist-info/RECORD +83 -0
  79. memect_ppx-0.0.1.dist-info/WHEEL +5 -0
  80. memect_ppx-0.0.1.dist-info/entry_points.txt +2 -0
  81. memect_ppx-0.0.1.dist-info/licenses/LICENSE +165 -0
  82. memect_ppx-0.0.1.dist-info/top_level.txt +2 -0
  83. ppx.py +6 -0
memect/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+
2
+
3
+
4
+
memect/app.py ADDED
@@ -0,0 +1,303 @@
1
+ # coding=utf-8
2
+
3
+ import asyncio
4
+ import datetime
5
+ import logging
6
+ import time
7
+ from pathlib import Path
8
+ from types import TracebackType
9
+ from typing import Any, Final, Mapping, Protocol, Self, Sequence
10
+
11
+ # ============================
12
+ from fastapi import FastAPI, Request
13
+ from fastapi.exceptions import RequestValidationError
14
+ from fastapi.middleware.cors import CORSMiddleware
15
+ from fastapi.middleware.gzip import GZipMiddleware
16
+ from fastapi.responses import FileResponse, JSONResponse
17
+ from fastapi.staticfiles import StaticFiles
18
+ from pydantic import BaseModel
19
+
20
+ from memect.pdf.service import PdfService
21
+
22
+ from memect.base.api import ApiError, ApiInfo
23
+ from memect.base.config import get_settings
24
+
25
+ # ==========================
26
+ from memect.base.utils import Timer
27
+
28
+
29
+ class ServerSettings(BaseModel):
30
+ provider: str = "uvicorn"
31
+ host: str = ""
32
+ port: int = 9527
33
+ cors: Mapping[str, Any] | None = None
34
+ uvicorn: Any = None
35
+ hypercorn: Any = None
36
+
37
+
38
+ class ApiService(Protocol):
39
+ def setup(self, app: FastAPI):
40
+ pass
41
+
42
+ def get_info(self) -> ApiInfo: ...
43
+
44
+
45
+ class App:
46
+ _logger: Final = logging.getLogger(f"{__module__}.{__qualname__}")
47
+
48
+ def __init__(self, settings: Mapping[str, Any] | None = None):
49
+ super().__init__()
50
+
51
+ self._settings: Final = settings or get_settings()
52
+ self._timer = Timer.start()
53
+ self._fastapi: FastAPI | None = None
54
+ self._pdf_service: PdfService | None = None
55
+
56
+ self._inited: bool = False
57
+
58
+ async def _init(self):
59
+ if self._inited:
60
+ return
61
+ self._inited = True
62
+ self._logger.info("start init app...")
63
+ self._pdf_service = PdfService(self._settings["pdf_service"])
64
+ self._fastapi = self._create_fastapi([self._pdf_service])
65
+ self._logger.info("end init app,elapsed=%.3f", self._timer.elapsed())
66
+
67
+ async def _exit(self):
68
+ self._logger.info("start exit app")
69
+ if self._pdf_service:
70
+ await self._pdf_service.close()
71
+ self._logger.info("end exit app uptime=%s", self._timer.uptime())
72
+
73
+ def _create_fastapi(self, services: Sequence[ApiService] | None = None):
74
+ self._logger.info("start create fastapi")
75
+ cors_cfg = self._settings["server"].get("cors")
76
+
77
+ async def on_startup():
78
+ # 如果需要使用同一个loop,需要在这里初始化
79
+ # await api.init()
80
+ self._logger.info("fastapi startup")
81
+
82
+ async def on_shutdown():
83
+ # await api.close()
84
+ self._logger.info("fastapi shutdown")
85
+
86
+ app = FastAPI(on_startup=[on_startup], on_shutdown=[on_shutdown])
87
+ if isinstance(cors_cfg, dict):
88
+ self._logger.info("setup cors")
89
+ app.add_middleware(CORSMiddleware, **cors_cfg)
90
+
91
+ app.add_middleware(GZipMiddleware, minimum_size=1000)
92
+
93
+ error_headers = {"x-api-status": "error"}
94
+
95
+ @app.exception_handler(RequestValidationError)
96
+ async def validation_exception_handler(
97
+ request: Request, exc: RequestValidationError
98
+ ):
99
+ # 参数错误使用debug级别的日志就可以,否则可能会很多,如果恶意攻击的
100
+ # 默认不会输出到日志,如果需要,可以在这里记录
101
+ # str(exc) 返回太具体,连错误文件和位置都暴露了
102
+ fields = {err["loc"][-1]: err["msg"] for err in exc.errors()}
103
+ error = ApiError(ApiError.PARAMETER, "参数错误", details=fields)
104
+ return JSONResponse(
105
+ status_code=200,
106
+ headers=error_headers,
107
+ content={"error": error.jsonify()},
108
+ )
109
+
110
+ @app.exception_handler(ApiError)
111
+ async def api_exception_handler(request: Request, exc: ApiError):
112
+ # fastapi不会记录日志
113
+ return JSONResponse(
114
+ status_code=200, headers=error_headers, content={"error": exc.jsonify()}
115
+ )
116
+
117
+ @app.exception_handler(Exception)
118
+ async def system_exception_handler(request: Request, exc: Exception):
119
+ # 不需要返回具体的信息给客户端,这个异常fastapi会log
120
+ error = ApiError(ApiError.SYSTEM, "系统异常")
121
+ return JSONResponse(
122
+ status_code=200,
123
+ headers=error_headers,
124
+ content={"error": error.jsonify()},
125
+ )
126
+
127
+ # ====api====
128
+ api_infos: list[ApiInfo] = []
129
+ if services:
130
+ for service in services:
131
+ service.setup(app)
132
+ api_infos.append(service.get_info())
133
+
134
+ @app.get("/admin/gc")
135
+ def gc():
136
+ # TODO 后续可以要求一个token
137
+ import gc
138
+
139
+ t1 = time.monotonic()
140
+ gc.collect()
141
+ gc.get_count()
142
+ t2 = time.monotonic()
143
+ return {"elapsed": t2 - t1}
144
+
145
+ @app.get("/admin/state.html")
146
+ async def state():
147
+ return FileResponse("./web/state.html")
148
+
149
+ @app.get("/apis")
150
+ async def apis():
151
+ return api_infos
152
+
153
+ # ===k8s=====
154
+ @app.get("/health")
155
+ async def health():
156
+ # k8s检查,如果没有返回200,就重启容器
157
+ return {
158
+ "status": "ok",
159
+ "datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"),
160
+ }
161
+
162
+ @app.get("/ready")
163
+ async def ready():
164
+ # k8s检查是否准备好了,如:数据库等都连接了
165
+ return {}
166
+
167
+ #
168
+
169
+ @app.get("/echo")
170
+ def echo():
171
+ return {
172
+ "message": "echo",
173
+ "datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"),
174
+ }
175
+
176
+ @app.get("/version.json")
177
+ async def version():
178
+ return {}
179
+
180
+ @app.get("/changelog.md")
181
+ async def changelog():
182
+ return FileResponse("./changelog.md")
183
+
184
+ import memect.web
185
+ app.mount(
186
+ "/", StaticFiles(directory=Path(memect.web.__file__).parent.absolute(), html=True), name="web"
187
+ )
188
+ self._logger.info("end create fastapi")
189
+ return app
190
+
191
+ async def __call__(self, scope: Any, receive: Any, send: Any) -> Any:
192
+ # 必须在这里初始化,因为需要在异步环境下
193
+ await self._init()
194
+ assert self._fastapi is not None
195
+ # 可以根据asgi自行处理请求,目前使用fastapi
196
+ return await self._fastapi(scope, receive, send)
197
+
198
+ async def serve(self):
199
+ server_cfg = self._settings["server"]
200
+ self._logger.info("start httpserver,provider=%s", server_cfg["provider"])
201
+ host = server_cfg.get("host", "0.0.0.0")
202
+ port = server_cfg.get("port", 9527)
203
+ try:
204
+ if server_cfg["provider"] == "uvicorn":
205
+ # uvicorn仅仅支持http1.1,不支持http2
206
+ # 禁用uvicorn的日志设置,使用全局的设置
207
+ # uvicorn的access log的缺点就是,不显示请求返回的内容长度和耗时,因为是请求在返回headers的时候就记录了日志
208
+ # 而不是在请求处理完毕后记录请求的日志
209
+ # 如果需要,可以参考https://github.com/Kludex/asgi-logger
210
+ # 使用uvloop+httptools可以达到3500-3700个请求/秒
211
+ # pypy不支持uvloop,但是也不需要使用pypy来执行io操作,因为比cpython慢
212
+ # 使用对象不支持reload,workers等参数,实际上也不需要
213
+ # 可以通过nginx或者gunicon等来支持
214
+ # ab -n 1000 -c 10
215
+ # pypy 2200/s(asyncio)
216
+ # python3.11 2200/s(asyncio) 3000/s(uvloop)
217
+ import uvicorn
218
+
219
+ config = uvicorn.Config(
220
+ self, **server_cfg.get("uvicorn") or {}, host=host, port=port
221
+ )
222
+ server = uvicorn.Server(config)
223
+ await server.serve()
224
+ elif server_cfg["provider"] == "granian":
225
+ from granian.constants import Interfaces
226
+ from granian.server.embed import Server
227
+
228
+ server = Server(
229
+ self,
230
+ address=host,
231
+ port=port,
232
+ interface=Interfaces.ASGI,
233
+ **server_cfg.get("granian", {}),
234
+ )
235
+ await server.serve()
236
+ elif server_cfg["provider"] == "hypercorn":
237
+ # 支持http1.1,http2,http3
238
+ # pip install hypercorn
239
+ # ab -n 1000 -c 10
240
+ # pypy: 1200/s
241
+ # python3.11: 1200/s,1400/s(uvloop)
242
+ # 所以,使用默认的asyncio,pypy和python是持平的,uvloop会快10%-20%
243
+ from hypercorn.asyncio import serve # type:ignore
244
+ from hypercorn.config import Config
245
+
246
+ # from hypercorn.statsd import StatsdLogger
247
+ # config.statsd_host=None
248
+ # config.set_statsd_logger_class(StatsdLogger)
249
+ config = Config.from_mapping(
250
+ server_cfg.get("hypercorn"), bind=[f"{host}:{port}"]
251
+ )
252
+ await serve(self, config)
253
+ else:
254
+ raise ValueError(f"不支持的provider={server_cfg['provider']}")
255
+
256
+ finally:
257
+ self._logger.info("exit httpserver uptime=%s", self._timer.uptime())
258
+
259
+ # TODO 如果需要异步
260
+ async def __aenter__(self) -> Self:
261
+ await self._init()
262
+ return self
263
+
264
+ async def __aexit__(
265
+ self, et: type | None, ev: BaseException | None, tb: TracebackType | None
266
+ ) -> bool | None:
267
+ await self._exit()
268
+
269
+ @classmethod
270
+ def run_async(cls, c: Any):
271
+ try:
272
+ import uvloop
273
+
274
+ use_uvloop = True
275
+ except ModuleNotFoundError:
276
+ use_uvloop = False
277
+ if use_uvloop:
278
+ cls._logger.info("use uv loop")
279
+ import uvloop
280
+
281
+ # 如下执行,不修改全局
282
+ with asyncio.Runner(loop_factory=uvloop.new_event_loop) as runner:
283
+ runner.run(c)
284
+ else:
285
+ # 不在这里install,使用默认的
286
+ # uvloop.install()
287
+ asyncio.run(c)
288
+
289
+ @classmethod
290
+ def run(cls):
291
+ cls.run_async(cls.arun())
292
+
293
+ @classmethod
294
+ async def arun(cls):
295
+ async with App() as app:
296
+ await app.serve()
297
+
298
+
299
+ async def create_app():
300
+ from memect.base.config import setup
301
+
302
+ setup()
303
+ return App()
File without changes
memect/base/api.py ADDED
@@ -0,0 +1,75 @@
1
+
2
+ from typing import Any, Final, Mapping, NotRequired, Sequence, TypedDict
3
+
4
+ from pydantic import BaseModel
5
+
6
+
7
+ class ErrorModel(BaseModel):
8
+ code:int|None=0
9
+ message:str|None=None
10
+ extras:Mapping[str,Any]|None=None
11
+
12
+ class ApiError(Exception):
13
+
14
+ #在多数情况下,调用者都不需要关心错误码,除非是需要自动化程序,如:现在的Agent等,可以根据错误码进行合适的处理
15
+ ANY=10000
16
+ """表示任何不需要区分的错误"""
17
+ SYSTEM=10001
18
+ """系统错误"""
19
+ PARAMETER=10002
20
+ """参数错误"""
21
+
22
+ def __init__(self,code:int,message:str,**extras:Any):
23
+ super().__init__(code,message,extras)
24
+ self.code:Final=code
25
+ self.message:Final=message
26
+ self.extras:Final=dict(extras)
27
+
28
+ def jsonify(self)->dict[str,Any]:
29
+ data={
30
+ 'code':self.code,
31
+ 'message':self.message
32
+ }
33
+
34
+ # 为了兼容老的api接口,code='running',按现在应该是返回数值+status='running'
35
+ if self.extras.get('status') in ('running','waiting'):
36
+ data['code']='running'
37
+ data.update(self.extras)
38
+ return data
39
+
40
+ @classmethod
41
+ def from_dict(cls,error:Mapping[str,Any]):
42
+ return cls(**error)
43
+
44
+
45
+ class FileType(TypedDict):
46
+ name: str
47
+ """类型的名字"""
48
+ exts: Sequence[str]
49
+ """文件的扩展名"""
50
+ max_length: int
51
+ """文件的字节数"""
52
+ max_size: NotRequired[tuple[int, int] | None]
53
+ """图片的width/height"""
54
+ max_page_count: NotRequired[int | None]
55
+ """pdf文件允许的页数"""
56
+ max_file_count: NotRequired[int | None]
57
+ """zip文件允许的文件数"""
58
+
59
+ class ApiInfo(TypedDict):
60
+ name: str
61
+ url: str
62
+
63
+ allow_async: bool
64
+ allow_timeout: bool
65
+ allow_form: bool
66
+ allow_task_id: bool
67
+
68
+ file: Mapping[str, Any]
69
+ schema: Mapping[str, Any]
70
+ defaults: Mapping[str, Any] | None
71
+
72
+ def usage():
73
+ ApiError(ApiError.SYSTEM,'')
74
+ ApiError(ApiError.ANY,'xxx',status='running')
75
+