MeUtils 2025.3.3.18.41.24__py3-none-any.whl → 2025.3.5.19.55.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {MeUtils-2025.3.3.18.41.24.dist-info → MeUtils-2025.3.5.19.55.22.dist-info}/METADATA +264 -264
- {MeUtils-2025.3.3.18.41.24.dist-info → MeUtils-2025.3.5.19.55.22.dist-info}/RECORD +61 -33
- examples/_openaisdk/open_router.py +2 -1
- examples/_openaisdk/openai_files.py +16 -5
- examples/_openaisdk/openai_images.py +1 -0
- examples/_openaisdk/openai_moon.py +22 -19
- examples/sh/__init__.py +11 -0
- meutils/apis/baidu/bdaitpzs.py +9 -17
- meutils/apis/chatglm/glm_video_api.py +2 -2
- meutils/apis/images/edits.py +7 -2
- meutils/apis/jimeng/common.py +1 -1
- meutils/apis/oneapi/common.py +4 -4
- meutils/apis/proxy/ips.py +2 -0
- meutils/caches/common.py +4 -0
- meutils/data/VERSION +1 -1
- meutils/data/oneapi/NOTICE.html +12 -0
- meutils/data/oneapi/__init__.py +1 -1
- meutils/data/oneapi/index.html +275 -0
- meutils/io/_openai_files.py +31 -0
- meutils/io/openai_files.py +138 -0
- meutils/io/parsers/__init__.py +10 -0
- meutils/io/parsers/fileparser/PDF/346/212/275/345/217/226.py +58 -0
- meutils/io/parsers/fileparser/__init__.py +11 -0
- meutils/io/parsers/fileparser/common.py +91 -0
- meutils/io/parsers/fileparser/demo.py +41 -0
- meutils/io/parsers/fileparser/filetype/__init__.py +10 -0
- meutils/io/parsers/fileparser/filetype/__main__.py +37 -0
- meutils/io/parsers/fileparser/filetype/filetype.py +98 -0
- meutils/io/parsers/fileparser/filetype/helpers.py +140 -0
- meutils/io/parsers/fileparser/filetype/match.py +155 -0
- meutils/io/parsers/fileparser/filetype/types/__init__.py +118 -0
- meutils/io/parsers/fileparser/filetype/types/application.py +22 -0
- meutils/io/parsers/fileparser/filetype/types/archive.py +687 -0
- meutils/io/parsers/fileparser/filetype/types/audio.py +212 -0
- meutils/io/parsers/fileparser/filetype/types/base.py +29 -0
- meutils/io/parsers/fileparser/filetype/types/document.py +256 -0
- meutils/io/parsers/fileparser/filetype/types/font.py +115 -0
- meutils/io/parsers/fileparser/filetype/types/image.py +383 -0
- meutils/io/parsers/fileparser/filetype/types/isobmff.py +33 -0
- meutils/io/parsers/fileparser/filetype/types/video.py +223 -0
- meutils/io/parsers/fileparser/filetype/utils.py +84 -0
- meutils/io/parsers/fileparser/filetype.py +41 -0
- meutils/io/parsers/fileparser/mineru.py +48 -0
- meutils/io/parsers/fileparser/pdf.py +30 -0
- meutils/io/parsers/fileparser//350/241/250/346/240/274/346/212/275/345/217/226.py +118 -0
- meutils/llm/check_utils.py +33 -2
- meutils/llm/clients.py +1 -0
- meutils/llm/completions/chat_gemini.py +72 -0
- meutils/llm/completions/chat_plus.py +78 -0
- meutils/llm/completions/{agents/file.py → chat_spark.py} +46 -26
- meutils/llm/completions/qwenllm.py +57 -16
- meutils/llm/completions/yuanbao.py +29 -3
- meutils/llm/openai_utils/common.py +2 -2
- meutils/schemas/oneapi/common.py +22 -19
- meutils/schemas/openai_types.py +65 -29
- meutils/schemas/yuanbao_types.py +6 -7
- meutils/types.py +2 -0
- meutils/data/oneapi/NOTICE.md +0 -1
- meutils/data/oneapi/_NOTICE.md +0 -140
- meutils/llm/completions/gemini.py +0 -69
- {MeUtils-2025.3.3.18.41.24.dist-info → MeUtils-2025.3.5.19.55.22.dist-info}/LICENSE +0 -0
- {MeUtils-2025.3.3.18.41.24.dist-info → MeUtils-2025.3.5.19.55.22.dist-info}/WHEEL +0 -0
- {MeUtils-2025.3.3.18.41.24.dist-info → MeUtils-2025.3.5.19.55.22.dist-info}/entry_points.txt +0 -0
- {MeUtils-2025.3.3.18.41.24.dist-info → MeUtils-2025.3.5.19.55.22.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,275 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html>
|
3
|
+
|
4
|
+
<head>
|
5
|
+
<meta charset="utf-8" />
|
6
|
+
<script src="https://unpkg.com/vue@3"></script>
|
7
|
+
<style>
|
8
|
+
ul {
|
9
|
+
padding: 0;
|
10
|
+
}
|
11
|
+
|
12
|
+
li {
|
13
|
+
list-style: none;
|
14
|
+
}
|
15
|
+
|
16
|
+
body {
|
17
|
+
font-family: Arial, sans-serif;
|
18
|
+
background-color: #f4f4f4;
|
19
|
+
margin: 0;
|
20
|
+
padding: 0;
|
21
|
+
}
|
22
|
+
|
23
|
+
.container {
|
24
|
+
width: 100%;
|
25
|
+
/* margin: 50px auto; */
|
26
|
+
background: white;
|
27
|
+
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
|
28
|
+
border-radius: 8px;
|
29
|
+
overflow: hidden;
|
30
|
+
font-size: 14px;
|
31
|
+
}
|
32
|
+
|
33
|
+
.tabs {
|
34
|
+
display: flex;
|
35
|
+
list-style: none;
|
36
|
+
margin: 0;
|
37
|
+
padding: 0;
|
38
|
+
color: #000000;
|
39
|
+
background-color: #e5e5e5;
|
40
|
+
}
|
41
|
+
|
42
|
+
.tabs li {
|
43
|
+
flex-grow: 1;
|
44
|
+
text-align: center;
|
45
|
+
cursor: pointer;
|
46
|
+
padding: 15px 0;
|
47
|
+
/* color: white; */
|
48
|
+
transition: background 0.3s ease;
|
49
|
+
}
|
50
|
+
|
51
|
+
.tabs li.active {
|
52
|
+
color: #18a058;
|
53
|
+
background-color: #daf0e4;
|
54
|
+
}
|
55
|
+
|
56
|
+
.tab-content {
|
57
|
+
padding: 20px;
|
58
|
+
display: none;
|
59
|
+
}
|
60
|
+
|
61
|
+
.tab-content.active {
|
62
|
+
display: block;
|
63
|
+
}
|
64
|
+
|
65
|
+
/* 公告样式 */
|
66
|
+
.announcement-container .announcement-item {
|
67
|
+
display: flex;
|
68
|
+
align-items: start;
|
69
|
+
position: relative;
|
70
|
+
/* 添加定位以便伪元素可以相对其定位 */
|
71
|
+
/* margin-bottom: 20px; */
|
72
|
+
padding-left: 10px;
|
73
|
+
font-size: 18px;
|
74
|
+
color: #191919;
|
75
|
+
display: flex;
|
76
|
+
align-items: center;
|
77
|
+
gap: 5px;
|
78
|
+
}
|
79
|
+
|
80
|
+
.announcement-container .announcement-item::before {
|
81
|
+
width: 4px;
|
82
|
+
height: 100%;
|
83
|
+
content: '';
|
84
|
+
position: absolute;
|
85
|
+
left: 0px;
|
86
|
+
background-color: #ccc9d0;
|
87
|
+
}
|
88
|
+
|
89
|
+
.announcement-container .announcement-item .tag {
|
90
|
+
padding: 2px 5px;
|
91
|
+
border-radius: 99px;
|
92
|
+
color: #fff;
|
93
|
+
background-color: #24a76e;
|
94
|
+
font-size: 14px;
|
95
|
+
}
|
96
|
+
|
97
|
+
/* 更新记录样式 */
|
98
|
+
.change-container .change-item {
|
99
|
+
display: flex;
|
100
|
+
align-items: start;
|
101
|
+
position: relative;
|
102
|
+
/* 添加定位以便伪元素可以相对其定位 */
|
103
|
+
/* margin-bottom: 20px; */
|
104
|
+
}
|
105
|
+
|
106
|
+
.change-container .change-item::before {
|
107
|
+
content: '';
|
108
|
+
position: absolute;
|
109
|
+
left: 90px;
|
110
|
+
/* 与 .point 的 margin-left 对齐 */
|
111
|
+
top: 20px;
|
112
|
+
/* 调整以适应点的位置 */
|
113
|
+
height: calc(100% - 20px);
|
114
|
+
/* 连线的高度 */
|
115
|
+
border-left: 1px dashed #24a76e;
|
116
|
+
/* 连线样式 */
|
117
|
+
}
|
118
|
+
|
119
|
+
.change-container .change-item:last-child::before {
|
120
|
+
display: none;
|
121
|
+
/* 最后一个项目不需要连线 */
|
122
|
+
}
|
123
|
+
|
124
|
+
.change-container .point {
|
125
|
+
width: 10px;
|
126
|
+
height: 10px;
|
127
|
+
border-radius: 50%;
|
128
|
+
background-color: #24a76e;
|
129
|
+
margin: 5px;
|
130
|
+
}
|
131
|
+
|
132
|
+
.change-container .left {
|
133
|
+
display: flex;
|
134
|
+
align-items: center;
|
135
|
+
}
|
136
|
+
|
137
|
+
.change-container .left .time {
|
138
|
+
width: 80px;
|
139
|
+
margin: 0;
|
140
|
+
text-align: right;
|
141
|
+
}
|
142
|
+
|
143
|
+
.change-container .right {
|
144
|
+
/* padding-left: 80px; */
|
145
|
+
}
|
146
|
+
|
147
|
+
.content-item {
|
148
|
+
font-size: 12px;
|
149
|
+
color: #3c3c3c;
|
150
|
+
}
|
151
|
+
|
152
|
+
.right li{
|
153
|
+
list-style: disc;
|
154
|
+
margin-left: 12px;
|
155
|
+
}
|
156
|
+
</style>
|
157
|
+
</head>
|
158
|
+
|
159
|
+
<body>
|
160
|
+
<div id="app">
|
161
|
+
<div class="container">
|
162
|
+
<ul class="tabs">
|
163
|
+
<li class="tab" :class="{ active: tabIndex === 1 }" data-tab="announcement" @click="tabIndex=1">
|
164
|
+
最新通知
|
165
|
+
</li>
|
166
|
+
<li class="tab" :class="{ active: tabIndex === 2 }" data-tab="update-log" @click="tabIndex=2">
|
167
|
+
更新记录
|
168
|
+
</li>
|
169
|
+
</ul>
|
170
|
+
<div id="announcement" class="tab-content announcement-container" :class="{ active: tabIndex === 1 }">
|
171
|
+
<!-- <h2>最新通知</h2>-->
|
172
|
+
<!-- <div v-for="item in announcements" class="announcement-item">-->
|
173
|
+
<!-- <span>{{item.content}}</span>-->
|
174
|
+
<!-- <span v-for="tag in item.tags" class="tag">{{ tag }}</span>-->
|
175
|
+
<!-- </div>-->
|
176
|
+
<!-- -->
|
177
|
+
<!-- <h2>更新记录</h2>-->
|
178
|
+
|
179
|
+
<div class="change-container">
|
180
|
+
<div v-for="item in changeLogs" class="change-item">
|
181
|
+
<div class="left">
|
182
|
+
<p class="time">{{ item.date }}</p>
|
183
|
+
<div class="point"></div>
|
184
|
+
</div>
|
185
|
+
<div class="right">
|
186
|
+
<span>{{ item.title }}</span>
|
187
|
+
<ul v-for="item in item.content">
|
188
|
+
<li class="content-item">{{ item }}</li>
|
189
|
+
</ul>
|
190
|
+
</div>
|
191
|
+
</div>
|
192
|
+
</div>
|
193
|
+
|
194
|
+
</div>
|
195
|
+
<div id="update-log" class="tab-content" :class="{ active: tabIndex === 2 }">
|
196
|
+
<h2>更新记录</h2>
|
197
|
+
<div class="change-container">
|
198
|
+
<div v-for="item in changeLogs" class="change-item">
|
199
|
+
<div class="left">
|
200
|
+
<p class="time">{{ item.date }}</p>
|
201
|
+
<div class="point"></div>
|
202
|
+
</div>
|
203
|
+
<div class="right">
|
204
|
+
<span>{{ item.title }}</span>
|
205
|
+
<ul v-for="item in item.content">
|
206
|
+
<li class="content-item">{{ item }}</li>
|
207
|
+
</ul>
|
208
|
+
</div>
|
209
|
+
</div>
|
210
|
+
</div>
|
211
|
+
</div>
|
212
|
+
</div>
|
213
|
+
</div>
|
214
|
+
<script>
|
215
|
+
// import { ref, reactive } from './vue.esm-browser.js' //模块化开发方式
|
216
|
+
const App = {
|
217
|
+
setup() {
|
218
|
+
let tabIndex = Vue.ref(1)
|
219
|
+
|
220
|
+
return {
|
221
|
+
tabIndex,
|
222
|
+
announcements: [
|
223
|
+
{
|
224
|
+
content: '支持超多模型:',
|
225
|
+
tags: [
|
226
|
+
'对话',
|
227
|
+
'推理',
|
228
|
+
'图片',
|
229
|
+
'视频',
|
230
|
+
'音频',
|
231
|
+
'变清晰',
|
232
|
+
'去水印',
|
233
|
+
'文档解析',
|
234
|
+
],
|
235
|
+
},
|
236
|
+
],
|
237
|
+
changeLogs: [
|
238
|
+
{
|
239
|
+
date: '2025.03.04',
|
240
|
+
title: '🎉 最新通知',
|
241
|
+
content: [
|
242
|
+
'qwq-max/qwq-max-search:阿里 Qwen 首个推理模型亮相!擅长数学和编程,支持联网搜索',
|
243
|
+
'qwen-max-search:qwen2.5-max支持联网搜索',
|
244
|
+
'qwen2.5-max:支持文件问答',
|
245
|
+
|
246
|
+
'因近期Midjourney封号严重,3月5日起, Relax基础价调整到0.05 Fast基础价调整到0.1 Turbo基础价调整到0.3,我们会保证全天稳定有号。',
|
247
|
+
],
|
248
|
+
},
|
249
|
+
// {
|
250
|
+
// date: '2025.02.25',
|
251
|
+
// title:
|
252
|
+
// '🚀 已修复异常扣费,今日claude-3-7-sonnet-thinking模型,有异常扣费,请联系客服退回额度',
|
253
|
+
// content: [
|
254
|
+
// '已修复异常扣费,今日claude-3-7-sonnet-thinking模型,有异常扣费,请联系客服退回额度',
|
255
|
+
// '新增模型claude-3-7-sonnet-20250219,',
|
256
|
+
// ],
|
257
|
+
// },
|
258
|
+
{
|
259
|
+
date: '2025.02.25',
|
260
|
+
title:
|
261
|
+
'🎉 新增模型claude-3-7-sonnet-20250219,Claude最新模型,',
|
262
|
+
content: [
|
263
|
+
'新增模型claude-3-7-sonnet-20250219,Claude最新模型,已兼容think格claude-3-7-sonnet-thinking模型',
|
264
|
+
],
|
265
|
+
},
|
266
|
+
],
|
267
|
+
}
|
268
|
+
},
|
269
|
+
}
|
270
|
+
const app = Vue.createApp(App)
|
271
|
+
app.mount('#app')
|
272
|
+
</script>
|
273
|
+
</body>
|
274
|
+
|
275
|
+
</html>
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Project : AI. @by PyCharm
|
4
|
+
# @File : openai_files
|
5
|
+
# @Time : 2025/3/4 18:20
|
6
|
+
# @Author : betterme
|
7
|
+
# @WeChat : meutils
|
8
|
+
# @Software : PyCharm
|
9
|
+
# @Description :
|
10
|
+
|
11
|
+
from meutils.pipe import *
|
12
|
+
from meutils.io.files_utils import to_bytes
|
13
|
+
from meutils.llm.clients import moonshot_client, zhipuai_client
|
14
|
+
|
15
|
+
|
16
|
+
async def file_extract(file): # "https://oss.ffire.cc/files/招标文件备案表(第二次).pdf"
|
17
|
+
"""todo 定时删除文件"""
|
18
|
+
filename = Path(file).name
|
19
|
+
mime_type, _ = mimetypes.guess_type(filename) # mime_type = "application/octet-stream"
|
20
|
+
file: bytes = await to_bytes(file)
|
21
|
+
|
22
|
+
file_object = await moonshot_client.files.create(
|
23
|
+
# file=file,
|
24
|
+
# file=("filename.pdf", file),
|
25
|
+
file=(filename, file, mime_type),
|
26
|
+
purpose="file-extract"
|
27
|
+
)
|
28
|
+
logger.debug(file_object)
|
29
|
+
|
30
|
+
response = await moonshot_client.files.content(file_id=file_object.id)
|
31
|
+
return response.text
|
@@ -0,0 +1,138 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Project : AI. @by PyCharm
|
4
|
+
# @File : fileparser
|
5
|
+
# @Time : 2025/1/7 17:48
|
6
|
+
# @Author : betterme
|
7
|
+
# @WeChat : meutils
|
8
|
+
# @Software : PyCharm
|
9
|
+
# @Description : https://bigmodel.cn/dev/activities/freebie/fileextracion
|
10
|
+
import asyncio
|
11
|
+
|
12
|
+
import httpx
|
13
|
+
import shortuuid
|
14
|
+
|
15
|
+
from meutils.pipe import *
|
16
|
+
from meutils.io.files_utils import to_bytes, guess_mime_type
|
17
|
+
from meutils.llm.clients import moonshot_client, zhipuai_client, APIStatusError
|
18
|
+
from meutils.notice.feishu import send_message as _send_message, FILES
|
19
|
+
from meutils.caches import cache, rcache
|
20
|
+
|
21
|
+
# from openai.types.file_object import FileObject
|
22
|
+
|
23
|
+
send_message = partial(
|
24
|
+
_send_message,
|
25
|
+
title=__name__,
|
26
|
+
url=FILES
|
27
|
+
)
|
28
|
+
"""
|
29
|
+
|
30
|
+
# 智谱
|
31
|
+
# 格式限制:.PDF .DOCX .DOC .XLS .XLSX .PPT .PPTX .PNG .JPG .JPEG .CSV .PY .TXT .MD .BMP .GIF
|
32
|
+
|
33
|
+
# kimi todo: 定期删除文件
|
34
|
+
文件接口与 Kimi 智能助手中上传文件功能所使用的相同,支持相同的文件格式,它们包括
|
35
|
+
.pdf .txt .csv .doc .docx .xls .xlsx .ppt .pptx .md .jpeg .png .bmp .gif .svg .svgz .webp .ico .xbm .dib .pjp .tif
|
36
|
+
.pjpeg .avif .dot .apng .epub .tiff .jfif .html .json .mobi .log .go .h .c .cpp .cxx .cc .cs .java .js .css .jsp .php
|
37
|
+
.py .py3 .asp .yaml .yml .ini .conf .ts .tsx 等格式。
|
38
|
+
|
39
|
+
# todo:
|
40
|
+
+ .sh
|
41
|
+
"""
|
42
|
+
|
43
|
+
|
44
|
+
async def delete_files(client, threshold: int = 666):
|
45
|
+
_ = await client.files.list()
|
46
|
+
file_objects = _.data
|
47
|
+
|
48
|
+
logger.debug(len(file_objects))
|
49
|
+
|
50
|
+
if len(file_objects) > threshold:
|
51
|
+
tasks = [client.files.delete(file.id) for file in file_objects]
|
52
|
+
await asyncio.gather(*tasks)
|
53
|
+
|
54
|
+
|
55
|
+
@rcache(ttl=1 * 3600)
|
56
|
+
async def _file_extract(file):
|
57
|
+
"""
|
58
|
+
|
59
|
+
:param file: url bytes path
|
60
|
+
:return:
|
61
|
+
"""
|
62
|
+
|
63
|
+
filename = Path(file).name if isinstance(file, str) else 'untitled'
|
64
|
+
mime_type = guess_mime_type(file)
|
65
|
+
|
66
|
+
file: bytes = await to_bytes(file)
|
67
|
+
|
68
|
+
for i, client in enumerate([moonshot_client, zhipuai_client]):
|
69
|
+
|
70
|
+
try:
|
71
|
+
# 1 / 0
|
72
|
+
file_object = await client.files.create(
|
73
|
+
file=(filename, file, mime_type),
|
74
|
+
purpose="file-extract"
|
75
|
+
)
|
76
|
+
logger.debug(file_object)
|
77
|
+
|
78
|
+
response = await client.files.content(file_id=file_object.id)
|
79
|
+
|
80
|
+
return response.json()
|
81
|
+
|
82
|
+
except Exception as e:
|
83
|
+
logger.debug(e)
|
84
|
+
if i == 1:
|
85
|
+
await delete_files(moonshot_client)
|
86
|
+
|
87
|
+
# 兜底
|
88
|
+
data = {
|
89
|
+
'filename': filename,
|
90
|
+
|
91
|
+
'type': 'file',
|
92
|
+
'file_type': mime_type,
|
93
|
+
'content': '',
|
94
|
+
}
|
95
|
+
try:
|
96
|
+
data['content'] = file.decode('utf-8')
|
97
|
+
|
98
|
+
except Exception as e:
|
99
|
+
logger.debug(e)
|
100
|
+
return data
|
101
|
+
|
102
|
+
|
103
|
+
async def file_extract(files):
|
104
|
+
if isinstance(files, str):
|
105
|
+
return await _file_extract(files)
|
106
|
+
|
107
|
+
tasks = [_file_extract(file) for file in files]
|
108
|
+
return await asyncio.gather(*tasks)
|
109
|
+
|
110
|
+
|
111
|
+
# FileObject(id='1741136989_8dd96cbee6274251b7e4c9568779bd6a', bytes=82947, created_at=1741136989, filename='kling_watermark.png', object='file', status=None, status_details=None)
|
112
|
+
|
113
|
+
if __name__ == '__main__':
|
114
|
+
# file = "https://oss.ffire.cc/files/招标文件备案表(第二次).pdf"
|
115
|
+
file = "https://oss.ffire.cc/files/%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6%E5%A4%87%E6%A1%88%E8%A1%A8%EF%BC%88%E7%AC%AC%E4%BA%8C%E6%AC%A1%EF%BC%89.pdf"
|
116
|
+
"https://oss.ffire.cc/files/%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6%E5%A4%87%E6%A1%88%E8%A1%A8%EF%BC%88%E7%AC%AC%E4%BA%8C%E6%AC%A1%EF%BC%89.pdf 这个文件讲了什么?"
|
117
|
+
file = "https://oss.ffire.cc/files/百炼系列手机产品介绍.docx"
|
118
|
+
# file = Path("/Users/betterme/PycharmProjects/AI/MeUtils/meutils/llm/completions/rag/百炼系列手机产品介绍.docx")
|
119
|
+
|
120
|
+
# file = "/Users/betterme/PycharmProjects/AI/MeUtils/meutils/io/img_1.png"
|
121
|
+
|
122
|
+
# openai.BadRequestError: Error code: 400 - {'error': {'message': 'text extract error: 没有解析出内容', 'type': 'invalid_request_error'}}
|
123
|
+
# file = "https://oss.ffire.cc/files/kling_watermark.png"
|
124
|
+
file = "/Users/betterme/PycharmProjects/AI/xx.sh"
|
125
|
+
|
126
|
+
file = [file] * 10
|
127
|
+
file = []
|
128
|
+
|
129
|
+
# print(Path(file).read_text())
|
130
|
+
|
131
|
+
# with timer():
|
132
|
+
# r = arun(file_extract(file, moonshot_client))
|
133
|
+
|
134
|
+
# with timer():
|
135
|
+
# r = arun(file_extract(file, provider='kimi'))
|
136
|
+
|
137
|
+
with timer():
|
138
|
+
arun(file_extract(file))
|
@@ -0,0 +1,58 @@
|
|
1
|
+
import pdfplumber
|
2
|
+
from pandas import DataFrame
|
3
|
+
import tabulate
|
4
|
+
|
5
|
+
"""
|
6
|
+
表格
|
7
|
+
"""
|
8
|
+
|
9
|
+
|
10
|
+
def curves_to_edges(cs):
|
11
|
+
edges = []
|
12
|
+
for c in cs:
|
13
|
+
edges += pdfplumber.utils.rect_to_edges(c)
|
14
|
+
return edges
|
15
|
+
|
16
|
+
|
17
|
+
def clean_cell_text(text):
|
18
|
+
"""
|
19
|
+
清除文本中的换行符和多余空格
|
20
|
+
"""
|
21
|
+
if text is None:
|
22
|
+
return ""
|
23
|
+
text = text.replace("\n", "")
|
24
|
+
# 去除字符串开头和结尾的空白字符
|
25
|
+
text = text.strip()
|
26
|
+
return text
|
27
|
+
|
28
|
+
|
29
|
+
def extract_tables_with_text(self, pdf) -> List[str]:
|
30
|
+
"""抽取表格并嵌入文本"""
|
31
|
+
|
32
|
+
def check_bboxes(word, table_bbox):
|
33
|
+
left = word['x0'], word['top'], word['x1'], word['bottom']
|
34
|
+
r = table_bbox
|
35
|
+
return left[0] > r[0] and left[1] > r[1] and left[2] < r[2] and left[3] < r[3]
|
36
|
+
|
37
|
+
lines = []
|
38
|
+
for page in pdf.pages:
|
39
|
+
tables = page.find_tables(
|
40
|
+
table_settings={
|
41
|
+
"vertical_strategy": "lines",
|
42
|
+
"horizontal_strategy": "lines",
|
43
|
+
"explicit_vertical_lines": self.curves_to_edges(page.curves) + page.edges,
|
44
|
+
"explicit_horizontal_lines": self.curves_to_edges(page.curves) + page.edges,
|
45
|
+
}
|
46
|
+
)
|
47
|
+
bboxes = [table.bbox for table in tables]
|
48
|
+
tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
|
49
|
+
non_table_words = [word for word in page.extract_words() if
|
50
|
+
not any([check_bboxes(word, table_bbox) for table_bbox in bboxes])]
|
51
|
+
|
52
|
+
for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, 'top', tolerance=5):
|
53
|
+
if 'text' in cluster[0]:
|
54
|
+
lines.append(' '.join([i['text'] for i in cluster]))
|
55
|
+
elif 'table' in cluster[0]:
|
56
|
+
lines.append(tabulate.tabulate(DataFrame(cluster[0]['table']).applymap(self.clean_cell_text),
|
57
|
+
tablefmt="github"))
|
58
|
+
return lines
|
@@ -0,0 +1,11 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Project : AI. @by PyCharm
|
4
|
+
# @File : __init__.py
|
5
|
+
# @Time : 2023/5/18 16:31
|
6
|
+
# @Author : betterme
|
7
|
+
# @WeChat : meutils
|
8
|
+
# @Software : PyCharm
|
9
|
+
# @Description :
|
10
|
+
|
11
|
+
from meutils.fileparser.common import *
|
@@ -0,0 +1,91 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Project : AI. @by PyCharm
|
4
|
+
# @File : common
|
5
|
+
# @Time : 2023/5/18 16:39
|
6
|
+
# @Author : betterme
|
7
|
+
# @WeChat : meutils
|
8
|
+
# @Software : PyCharm
|
9
|
+
# @Description :
|
10
|
+
|
11
|
+
from meutils.pipe import *
|
12
|
+
|
13
|
+
|
14
|
+
def doc2docx(doc_paths, outdir='.', max_workers=1):
|
15
|
+
"""todo: 多进程阻塞"""
|
16
|
+
if isinstance(doc_paths, str):
|
17
|
+
doc_paths = [doc_paths]
|
18
|
+
max_workers = min(max_workers, len(doc_paths))
|
19
|
+
func = partial(_doc2docx, outdir=outdir)
|
20
|
+
return doc_paths | xProcessPoolExecutor(func, max_workers) | xlist
|
21
|
+
|
22
|
+
|
23
|
+
def _doc2docx(doc_path, outdir='.'):
|
24
|
+
if Path(doc_path).is_file():
|
25
|
+
cmd = 'libreoffice --headless --convert-to docx'.split() + [doc_path, '--outdir', outdir]
|
26
|
+
p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
27
|
+
p.wait(timeout=16)
|
28
|
+
stdout, stderr = p.communicate()
|
29
|
+
if stderr:
|
30
|
+
raise subprocess.SubprocessError(stderr)
|
31
|
+
return stdout.decode()
|
32
|
+
return False
|
33
|
+
|
34
|
+
|
35
|
+
def stream2tempfile4process(
|
36
|
+
stream: Union[str, bytes] = b"temp",
|
37
|
+
process_fn: Callable[[os.PathLike], Any] = lambda p: p.read_text(),
|
38
|
+
delete=True
|
39
|
+
):
|
40
|
+
# 创建临时文件
|
41
|
+
import tempfile
|
42
|
+
|
43
|
+
with tempfile.NamedTemporaryFile(delete=delete) as temp_file:
|
44
|
+
p = Path(temp_file.name)
|
45
|
+
if isinstance(stream, str): # 写
|
46
|
+
p.write_text(stream)
|
47
|
+
else:
|
48
|
+
p.write_bytes(stream)
|
49
|
+
return process_fn(p) # 读
|
50
|
+
|
51
|
+
|
52
|
+
def stream_parser(file_stream):
|
53
|
+
"""
|
54
|
+
from fastapi import FastAPI, File, UploadFile
|
55
|
+
|
56
|
+
file_stream = UploadFile(open(''))
|
57
|
+
|
58
|
+
filename, file_stream = stream_parser(file_stream)
|
59
|
+
"""
|
60
|
+
filename = ''
|
61
|
+
# from fastapi import FastAPI, File, UploadFile
|
62
|
+
if hasattr(file_stream, 'file'):
|
63
|
+
filename = file_stream.file.name or file_stream.filename
|
64
|
+
file_stream = file_stream.file
|
65
|
+
if isinstance(file_stream, io.TextIOWrapper): # 转 bytes
|
66
|
+
file_stream = file_stream.buffer
|
67
|
+
file_stream = file_stream.read()
|
68
|
+
|
69
|
+
# st.file_uploader
|
70
|
+
elif hasattr(file_stream, 'read'):
|
71
|
+
filename = file_stream.name
|
72
|
+
if isinstance(file_stream, io.TextIOWrapper): # 转 bytes
|
73
|
+
file_stream = file_stream.buffer
|
74
|
+
file_stream = file_stream.read()
|
75
|
+
|
76
|
+
# ValueError: I/O operation on closed file.
|
77
|
+
# with file_stream:
|
78
|
+
# file_stream = file_stream.buffer.read()
|
79
|
+
|
80
|
+
elif (
|
81
|
+
isinstance(file_stream, (str, os.PathLike))
|
82
|
+
and len(file_stream) < 256
|
83
|
+
and Path(file_stream).is_file()
|
84
|
+
):
|
85
|
+
filename = str(file_stream)
|
86
|
+
file_stream = open(filename).read()
|
87
|
+
|
88
|
+
elif isinstance(file_stream, (bytes, bytearray)):
|
89
|
+
pass
|
90
|
+
|
91
|
+
return filename, file_stream
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Project : AI. @by PyCharm
|
4
|
+
# @File : demo
|
5
|
+
# @Time : 2024/6/5 09:00
|
6
|
+
# @Author : betterme
|
7
|
+
# @WeChat : meutils
|
8
|
+
# @Software : PyCharm
|
9
|
+
# @Description :
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
from meutils.pipe import *
|
13
|
+
import pandas as pd
|
14
|
+
|
15
|
+
|
16
|
+
import pdfplumber
|
17
|
+
with pdfplumber.open('银行间市场债券交易结算情况(按投资者).pdf') as pdf:
|
18
|
+
for page in pdf.pages:
|
19
|
+
data = page.extract_table()
|
20
|
+
|
21
|
+
|
22
|
+
cols = pd.MultiIndex.from_tuples(zip(*data[:2]))
|
23
|
+
print(cols)
|
24
|
+
df = pd.DataFrame(data[2:], columns=cols)
|
25
|
+
|
26
|
+
print(df.fillna(method='ffill', axis=1))
|
27
|
+
|
28
|
+
|
29
|
+
#
|
30
|
+
# import camelot
|
31
|
+
# import pandas as pd
|
32
|
+
# # 使用Camelot读取PDF文件中的表格
|
33
|
+
# tables = camelot.read_pdf('银行间市场债券交易结算情况(按投资者).pdf', pages='all', flavor='lattice')
|
34
|
+
#
|
35
|
+
# # # 将所有表格转换为 DataFrame 并合并
|
36
|
+
# # all_data = pd.concat([table.df for table in tables], ignore_index=True)
|
37
|
+
# #
|
38
|
+
# # all_data.to_excel('all_data.xlsx',index=False)
|
39
|
+
#
|
40
|
+
#
|
41
|
+
# print(tables[0].df)
|