MeUtils 2025.3.3.18.41.24__py3-none-any.whl → 2025.3.5.19.55.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {MeUtils-2025.3.3.18.41.24.dist-info → MeUtils-2025.3.5.19.55.22.dist-info}/METADATA +264 -264
  2. {MeUtils-2025.3.3.18.41.24.dist-info → MeUtils-2025.3.5.19.55.22.dist-info}/RECORD +61 -33
  3. examples/_openaisdk/open_router.py +2 -1
  4. examples/_openaisdk/openai_files.py +16 -5
  5. examples/_openaisdk/openai_images.py +1 -0
  6. examples/_openaisdk/openai_moon.py +22 -19
  7. examples/sh/__init__.py +11 -0
  8. meutils/apis/baidu/bdaitpzs.py +9 -17
  9. meutils/apis/chatglm/glm_video_api.py +2 -2
  10. meutils/apis/images/edits.py +7 -2
  11. meutils/apis/jimeng/common.py +1 -1
  12. meutils/apis/oneapi/common.py +4 -4
  13. meutils/apis/proxy/ips.py +2 -0
  14. meutils/caches/common.py +4 -0
  15. meutils/data/VERSION +1 -1
  16. meutils/data/oneapi/NOTICE.html +12 -0
  17. meutils/data/oneapi/__init__.py +1 -1
  18. meutils/data/oneapi/index.html +275 -0
  19. meutils/io/_openai_files.py +31 -0
  20. meutils/io/openai_files.py +138 -0
  21. meutils/io/parsers/__init__.py +10 -0
  22. meutils/io/parsers/fileparser/PDF/346/212/275/345/217/226.py +58 -0
  23. meutils/io/parsers/fileparser/__init__.py +11 -0
  24. meutils/io/parsers/fileparser/common.py +91 -0
  25. meutils/io/parsers/fileparser/demo.py +41 -0
  26. meutils/io/parsers/fileparser/filetype/__init__.py +10 -0
  27. meutils/io/parsers/fileparser/filetype/__main__.py +37 -0
  28. meutils/io/parsers/fileparser/filetype/filetype.py +98 -0
  29. meutils/io/parsers/fileparser/filetype/helpers.py +140 -0
  30. meutils/io/parsers/fileparser/filetype/match.py +155 -0
  31. meutils/io/parsers/fileparser/filetype/types/__init__.py +118 -0
  32. meutils/io/parsers/fileparser/filetype/types/application.py +22 -0
  33. meutils/io/parsers/fileparser/filetype/types/archive.py +687 -0
  34. meutils/io/parsers/fileparser/filetype/types/audio.py +212 -0
  35. meutils/io/parsers/fileparser/filetype/types/base.py +29 -0
  36. meutils/io/parsers/fileparser/filetype/types/document.py +256 -0
  37. meutils/io/parsers/fileparser/filetype/types/font.py +115 -0
  38. meutils/io/parsers/fileparser/filetype/types/image.py +383 -0
  39. meutils/io/parsers/fileparser/filetype/types/isobmff.py +33 -0
  40. meutils/io/parsers/fileparser/filetype/types/video.py +223 -0
  41. meutils/io/parsers/fileparser/filetype/utils.py +84 -0
  42. meutils/io/parsers/fileparser/filetype.py +41 -0
  43. meutils/io/parsers/fileparser/mineru.py +48 -0
  44. meutils/io/parsers/fileparser/pdf.py +30 -0
  45. meutils/io/parsers/fileparser//350/241/250/346/240/274/346/212/275/345/217/226.py +118 -0
  46. meutils/llm/check_utils.py +33 -2
  47. meutils/llm/clients.py +1 -0
  48. meutils/llm/completions/chat_gemini.py +72 -0
  49. meutils/llm/completions/chat_plus.py +78 -0
  50. meutils/llm/completions/{agents/file.py → chat_spark.py} +46 -26
  51. meutils/llm/completions/qwenllm.py +57 -16
  52. meutils/llm/completions/yuanbao.py +29 -3
  53. meutils/llm/openai_utils/common.py +2 -2
  54. meutils/schemas/oneapi/common.py +22 -19
  55. meutils/schemas/openai_types.py +65 -29
  56. meutils/schemas/yuanbao_types.py +6 -7
  57. meutils/types.py +2 -0
  58. meutils/data/oneapi/NOTICE.md +0 -1
  59. meutils/data/oneapi/_NOTICE.md +0 -140
  60. meutils/llm/completions/gemini.py +0 -69
  61. {MeUtils-2025.3.3.18.41.24.dist-info → MeUtils-2025.3.5.19.55.22.dist-info}/LICENSE +0 -0
  62. {MeUtils-2025.3.3.18.41.24.dist-info → MeUtils-2025.3.5.19.55.22.dist-info}/WHEEL +0 -0
  63. {MeUtils-2025.3.3.18.41.24.dist-info → MeUtils-2025.3.5.19.55.22.dist-info}/entry_points.txt +0 -0
  64. {MeUtils-2025.3.3.18.41.24.dist-info → MeUtils-2025.3.5.19.55.22.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,275 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+
4
+ <head>
5
+ <meta charset="utf-8" />
6
+ <script src="https://unpkg.com/vue@3"></script>
7
+ <style>
8
+ ul {
9
+ padding: 0;
10
+ }
11
+
12
+ li {
13
+ list-style: none;
14
+ }
15
+
16
+ body {
17
+ font-family: Arial, sans-serif;
18
+ background-color: #f4f4f4;
19
+ margin: 0;
20
+ padding: 0;
21
+ }
22
+
23
+ .container {
24
+ width: 100%;
25
+ /* margin: 50px auto; */
26
+ background: white;
27
+ box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
28
+ border-radius: 8px;
29
+ overflow: hidden;
30
+ font-size: 14px;
31
+ }
32
+
33
+ .tabs {
34
+ display: flex;
35
+ list-style: none;
36
+ margin: 0;
37
+ padding: 0;
38
+ color: #000000;
39
+ background-color: #e5e5e5;
40
+ }
41
+
42
+ .tabs li {
43
+ flex-grow: 1;
44
+ text-align: center;
45
+ cursor: pointer;
46
+ padding: 15px 0;
47
+ /* color: white; */
48
+ transition: background 0.3s ease;
49
+ }
50
+
51
+ .tabs li.active {
52
+ color: #18a058;
53
+ background-color: #daf0e4;
54
+ }
55
+
56
+ .tab-content {
57
+ padding: 20px;
58
+ display: none;
59
+ }
60
+
61
+ .tab-content.active {
62
+ display: block;
63
+ }
64
+
65
+ /* 公告样式 */
66
+ .announcement-container .announcement-item {
67
+ display: flex;
68
+ align-items: start;
69
+ position: relative;
70
+ /* 添加定位以便伪元素可以相对其定位 */
71
+ /* margin-bottom: 20px; */
72
+ padding-left: 10px;
73
+ font-size: 18px;
74
+ color: #191919;
75
+ display: flex;
76
+ align-items: center;
77
+ gap: 5px;
78
+ }
79
+
80
+ .announcement-container .announcement-item::before {
81
+ width: 4px;
82
+ height: 100%;
83
+ content: '';
84
+ position: absolute;
85
+ left: 0px;
86
+ background-color: #ccc9d0;
87
+ }
88
+
89
+ .announcement-container .announcement-item .tag {
90
+ padding: 2px 5px;
91
+ border-radius: 99px;
92
+ color: #fff;
93
+ background-color: #24a76e;
94
+ font-size: 14px;
95
+ }
96
+
97
+ /* 更新记录样式 */
98
+ .change-container .change-item {
99
+ display: flex;
100
+ align-items: start;
101
+ position: relative;
102
+ /* 添加定位以便伪元素可以相对其定位 */
103
+ /* margin-bottom: 20px; */
104
+ }
105
+
106
+ .change-container .change-item::before {
107
+ content: '';
108
+ position: absolute;
109
+ left: 90px;
110
+ /* 与 .point 的 margin-left 对齐 */
111
+ top: 20px;
112
+ /* 调整以适应点的位置 */
113
+ height: calc(100% - 20px);
114
+ /* 连线的高度 */
115
+ border-left: 1px dashed #24a76e;
116
+ /* 连线样式 */
117
+ }
118
+
119
+ .change-container .change-item:last-child::before {
120
+ display: none;
121
+ /* 最后一个项目不需要连线 */
122
+ }
123
+
124
+ .change-container .point {
125
+ width: 10px;
126
+ height: 10px;
127
+ border-radius: 50%;
128
+ background-color: #24a76e;
129
+ margin: 5px;
130
+ }
131
+
132
+ .change-container .left {
133
+ display: flex;
134
+ align-items: center;
135
+ }
136
+
137
+ .change-container .left .time {
138
+ width: 80px;
139
+ margin: 0;
140
+ text-align: right;
141
+ }
142
+
143
+ .change-container .right {
144
+ /* padding-left: 80px; */
145
+ }
146
+
147
+ .content-item {
148
+ font-size: 12px;
149
+ color: #3c3c3c;
150
+ }
151
+
152
+ .right li{
153
+ list-style: disc;
154
+ margin-left: 12px;
155
+ }
156
+ </style>
157
+ </head>
158
+
159
+ <body>
160
+ <div id="app">
161
+ <div class="container">
162
+ <ul class="tabs">
163
+ <li class="tab" :class="{ active: tabIndex === 1 }" data-tab="announcement" @click="tabIndex=1">
164
+ 最新通知
165
+ </li>
166
+ <li class="tab" :class="{ active: tabIndex === 2 }" data-tab="update-log" @click="tabIndex=2">
167
+ 更新记录
168
+ </li>
169
+ </ul>
170
+ <div id="announcement" class="tab-content announcement-container" :class="{ active: tabIndex === 1 }">
171
+ <!-- <h2>最新通知</h2>-->
172
+ <!-- <div v-for="item in announcements" class="announcement-item">-->
173
+ <!-- <span>{{item.content}}</span>-->
174
+ <!-- <span v-for="tag in item.tags" class="tag">{{ tag }}</span>-->
175
+ <!-- </div>-->
176
+ <!-- -->
177
+ <!-- <h2>更新记录</h2>-->
178
+
179
+ <div class="change-container">
180
+ <div v-for="item in changeLogs" class="change-item">
181
+ <div class="left">
182
+ <p class="time">{{ item.date }}</p>
183
+ <div class="point"></div>
184
+ </div>
185
+ <div class="right">
186
+ <span>{{ item.title }}</span>
187
+ <ul v-for="item in item.content">
188
+ <li class="content-item">{{ item }}</li>
189
+ </ul>
190
+ </div>
191
+ </div>
192
+ </div>
193
+
194
+ </div>
195
+ <div id="update-log" class="tab-content" :class="{ active: tabIndex === 2 }">
196
+ <h2>更新记录</h2>
197
+ <div class="change-container">
198
+ <div v-for="item in changeLogs" class="change-item">
199
+ <div class="left">
200
+ <p class="time">{{ item.date }}</p>
201
+ <div class="point"></div>
202
+ </div>
203
+ <div class="right">
204
+ <span>{{ item.title }}</span>
205
+ <ul v-for="item in item.content">
206
+ <li class="content-item">{{ item }}</li>
207
+ </ul>
208
+ </div>
209
+ </div>
210
+ </div>
211
+ </div>
212
+ </div>
213
+ </div>
214
+ <script>
215
+ // import { ref, reactive } from './vue.esm-browser.js' //模块化开发方式
216
+ const App = {
217
+ setup() {
218
+ let tabIndex = Vue.ref(1)
219
+
220
+ return {
221
+ tabIndex,
222
+ announcements: [
223
+ {
224
+ content: '支持超多模型:',
225
+ tags: [
226
+ '对话',
227
+ '推理',
228
+ '图片',
229
+ '视频',
230
+ '音频',
231
+ '变清晰',
232
+ '去水印',
233
+ '文档解析',
234
+ ],
235
+ },
236
+ ],
237
+ changeLogs: [
238
+ {
239
+ date: '2025.03.04',
240
+ title: '🎉 最新通知',
241
+ content: [
242
+ 'qwq-max/qwq-max-search:阿里 Qwen 首个推理模型亮相!擅长数学和编程,支持联网搜索',
243
+ 'qwen-max-search:qwen2.5-max支持联网搜索',
244
+ 'qwen2.5-max:支持文件问答',
245
+
246
+ '因近期Midjourney封号严重,3月5日起, Relax基础价调整到0.05 Fast基础价调整到0.1 Turbo基础价调整到0.3,我们会保证全天稳定有号。',
247
+ ],
248
+ },
249
+ // {
250
+ // date: '2025.02.25',
251
+ // title:
252
+ // '🚀 已修复异常扣费,今日claude-3-7-sonnet-thinking模型,有异常扣费,请联系客服退回额度',
253
+ // content: [
254
+ // '已修复异常扣费,今日claude-3-7-sonnet-thinking模型,有异常扣费,请联系客服退回额度',
255
+ // '新增模型claude-3-7-sonnet-20250219,',
256
+ // ],
257
+ // },
258
+ {
259
+ date: '2025.02.25',
260
+ title:
261
+ '🎉 新增模型claude-3-7-sonnet-20250219,Claude最新模型,',
262
+ content: [
263
+ '新增模型claude-3-7-sonnet-20250219,Claude最新模型,已兼容think格claude-3-7-sonnet-thinking模型',
264
+ ],
265
+ },
266
+ ],
267
+ }
268
+ },
269
+ }
270
+ const app = Vue.createApp(App)
271
+ app.mount('#app')
272
+ </script>
273
+ </body>
274
+
275
+ </html>
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # @Project : AI. @by PyCharm
4
+ # @File : openai_files
5
+ # @Time : 2025/3/4 18:20
6
+ # @Author : betterme
7
+ # @WeChat : meutils
8
+ # @Software : PyCharm
9
+ # @Description :
10
+
11
+ from meutils.pipe import *
12
+ from meutils.io.files_utils import to_bytes
13
+ from meutils.llm.clients import moonshot_client, zhipuai_client
14
+
15
+
16
+ async def file_extract(file): # "https://oss.ffire.cc/files/招标文件备案表(第二次).pdf"
17
+ """todo 定时删除文件"""
18
+ filename = Path(file).name
19
+ mime_type, _ = mimetypes.guess_type(filename) # mime_type = "application/octet-stream"
20
+ file: bytes = await to_bytes(file)
21
+
22
+ file_object = await moonshot_client.files.create(
23
+ # file=file,
24
+ # file=("filename.pdf", file),
25
+ file=(filename, file, mime_type),
26
+ purpose="file-extract"
27
+ )
28
+ logger.debug(file_object)
29
+
30
+ response = await moonshot_client.files.content(file_id=file_object.id)
31
+ return response.text
@@ -0,0 +1,138 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # @Project : AI. @by PyCharm
4
+ # @File : fileparser
5
+ # @Time : 2025/1/7 17:48
6
+ # @Author : betterme
7
+ # @WeChat : meutils
8
+ # @Software : PyCharm
9
+ # @Description : https://bigmodel.cn/dev/activities/freebie/fileextracion
10
+ import asyncio
11
+
12
+ import httpx
13
+ import shortuuid
14
+
15
+ from meutils.pipe import *
16
+ from meutils.io.files_utils import to_bytes, guess_mime_type
17
+ from meutils.llm.clients import moonshot_client, zhipuai_client, APIStatusError
18
+ from meutils.notice.feishu import send_message as _send_message, FILES
19
+ from meutils.caches import cache, rcache
20
+
21
+ # from openai.types.file_object import FileObject
22
+
23
+ send_message = partial(
24
+ _send_message,
25
+ title=__name__,
26
+ url=FILES
27
+ )
28
+ """
29
+
30
+ # 智谱
31
+ # 格式限制:.PDF .DOCX .DOC .XLS .XLSX .PPT .PPTX .PNG .JPG .JPEG .CSV .PY .TXT .MD .BMP .GIF
32
+
33
+ # kimi todo: 定期删除文件
34
+ 文件接口与 Kimi 智能助手中上传文件功能所使用的相同,支持相同的文件格式,它们包括
35
+ .pdf .txt .csv .doc .docx .xls .xlsx .ppt .pptx .md .jpeg .png .bmp .gif .svg .svgz .webp .ico .xbm .dib .pjp .tif
36
+ .pjpeg .avif .dot .apng .epub .tiff .jfif .html .json .mobi .log .go .h .c .cpp .cxx .cc .cs .java .js .css .jsp .php
37
+ .py .py3 .asp .yaml .yml .ini .conf .ts .tsx 等格式。
38
+
39
+ # todo:
40
+ + .sh
41
+ """
42
+
43
+
44
+ async def delete_files(client, threshold: int = 666):
45
+ _ = await client.files.list()
46
+ file_objects = _.data
47
+
48
+ logger.debug(len(file_objects))
49
+
50
+ if len(file_objects) > threshold:
51
+ tasks = [client.files.delete(file.id) for file in file_objects]
52
+ await asyncio.gather(*tasks)
53
+
54
+
55
+ @rcache(ttl=1 * 3600)
56
+ async def _file_extract(file):
57
+ """
58
+
59
+ :param file: url bytes path
60
+ :return:
61
+ """
62
+
63
+ filename = Path(file).name if isinstance(file, str) else 'untitled'
64
+ mime_type = guess_mime_type(file)
65
+
66
+ file: bytes = await to_bytes(file)
67
+
68
+ for i, client in enumerate([moonshot_client, zhipuai_client]):
69
+
70
+ try:
71
+ # 1 / 0
72
+ file_object = await client.files.create(
73
+ file=(filename, file, mime_type),
74
+ purpose="file-extract"
75
+ )
76
+ logger.debug(file_object)
77
+
78
+ response = await client.files.content(file_id=file_object.id)
79
+
80
+ return response.json()
81
+
82
+ except Exception as e:
83
+ logger.debug(e)
84
+ if i == 1:
85
+ await delete_files(moonshot_client)
86
+
87
+ # 兜底
88
+ data = {
89
+ 'filename': filename,
90
+
91
+ 'type': 'file',
92
+ 'file_type': mime_type,
93
+ 'content': '',
94
+ }
95
+ try:
96
+ data['content'] = file.decode('utf-8')
97
+
98
+ except Exception as e:
99
+ logger.debug(e)
100
+ return data
101
+
102
+
103
+ async def file_extract(files):
104
+ if isinstance(files, str):
105
+ return await _file_extract(files)
106
+
107
+ tasks = [_file_extract(file) for file in files]
108
+ return await asyncio.gather(*tasks)
109
+
110
+
111
+ # FileObject(id='1741136989_8dd96cbee6274251b7e4c9568779bd6a', bytes=82947, created_at=1741136989, filename='kling_watermark.png', object='file', status=None, status_details=None)
112
+
113
+ if __name__ == '__main__':
114
+ # file = "https://oss.ffire.cc/files/招标文件备案表(第二次).pdf"
115
+ file = "https://oss.ffire.cc/files/%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6%E5%A4%87%E6%A1%88%E8%A1%A8%EF%BC%88%E7%AC%AC%E4%BA%8C%E6%AC%A1%EF%BC%89.pdf"
116
+ "https://oss.ffire.cc/files/%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6%E5%A4%87%E6%A1%88%E8%A1%A8%EF%BC%88%E7%AC%AC%E4%BA%8C%E6%AC%A1%EF%BC%89.pdf 这个文件讲了什么?"
117
+ file = "https://oss.ffire.cc/files/百炼系列手机产品介绍.docx"
118
+ # file = Path("/Users/betterme/PycharmProjects/AI/MeUtils/meutils/llm/completions/rag/百炼系列手机产品介绍.docx")
119
+
120
+ # file = "/Users/betterme/PycharmProjects/AI/MeUtils/meutils/io/img_1.png"
121
+
122
+ # openai.BadRequestError: Error code: 400 - {'error': {'message': 'text extract error: 没有解析出内容', 'type': 'invalid_request_error'}}
123
+ # file = "https://oss.ffire.cc/files/kling_watermark.png"
124
+ file = "/Users/betterme/PycharmProjects/AI/xx.sh"
125
+
126
+ file = [file] * 10
127
+ file = []
128
+
129
+ # print(Path(file).read_text())
130
+
131
+ # with timer():
132
+ # r = arun(file_extract(file, moonshot_client))
133
+
134
+ # with timer():
135
+ # r = arun(file_extract(file, provider='kimi'))
136
+
137
+ with timer():
138
+ arun(file_extract(file))
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # @Project : AI. @by PyCharm
4
+ # @File : __init__.py
5
+ # @Time : 2025/1/10 15:45
6
+ # @Author : betterme
7
+ # @WeChat : meutils
8
+ # @Software : PyCharm
9
+ # @Description :
10
+
@@ -0,0 +1,58 @@
1
+ import pdfplumber
2
+ from pandas import DataFrame
3
+ import tabulate
4
+
5
+ """
6
+ 表格
7
+ """
8
+
9
+
10
+ def curves_to_edges(cs):
11
+ edges = []
12
+ for c in cs:
13
+ edges += pdfplumber.utils.rect_to_edges(c)
14
+ return edges
15
+
16
+
17
+ def clean_cell_text(text):
18
+ """
19
+ 清除文本中的换行符和多余空格
20
+ """
21
+ if text is None:
22
+ return ""
23
+ text = text.replace("\n", "")
24
+ # 去除字符串开头和结尾的空白字符
25
+ text = text.strip()
26
+ return text
27
+
28
+
29
+ def extract_tables_with_text(self, pdf) -> List[str]:
30
+ """抽取表格并嵌入文本"""
31
+
32
+ def check_bboxes(word, table_bbox):
33
+ left = word['x0'], word['top'], word['x1'], word['bottom']
34
+ r = table_bbox
35
+ return left[0] > r[0] and left[1] > r[1] and left[2] < r[2] and left[3] < r[3]
36
+
37
+ lines = []
38
+ for page in pdf.pages:
39
+ tables = page.find_tables(
40
+ table_settings={
41
+ "vertical_strategy": "lines",
42
+ "horizontal_strategy": "lines",
43
+ "explicit_vertical_lines": self.curves_to_edges(page.curves) + page.edges,
44
+ "explicit_horizontal_lines": self.curves_to_edges(page.curves) + page.edges,
45
+ }
46
+ )
47
+ bboxes = [table.bbox for table in tables]
48
+ tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
49
+ non_table_words = [word for word in page.extract_words() if
50
+ not any([check_bboxes(word, table_bbox) for table_bbox in bboxes])]
51
+
52
+ for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, 'top', tolerance=5):
53
+ if 'text' in cluster[0]:
54
+ lines.append(' '.join([i['text'] for i in cluster]))
55
+ elif 'table' in cluster[0]:
56
+ lines.append(tabulate.tabulate(DataFrame(cluster[0]['table']).applymap(self.clean_cell_text),
57
+ tablefmt="github"))
58
+ return lines
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # @Project : AI. @by PyCharm
4
+ # @File : __init__.py
5
+ # @Time : 2023/5/18 16:31
6
+ # @Author : betterme
7
+ # @WeChat : meutils
8
+ # @Software : PyCharm
9
+ # @Description :
10
+
11
+ from meutils.fileparser.common import *
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # @Project : AI. @by PyCharm
4
+ # @File : common
5
+ # @Time : 2023/5/18 16:39
6
+ # @Author : betterme
7
+ # @WeChat : meutils
8
+ # @Software : PyCharm
9
+ # @Description :
10
+
11
+ from meutils.pipe import *
12
+
13
+
14
+ def doc2docx(doc_paths, outdir='.', max_workers=1):
15
+ """todo: 多进程阻塞"""
16
+ if isinstance(doc_paths, str):
17
+ doc_paths = [doc_paths]
18
+ max_workers = min(max_workers, len(doc_paths))
19
+ func = partial(_doc2docx, outdir=outdir)
20
+ return doc_paths | xProcessPoolExecutor(func, max_workers) | xlist
21
+
22
+
23
+ def _doc2docx(doc_path, outdir='.'):
24
+ if Path(doc_path).is_file():
25
+ cmd = 'libreoffice --headless --convert-to docx'.split() + [doc_path, '--outdir', outdir]
26
+ p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
27
+ p.wait(timeout=16)
28
+ stdout, stderr = p.communicate()
29
+ if stderr:
30
+ raise subprocess.SubprocessError(stderr)
31
+ return stdout.decode()
32
+ return False
33
+
34
+
35
+ def stream2tempfile4process(
36
+ stream: Union[str, bytes] = b"temp",
37
+ process_fn: Callable[[os.PathLike], Any] = lambda p: p.read_text(),
38
+ delete=True
39
+ ):
40
+ # 创建临时文件
41
+ import tempfile
42
+
43
+ with tempfile.NamedTemporaryFile(delete=delete) as temp_file:
44
+ p = Path(temp_file.name)
45
+ if isinstance(stream, str): # 写
46
+ p.write_text(stream)
47
+ else:
48
+ p.write_bytes(stream)
49
+ return process_fn(p) # 读
50
+
51
+
52
+ def stream_parser(file_stream):
53
+ """
54
+ from fastapi import FastAPI, File, UploadFile
55
+
56
+ file_stream = UploadFile(open(''))
57
+
58
+ filename, file_stream = stream_parser(file_stream)
59
+ """
60
+ filename = ''
61
+ # from fastapi import FastAPI, File, UploadFile
62
+ if hasattr(file_stream, 'file'):
63
+ filename = file_stream.file.name or file_stream.filename
64
+ file_stream = file_stream.file
65
+ if isinstance(file_stream, io.TextIOWrapper): # 转 bytes
66
+ file_stream = file_stream.buffer
67
+ file_stream = file_stream.read()
68
+
69
+ # st.file_uploader
70
+ elif hasattr(file_stream, 'read'):
71
+ filename = file_stream.name
72
+ if isinstance(file_stream, io.TextIOWrapper): # 转 bytes
73
+ file_stream = file_stream.buffer
74
+ file_stream = file_stream.read()
75
+
76
+ # ValueError: I/O operation on closed file.
77
+ # with file_stream:
78
+ # file_stream = file_stream.buffer.read()
79
+
80
+ elif (
81
+ isinstance(file_stream, (str, os.PathLike))
82
+ and len(file_stream) < 256
83
+ and Path(file_stream).is_file()
84
+ ):
85
+ filename = str(file_stream)
86
+ file_stream = open(filename).read()
87
+
88
+ elif isinstance(file_stream, (bytes, bytearray)):
89
+ pass
90
+
91
+ return filename, file_stream
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # @Project : AI. @by PyCharm
4
+ # @File : demo
5
+ # @Time : 2024/6/5 09:00
6
+ # @Author : betterme
7
+ # @WeChat : meutils
8
+ # @Software : PyCharm
9
+ # @Description :
10
+ import pandas as pd
11
+
12
+ from meutils.pipe import *
13
+ import pandas as pd
14
+
15
+
16
+ import pdfplumber
17
+ with pdfplumber.open('银行间市场债券交易结算情况(按投资者).pdf') as pdf:
18
+ for page in pdf.pages:
19
+ data = page.extract_table()
20
+
21
+
22
+ cols = pd.MultiIndex.from_tuples(zip(*data[:2]))
23
+ print(cols)
24
+ df = pd.DataFrame(data[2:], columns=cols)
25
+
26
+ print(df.fillna(method='ffill', axis=1))
27
+
28
+
29
+ #
30
+ # import camelot
31
+ # import pandas as pd
32
+ # # 使用Camelot读取PDF文件中的表格
33
+ # tables = camelot.read_pdf('银行间市场债券交易结算情况(按投资者).pdf', pages='all', flavor='lattice')
34
+ #
35
+ # # # 将所有表格转换为 DataFrame 并合并
36
+ # # all_data = pd.concat([table.df for table in tables], ignore_index=True)
37
+ # #
38
+ # # all_data.to_excel('all_data.xlsx',index=False)
39
+ #
40
+ #
41
+ # print(tables[0].df)
@@ -0,0 +1,10 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from __future__ import absolute_import
4
+
5
+ from .filetype import * # noqa
6
+ from .helpers import * # noqa
7
+ from .match import * # noqa
8
+
9
+ # Current package semver version
10
+ __version__ = version = '1.2.0'