pyxllib 0.0.43__py3-none-any.whl → 0.3.197__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/__init__.py +9 -2
- pyxllib/algo/__init__.py +8 -0
- pyxllib/algo/disjoint.py +54 -0
- pyxllib/algo/geo.py +541 -0
- pyxllib/{util/mathlib.py → algo/intervals.py} +172 -36
- pyxllib/algo/matcher.py +389 -0
- pyxllib/algo/newbie.py +166 -0
- pyxllib/algo/pupil.py +629 -0
- pyxllib/algo/shapelylib.py +67 -0
- pyxllib/algo/specialist.py +241 -0
- pyxllib/algo/stat.py +494 -0
- pyxllib/algo/treelib.py +149 -0
- pyxllib/algo/unitlib.py +66 -0
- pyxllib/autogui/__init__.py +5 -0
- pyxllib/autogui/activewin.py +246 -0
- pyxllib/autogui/all.py +9 -0
- pyxllib/autogui/autogui.py +852 -0
- pyxllib/autogui/uiautolib.py +362 -0
- pyxllib/autogui/virtualkey.py +102 -0
- pyxllib/autogui/wechat.py +827 -0
- pyxllib/autogui/wechat_msg.py +421 -0
- pyxllib/autogui/wxautolib.py +84 -0
- pyxllib/cv/__init__.py +1 -11
- pyxllib/cv/expert.py +267 -0
- pyxllib/cv/{imlib.py → imfile.py} +18 -83
- pyxllib/cv/imhash.py +39 -0
- pyxllib/cv/pupil.py +9 -0
- pyxllib/cv/rgbfmt.py +1525 -0
- pyxllib/cv/slidercaptcha.py +137 -0
- pyxllib/cv/trackbartools.py +163 -49
- pyxllib/cv/xlcvlib.py +1040 -0
- pyxllib/cv/xlpillib.py +423 -0
- pyxllib/data/__init__.py +0 -0
- pyxllib/data/echarts.py +240 -0
- pyxllib/data/jsonlib.py +89 -0
- pyxllib/{util/oss2_.py → data/oss.py} +11 -9
- pyxllib/data/pglib.py +1127 -0
- pyxllib/data/sqlite.py +568 -0
- pyxllib/{util → data}/sqllib.py +13 -31
- pyxllib/ext/JLineViewer.py +505 -0
- pyxllib/ext/__init__.py +6 -0
- pyxllib/{util → ext}/demolib.py +119 -35
- pyxllib/ext/drissionlib.py +277 -0
- pyxllib/ext/kq5034lib.py +12 -0
- pyxllib/{util/main.py → ext/old.py} +122 -284
- pyxllib/ext/qt.py +449 -0
- pyxllib/ext/robustprocfile.py +497 -0
- pyxllib/ext/seleniumlib.py +76 -0
- pyxllib/{util/tklib.py → ext/tk.py} +10 -11
- pyxllib/ext/unixlib.py +827 -0
- pyxllib/ext/utools.py +351 -0
- pyxllib/{util/webhooklib.py → ext/webhook.py} +45 -17
- pyxllib/ext/win32lib.py +40 -0
- pyxllib/ext/wjxlib.py +88 -0
- pyxllib/ext/wpsapi.py +124 -0
- pyxllib/ext/xlwork.py +9 -0
- pyxllib/ext/yuquelib.py +1105 -0
- pyxllib/file/__init__.py +17 -0
- pyxllib/file/docxlib.py +761 -0
- pyxllib/{util → file}/gitlib.py +40 -27
- pyxllib/file/libreoffice.py +165 -0
- pyxllib/file/movielib.py +148 -0
- pyxllib/file/newbie.py +10 -0
- pyxllib/file/onenotelib.py +1469 -0
- pyxllib/file/packlib/__init__.py +330 -0
- pyxllib/{util → file/packlib}/zipfile.py +598 -195
- pyxllib/file/pdflib.py +426 -0
- pyxllib/file/pupil.py +185 -0
- pyxllib/file/specialist/__init__.py +685 -0
- pyxllib/{basic/_5_dirlib.py → file/specialist/dirlib.py} +364 -93
- pyxllib/file/specialist/download.py +193 -0
- pyxllib/file/specialist/filelib.py +2829 -0
- pyxllib/file/xlsxlib.py +3131 -0
- pyxllib/file/xlsyncfile.py +341 -0
- pyxllib/prog/__init__.py +5 -0
- pyxllib/prog/cachetools.py +64 -0
- pyxllib/prog/deprecatedlib.py +233 -0
- pyxllib/prog/filelock.py +42 -0
- pyxllib/prog/ipyexec.py +253 -0
- pyxllib/prog/multiprogs.py +940 -0
- pyxllib/prog/newbie.py +451 -0
- pyxllib/prog/pupil.py +1197 -0
- pyxllib/{sitepackages.py → prog/sitepackages.py} +5 -3
- pyxllib/prog/specialist/__init__.py +391 -0
- pyxllib/prog/specialist/bc.py +203 -0
- pyxllib/prog/specialist/browser.py +497 -0
- pyxllib/prog/specialist/common.py +347 -0
- pyxllib/prog/specialist/datetime.py +199 -0
- pyxllib/prog/specialist/tictoc.py +240 -0
- pyxllib/prog/specialist/xllog.py +180 -0
- pyxllib/prog/xlosenv.py +108 -0
- pyxllib/stdlib/__init__.py +17 -0
- pyxllib/{util → stdlib}/tablepyxl/__init__.py +1 -3
- pyxllib/{util → stdlib}/tablepyxl/style.py +1 -1
- pyxllib/{util → stdlib}/tablepyxl/tablepyxl.py +2 -4
- pyxllib/text/__init__.py +8 -0
- pyxllib/text/ahocorasick.py +39 -0
- pyxllib/text/airscript.js +744 -0
- pyxllib/text/charclasslib.py +121 -0
- pyxllib/text/jiebalib.py +267 -0
- pyxllib/text/jinjalib.py +32 -0
- pyxllib/text/jsa_ai_prompt.md +271 -0
- pyxllib/text/jscode.py +922 -0
- pyxllib/text/latex/__init__.py +158 -0
- pyxllib/text/levenshtein.py +303 -0
- pyxllib/text/nestenv.py +1215 -0
- pyxllib/text/newbie.py +300 -0
- pyxllib/text/pupil/__init__.py +8 -0
- pyxllib/text/pupil/common.py +1121 -0
- pyxllib/text/pupil/xlalign.py +326 -0
- pyxllib/text/pycode.py +47 -0
- pyxllib/text/specialist/__init__.py +8 -0
- pyxllib/text/specialist/common.py +112 -0
- pyxllib/text/specialist/ptag.py +186 -0
- pyxllib/text/spellchecker.py +172 -0
- pyxllib/text/templates/echart_base.html +11 -0
- pyxllib/text/templates/highlight_code.html +17 -0
- pyxllib/text/templates/latex_editor.html +103 -0
- pyxllib/text/vbacode.py +17 -0
- pyxllib/text/xmllib.py +747 -0
- pyxllib/xl.py +39 -0
- pyxllib/xlcv.py +17 -0
- pyxllib-0.3.197.dist-info/METADATA +48 -0
- pyxllib-0.3.197.dist-info/RECORD +126 -0
- {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +4 -5
- pyxllib/basic/_1_strlib.py +0 -945
- pyxllib/basic/_2_timelib.py +0 -488
- pyxllib/basic/_3_pathlib.py +0 -916
- pyxllib/basic/_4_loglib.py +0 -419
- pyxllib/basic/__init__.py +0 -54
- pyxllib/basic/arrow_.py +0 -250
- pyxllib/basic/chardet_.py +0 -66
- pyxllib/basic/dirlib.py +0 -529
- pyxllib/basic/dprint.py +0 -202
- pyxllib/basic/extension.py +0 -12
- pyxllib/basic/judge.py +0 -31
- pyxllib/basic/log.py +0 -204
- pyxllib/basic/pathlib_.py +0 -705
- pyxllib/basic/pytictoc.py +0 -102
- pyxllib/basic/qiniu_.py +0 -61
- pyxllib/basic/strlib.py +0 -761
- pyxllib/basic/timer.py +0 -132
- pyxllib/cv/cv.py +0 -834
- pyxllib/cv/cvlib/_1_geo.py +0 -543
- pyxllib/cv/cvlib/_2_cvprcs.py +0 -309
- pyxllib/cv/cvlib/_2_imgproc.py +0 -594
- pyxllib/cv/cvlib/_3_pilprcs.py +0 -80
- pyxllib/cv/cvlib/_4_cvimg.py +0 -211
- pyxllib/cv/cvlib/__init__.py +0 -10
- pyxllib/cv/debugtools.py +0 -82
- pyxllib/cv/fitz_.py +0 -300
- pyxllib/cv/installer.py +0 -42
- pyxllib/debug/_0_installer.py +0 -38
- pyxllib/debug/_1_typelib.py +0 -277
- pyxllib/debug/_2_chrome.py +0 -198
- pyxllib/debug/_3_showdir.py +0 -161
- pyxllib/debug/_4_bcompare.py +0 -140
- pyxllib/debug/__init__.py +0 -49
- pyxllib/debug/bcompare.py +0 -132
- pyxllib/debug/chrome.py +0 -198
- pyxllib/debug/installer.py +0 -38
- pyxllib/debug/showdir.py +0 -158
- pyxllib/debug/typelib.py +0 -278
- pyxllib/image/__init__.py +0 -12
- pyxllib/torch/__init__.py +0 -20
- pyxllib/torch/modellib.py +0 -37
- pyxllib/torch/trainlib.py +0 -344
- pyxllib/util/__init__.py +0 -20
- pyxllib/util/aip_.py +0 -141
- pyxllib/util/casiadb.py +0 -59
- pyxllib/util/excellib.py +0 -495
- pyxllib/util/filelib.py +0 -612
- pyxllib/util/jsondata.py +0 -27
- pyxllib/util/jsondata2.py +0 -92
- pyxllib/util/labelmelib.py +0 -139
- pyxllib/util/onepy/__init__.py +0 -29
- pyxllib/util/onepy/onepy.py +0 -574
- pyxllib/util/onepy/onmanager.py +0 -170
- pyxllib/util/pyautogui_.py +0 -219
- pyxllib/util/textlib.py +0 -1305
- pyxllib/util/unorder.py +0 -22
- pyxllib/util/xmllib.py +0 -639
- pyxllib-0.0.43.dist-info/METADATA +0 -39
- pyxllib-0.0.43.dist-info/RECORD +0 -80
- pyxllib-0.0.43.dist-info/top_level.txt +0 -1
- {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
pyxllib/file/xlsxlib.py
ADDED
@@ -0,0 +1,3131 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : 陈坤泽
|
4
|
+
# @Email : 877362867@qq.com
|
5
|
+
# @Date : 2024/01/07
|
6
|
+
|
7
|
+
"""
|
8
|
+
扩展了些自己的openpyxl工具
|
9
|
+
"""
|
10
|
+
import copy
|
11
|
+
|
12
|
+
import time
|
13
|
+
|
14
|
+
from pyxllib.prog.pupil import check_install_package, run_once
|
15
|
+
|
16
|
+
check_install_package('openpyxl')
|
17
|
+
# check_install_package('premailer')
|
18
|
+
# check_install_package('xlrd2')
|
19
|
+
check_install_package('yattag')
|
20
|
+
check_install_package('jsonpickle')
|
21
|
+
|
22
|
+
from collections import Counter, OrderedDict, defaultdict
|
23
|
+
import csv
|
24
|
+
import datetime
|
25
|
+
from itertools import islice
|
26
|
+
import json
|
27
|
+
import math
|
28
|
+
from pathlib import Path
|
29
|
+
import random
|
30
|
+
import re
|
31
|
+
import io
|
32
|
+
|
33
|
+
import xlrd
|
34
|
+
|
35
|
+
import filetype
|
36
|
+
import openpyxl
|
37
|
+
from openpyxl import Workbook
|
38
|
+
from openpyxl.cell.cell import MergedCell
|
39
|
+
from openpyxl.styles import Font, Alignment
|
40
|
+
from openpyxl.utils.cell import get_column_letter, column_index_from_string
|
41
|
+
import openpyxl.worksheet.formula
|
42
|
+
import pandas as pd
|
43
|
+
|
44
|
+
try:
|
45
|
+
import jsonpickle
|
46
|
+
except ModuleNotFoundError:
|
47
|
+
pass
|
48
|
+
|
49
|
+
from pyxllib.prog.newbie import human_readable_number
|
50
|
+
from pyxllib.prog.pupil import (inject_members, dprint, xlmd5, shuffle_dict_keys, Timeout,
|
51
|
+
safe_div, format_exception, DictTool)
|
52
|
+
from pyxllib.prog.specialist import browser, TicToc
|
53
|
+
from pyxllib.algo.specialist import product
|
54
|
+
from pyxllib.text.pupil import calc_chinese_ratio
|
55
|
+
from pyxllib.file.specialist import XlPath
|
56
|
+
|
57
|
+
|
58
|
+
def __1_basic():
|
59
|
+
""" 表格的组件功能 """
|
60
|
+
|
61
|
+
|
62
|
+
def excel_addr(n, m) -> str:
|
63
|
+
r"""数字索引转excel地址索引
|
64
|
+
|
65
|
+
:param n: 行号,可以输入字符串形式的数字
|
66
|
+
:param m: 列号,同上可以输入str的数字
|
67
|
+
:return:
|
68
|
+
|
69
|
+
>>> excel_addr(2, 3)
|
70
|
+
'C2'
|
71
|
+
"""
|
72
|
+
return f'{get_column_letter(int(m))}{n}'
|
73
|
+
|
74
|
+
|
75
|
+
def excel_addr2(n1, m1, n2, m2) -> str:
|
76
|
+
r""" excel_addr的扩展版,定位一个区间
|
77
|
+
|
78
|
+
>>> excel_addr2(2, 3, 4, 4)
|
79
|
+
'C2:D4'
|
80
|
+
"""
|
81
|
+
return f'{get_column_letter(int(m1))}{n1}:{get_column_letter(int(m2))}{n2}'
|
82
|
+
|
83
|
+
|
84
|
+
def is_valid_excel_cell(cell):
|
85
|
+
""" 判断输入的字符串是否是一个合法的Excel单元格地址
|
86
|
+
|
87
|
+
:param str cell: 输入的字符串
|
88
|
+
:return bool: 如果是合法的Excel单元格地址返回True,否则返回False
|
89
|
+
"""
|
90
|
+
match = re.fullmatch(r'[A-Z]+[1-9][0-9]*', cell)
|
91
|
+
return match is not None
|
92
|
+
|
93
|
+
|
94
|
+
def is_valid_excel_range(range):
|
95
|
+
""" 判断输入的字符串是否是一个合法的Excel单元格范围
|
96
|
+
|
97
|
+
:param str range: 输入的字符串
|
98
|
+
:return bool: 如果是合法的Excel单元格范围返回True,否则返回False
|
99
|
+
"""
|
100
|
+
if ':' in range:
|
101
|
+
start, end = range.split(':')
|
102
|
+
return (is_valid_excel_cell(start) or start.isdigit() or re.fullmatch(r'[A-Z]+', start)) and \
|
103
|
+
(is_valid_excel_cell(end) or end.isdigit() or re.fullmatch(r'[A-Z]+', end)) and \
|
104
|
+
start <= end
|
105
|
+
else:
|
106
|
+
return is_valid_excel_cell(range)
|
107
|
+
|
108
|
+
|
109
|
+
def is_valid_excel_address(address):
|
110
|
+
""" 判断输入的字符串是否是一个合法的Excel地址定位
|
111
|
+
|
112
|
+
:param str address: 输入的字符串
|
113
|
+
:return bool: 如果是合法的Excel地址定位返回True,否则返回False
|
114
|
+
|
115
|
+
注意,严格来说,'A1,A3'这种定位也是可以的,但是这个函数暂不考虑这种情况,
|
116
|
+
如果需要,可以另外写is_valid_excel_address2
|
117
|
+
"""
|
118
|
+
if ':' in address:
|
119
|
+
return is_valid_excel_range(address)
|
120
|
+
else:
|
121
|
+
return is_valid_excel_cell(address)
|
122
|
+
|
123
|
+
|
124
|
+
@run_once('str')
|
125
|
+
def xlfmt2pyfmt_date(xl_fmt):
|
126
|
+
""" 日期的渲染操作
|
127
|
+
|
128
|
+
>>> xlfmt2pyfmt_date('yyyy/m/d')
|
129
|
+
'%Y/{month}/{day}'
|
130
|
+
>>> xlfmt2pyfmt_date('yyyy-mm-dd')
|
131
|
+
'%Y-%m-%d'
|
132
|
+
>>> xlfmt2pyfmt_date('yyyy年mm月dd日')
|
133
|
+
'%Y年%m月%d日'
|
134
|
+
>>> xlfmt2pyfmt_date('yyyy年m月d日')
|
135
|
+
'%Y年{month}月{day}日'
|
136
|
+
|
137
|
+
# 注意以下是并未支持的功能...会默认返回 '%Y/%-m/%-d'
|
138
|
+
>>> xlfmt2pyfmt_date('yy-m-d')
|
139
|
+
'%y-{month}-{day}'
|
140
|
+
>>> xlfmt2pyfmt_date('m/d/yy')
|
141
|
+
'{month}/{day}/%y'
|
142
|
+
>>> xlfmt2pyfmt_date('dddd, mmmm dd, yyyy')
|
143
|
+
'%A, %B {day}, %Y'
|
144
|
+
>>> xlfmt2pyfmt_date('yy/mm/dd')
|
145
|
+
'%y/%m/%d'
|
146
|
+
>>> xlfmt2pyfmt_date('m-d-yy')
|
147
|
+
'{month}-{day}-%y'
|
148
|
+
"""
|
149
|
+
mappings = {
|
150
|
+
'y': {2: '%y', 4: '%Y'},
|
151
|
+
'm': {1: '%-m', 2: '%m', 3: '%b', 4: '%B'},
|
152
|
+
'd': {1: '%-d', 2: '%d', 3: '%a', 4: '%A'}
|
153
|
+
}
|
154
|
+
|
155
|
+
m = re.search(r'(y+)(.+?)(m+)(.+?)(d+)(日?)', xl_fmt.replace('"', ''))
|
156
|
+
if m:
|
157
|
+
y, sep1, m, sep2, d, sep3 = m.groups()
|
158
|
+
year_pattern = mappings['y'].get(len(y), '%Y')
|
159
|
+
month_pattern = mappings['m'].get(len(m), '%m')
|
160
|
+
day_pattern = mappings['d'].get(len(d), '%d')
|
161
|
+
fmt = f'{year_pattern}{sep1}{month_pattern}{sep2}{day_pattern}{sep3}'
|
162
|
+
else:
|
163
|
+
fmt = '%Y/%-m/%-d'
|
164
|
+
|
165
|
+
# 在windows下,%-m和%-d会报错,所以需要替换成{month}和{day}
|
166
|
+
fmt = fmt.replace('%-m', '{month}')
|
167
|
+
fmt = fmt.replace('%-d', '{day}')
|
168
|
+
return fmt
|
169
|
+
|
170
|
+
|
171
|
+
@run_once('str')
|
172
|
+
def xlfmt2pyfmt_time(xl_fmt):
|
173
|
+
""" 时间的渲染操作
|
174
|
+
|
175
|
+
>>> xlfmt2pyfmt_time('h:mm:ss')
|
176
|
+
'%I:%M:%S'
|
177
|
+
>>> xlfmt2pyfmt_time('hh:mm:ss')
|
178
|
+
'%H:%M:%S'
|
179
|
+
>>> xlfmt2pyfmt_time('mm:ss')
|
180
|
+
'%M:%S'
|
181
|
+
>>> xlfmt2pyfmt_time('h:mm')
|
182
|
+
'%I:%M'
|
183
|
+
>>> xlfmt2pyfmt_time('hh:mm')
|
184
|
+
'%H:%M'
|
185
|
+
>>> xlfmt2pyfmt_time('m:ss')
|
186
|
+
'%M:%S'
|
187
|
+
>>> xlfmt2pyfmt_time('h:mm:ss AM/PM')
|
188
|
+
'%I:%M:%S %p'
|
189
|
+
>>> xlfmt2pyfmt_time('hh:mm:ss AM/PM')
|
190
|
+
'%H:%M:%S %p'
|
191
|
+
"""
|
192
|
+
xl_fmt = re.sub(r'(y+)(.+?)(m+)(.+?)(d+)(日?)', '', xl_fmt.replace('"', ''))
|
193
|
+
|
194
|
+
components = []
|
195
|
+
|
196
|
+
# 判断是12小时制还是24小时制
|
197
|
+
if 'hh' in xl_fmt:
|
198
|
+
components.append('%H')
|
199
|
+
elif 'h' in xl_fmt:
|
200
|
+
components.append('%I')
|
201
|
+
|
202
|
+
# 判断是否显示分钟
|
203
|
+
if 'mm' in xl_fmt or 'm' in xl_fmt:
|
204
|
+
components.append('%M')
|
205
|
+
|
206
|
+
# 判断是否显示秒钟
|
207
|
+
if 'ss' in xl_fmt or 's' in xl_fmt:
|
208
|
+
components.append('%S')
|
209
|
+
|
210
|
+
# 判断是否显示AM/PM
|
211
|
+
if 'AM/PM' in xl_fmt:
|
212
|
+
if components:
|
213
|
+
components[-1] += ' %p'
|
214
|
+
else:
|
215
|
+
components.append('%p')
|
216
|
+
|
217
|
+
return ':'.join(components)
|
218
|
+
|
219
|
+
|
220
|
+
# @run_once('str')
|
221
|
+
def xlfmt2pyfmt_datetime(xl_fmt):
|
222
|
+
""" 主要是针对日期、时间的渲染操作
|
223
|
+
|
224
|
+
>>> xlfmt2pyfmt_datetime('yyyy-mm-dd h:mm:ss')
|
225
|
+
'%Y-%m-%d %I:%M:%S'
|
226
|
+
"""
|
227
|
+
py_fmt = xlfmt2pyfmt_date(xl_fmt)
|
228
|
+
if ':' in xl_fmt:
|
229
|
+
py_fmt += ' ' + xlfmt2pyfmt_time(xl_fmt)
|
230
|
+
return py_fmt
|
231
|
+
|
232
|
+
|
233
|
+
def xl_render_value(x, xl_fmt):
|
234
|
+
""" 得到单元格简单渲染后的效果
|
235
|
+
py里不可能对excel的所有格式进行全覆盖,只是对场景格式进行处理
|
236
|
+
|
237
|
+
注意,遇到公式是很难计算处理的,大概率只能保持原公式显示
|
238
|
+
因为日期用的比较多,需要时常获得真实的渲染效果,所以这里封装一个接口
|
239
|
+
|
240
|
+
对于JSA等场景,直接使用Cell.Text获取渲染值就行,不需要这里这么复杂的实现
|
241
|
+
|
242
|
+
>>> xl_render_value(datetime.datetime(2020, 1, 1), 'yyyy-mm-dd')
|
243
|
+
'2020-01-01'
|
244
|
+
"""
|
245
|
+
|
246
|
+
if isinstance(x, datetime.datetime):
|
247
|
+
y = x.strftime(xlfmt2pyfmt_datetime(xl_fmt)).format(month=x.month, day=x.day)
|
248
|
+
elif isinstance(x, datetime.date):
|
249
|
+
y = x.strftime(xlfmt2pyfmt_date(xl_fmt)).format(month=x.month, day=x.day)
|
250
|
+
elif isinstance(x, datetime.time):
|
251
|
+
y = x.strftime(xlfmt2pyfmt_time(xl_fmt))
|
252
|
+
elif isinstance(x, datetime.timedelta):
|
253
|
+
y = str(x)
|
254
|
+
elif isinstance(x, (str, int, float, bool)): # 其他可以json化的数据类型
|
255
|
+
y = x
|
256
|
+
else: # ArrayFormula、DataTableFormula等无法json化的数据,提前转str
|
257
|
+
y = str(x)
|
258
|
+
return y
|
259
|
+
|
260
|
+
|
261
|
+
def sort_excel_files(file_paths):
|
262
|
+
""" 在文件清单中,把excel类型的文件优先排到前面 """
|
263
|
+
|
264
|
+
def sort_key(filename: str) -> int:
|
265
|
+
""" 根据文件后缀给出权重排序值
|
266
|
+
|
267
|
+
:param str filename: 文件名
|
268
|
+
:return int: 权重值(小的在前)
|
269
|
+
|
270
|
+
>>> sort_key('test.xlsx')
|
271
|
+
1
|
272
|
+
>>> sort_key('demo.xls')
|
273
|
+
2
|
274
|
+
>>> sort_key('other.txt')
|
275
|
+
3
|
276
|
+
"""
|
277
|
+
if re.search(r'\.xlsx$', filename):
|
278
|
+
return 1, filename
|
279
|
+
elif re.search(r'\.xl[^.]*$', filename):
|
280
|
+
return 2, filename
|
281
|
+
else:
|
282
|
+
return 3, filename
|
283
|
+
|
284
|
+
file_paths2 = sorted(file_paths, key=sort_key)
|
285
|
+
return file_paths2
|
286
|
+
|
287
|
+
|
288
|
+
def excel2md5(file, reduction_degree=1):
|
289
|
+
try:
|
290
|
+
wb = openpyxl.load_workbook(file)
|
291
|
+
except (ValueError, TypeError) as e:
|
292
|
+
# 有些表格直接读取会失败,但使用read_only就能读了
|
293
|
+
wb = openpyxl.load_workbook(file, read_only=True)
|
294
|
+
except Exception as e: # 还有其他zipfile.BadZipFile等错误
|
295
|
+
print(file, str(e))
|
296
|
+
return ''
|
297
|
+
|
298
|
+
return wb.to_md5(reduction_degree)
|
299
|
+
|
300
|
+
|
301
|
+
def convert_csv_text_to_xlsx(csv_text):
|
302
|
+
""" 将 csv 文本转换为 xlsx 文件 """
|
303
|
+
wb = Workbook()
|
304
|
+
sheet = wb.active
|
305
|
+
|
306
|
+
# 使用 io.StringIO 将 csv 文本转换为可读的文件对象
|
307
|
+
f = io.StringIO(csv_text)
|
308
|
+
reader = csv.reader(f)
|
309
|
+
for row_idx, row in enumerate(reader, start=1):
|
310
|
+
for col_idx, value in enumerate(row, start=1):
|
311
|
+
sheet.cell(row=row_idx, column=col_idx).value = value
|
312
|
+
|
313
|
+
f.close()
|
314
|
+
return wb
|
315
|
+
|
316
|
+
|
317
|
+
def convert_csv_to_xlsx(csv_file):
|
318
|
+
""" 将 csv 文件转换为 xlsx 文件 """
|
319
|
+
wb = Workbook()
|
320
|
+
sheet = wb.active
|
321
|
+
|
322
|
+
with open(csv_file, encoding='utf-8') as f:
|
323
|
+
reader = csv.reader(f)
|
324
|
+
for row_idx, row in enumerate(reader, start=1):
|
325
|
+
for col_idx, value in enumerate(row, start=1):
|
326
|
+
sheet.cell(row=row_idx, column=col_idx).value = value
|
327
|
+
|
328
|
+
return wb
|
329
|
+
|
330
|
+
|
331
|
+
def convert_xls_to_xlsx(xls_file):
|
332
|
+
""" 将 xls 文件转换为 xlsx 文件
|
333
|
+
|
334
|
+
注意,这只是一个简化版的转换,要尽量完整的话,还是要用microsoft 365来升级xls的
|
335
|
+
"""
|
336
|
+
# 使用 xlrd 打开 xls 文件
|
337
|
+
xls_workbook = xlrd.open_workbook(xls_file)
|
338
|
+
|
339
|
+
# 创建一个新的 openpyxl 工作簿
|
340
|
+
wb = Workbook()
|
341
|
+
ws = wb.active
|
342
|
+
|
343
|
+
for i in range(xls_workbook.nsheets):
|
344
|
+
xls_sheet = xls_workbook.sheet_by_index(i)
|
345
|
+
if i == 0:
|
346
|
+
# 为第一个工作表设置名称
|
347
|
+
ws.title = xls_sheet.name
|
348
|
+
else:
|
349
|
+
# 创建新的工作表并设置名称
|
350
|
+
ws = wb.create_sheet(title=xls_sheet.name)
|
351
|
+
for row in range(xls_sheet.nrows):
|
352
|
+
for col in range(xls_sheet.ncols):
|
353
|
+
# 将 xlrd 单元格的数据写入 openpyxl 单元格
|
354
|
+
ws.cell(row=row + 1, column=col + 1).value = xls_sheet.cell_value(row, col)
|
355
|
+
|
356
|
+
return wb
|
357
|
+
|
358
|
+
|
359
|
+
def load_as_xlsx_file(file_path, keep_links=False, keep_vba=False):
|
360
|
+
""" 这个不能全信文件给的扩展名,需要智能判断 """
|
361
|
+
|
362
|
+
# 0 工具函数
|
363
|
+
@run_once()
|
364
|
+
def read_xlsx():
|
365
|
+
file = file_path
|
366
|
+
# 如果文件原本的后缀不是xlsx,openpyxl是读不了的,要绕个弯
|
367
|
+
if file.suffix[1:] not in ('xlsx', 'xlsm'):
|
368
|
+
with open(file_path, 'rb') as f2:
|
369
|
+
data = f2.read()
|
370
|
+
file = io.BytesIO(data)
|
371
|
+
try:
|
372
|
+
return openpyxl.load_workbook(file,
|
373
|
+
keep_links=keep_links,
|
374
|
+
keep_vba=keep_vba), ''
|
375
|
+
except Exception as e:
|
376
|
+
if isinstance(e, TimeoutError): # 这里触发的是总的超时设定
|
377
|
+
raise e
|
378
|
+
return None, format_exception(e, 2)
|
379
|
+
|
380
|
+
@run_once()
|
381
|
+
def read_xls():
|
382
|
+
try:
|
383
|
+
return convert_xls_to_xlsx(file_path), ''
|
384
|
+
except Exception as e:
|
385
|
+
return None, format_exception(e, 2)
|
386
|
+
|
387
|
+
@run_once()
|
388
|
+
def read_csv():
|
389
|
+
try:
|
390
|
+
return convert_csv_to_xlsx(file_path), ''
|
391
|
+
except Exception as e:
|
392
|
+
return None, format_exception(e, 2)
|
393
|
+
|
394
|
+
def read_test(suffix):
|
395
|
+
if suffix in ('xlsx', 'xlsm', 'zip'):
|
396
|
+
wb, error = read_xlsx()
|
397
|
+
elif suffix == 'xls':
|
398
|
+
wb, error = read_xls()
|
399
|
+
elif suffix == 'csv':
|
400
|
+
wb, error = read_csv()
|
401
|
+
else:
|
402
|
+
wb, error = None, f'不支持的文件类型:{suffix}'
|
403
|
+
return wb, error
|
404
|
+
|
405
|
+
# 1 优先相信用户输入的文件名类型
|
406
|
+
file_path = Path(file_path)
|
407
|
+
suffix = file_path.suffix.lower()[1:]
|
408
|
+
wb, error = read_test(suffix)
|
409
|
+
if wb is not None:
|
410
|
+
return wb, suffix
|
411
|
+
|
412
|
+
# 2 如果处理不了,则尝试用filetype判断的类型
|
413
|
+
suffix2 = filetype.guess(file_path)
|
414
|
+
suffix2 = suffix2.extension if suffix2 else ''
|
415
|
+
wb, _ = read_test(suffix2)
|
416
|
+
if wb is not None:
|
417
|
+
return wb, suffix2
|
418
|
+
|
419
|
+
# 3 如果还处理不了,再把其他可能的情况试一遍
|
420
|
+
for suffix in ('xlsx', 'xls', 'csv'):
|
421
|
+
wb, _ = read_test(suffix)
|
422
|
+
if wb is not None:
|
423
|
+
return wb, suffix
|
424
|
+
|
425
|
+
# 4 确实是处理不了的类型,返回报错信息
|
426
|
+
return None, error
|
427
|
+
|
428
|
+
|
429
|
+
def parse_range_address(address):
|
430
|
+
""" 解析单元格范围地址。
|
431
|
+
|
432
|
+
:param str address: 单元格范围地址,例如 'A1', 'A1:B3', '1:3', 'A:B' 等。
|
433
|
+
:return dict: 一个包含 'left', 'top', 'right', 'bottom' 的字典。
|
434
|
+
"""
|
435
|
+
# 初始化默认值
|
436
|
+
left, right, top, bottom = None, None, None, None
|
437
|
+
|
438
|
+
# 分割地址以获取开始和结束
|
439
|
+
parts = address.split(":")
|
440
|
+
start_cell = parts[0]
|
441
|
+
end_cell = parts[1] if len(parts) > 1 else start_cell
|
442
|
+
|
443
|
+
# 如果 start_cell 是行号
|
444
|
+
if start_cell.isdigit():
|
445
|
+
top = int(start_cell)
|
446
|
+
else:
|
447
|
+
# 尝试从 start_cell 提取列
|
448
|
+
try:
|
449
|
+
left = column_index_from_string(start_cell.rstrip('1234567890'))
|
450
|
+
top = int(''.join(filter(str.isdigit, start_cell))) if any(
|
451
|
+
char.isdigit() for char in start_cell) else None
|
452
|
+
except ValueError:
|
453
|
+
left = None
|
454
|
+
|
455
|
+
# 如果 end_cell 是行号
|
456
|
+
if end_cell.isdigit():
|
457
|
+
bottom = int(end_cell)
|
458
|
+
else:
|
459
|
+
# 尝试从 end_cell 提取列
|
460
|
+
try:
|
461
|
+
right = column_index_from_string(end_cell.rstrip('1234567890'))
|
462
|
+
bottom = int(''.join(filter(str.isdigit, end_cell))) if any(char.isdigit() for char in end_cell) else None
|
463
|
+
except ValueError:
|
464
|
+
right = None
|
465
|
+
|
466
|
+
# 如果只提供了一个部分 (例如 '1', 'A'),将最大值设置为最小值
|
467
|
+
if len(parts) == 1:
|
468
|
+
right = left if left is not None else right
|
469
|
+
bottom = top if top is not None else bottom
|
470
|
+
|
471
|
+
return {"left": left, "top": top, "right": right, "bottom": bottom}
|
472
|
+
|
473
|
+
|
474
|
+
def get_addr_area(addr):
|
475
|
+
""" 一个range描述的面积 """
|
476
|
+
if ':' in addr:
|
477
|
+
d = parse_range_address(addr)
|
478
|
+
return (d['right'] - d['left'] + 1) * (d['bottom'] - d['top'] + 1)
|
479
|
+
else:
|
480
|
+
return 1
|
481
|
+
|
482
|
+
|
483
|
+
def build_range_address(left=None, top=None, right=None, bottom=None):
|
484
|
+
""" 构建单元格范围地址。
|
485
|
+
|
486
|
+
:return str: 单元格范围地址,例如 'A1', 'A1:B3', '1:3', 'A:B' 等。
|
487
|
+
"""
|
488
|
+
start_cell = f"{get_column_letter(left) if left else ''}{top if top else ''}"
|
489
|
+
end_cell = f"{get_column_letter(right) if right else ''}{bottom if bottom else ''}"
|
490
|
+
|
491
|
+
# 当开始和结束单元格相同时,只返回一个单元格地址
|
492
|
+
if start_cell == end_cell:
|
493
|
+
return start_cell
|
494
|
+
# 当其中一个单元格是空字符串时,只返回另一个单元格地址
|
495
|
+
elif not start_cell or not end_cell:
|
496
|
+
return start_cell or end_cell
|
497
|
+
else:
|
498
|
+
return f"{start_cell}:{end_cell}"
|
499
|
+
|
500
|
+
|
501
|
+
def combine_addresses(*addrs):
|
502
|
+
# 初始化最小和最大行列值
|
503
|
+
min_left, min_top, max_right, max_bottom = float('inf'), float('inf'), 0, 0
|
504
|
+
|
505
|
+
# 遍历所有地址
|
506
|
+
for addr in addrs:
|
507
|
+
# 解析每个地址
|
508
|
+
addr_dict = parse_range_address(addr)
|
509
|
+
|
510
|
+
# 更新最小和最大行列值
|
511
|
+
if addr_dict['left'] is not None:
|
512
|
+
min_left = min(min_left, addr_dict['left'])
|
513
|
+
max_right = max(max_right, addr_dict['right'] if addr_dict['right'] is not None else addr_dict['left'])
|
514
|
+
if addr_dict['top'] is not None:
|
515
|
+
min_top = min(min_top, addr_dict['top'])
|
516
|
+
max_bottom = max(max_bottom, addr_dict['bottom'] if addr_dict['bottom'] is not None else addr_dict['top'])
|
517
|
+
|
518
|
+
# 构建新的地址字符串
|
519
|
+
new_addr = f"{get_column_letter(min_left)}{min_top}:{get_column_letter(max_right)}{max_bottom}"
|
520
|
+
return new_addr
|
521
|
+
|
522
|
+
|
523
|
+
def is_string_type(value):
|
524
|
+
""" 检查值是否为字符串类型,不是数值或日期类型 """
|
525
|
+
# 首先检查日期类型
|
526
|
+
try:
|
527
|
+
pd.to_datetime(value, errors='raise')
|
528
|
+
return False
|
529
|
+
except (ValueError, TypeError, OverflowError, AttributeError):
|
530
|
+
pass
|
531
|
+
|
532
|
+
# 检查是否为浮点数类型
|
533
|
+
try:
|
534
|
+
float(value)
|
535
|
+
return False
|
536
|
+
except (ValueError, TypeError):
|
537
|
+
return True
|
538
|
+
|
539
|
+
|
540
|
+
def __2_openpyxl_class():
|
541
|
+
""" 对openpyxl已有类的功能的增强 """
|
542
|
+
|
543
|
+
|
544
|
+
class XlCell(openpyxl.cell.cell.Cell): # 适用于 openpyxl.cell.cell.MergedCell,但这里不能多重继承
|
545
|
+
|
546
|
+
def in_range(self):
|
547
|
+
""" 判断一个单元格所在的合并单元格
|
548
|
+
|
549
|
+
>> ws['C1'].in_range()
|
550
|
+
<openpyxl.worksheet.cell_range.CellRange> A1:D3
|
551
|
+
"""
|
552
|
+
ws = self.parent
|
553
|
+
for rng in ws.merged_cells.ranges:
|
554
|
+
if self.coordinate in rng:
|
555
|
+
break
|
556
|
+
else: # 如果找不到则返回原值
|
557
|
+
rng = self
|
558
|
+
return rng
|
559
|
+
|
560
|
+
def mcell(self):
|
561
|
+
"""返回“有效单元格”,即如果输入的是一个合并单元格,会返回该合并单元格左上角的单元格
|
562
|
+
修改左上角单元格的值才是可行、有意义的
|
563
|
+
|
564
|
+
因为跟合并单元格有关,所以 以m前缀 merge
|
565
|
+
"""
|
566
|
+
if isinstance(self, MergedCell):
|
567
|
+
ws = self.parent
|
568
|
+
x, y = self.in_range().top[0]
|
569
|
+
return ws.cell(x, y)
|
570
|
+
else:
|
571
|
+
return self
|
572
|
+
|
573
|
+
def celltype(self, *, return_mode=False):
|
574
|
+
"""
|
575
|
+
:param return_mode: 是否返回运算的中间结果信息
|
576
|
+
主要是在type=2的情景,有时候需要使用rng变量,可以这里直接返回,避免外部重复计算
|
577
|
+
:return: 单元格类型
|
578
|
+
0:普通单元格
|
579
|
+
1:合并单元格其他衍生位置
|
580
|
+
2:合并单元格的左上角的位置
|
581
|
+
|
582
|
+
TODO 这个函数还是可以看看能不能有更好的实现、提速
|
583
|
+
"""
|
584
|
+
|
585
|
+
def try_offset(x, y):
|
586
|
+
try:
|
587
|
+
return isinstance(self.offset(x, y), MergedCell)
|
588
|
+
except ValueError:
|
589
|
+
# 有可能会越界:ValueError: Row numbers must be between 1 and 1048576
|
590
|
+
return False
|
591
|
+
|
592
|
+
_type, status = 0, {}
|
593
|
+
if isinstance(self, MergedCell):
|
594
|
+
_type = 1
|
595
|
+
elif try_offset(1, 0) or try_offset(0, 1):
|
596
|
+
# 这里只能判断可能是合并单元格,具体是不是合并单元格,还要
|
597
|
+
rng = self.in_range()
|
598
|
+
status['rng'] = rng
|
599
|
+
_type = 2 if hasattr(rng, 'size') else 0
|
600
|
+
|
601
|
+
if return_mode:
|
602
|
+
return _type, status
|
603
|
+
else:
|
604
|
+
return _type
|
605
|
+
|
606
|
+
def isnone(self):
|
607
|
+
""" 是普通单元格且值为None
|
608
|
+
|
609
|
+
注意合并单元格的衍生单元格不为None
|
610
|
+
"""
|
611
|
+
celltype = self.celltype()
|
612
|
+
return celltype == 0 and self.value is None
|
613
|
+
|
614
|
+
def clear(self):
|
615
|
+
""" 清除数值、格式、合并单元格
|
616
|
+
|
617
|
+
注意,如果self是合并单元格,分两种清空
|
618
|
+
母格(左上角),会撤销合并到和母格数值、格式
|
619
|
+
衍生格,只会撤销合并单元格,但不会清除母格的数值、格式
|
620
|
+
|
621
|
+
:return: 涉及到合并单元格的情况,新单元格和原单元格已经不一样了,需要重新获取对象
|
622
|
+
"""
|
623
|
+
ct, mid_result = self.celltype(return_mode=True)
|
624
|
+
x = self
|
625
|
+
if ct: # 如果是合并单元格,取消该区域的合并单元格
|
626
|
+
rng = mid_result['rng'] if ('rng' in mid_result) else self.in_range()
|
627
|
+
self.parent.unmerge_cells(rng.coord)
|
628
|
+
x = self.parent[self.coordinate]
|
629
|
+
x.value = None
|
630
|
+
x.style = 'Normal'
|
631
|
+
return x
|
632
|
+
|
633
|
+
def copy_cell_format(self, dst_cell):
|
634
|
+
""" 单元格全格式复制,需要事先指定好新旧单元格的物理位置
|
635
|
+
参考:https://stackoverflow.com/questions/23332259/copy-cell-style-openpyxl
|
636
|
+
"""
|
637
|
+
from copy import copy
|
638
|
+
if self.has_style:
|
639
|
+
dst_cell.font = copy(self.font) # 字体
|
640
|
+
dst_cell.border = copy(self.border) # 表格线
|
641
|
+
dst_cell.fill = copy(self.fill) # 填充色
|
642
|
+
dst_cell.number_format = copy(self.number_format) # 数字格式
|
643
|
+
dst_cell.protection = copy(self.protection) # 保护?
|
644
|
+
dst_cell.alignment = copy(self.alignment) # 对齐格式
|
645
|
+
# dst_cell.style = self.style
|
646
|
+
# if self.comment:
|
647
|
+
# 这个会引发AttributeError。。。
|
648
|
+
# vml = fromstring(self.workbook.vba_archive.read(ws.legacy_drawing))
|
649
|
+
# AttributeError: 'NoneType' object has no attribute 'read'
|
650
|
+
# dst_cell.comment = copy(cell.comment)
|
651
|
+
# 就算开了keep_vba可以强制写入了,打开的时候文件可能还是会错
|
652
|
+
|
653
|
+
def copy_cell(self, dst_cell):
|
654
|
+
""" 单元格全格式、包括值的整体复制
|
655
|
+
|
656
|
+
注意合并单元格比较复杂,比如要把 'A1:C3' 复制到 'A2:D4',是会出现问题的
|
657
|
+
在预先清空A2:D4数据的时候,会把
|
658
|
+
一般这种清空,推荐先将数据库复制到一个临时sheet,再复制回原sheet更安全
|
659
|
+
"""
|
660
|
+
from itertools import product
|
661
|
+
ct, mid_result = self.celltype(return_mode=True)
|
662
|
+
|
663
|
+
if ct == 0: # 普通单元格,只复制值和格式
|
664
|
+
dst_cell = dst_cell.clear()
|
665
|
+
dst_cell.value = self.value
|
666
|
+
self.copy_cell_format(dst_cell)
|
667
|
+
elif ct == 2: # 合并单元格,除了值和格式,要考虑单元格整体性的复制替换
|
668
|
+
dst_cell = dst_cell.clear()
|
669
|
+
rng = mid_result['rng'] if ('rng' in mid_result) else self.in_range() # CellRange类型
|
670
|
+
n, m = rng.size['rows'], rng.size['columns'] # 几行几列
|
671
|
+
# 先把目标位置里的区域清空
|
672
|
+
ws2 = dst_cell.parent
|
673
|
+
x2, y2 = dst_cell.row, dst_cell.column
|
674
|
+
for i, j in product(range(n), range(m)):
|
675
|
+
ws2.cell(x2 + i, y2 + j).clear()
|
676
|
+
# 拷贝数据
|
677
|
+
dst_cell.value = self.value
|
678
|
+
self.copy_cell_format(dst_cell)
|
679
|
+
ws2.merge_cells(start_row=x2, start_column=y2, end_row=x2 + n - 1, end_column=y2 + m - 1)
|
680
|
+
else: # 合并单元格的衍生单元格复制时,不做任何处理
|
681
|
+
return
|
682
|
+
|
683
|
+
def down(self, count=1):
|
684
|
+
""" 输入一个单元格,向下移动一格
|
685
|
+
注意其跟offset的区别,如果cell是合并单元格,会跳过自身的衍生单元格
|
686
|
+
|
687
|
+
:param count: 重复操作次数
|
688
|
+
|
689
|
+
注意这里移动跟excel中操作也不太一样,设计的更加"原子化",可以多配合cell.mcell功能使用。
|
690
|
+
详见:【腾讯文档】cell移动机制说明 https://docs.qq.com/doc/DUkRUaFhlb3l4UG1P
|
691
|
+
"""
|
692
|
+
|
693
|
+
def _func(cell):
|
694
|
+
r, c = cell.row, cell.column
|
695
|
+
if cell.celltype():
|
696
|
+
rng = cell.in_range()
|
697
|
+
r = rng.max_row
|
698
|
+
return cell.parent.cell(r + 1, c)
|
699
|
+
|
700
|
+
cell = self
|
701
|
+
for _ in range(count):
|
702
|
+
cell = _func(cell)
|
703
|
+
return cell
|
704
|
+
|
705
|
+
def right(self, count=1):
|
706
|
+
def _func(cell):
|
707
|
+
r, c = cell.row, cell.column
|
708
|
+
if cell.celltype():
|
709
|
+
rng = cell.in_range()
|
710
|
+
c = rng.max_col
|
711
|
+
return cell.parent.cell(r, c + 1)
|
712
|
+
|
713
|
+
cell = self
|
714
|
+
for _ in range(count):
|
715
|
+
cell = _func(cell)
|
716
|
+
return cell
|
717
|
+
|
718
|
+
def up(self, count=1):
|
719
|
+
def _func(cell):
|
720
|
+
r, c = cell.row, cell.column
|
721
|
+
if cell.celltype():
|
722
|
+
rng = cell.in_range()
|
723
|
+
r = rng.min_row
|
724
|
+
return cell.parent.cell(max(r - 1, 1), c)
|
725
|
+
|
726
|
+
cell = self
|
727
|
+
for _ in range(count):
|
728
|
+
cell = _func(cell)
|
729
|
+
return cell
|
730
|
+
|
731
|
+
def left(self, count=1):
|
732
|
+
def _func(cell):
|
733
|
+
r, c = cell.row, cell.column
|
734
|
+
if cell.celltype():
|
735
|
+
rng = cell.in_range()
|
736
|
+
r = rng.min_row
|
737
|
+
return cell.parent.cell(r, max(c - 1, 1))
|
738
|
+
|
739
|
+
cell = self
|
740
|
+
for _ in range(count):
|
741
|
+
cell = _func(cell)
|
742
|
+
return cell
|
743
|
+
|
744
|
+
def fill_color(self, color, fill_type="solid", **kwargs):
|
745
|
+
""" 封装一些我自己常用的填色方案 """
|
746
|
+
from openpyxl.styles import PatternFill
|
747
|
+
from pyxllib.cv.rgbfmt import RgbFormatter
|
748
|
+
if isinstance(color, str):
|
749
|
+
color = RgbFormatter.from_name(color)
|
750
|
+
elif isinstance(color, (list, tuple)):
|
751
|
+
color = RgbFormatter(*color)
|
752
|
+
self.fill = PatternFill(fgColor=color.to_hex()[1:], fill_type=fill_type, **kwargs)
|
753
|
+
|
754
|
+
def set_rich_value(self, value, color=None):
|
755
|
+
""" 因为我经常文本和单元格背景色一起设置,所以这里封装一个接口
|
756
|
+
如果只是普通的设置value,用cell.value就行,这个函数主要是设置富文本
|
757
|
+
"""
|
758
|
+
self.value = value
|
759
|
+
if color:
|
760
|
+
self.fill_color(color)
|
761
|
+
# todo 可以考虑扩展更多富文本样式,在这里统一设置
|
762
|
+
|
763
|
+
def get_number_format(self):
|
764
|
+
""" 相比源生的接口,有做了一些细节优化 """
|
765
|
+
fmt = self.number_format
|
766
|
+
# openpyxl的机制,如果没有配置日期格式,读取到的是默认的'mm-dd-yy',其实在中文场景,默认格式应该是后者
|
767
|
+
if fmt == 'mm-dd-yy':
|
768
|
+
return 'yyyy/m/d' # 中文的默认日期格式
|
769
|
+
elif fmt == r'yyyy\-mm\-dd': # 不知道为什么会有提取到这种\的情况,先暴力替换了
|
770
|
+
fmt = 'yyyy-mm-dd'
|
771
|
+
return fmt
|
772
|
+
|
773
|
+
def get_render_value(self):
|
774
|
+
""" 得到单元格简单渲染后的效果
|
775
|
+
py里不可能对excel的所有格式进行全覆盖,只是对场景格式进行处理
|
776
|
+
|
777
|
+
注意,遇到公式是很难计算处理的,大概率只能保持原公式显示
|
778
|
+
因为日期用的比较多,需要时常获得真实的渲染效果,所以这里封装一个接口
|
779
|
+
"""
|
780
|
+
|
781
|
+
x = self.value
|
782
|
+
if isinstance(x, openpyxl.worksheet.formula.ArrayFormula): # 数组公式要特别渲染
|
783
|
+
return x.text
|
784
|
+
xl_fmt = self.get_number_format()
|
785
|
+
return xl_render_value(x, xl_fmt)
|
786
|
+
|
787
|
+
def address(self):
|
788
|
+
if isinstance(self, openpyxl.cell.cell.Cell):
|
789
|
+
return self.coordinate
|
790
|
+
else: # 否则认为是合并单元格
|
791
|
+
return str(self) # 标准库有自带方法,可以直接转的
|
792
|
+
|
793
|
+
|
794
|
+
# 只有cell和mergecell都共同没有的成员方法,才添加进去
|
795
|
+
__members = set(dir(XlCell)) - set(dir(openpyxl.cell.cell.Cell)) - \
|
796
|
+
set(dir(openpyxl.cell.cell.MergedCell)) - {'__dict__'}
|
797
|
+
inject_members(XlCell, openpyxl.cell.cell.Cell, __members)
|
798
|
+
inject_members(XlCell, openpyxl.cell.cell.MergedCell, __members)
|
799
|
+
|
800
|
+
|
801
|
+
class XlWorksheet(openpyxl.worksheet.worksheet.Worksheet):
|
802
|
+
""" 扩展标准的Workshhet功能 """
|
803
|
+
|
804
|
+
def get_raw_usedrange(self):
|
805
|
+
raw_used_range = build_range_address(left=self.min_column, top=self.min_row,
|
806
|
+
right=self.max_column, bottom=self.max_row)
|
807
|
+
return raw_used_range
|
808
|
+
|
809
|
+
def is_empty_row(self, row, start_col, end_col):
|
810
|
+
if not hasattr(self, 'is_empty_row_cache'):
|
811
|
+
self.is_empty_row_cache = {}
|
812
|
+
key = (row, start_col, end_col)
|
813
|
+
|
814
|
+
def is_empty_row_core():
|
815
|
+
cur_col = start_col
|
816
|
+
# 特地提前检查下最后一列的那个单元格
|
817
|
+
if self.cell(row, end_col).value is not None:
|
818
|
+
return False
|
819
|
+
while cur_col <= end_col:
|
820
|
+
if self.cell(row, cur_col).value is not None:
|
821
|
+
return False
|
822
|
+
# 步长随着尝试的增加,也逐渐降低采样率
|
823
|
+
n = cur_col - start_col + 1
|
824
|
+
# 在最大值m=16384列情况下,/1000,最多检索3404个单元格,/100,最多检索569次,/50最多检索320次
|
825
|
+
# cur_col += (n // 50) + 1
|
826
|
+
# 再变形,加强前面权重,大大降低后面权重
|
827
|
+
if n <= 100:
|
828
|
+
cur_col += 1
|
829
|
+
else: # 最多54次
|
830
|
+
cur_col += (n // 10)
|
831
|
+
|
832
|
+
return True
|
833
|
+
|
834
|
+
if key not in self.is_empty_row_cache:
|
835
|
+
self.is_empty_row_cache[key] = is_empty_row_core()
|
836
|
+
|
837
|
+
return self.is_empty_row_cache[key]
|
838
|
+
|
839
|
+
def is_empty_column(self, col, start_row, end_row):
|
840
|
+
if not hasattr(self, 'is_empty_column_cache'):
|
841
|
+
self.is_empty_column_cache = {}
|
842
|
+
key = (col, start_row, end_row)
|
843
|
+
|
844
|
+
def is_empty_column_core():
|
845
|
+
cur_row = start_row
|
846
|
+
# 特地提前检查下最后一行的那个单元格
|
847
|
+
if self.cell(end_row, col).value is not None:
|
848
|
+
return False
|
849
|
+
while cur_row <= end_row:
|
850
|
+
if self.cell(cur_row, col).value is not None:
|
851
|
+
return False
|
852
|
+
n = cur_row - start_row + 1
|
853
|
+
# 在最大值n=1048576行情况下,/1000,最多检索7535个单元格,/100,最多检索987次,/50最多检索530次
|
854
|
+
cur_row += (n // 1000) + 1
|
855
|
+
return True
|
856
|
+
|
857
|
+
if key not in self.is_empty_column_cache:
|
858
|
+
self.is_empty_column_cache[key] = is_empty_column_core()
|
859
|
+
|
860
|
+
return self.is_empty_column_cache[key]
|
861
|
+
|
862
|
+
def find_last_non_empty_row(self, start_row, end_row, start_col, end_col, m=30):
|
863
|
+
# 1 如果剩余行数不多(小于等于m),直接遍历这些行
|
864
|
+
if end_row - start_row <= m: # 这里是兼容start_row大于end_row的情况的
|
865
|
+
for row in range(end_row, start_row - 1, -1):
|
866
|
+
if not self.is_empty_row(row, start_col, end_col):
|
867
|
+
return row
|
868
|
+
return -1 # 如果都是空的,则返回-1
|
869
|
+
|
870
|
+
# 2 计算分割点
|
871
|
+
intervals = [(end_row - start_row) // (m - 1) * i + start_row for i in range(m - 1)] + [end_row]
|
872
|
+
|
873
|
+
# 3 反向遍历这些分割点,找到第一个非空行
|
874
|
+
for i in reversed(range(len(intervals))):
|
875
|
+
# 检查点全部都空,也不能判定全空,要检查前面半个区间
|
876
|
+
if i == 0 or not self.is_empty_row(intervals[i], start_col, end_col):
|
877
|
+
# 如果这是最后一个分割点,则直接返回它
|
878
|
+
if i == m - 1:
|
879
|
+
return intervals[i]
|
880
|
+
# 否则,在这个分割点和下一个(一半二分的位置)之间递归查找
|
881
|
+
return self.find_last_non_empty_row(intervals[i],
|
882
|
+
intervals[min(i + m // 2, m - 1)],
|
883
|
+
start_col,
|
884
|
+
end_col,
|
885
|
+
m + 1)
|
886
|
+
|
887
|
+
# 如果所有分割点都是空的,则返回-1
|
888
|
+
return -1
|
889
|
+
|
890
|
+
def find_last_non_empty_column(self, start_col, end_col, start_row, end_row, m=30):
|
891
|
+
# dprint(end_col)
|
892
|
+
|
893
|
+
# 1 如果剩余列数不多(小于等于m),直接遍历这些列
|
894
|
+
if end_col - start_col <= m:
|
895
|
+
for col in range(end_col, start_col - 1, -1):
|
896
|
+
if not self.is_empty_column(col, start_row, end_row):
|
897
|
+
return col
|
898
|
+
return -1 # 如果都是空的,则返回-1
|
899
|
+
|
900
|
+
# 2 计算分割点
|
901
|
+
intervals = [(end_col - start_col) // (m - 1) * i + start_col for i in range(m - 1)] + [end_col]
|
902
|
+
|
903
|
+
# 3 反向遍历这些分割点,找到第一个非空列
|
904
|
+
for i in reversed(range(len(intervals))):
|
905
|
+
if i == 0 or not self.is_empty_column(intervals[i], start_row, end_row):
|
906
|
+
# 如果这是最后一个分割点,则直接返回它
|
907
|
+
if i == m - 1:
|
908
|
+
return intervals[i]
|
909
|
+
# 否则,在这个分割点和下一个(一半二分的位置)之间递归查找
|
910
|
+
return self.find_last_non_empty_column(intervals[i],
|
911
|
+
intervals[min(i + m // 2, m - 1)],
|
912
|
+
start_row,
|
913
|
+
end_row,
|
914
|
+
m + 1)
|
915
|
+
# 如果所有分割点都是空的,则返回-1
|
916
|
+
return -1
|
917
|
+
|
918
|
+
def find_first_non_empty_row(self, start_row, end_row, start_col, end_col, m=30):
|
919
|
+
# 1 如果剩余行数不多(小于等于m),直接遍历这些行
|
920
|
+
if end_row - start_row <= m:
|
921
|
+
for row in range(start_row, end_row + 1):
|
922
|
+
if not self.is_empty_row(row, start_col, end_col):
|
923
|
+
return row
|
924
|
+
return -1 # 如果都是空的,则返回-1
|
925
|
+
|
926
|
+
# 2 计算分割点
|
927
|
+
intervals = [(end_row - start_row) // (m - 1) * i + start_row for i in range(m - 1)] + [end_row]
|
928
|
+
|
929
|
+
# 3 正向遍历这些分割点,找到第一个非空行
|
930
|
+
for i in range(len(intervals)):
|
931
|
+
if i == m - 1 or not self.is_empty_row(intervals[i], start_col, end_col):
|
932
|
+
# 如果这是第一个分割点,则直接返回它
|
933
|
+
if i == 0:
|
934
|
+
return intervals[i]
|
935
|
+
# 否则,在这个分割点和前一个(一半二分的位置)之间递归查找
|
936
|
+
return self.find_first_non_empty_row(intervals[max(i - m // 2, 0)],
|
937
|
+
intervals[i],
|
938
|
+
start_col,
|
939
|
+
end_col,
|
940
|
+
m + 1)
|
941
|
+
# 如果所有分割点都是空的,则返回-1
|
942
|
+
return -1
|
943
|
+
|
944
|
+
def find_first_non_empty_column(self, start_col, end_col, start_row, end_row, m=30):
|
945
|
+
# 1 如果剩余列数不多(小于等于m),直接遍历这些列
|
946
|
+
if end_col - start_col <= m:
|
947
|
+
for col in range(start_col, end_col + 1):
|
948
|
+
if not self.is_empty_column(col, start_row, end_row):
|
949
|
+
return col
|
950
|
+
return -1 # 如果都是空的,则返回-1
|
951
|
+
|
952
|
+
# 2 计算分割点
|
953
|
+
intervals = [(end_col - start_col) // (m - 1) * i + start_col for i in range(m - 1)] + [end_col]
|
954
|
+
|
955
|
+
# 3 正向遍历这些分割点,找到第一个非空列
|
956
|
+
for i in range(len(intervals)):
|
957
|
+
if i == m - 1 or not self.is_empty_column(intervals[i], start_row, end_row):
|
958
|
+
# 如果这是第一个分割点,则直接返回它
|
959
|
+
if i == 0:
|
960
|
+
return intervals[i]
|
961
|
+
# 否则,在这个分割点和前一个(一半二分的位置)之间递归查找
|
962
|
+
return self.find_first_non_empty_column(intervals[max(i - m // 2, 0)],
|
963
|
+
intervals[i],
|
964
|
+
start_row,
|
965
|
+
end_row,
|
966
|
+
m + 1)
|
967
|
+
# 如果所有分割点都是空的,则返回-1
|
968
|
+
return -1
|
969
|
+
|
970
|
+
def get_usedrange(self):
|
971
|
+
""" 定位有效数据区间。
|
972
|
+
|
973
|
+
背景:
|
974
|
+
在Excel工作表中,经常需要确定包含数据的有效区域。这是因为工作表可能包含大量的空白区域,
|
975
|
+
而实际数据仅占据一部分空间。有效地识别这个区域对于进一步的数据处理和分析至关重要。
|
976
|
+
如果暴力一行行遍历,遇到XFD1048576这种覆盖全范围的表,运行肯定会超时。
|
977
|
+
|
978
|
+
求解思路:
|
979
|
+
为了高效地定位有效数据区间,我们采用了分割点技术和二分查找的思想。
|
980
|
+
具体解释,以找第1~100行中最后一个非空行为例。
|
981
|
+
可以判断第1、50、100行是否是空行,如果50、100都是空行,那空行应该在第1~50的范围里,然后再判定第25行是否为空行。
|
982
|
+
但二分法可能不严谨,第50、100都是空行,中间也有可能有内容,比如第1~80行本来其实都有内容,只是恰好第50行空了。
|
983
|
+
所以在二分法的基础上,还需要把1~100行等间距取m个采样点来辅助检查。
|
984
|
+
|
985
|
+
通过调m的值的变化方法,可以在速度和精度之间做一个权衡。
|
986
|
+
这四个定界符,最最慢的是find_last_non_empty_row,重点调这个。
|
987
|
+
|
988
|
+
四个边界判定函数,工程上是可以整合的,但是整合后,太多if分支操作,会降低效率,这里为了运行速度,就拆分4个实现更好。
|
989
|
+
具体实现中,还有其他一些细节考虑优化。
|
990
|
+
比如先找最后行,再最后列,再第一行,第一列,这个顺序是有讲究的,可以减少很多不必要的遍历。
|
991
|
+
因为数据一般是比较高和窄的,所以应该先对行做处理。以及前面出现空区域的概率小,可以后面再处理。
|
992
|
+
而且就openpyxl而言,对列的遍历也是远慢于行的遍历的。
|
993
|
+
|
994
|
+
:param reset_bounds: 计算出新区域后,是否重置ws的边界值
|
995
|
+
"""
|
996
|
+
if not hasattr(self, 'usedrange_cache'):
|
997
|
+
# 初始化边界值
|
998
|
+
left, right, top, bottom = self.min_column, self.max_column, self.min_row, self.max_row
|
999
|
+
|
1000
|
+
# start_time = time.time()
|
1001
|
+
# 使用优化后的函数找到最下方的行和最右边的列
|
1002
|
+
bottom = self.find_last_non_empty_row(top, bottom, left, right)
|
1003
|
+
if bottom == -1:
|
1004
|
+
return 'A1' # 空表返回A1占位
|
1005
|
+
right = self.find_last_non_empty_column(left, right, top, bottom)
|
1006
|
+
if right == -1:
|
1007
|
+
return 'A1'
|
1008
|
+
|
1009
|
+
# 使用优化后的函数找到最上方的行和最左边的列
|
1010
|
+
top = self.find_first_non_empty_row(top, bottom, left, right)
|
1011
|
+
if top == -1:
|
1012
|
+
return 'A1'
|
1013
|
+
left = self.find_first_non_empty_column(left, right, top, bottom)
|
1014
|
+
if left == -1:
|
1015
|
+
return 'A1'
|
1016
|
+
# get_global_var('get_usedrange_time')[-1] += time.time() - start_time
|
1017
|
+
|
1018
|
+
# 2 然后还要再扩范围(根据合并单元格情况)
|
1019
|
+
# start_time = time.time()
|
1020
|
+
top0, bottom0, left0, right0 = top, bottom, left, right
|
1021
|
+
for merged_range in self.merged_cells.ranges:
|
1022
|
+
l, t, r, b = merged_range.bounds
|
1023
|
+
if top0 <= b <= bottom0 or top0 <= t <= bottom0:
|
1024
|
+
if left0 <= r and l < left:
|
1025
|
+
left = l
|
1026
|
+
if l <= right0 and r > right:
|
1027
|
+
right = r
|
1028
|
+
if left0 <= r <= right0 or left0 <= l <= right0:
|
1029
|
+
if top0 <= b and t < top:
|
1030
|
+
top = t
|
1031
|
+
if t <= bottom0 and b > bottom:
|
1032
|
+
bottom = b
|
1033
|
+
# get_global_var('expandrange_time')[-1] += time.time() - start_time
|
1034
|
+
|
1035
|
+
self.used_range = build_range_address(left=left, top=top, right=right, bottom=bottom)
|
1036
|
+
|
1037
|
+
return self.used_range
|
1038
|
+
|
1039
|
+
def copy_worksheet(self, dst_ws):
|
1040
|
+
"""跨工作薄时复制表格内容的功能
|
1041
|
+
openpyxl自带的Workbook.copy_worksheet没法跨工作薄复制,很坑
|
1042
|
+
|
1043
|
+
src_ws.copy_worksheet(dst_ws)
|
1044
|
+
"""
|
1045
|
+
# 1 取每个单元格的值
|
1046
|
+
for row in self:
|
1047
|
+
for cell in row:
|
1048
|
+
try:
|
1049
|
+
cell.copy_cell(dst_ws[cell.coordinate])
|
1050
|
+
except AttributeError:
|
1051
|
+
pass
|
1052
|
+
# 2 合并单元格的处理
|
1053
|
+
for rng in self.merged_cells.ranges:
|
1054
|
+
dst_ws.merge_cells(rng.ref)
|
1055
|
+
# 3 其他表格属性的复制
|
1056
|
+
# 这个从excel读取过来的时候,是不准的,例如D3可能因为关闭时停留窗口的原因误跑到D103
|
1057
|
+
# dprint(origin_ws.freeze_panes)
|
1058
|
+
# target_ws.freeze_panes = origin_ws.freeze_panes
|
1059
|
+
|
1060
|
+
def _cells_by_row(self, min_col, min_row, max_col, max_row, values_only=False):
|
1061
|
+
""" openpyxl的这个迭代器,遇到合并单元格会有bug
|
1062
|
+
所以我把它重新设计一下~~
|
1063
|
+
"""
|
1064
|
+
for row in range(min_row, max_row + 1):
|
1065
|
+
cells = (self.cell(row=row, column=column) for column in range(min_col, max_col + 1))
|
1066
|
+
if values_only:
|
1067
|
+
# yield tuple(cell.value for cell in cells) # 原代码
|
1068
|
+
yield tuple(getattr(cell, 'value', None) for cell in cells)
|
1069
|
+
else:
|
1070
|
+
yield tuple(cells)
|
1071
|
+
|
1072
|
+
def search(self, pattern, min_row=None, max_row=None, min_col=None, max_col=None, order=None, direction=0):
|
1073
|
+
""" 查找满足pattern正则表达式的单元格
|
1074
|
+
|
1075
|
+
:param pattern: 正则匹配式,可以输入re.complier对象
|
1076
|
+
会将每个单元格的值转成str,然后进行字符串规则search匹配
|
1077
|
+
注意日期的本质是一个数字,pattern支持输入一个datetime.date类型,会自动转为excel的日期值
|
1078
|
+
支持多层嵌套 ['模块一', '属性1']
|
1079
|
+
:param direction: 只有在 pattern 为数组的时候有用
|
1080
|
+
pattern有多组时,会嵌套找单元格
|
1081
|
+
每计算出一个条件后,默认取该单元格下方的子区间 axis=0
|
1082
|
+
如果不是下方,而是右方,可以设置为1
|
1083
|
+
还有奇葩的上方、左方,可以分别设置为2、3
|
1084
|
+
:param order: 默认None,也就是 [1, 2] 的效果,规律详见product接口
|
1085
|
+
|
1086
|
+
>> wb = openpyxl.load_workbook(filename='2020寒假教材各地区数量统计最新2020.1.1.xlsx')
|
1087
|
+
>> ws = Worksheet(wb['预算总表'])
|
1088
|
+
>> ws.search('年段')
|
1089
|
+
<Cell '预算总表'.B2>
|
1090
|
+
"""
|
1091
|
+
if not hasattr(self, 'search_cache'):
|
1092
|
+
self.search_cache = {}
|
1093
|
+
|
1094
|
+
if isinstance(pattern, list):
|
1095
|
+
pattern = tuple(pattern)
|
1096
|
+
|
1097
|
+
key = (pattern, min_row, max_row, min_col, max_col, order, direction)
|
1098
|
+
|
1099
|
+
def get_search_core():
|
1100
|
+
nonlocal pattern
|
1101
|
+
# 1 定界
|
1102
|
+
x1, x2 = max(min_row or 1, 1), min(max_row or self.max_row, self.max_row)
|
1103
|
+
y1, y2 = max(min_col or 1, 1), min(max_col or self.max_column, self.max_column)
|
1104
|
+
|
1105
|
+
# 2 遍历
|
1106
|
+
if isinstance(pattern, datetime.date):
|
1107
|
+
pattern = f'^{(pattern - datetime.date(1899, 12, 30)).days}$'
|
1108
|
+
|
1109
|
+
if isinstance(pattern, tuple):
|
1110
|
+
cel = None
|
1111
|
+
for p in pattern:
|
1112
|
+
cel = self.search(p, x1, x2, y1, y2, order)
|
1113
|
+
if cel:
|
1114
|
+
# up, down, left, right 找到的单元格四边界
|
1115
|
+
l, u, r, d = getattr(cel.in_range(), 'bounds', (cel.column, cel.row, cel.column, cel.row))
|
1116
|
+
if direction == 0:
|
1117
|
+
x1, x2, y1, y2 = max(x1, d + 1), x2, max(y1, l), min(y2, r)
|
1118
|
+
elif direction == 1:
|
1119
|
+
x1, x2, y1, y2 = max(x1, u), min(x2, d), max(y1, r + 1), y2
|
1120
|
+
elif direction == 2:
|
1121
|
+
x1, x2, y1, y2 = x1, min(x2, u - 1), max(y1, l), min(y2, r)
|
1122
|
+
elif direction == 3:
|
1123
|
+
x1, x2, y1, y2 = max(x1, u), min(x2, d), y1, min(y2, l - 1)
|
1124
|
+
else:
|
1125
|
+
raise ValueError(f'direction参数值错误{direction}')
|
1126
|
+
else:
|
1127
|
+
return None
|
1128
|
+
return cel
|
1129
|
+
else:
|
1130
|
+
if isinstance(pattern, str): pattern = re.compile(pattern)
|
1131
|
+
for x, y in product(range(x1, x2 + 1), range(y1, y2 + 1), order=order):
|
1132
|
+
cell = self.cell(x, y)
|
1133
|
+
if cell.celltype() == 1: continue # 过滤掉合并单元格位置
|
1134
|
+
if pattern.search(str(cell.value)): return cell # 返回满足条件的第一个值
|
1135
|
+
|
1136
|
+
if key not in self.search_cache:
|
1137
|
+
self.search_cache[key] = get_search_core()
|
1138
|
+
|
1139
|
+
return self.search_cache[key]
|
1140
|
+
|
1141
|
+
findcel = search
|
1142
|
+
|
1143
|
+
def findrow(self, pattern, *args, **kwargs):
|
1144
|
+
cel = self.findcel(pattern, *args, **kwargs)
|
1145
|
+
return cel.row if cel else 0
|
1146
|
+
|
1147
|
+
def findcol(self, pattern, *args, **kwargs):
|
1148
|
+
cel = self.findcel(pattern, *args, **kwargs)
|
1149
|
+
return cel.column if cel else 0
|
1150
|
+
|
1151
|
+
def browser(self):
|
1152
|
+
"""注意,这里会去除掉合并单元格"""
|
1153
|
+
browser(pd.DataFrame(self.values))
|
1154
|
+
|
1155
|
+
def select_columns(self, columns, column_name='searchkey'):
|
1156
|
+
r""" 获取表中columns属性列的值,返回dataframe数据类型
|
1157
|
+
|
1158
|
+
:param columns: 搜索列名使用正则re.search字符串匹配查找
|
1159
|
+
可以单列:'attr1',找到列头后,会一直往后取到最后一个非空值
|
1160
|
+
也可以多列: ['attr1', 'attr2', 'attr3']
|
1161
|
+
会结合多个列标题定位,数据从最大的起始行号开始取,
|
1162
|
+
(TODO 截止到最末非空值所在行 未实现,先用openpyxl自带的max_row判断,不过这个有时会判断过大)
|
1163
|
+
遇到合并单元格,会寻找其母单元格的值填充
|
1164
|
+
:param column_name: 返回的df。列名
|
1165
|
+
origin,原始的列名
|
1166
|
+
searchkey,搜索时用的查找名
|
1167
|
+
"""
|
1168
|
+
if not isinstance(columns, (list, tuple)):
|
1169
|
+
columns = [columns]
|
1170
|
+
|
1171
|
+
# 1 找到所有标题位置,定位起始行
|
1172
|
+
cels, names, start_line = [], [], -1
|
1173
|
+
for search_name in columns:
|
1174
|
+
cel = self.findcel(search_name)
|
1175
|
+
if cel:
|
1176
|
+
cels.append(cel)
|
1177
|
+
if column_name == 'searchkey':
|
1178
|
+
names.append(str(search_name))
|
1179
|
+
elif column_name == 'origin':
|
1180
|
+
if isinstance(search_name, (list, tuple)) and len(search_name) > 1:
|
1181
|
+
names.append('/'.join(list(search_name[:-1]) + [str(cel.value)]))
|
1182
|
+
else:
|
1183
|
+
names.append(str(cel.value))
|
1184
|
+
else:
|
1185
|
+
raise ValueError(f'{column_name}')
|
1186
|
+
start_line = max(start_line, cel.down().row)
|
1187
|
+
else:
|
1188
|
+
dprint(search_name) # 找不到指定列
|
1189
|
+
|
1190
|
+
# 2 获得每列的数据
|
1191
|
+
datas = {}
|
1192
|
+
for k, cel in enumerate(cels):
|
1193
|
+
if cel:
|
1194
|
+
col = cel.column
|
1195
|
+
li = []
|
1196
|
+
for i in range(start_line, self.max_row + 1):
|
1197
|
+
v = self.cell(i, col).mcell().value # 注意合并单元格的取值
|
1198
|
+
li.append(v)
|
1199
|
+
datas[names[k]] = li
|
1200
|
+
else:
|
1201
|
+
# 如果没找到列,设一个空列
|
1202
|
+
datas[names[k]] = [None] * (self.max_row + 1 - start_line)
|
1203
|
+
df = pd.DataFrame(datas)
|
1204
|
+
|
1205
|
+
# 3 去除所有空行数据
|
1206
|
+
df.dropna(how='all', inplace=True)
|
1207
|
+
|
1208
|
+
return df
|
1209
|
+
|
1210
|
+
def copy_range(self, src_addr, dst_cell, *, temp_sheet=False, return_mode=False):
|
1211
|
+
""" 将自身cell_range区间的内容、格式,拷贝到目标dst_cell里
|
1212
|
+
|
1213
|
+
:param str src_addr: 自身的一片单元格范围
|
1214
|
+
支持输入格式:str --> cell
|
1215
|
+
支持范式:普通单元格 --> 合并单元格
|
1216
|
+
:param dst_cell: 要复制到的目标单元格位置
|
1217
|
+
输入起点、单个位置
|
1218
|
+
一般只有同个工作表ws要考虑赋值顺序问题,防止引用前已被更新覆盖
|
1219
|
+
但有个极端情况:循环引用的公式计算,在跨ws的时候如果不考虑顺序也可能出错,但这种情况太复杂的,这里不可能去处理
|
1220
|
+
:param temp_sheet: 当拷贝中涉及合并单元格等重叠位置问题时,建议开启该参数,用中间数据表过渡下
|
1221
|
+
|
1222
|
+
这个算法主要难点,是要考虑合并单元格的情况比较复杂。否则本身逻辑并不难。
|
1223
|
+
|
1224
|
+
>> ws1.copy_range('A1:C3', ws2.cell('C2')) # 将ws1的A1:C3数据复制到ws2的C2里
|
1225
|
+
"""
|
1226
|
+
from itertools import product
|
1227
|
+
|
1228
|
+
# 0 中间表
|
1229
|
+
mid_result = {}
|
1230
|
+
if temp_sheet:
|
1231
|
+
ws3 = self.parent.create_sheet('__copy_range')
|
1232
|
+
mid_result = self.copy_range(src_addr, ws3['A1'], return_mode=True)
|
1233
|
+
ws1 = ws3
|
1234
|
+
src_addr = f'A1:{excel_addr(mid_result["n"], mid_result["m"])}'
|
1235
|
+
else:
|
1236
|
+
ws1 = self
|
1237
|
+
ws2 = dst_cell.parent
|
1238
|
+
|
1239
|
+
# 1 坐标计算
|
1240
|
+
# 用ws1[]比用CellRange更精准,还能处理"1:3"这种泛式定位,会根据max_column智能判定边界单元格
|
1241
|
+
src_cells = ws1[src_addr]
|
1242
|
+
# 强制转为n*m的二维tuple数组结构
|
1243
|
+
if not isinstance(src_cells, tuple):
|
1244
|
+
src_cells = (src_cells,)
|
1245
|
+
if not isinstance(src_cells[0], tuple):
|
1246
|
+
src_cells = (src_cells,)
|
1247
|
+
# 关键信息
|
1248
|
+
n, m = len(src_cells), len(src_cells[0]) # 待复制的数据是几行几列
|
1249
|
+
src_cell = src_cells[0][0]
|
1250
|
+
x1, y1 = src_cell.row, src_cell.column # ws1数据起始位置,x是行,y是列
|
1251
|
+
x2, y2 = dst_cell.row, dst_cell.column
|
1252
|
+
bias_rows, bias_cols = x2 - x1, y2 - y1
|
1253
|
+
mid_result['n'] = n
|
1254
|
+
mid_result['m'] = m
|
1255
|
+
|
1256
|
+
# 2 将src内容拷贝过去
|
1257
|
+
# 注意拷贝顺序跟移动方向是有关系的,要防止被误覆盖,复制了新的值,而非原始值
|
1258
|
+
r = sorted(range(n), reverse=bias_rows > 0) # 要拷贝的每行
|
1259
|
+
c = sorted(range(m), reverse=bias_cols > 0)
|
1260
|
+
for i, j in product(r, c): # openpyxl好像没有复制range的功能?
|
1261
|
+
ws1.cell(x1 + i, y1 + j).copy_cell(ws2.cell(x2 + i, y2 + j))
|
1262
|
+
|
1263
|
+
# 3 收尾
|
1264
|
+
if temp_sheet:
|
1265
|
+
self.parent.remove(ws1)
|
1266
|
+
|
1267
|
+
if return_mode:
|
1268
|
+
return mid_result
|
1269
|
+
|
1270
|
+
def reindex_columns(self, orders):
|
1271
|
+
""" 重新排列表格的列顺序
|
1272
|
+
|
1273
|
+
>> ws.reindex_columns('I,J,A,,,G,B,C,D,F,E,H,,,K'.split(','))
|
1274
|
+
|
1275
|
+
TODO 支持含合并单元格的整体移动?
|
1276
|
+
"""
|
1277
|
+
from openpyxl.utils.cell import column_index_from_string
|
1278
|
+
max_row, max_column = self.max_row, self.max_column
|
1279
|
+
for j, col in enumerate(orders, 1):
|
1280
|
+
if not col: continue
|
1281
|
+
self.copy_range(f'{col}1:{col}{max_row}', self[excel_addr(1, max_column + j)])
|
1282
|
+
self.delete_cols(1, max_column)
|
1283
|
+
|
1284
|
+
def to_html(self, *, border=1,
|
1285
|
+
style='border-collapse:collapse; text-indent:0; margin:0;') -> str:
|
1286
|
+
r"""
|
1287
|
+
.from_latex(r'''\begin{tabular}{|c|c|c|c|}
|
1288
|
+
\hline
|
1289
|
+
1 & 2 & & 4\\
|
1290
|
+
\hline
|
1291
|
+
\end{tabular}''').to_html())
|
1292
|
+
|
1293
|
+
==>
|
1294
|
+
|
1295
|
+
<table border="1" style="border-collapse: collapse;">
|
1296
|
+
<tr>
|
1297
|
+
<td style="text-align:center">
|
1298
|
+
1
|
1299
|
+
</td>
|
1300
|
+
<td style="text-align:center">
|
1301
|
+
2
|
1302
|
+
</td>
|
1303
|
+
<td style="text-align:center"></td>
|
1304
|
+
<td style="text-align:center">
|
1305
|
+
4
|
1306
|
+
</td>
|
1307
|
+
</tr>
|
1308
|
+
</table>
|
1309
|
+
"""
|
1310
|
+
from yattag import Doc
|
1311
|
+
|
1312
|
+
doc, tag, text = Doc().tagtext()
|
1313
|
+
tag_attrs = [('border', border), ('style', style)]
|
1314
|
+
# if self.data_tex: # 原来做latex的时候有的一个属性
|
1315
|
+
# tag_attrs.append(('data-tex', self.data_tex))
|
1316
|
+
|
1317
|
+
ws = self
|
1318
|
+
with tag('table', *tag_attrs):
|
1319
|
+
# dprint(ws.max_row, ws.max_column)
|
1320
|
+
cols = ws.max_column
|
1321
|
+
for i in range(1, ws.max_row + 1):
|
1322
|
+
# TODO 这样跳过其实也不太好,有时候可能就是想创建一个空内容的表格
|
1323
|
+
for j in range(1, cols + 1):
|
1324
|
+
if not ws.cell(i, j).isnone():
|
1325
|
+
break
|
1326
|
+
else: # 如果没有内容,跳过该行
|
1327
|
+
continue
|
1328
|
+
|
1329
|
+
with tag('tr'):
|
1330
|
+
for j in range(1, cols + 1):
|
1331
|
+
# ① 判断单元格类型
|
1332
|
+
cell = ws.cell(i, j)
|
1333
|
+
celltype = cell.celltype()
|
1334
|
+
if celltype == 1: # 合并单元格的衍生单元格
|
1335
|
+
continue
|
1336
|
+
elif cell.isnone(): # 其他正常的空单元格
|
1337
|
+
with tag('td'):
|
1338
|
+
doc.asis('')
|
1339
|
+
continue
|
1340
|
+
# ② 对齐等格式控制
|
1341
|
+
params = {}
|
1342
|
+
if celltype == 2: # 合并单元格的左上角
|
1343
|
+
rng = cell.in_range()
|
1344
|
+
params['rowspan'] = rng.size['rows']
|
1345
|
+
params['colspan'] = rng.size['columns']
|
1346
|
+
if cell.alignment.horizontal:
|
1347
|
+
params['style'] = 'text-align:' + cell.alignment.horizontal
|
1348
|
+
# if cell.alignment.vertical:
|
1349
|
+
# params['valign'] = cell.alignment.vertical
|
1350
|
+
with tag('td', **params):
|
1351
|
+
v = str(cell.value)
|
1352
|
+
# if not v: v = ' ' # 200424周五15:40,空值直接上平台表格会被折叠,就加了个空格
|
1353
|
+
doc.asis(v) # 不能用text,html特殊字符不用逃逸
|
1354
|
+
# res = indent(doc.getvalue(), indent_text=True) # 美化输出模式。但是这句在某些场景会有bug。
|
1355
|
+
res = doc.getvalue() # 紧凑模式
|
1356
|
+
|
1357
|
+
return res
|
1358
|
+
|
1359
|
+
def init_from_latex(self, content):
|
1360
|
+
""" 注意没有取名为from_latex,因为ws是事先创建好的,这里只是能输入latex代码进行初始化而已 """
|
1361
|
+
from openpyxl.styles import Border, Alignment, Side
|
1362
|
+
|
1363
|
+
from pyxllib.text.pupil import grp_bracket
|
1364
|
+
from pyxllib.text.latex import TexTabular
|
1365
|
+
|
1366
|
+
BRACE2 = grp_bracket(2, inner=True)
|
1367
|
+
BRACE5 = grp_bracket(5, inner=True)
|
1368
|
+
|
1369
|
+
# 暂时统一边框线的样式 borders。不做细化解析
|
1370
|
+
double = Side(border_style='thin', color='000000')
|
1371
|
+
|
1372
|
+
ws = self
|
1373
|
+
|
1374
|
+
# 处理表头
|
1375
|
+
data_tex = re.search(r'\\begin{tabular}\s*(?:\[.*?\])?\s*' + BRACE5, content).group(1)
|
1376
|
+
col_pos = TexTabular.parse_align(data_tex) # 每列的格式控制
|
1377
|
+
# dprint(self.data_tex, col_pos)
|
1378
|
+
total_col = len(col_pos)
|
1379
|
+
# 删除头尾标记
|
1380
|
+
s = re.sub(r'\\begin{tabular}(?:\[.*?\])?' + BRACE5, '', re.sub(r'\\end{tabular}', '', content))
|
1381
|
+
row, col = 1, 1
|
1382
|
+
|
1383
|
+
# 先用简单不严谨的规则确定用全网格,还是无网格
|
1384
|
+
# if '\\hline' not in s and '\\midrule' not in s:
|
1385
|
+
# border = 0
|
1386
|
+
|
1387
|
+
# 用 \\ 分割处理每一行
|
1388
|
+
for line in re.split(r'\\\\(?!{)', s)[:-1]:
|
1389
|
+
# dprint(line)
|
1390
|
+
# 1 处理当前行的所有列元素
|
1391
|
+
cur_line = line
|
1392
|
+
# dprint(line)
|
1393
|
+
# 清除特殊格式数据
|
1394
|
+
cur_line = re.sub(r'\\cmidrule' + BRACE2, '', cur_line)
|
1395
|
+
cur_line = re.sub(r'\\cline' + BRACE2, '', cur_line)
|
1396
|
+
for t in (r'\midrule', r'\toprule', r'\bottomrule', r'\hline', '\n'):
|
1397
|
+
cur_line = cur_line.replace(t, '')
|
1398
|
+
|
1399
|
+
# 遍历每列
|
1400
|
+
# dprint(cur_line)
|
1401
|
+
for item in cur_line.strip().split('&'):
|
1402
|
+
item = item.strip()
|
1403
|
+
# dprint(item)
|
1404
|
+
cur_loc = excel_addr(row, col)
|
1405
|
+
# dprint(row, col)
|
1406
|
+
|
1407
|
+
if 'multicolumn' in item:
|
1408
|
+
size, align, text = TexTabular.parse_multicolumn(item)
|
1409
|
+
align = TexTabular.parse_align(align) if align else col_pos[col - 1] # 如果没有写对齐,用默认列的格式
|
1410
|
+
n, m = size
|
1411
|
+
# 左右对齐,默认是left
|
1412
|
+
align = {'l': 'left', 'c': 'center', 'r': 'right'}.get(align, 'left')
|
1413
|
+
cell = ws[cur_loc].mcell()
|
1414
|
+
if cell.value:
|
1415
|
+
cell.value += '\n' + text
|
1416
|
+
else:
|
1417
|
+
cell.value = text
|
1418
|
+
ws[cur_loc].alignment = Alignment(horizontal=align, vertical='center')
|
1419
|
+
merge_loc = excel_addr(row + n - 1, col + m - 1)
|
1420
|
+
ws.merge_cells(f'{cur_loc}:{merge_loc}')
|
1421
|
+
col += m
|
1422
|
+
elif 'multirow' in item:
|
1423
|
+
n, bigstructs, width, fixup, text = TexTabular.parse_multirow(item, brace_text_only=False)
|
1424
|
+
try:
|
1425
|
+
ws[cur_loc] = text
|
1426
|
+
except AttributeError:
|
1427
|
+
# 遇到合并单元格重叠问题,就修改旧的合并单元格,然后添加新单元格
|
1428
|
+
# 例如原来 A1:A3 是一个合并单元格,现在要独立一个A3,则原来的部分重置为A1:A2
|
1429
|
+
rng = ws[cur_loc].in_range()
|
1430
|
+
ws.unmerge_cells(rng.coord) # 解除旧的合并单元格
|
1431
|
+
ws.merge_cells(re.sub(r'\d+$', f'{row - 1}', rng.coord))
|
1432
|
+
ws[cur_loc] = text
|
1433
|
+
align = {'l': 'left', 'c': 'center', 'r': 'right'}.get(col_pos[col - 1], 'left')
|
1434
|
+
ws[cur_loc].alignment = Alignment(horizontal=align, vertical='center')
|
1435
|
+
# dprint(item, row, n)
|
1436
|
+
merge_loc = excel_addr(row + n - 1, col)
|
1437
|
+
ws.merge_cells(f'{cur_loc}:{merge_loc}')
|
1438
|
+
col += 1
|
1439
|
+
else:
|
1440
|
+
if ws[cur_loc].celltype() == 0:
|
1441
|
+
ws[cur_loc].value = item
|
1442
|
+
# dprint(item, col_pos, col)
|
1443
|
+
align = {'l': 'left', 'c': 'center', 'r': 'right'}.get(col_pos[col - 1], 'left')
|
1444
|
+
ws[cur_loc].alignment = Alignment(horizontal=align, vertical='center')
|
1445
|
+
col += 1
|
1446
|
+
|
1447
|
+
# 2 其他border等格式控制
|
1448
|
+
if r'\midrule' in line or r'\toprule' in line or r'\bottomrule' in line or r'\hline' in line:
|
1449
|
+
# 该行画整条线
|
1450
|
+
loc_1 = excel_addr(row, 1)
|
1451
|
+
loc_2 = excel_addr(row, total_col)
|
1452
|
+
comb_loc = f'{loc_1}:{loc_2}'
|
1453
|
+
for cell in ws[comb_loc][0]:
|
1454
|
+
cell.border = Border(top=double)
|
1455
|
+
if r'\cmidrule' in line:
|
1456
|
+
for match in re.findall(r'\\cmidrule{([0-9]+)-([0-9]+)}', line):
|
1457
|
+
loc_1 = excel_addr(row, match[0])
|
1458
|
+
loc_2 = excel_addr(row, match[1])
|
1459
|
+
comb_loc = f'{loc_1}:{loc_2}'
|
1460
|
+
for cell in ws[comb_loc][0]:
|
1461
|
+
cell.border = Border(top=double)
|
1462
|
+
if r'\cline' in line:
|
1463
|
+
for match in re.findall(r'\\cline{([0-9]+)-([0-9]+)}', line):
|
1464
|
+
loc_1 = excel_addr(row, match[0])
|
1465
|
+
loc_2 = excel_addr(row, match[1])
|
1466
|
+
comb_loc = f'{loc_1}:{loc_2}'
|
1467
|
+
for cell in ws[comb_loc][0]:
|
1468
|
+
cell.border = Border(top=double)
|
1469
|
+
row, col = row + 1, 1
|
1470
|
+
|
1471
|
+
def to_latex(self):
|
1472
|
+
from pyxllib.text.latex import TexTabular
|
1473
|
+
|
1474
|
+
ws = self
|
1475
|
+
|
1476
|
+
li = []
|
1477
|
+
n, m = ws.max_row, ws.max_column
|
1478
|
+
format_count = [''] * m # 记录每一列中各种对齐格式出现的次数
|
1479
|
+
merge_count = [0] * m # 每列累积被合并行数,用于计算cline
|
1480
|
+
|
1481
|
+
li.append('\\hline')
|
1482
|
+
for i in range(1, n + 1):
|
1483
|
+
if ws.cell(i, 1).isnone(): continue
|
1484
|
+
line = []
|
1485
|
+
j = 1
|
1486
|
+
while j < m + 1:
|
1487
|
+
cell = ws.cell(i, j)
|
1488
|
+
celltype = cell.celltype()
|
1489
|
+
if celltype == 0: # 普通单元格
|
1490
|
+
line.append(str(cell.value))
|
1491
|
+
elif celltype == 1: # 合并单元格的衍生单元格
|
1492
|
+
mcell = cell.mcell() # 找出其母单元格
|
1493
|
+
if mcell.column == cell.column:
|
1494
|
+
columns = mcell.in_range().size['columns']
|
1495
|
+
if columns > 1:
|
1496
|
+
line.append(f'\\multicolumn{{{columns}}}{{|c|}}{{}}') # 加一个空的multicolumn
|
1497
|
+
else:
|
1498
|
+
line.append('') # 加一个空值
|
1499
|
+
elif celltype == 2: # 合并单元格的左上角
|
1500
|
+
rng = cell.in_range()
|
1501
|
+
v = cell.value
|
1502
|
+
rows, columns = rng.size['rows'], rng.size['columns']
|
1503
|
+
if rows > 1: # 有合并行
|
1504
|
+
v = f'\\multirow{{{rows}}}*{{{v}}}'
|
1505
|
+
for k in range(j, j + columns): merge_count[k - 1] = rows - 1
|
1506
|
+
if columns > 1: # 有合并列
|
1507
|
+
# horizontal取值有情况有
|
1508
|
+
# {‘center’, ‘centerContinuous’, ‘fill’, ‘left’, ‘justify’, ‘distributed’, ‘right’, ‘general’}
|
1509
|
+
# 所以如果不是left、center、right,改成默认c
|
1510
|
+
align = cell.alignment.horizontal[0]
|
1511
|
+
if align not in 'lcr': align = 'c'
|
1512
|
+
v = f'\\multicolumn{{{columns}}}{{|{align}|}}{{{v}}}'
|
1513
|
+
line.append(str(v))
|
1514
|
+
j += columns - 1
|
1515
|
+
if cell.alignment.horizontal:
|
1516
|
+
format_count[j - 1] += cell.alignment.horizontal[0]
|
1517
|
+
j += 1
|
1518
|
+
li.append(' & '.join(line) + r'\\ ' + TexTabular.create_cline(merge_count))
|
1519
|
+
merge_count = [(x - 1 if x > 0 else x) for x in merge_count]
|
1520
|
+
li.append('\\end{tabular}\n')
|
1521
|
+
head = '\\begin{tabular}' + TexTabular.create_formats(format_count)
|
1522
|
+
li = [head] + li # 开头其实可以最后加,在遍历中先确认每列用到最多的格式情况
|
1523
|
+
|
1524
|
+
return '\n'.join(li)
|
1525
|
+
|
1526
|
+
def cell2(self, row, column, value=None):
|
1527
|
+
""" 相比原版的cell,支持智能定位单元格位置
|
1528
|
+
|
1529
|
+
:param row:
|
1530
|
+
int,第几行,从1开始编号
|
1531
|
+
dict, {k: v},找k所在列,且值为v的行。结果可能不唯一,只会取第1个匹配项。
|
1532
|
+
支持多键同时检索
|
1533
|
+
目前v只用普通数值,以后可以扩展更灵活丰富的格式
|
1534
|
+
:param column: 用字段名column找单元格
|
1535
|
+
int,正常第几列,从1开始编号
|
1536
|
+
str,
|
1537
|
+
优先判断是否为纯大写字幕,使用'P'、'AZ'等进行解析
|
1538
|
+
其次使用findcol检索对应单元格
|
1539
|
+
List[str],类似str,findcol
|
1540
|
+
"""
|
1541
|
+
# 1 智能 row
|
1542
|
+
if isinstance(row, dict):
|
1543
|
+
idx_name = tuple(row.keys())[0]
|
1544
|
+
cols = {self.findcol(k): v for k, v in row.items()}
|
1545
|
+
for i in self.iterrows(idx_name):
|
1546
|
+
logo = True
|
1547
|
+
for c, v in cols.items():
|
1548
|
+
if self.cell(i, c).value != v:
|
1549
|
+
logo = False
|
1550
|
+
break
|
1551
|
+
if logo:
|
1552
|
+
row = i
|
1553
|
+
break
|
1554
|
+
else:
|
1555
|
+
raise ValueError('Not find cell')
|
1556
|
+
|
1557
|
+
# 2 智能 column
|
1558
|
+
if isinstance(column, int):
|
1559
|
+
pass
|
1560
|
+
elif isinstance(column, str) and re.match(r'[A-Z]+$', column):
|
1561
|
+
column = column_index_from_string(column)
|
1562
|
+
else:
|
1563
|
+
column = self.findcol(column)
|
1564
|
+
if not column:
|
1565
|
+
return None
|
1566
|
+
|
1567
|
+
# 3 单元格
|
1568
|
+
# cell = self.cell(row, column, value) # 这种写法好像有bug,写长文本的时候,后丢掉后半部分
|
1569
|
+
cell = self.cell(row, column)
|
1570
|
+
if value is not None:
|
1571
|
+
cell.value = value
|
1572
|
+
return cell
|
1573
|
+
|
1574
|
+
def iterrows(self, key_column_name, mode='auto', *, to_dict=None):
|
1575
|
+
""" 通过某个属性列作为key,判断数据所在行
|
1576
|
+
|
1577
|
+
正常遍历行用iterrows,离散找数据用cell2
|
1578
|
+
|
1579
|
+
:param key_column_name: 参考的主要字段名,判断数据起始行
|
1580
|
+
:param mode: 计算数据范围的一些细分方法,目前主要是数据结束位置的判断方法
|
1581
|
+
default: 从ws.max_row往前找到第一个key非空的单元格
|
1582
|
+
auto: 基于一套智能的usedrange判定方法
|
1583
|
+
any_content: 从ws.max_row往前找到第一个含有值的行
|
1584
|
+
... 待开发更多需求
|
1585
|
+
:param list[str] to_dict: 写出属性名,迭代的时候,返回除了下标,还有转换出的字典数据
|
1586
|
+
:return: 返回range类型,可以直接用于for循环
|
1587
|
+
"""
|
1588
|
+
# 1 起始行
|
1589
|
+
cel = self.findcel(key_column_name).down()
|
1590
|
+
min_row = cel.row
|
1591
|
+
|
1592
|
+
# 2 终止行
|
1593
|
+
max_row = self.max_row
|
1594
|
+
|
1595
|
+
if mode == 'default':
|
1596
|
+
col = cel.column
|
1597
|
+
while max_row > min_row:
|
1598
|
+
if self.cell(max_row, col).value:
|
1599
|
+
break
|
1600
|
+
max_row -= 1
|
1601
|
+
elif mode == 'any_content':
|
1602
|
+
max_column = self.max_column
|
1603
|
+
while max_row > min_row:
|
1604
|
+
empty_line = True
|
1605
|
+
for j in range(1, max_column + 1):
|
1606
|
+
if self.cell(max_row, j).value:
|
1607
|
+
empty_line = False
|
1608
|
+
break
|
1609
|
+
if not empty_line:
|
1610
|
+
break
|
1611
|
+
elif mode == 'auto':
|
1612
|
+
rng = parse_range_address(self.get_usedrange())
|
1613
|
+
max_row = rng['bottom']
|
1614
|
+
else:
|
1615
|
+
raise NotImplementedError(f'{mode}')
|
1616
|
+
|
1617
|
+
if to_dict:
|
1618
|
+
data = []
|
1619
|
+
for i in range(min_row, max_row + 1):
|
1620
|
+
msg = {}
|
1621
|
+
for k in to_dict:
|
1622
|
+
msg[k] = self.cell2(i, k).value
|
1623
|
+
data.append([i, msg])
|
1624
|
+
return data
|
1625
|
+
else:
|
1626
|
+
return range(min_row, max_row + 1)
|
1627
|
+
|
1628
|
+
def find_head_data_range(self, ref_col_name):
|
1629
|
+
""" 查找表格的表头、数据所在区域
|
1630
|
+
|
1631
|
+
可以把表格分成两大块:表头head,数据data
|
1632
|
+
每块数据都是一个矩形,有四个边界:ltrb
|
1633
|
+
|
1634
|
+
:param ref_col_name: 参考的主列字段名字(如果是复合表头,需要表头最后一行的某个字段名)
|
1635
|
+
用这个名字才能区分出表头、数据划分界限在哪
|
1636
|
+
|
1637
|
+
TODO right、bottom会多出一些空行、空列,怎么优化?
|
1638
|
+
"""
|
1639
|
+
cel = self.findcel(ref_col_name)
|
1640
|
+
data_start_row = cel.down().row
|
1641
|
+
|
1642
|
+
return {
|
1643
|
+
# 1 关键字段所在位置
|
1644
|
+
'cel': cel,
|
1645
|
+
'row': cel.row,
|
1646
|
+
'col': cel.column,
|
1647
|
+
# 2 表格左右区间
|
1648
|
+
'left': self.min_column,
|
1649
|
+
'right': self.max_column,
|
1650
|
+
# 3 表头和数据划分行
|
1651
|
+
'head_top': self.min_row,
|
1652
|
+
'head_bottom': data_start_row - 1,
|
1653
|
+
'data_top': data_start_row,
|
1654
|
+
'data_bottom': self.max_row,
|
1655
|
+
}
|
1656
|
+
|
1657
|
+
def autofit(self):
|
1658
|
+
""" 自动调整工作表中所有列的宽度
|
1659
|
+
这里并不是excel自带标准的autofit,而是一种近似算法
|
1660
|
+
"""
|
1661
|
+
|
1662
|
+
def adjusted_column_width(cell_value):
|
1663
|
+
"""
|
1664
|
+
根据单元格的内容调整列宽。
|
1665
|
+
假设中文字符的宽度是拉丁字符的两倍。
|
1666
|
+
"""
|
1667
|
+
width_constant = 1.2 # 根据所需宽度调整此常数
|
1668
|
+
try:
|
1669
|
+
chinese_characters = sum(1 for char in cell_value if '\u4e00' <= char <= '\u9fff')
|
1670
|
+
latin_characters = len(cell_value) - chinese_characters
|
1671
|
+
return (chinese_characters * 2 + latin_characters) * width_constant
|
1672
|
+
except TypeError:
|
1673
|
+
return 10 * width_constant # 如果单元格没有值或非字符串值则返回默认宽度
|
1674
|
+
|
1675
|
+
for col in self.columns:
|
1676
|
+
max_width = 0
|
1677
|
+
column = [cell for cell in col]
|
1678
|
+
for cell in column:
|
1679
|
+
adjusted_width = adjusted_column_width(cell.value)
|
1680
|
+
if adjusted_width > max_width:
|
1681
|
+
max_width = adjusted_width
|
1682
|
+
# 找到列中的第一个非合并单元格
|
1683
|
+
first_non_merged_cell = next((cell for cell in column if not isinstance(cell, MergedCell)), None)
|
1684
|
+
if first_non_merged_cell:
|
1685
|
+
self.column_dimensions[first_non_merged_cell.column_letter].width = min(max_width, 100)
|
1686
|
+
# 列宽最多设置到100,再大就增设自动换行来实现排版
|
1687
|
+
if max_width > 100:
|
1688
|
+
for cell in column:
|
1689
|
+
current_alignment_dict = getattr(cell, 'alignment', Alignment()).__dict__
|
1690
|
+
# 从字典中删除 wrapText,以避免重复赋值
|
1691
|
+
current_alignment_dict.pop('wrapText', None)
|
1692
|
+
cell.alignment = Alignment(wrapText=True, **current_alignment_dict)
|
1693
|
+
|
1694
|
+
def get_sorted_merged_cells(self):
|
1695
|
+
""" 将合并单元格按照行列顺序排列。
|
1696
|
+
"""
|
1697
|
+
if not hasattr(self, 'sorted_merged_cells'):
|
1698
|
+
self.sorted_merged_cells = list(sorted(self.merged_cells.ranges, key=lambda x: (x.min_row, x.min_col)))
|
1699
|
+
return self.sorted_merged_cells
|
1700
|
+
|
1701
|
+
|
1702
|
+
inject_members(XlWorksheet, openpyxl.worksheet.worksheet.Worksheet, white_list=['_cells_by_row'])
|
1703
|
+
|
1704
|
+
|
1705
|
+
class XlWorkbook(openpyxl.Workbook):
|
1706
|
+
|
1707
|
+
def adjust_sheets(self, new_sheetnames):
|
1708
|
+
""" 按照 new_sheetnames 的清单重新调整sheets
|
1709
|
+
在清单里的按顺序罗列
|
1710
|
+
不在清单里的表格删除
|
1711
|
+
不能出现wb原本没有的表格名
|
1712
|
+
"""
|
1713
|
+
for name in set(self.sheetnames) - set(new_sheetnames):
|
1714
|
+
# 最好调用标准的remove接口删除sheet
|
1715
|
+
# 不然虽然能表面上看也能删除sheet,但会有命名空间的一些冗余信息留下
|
1716
|
+
self.remove(self[name])
|
1717
|
+
self._sheets = [self[name] for name in new_sheetnames]
|
1718
|
+
return self
|
1719
|
+
|
1720
|
+
def merge_sheets_by_keycol(self, sheets, keycol, new_name=None, *, cmp_func=None):
|
1721
|
+
""" 对多个工作表,按照关键列(主键)进行数据合并
|
1722
|
+
|
1723
|
+
:param sheets: 多个表格(可以不同工作薄),顺序有影响,以第0个表为主表
|
1724
|
+
:param keycol: 关键字段
|
1725
|
+
:param new_name: 新的sheets名称
|
1726
|
+
todo new_name变为可选参数,不写时默认合并到第一个表格里
|
1727
|
+
:param cmp_func: 自定义匹配规则
|
1728
|
+
def cmp_func(主表已转str格式的键值, 副表已转str格式的键值):
|
1729
|
+
return True匹配成功
|
1730
|
+
return False匹配失败
|
1731
|
+
|
1732
|
+
完整版实现起来有点麻烦,会循序渐进,先实现简洁版
|
1733
|
+
|
1734
|
+
来自不同表的字段区分
|
1735
|
+
原本是想修改head名称来实现,比如加上前缀"表1"、"表2",但这样有点冗余难看
|
1736
|
+
后来想到可以在每个表后面扩展一个列
|
1737
|
+
__keycol0__、__keycol1__、...
|
1738
|
+
即作为分割,也可以用于辅助计算
|
1739
|
+
|
1740
|
+
todo 或者在开头加上一个合并单元格,不同表格的区分?
|
1741
|
+
todo 根据不同表格最大表头行数优化下,防止ws1表头太矮,后面有高表头的数据会复制缺失
|
1742
|
+
"""
|
1743
|
+
if cmp_func is None:
|
1744
|
+
def cmp_func(k1, k2):
|
1745
|
+
return k1 == k2
|
1746
|
+
|
1747
|
+
# 1 新建表格,从sheets[0]拷贝
|
1748
|
+
if new_name:
|
1749
|
+
ws1 = self.copy_worksheet(sheets[0])
|
1750
|
+
ws1.title = new_name
|
1751
|
+
else:
|
1752
|
+
ws1 = sheets[0]
|
1753
|
+
|
1754
|
+
# 2 添加__keycol0__
|
1755
|
+
msg1 = ws1.find_head_data_range(keycol)
|
1756
|
+
last_right = msg1['right'] + 1
|
1757
|
+
ws1.cell(msg1['head_bottom'], last_right).value = '__keycol0__'
|
1758
|
+
|
1759
|
+
exists_key = set()
|
1760
|
+
|
1761
|
+
def write_new_key(row, column, value):
|
1762
|
+
ws1.cell(row, column).value = value
|
1763
|
+
if value in exists_key:
|
1764
|
+
ws1.cell(row, column).fill_color([252, 157, 154])
|
1765
|
+
else:
|
1766
|
+
exists_key.add(value)
|
1767
|
+
|
1768
|
+
for i in range(msg1['data_top'], msg1['data_bottom'] + 1):
|
1769
|
+
write_new_key(i, last_right, ws1.cell(i, msg1['col']).value)
|
1770
|
+
|
1771
|
+
# 3 依次把其他表添加到ws1
|
1772
|
+
last_data_bottom = msg1['data_bottom']
|
1773
|
+
for ws2_id, ws2 in enumerate(sheets[1:], start=1):
|
1774
|
+
# 3.1 ws2关键信息
|
1775
|
+
msg2 = ws2.find_head_data_range(keycol)
|
1776
|
+
data2 = []
|
1777
|
+
for i2 in range(msg2['data_top'], msg2['data_bottom'] + 1):
|
1778
|
+
data2.append([i2, str(ws2.cell(i2, msg2['col']).value)])
|
1779
|
+
|
1780
|
+
# 3.2 复制表头(支持复合、合并单元格、多行格式的表头)
|
1781
|
+
msg3 = {} # ws2复制到ws1,新区间的各种位置
|
1782
|
+
row_bias = msg2['head_bottom'] - msg1['head_bottom'] # 表头底部对齐需要的偏移行数
|
1783
|
+
msg3['head_top'] = msg2['head_top'] - row_bias
|
1784
|
+
msg3['left'] = last_right + 1
|
1785
|
+
if msg3['head_top'] < 1: # 表头无法整个复制过来,那就要缩小下ws2表头开始的位置
|
1786
|
+
msg2['head_top'] += msg3['head_top'] + 1
|
1787
|
+
msg3['head_top'] = 1
|
1788
|
+
ws2.copy_range(excel_addr2(msg2['head_top'], msg2['left'], msg2['head_bottom'], msg2['right']),
|
1789
|
+
ws1[excel_addr(msg3['head_top'], last_right + 1)])
|
1790
|
+
|
1791
|
+
new_right = last_right + msg2['right'] - msg1['left'] + 2
|
1792
|
+
ws1.cell(msg1['head_bottom'], new_right).value = f'__keycol{ws2_id}__'
|
1793
|
+
|
1794
|
+
# 3.4 先按ws1数据顺序模板填充数据
|
1795
|
+
exists_key = set()
|
1796
|
+
|
1797
|
+
# trick: 配合后续f字符串的使用,把重复性的东西提前计算好了
|
1798
|
+
ws2_row_tag = excel_addr2(1, msg2['left'], 1, msg2['right']).replace('1', '{0}')
|
1799
|
+
|
1800
|
+
# 考虑到可能存在重复值问题,所以这里算法是暴力双循环
|
1801
|
+
for i1 in range(msg1['data_top'], last_data_bottom + 1):
|
1802
|
+
k1 = str(ws1.cell(i1, last_right).value)
|
1803
|
+
for _i, x in enumerate(data2):
|
1804
|
+
if cmp_func(k1, x[1]): # todo 这里可以扩展自定义匹配规则的
|
1805
|
+
ws2.copy_range(ws2_row_tag.format(x[0]), ws1[excel_addr(i1, msg3['left'])])
|
1806
|
+
del data2[_i]
|
1807
|
+
break
|
1808
|
+
else: # ws2有,ws1没有的数据
|
1809
|
+
pass
|
1810
|
+
write_new_key(i1, new_right, k1)
|
1811
|
+
|
1812
|
+
# 3.5 剩余的data2添加到末尾
|
1813
|
+
for x in data2:
|
1814
|
+
last_data_bottom += 1
|
1815
|
+
ws2.copy_range(ws2_row_tag.format(x[0]), ws1[excel_addr(last_data_bottom, msg3['left'])])
|
1816
|
+
write_new_key(last_data_bottom, new_right, x[1])
|
1817
|
+
|
1818
|
+
# 3.6 更新坐标
|
1819
|
+
last_right = new_right
|
1820
|
+
|
1821
|
+
@classmethod
|
1822
|
+
def from_html(cls, content) -> 'XlWorkbook':
|
1823
|
+
from pyxllib.stdlib.tablepyxl.tablepyxl import document_to_workbook
|
1824
|
+
# 支持多 <table> 结构
|
1825
|
+
return document_to_workbook(content)
|
1826
|
+
|
1827
|
+
@classmethod
|
1828
|
+
def from_latex(cls, content) -> 'XlWorkbook':
|
1829
|
+
"""
|
1830
|
+
参考:kun0zhou,https://github.com/kun-zhou/latex2excel/blob/master/latex2excel.py
|
1831
|
+
"""
|
1832
|
+
from openpyxl import Workbook
|
1833
|
+
|
1834
|
+
# 可以处理多个表格
|
1835
|
+
wb = Workbook()
|
1836
|
+
for idx, s in enumerate(re.findall(r'(\\begin{tabular}.*?\\end{tabular})', content, flags=re.DOTALL), start=1):
|
1837
|
+
if idx == 1:
|
1838
|
+
ws = wb.active
|
1839
|
+
ws.title = 'Table 1'
|
1840
|
+
else:
|
1841
|
+
ws = wb.create_sheet(title=f'Table {idx}')
|
1842
|
+
ws.init_from_latex(s)
|
1843
|
+
|
1844
|
+
return wb
|
1845
|
+
|
1846
|
+
def to_html(self) -> str:
|
1847
|
+
li = []
|
1848
|
+
for ws in self.worksheets:
|
1849
|
+
li.append(ws.to_html())
|
1850
|
+
return '\n\n'.join(li)
|
1851
|
+
|
1852
|
+
def to_latex(self):
|
1853
|
+
li = []
|
1854
|
+
for ws in self.worksheets:
|
1855
|
+
li.append(ws.to_latex())
|
1856
|
+
return '\n'.join(li)
|
1857
|
+
|
1858
|
+
def _to_json_readonly(self, data, reduction_degree):
|
1859
|
+
if reduction_degree == 0:
|
1860
|
+
return data
|
1861
|
+
|
1862
|
+
def traverse_json(obj, path=""):
|
1863
|
+
""" 递归遍历JSON对象,模拟Python字典和列表的索引机制来显示路径。
|
1864
|
+
"""
|
1865
|
+
if isinstance(obj, dict):
|
1866
|
+
for name in ['filename', 'start_dir', 'header_offset', 'modified',
|
1867
|
+
'CRC', 'compress_size', 'file_size',
|
1868
|
+
# 230908周五16:40,后面是详细查看各种案例后,添加的处理
|
1869
|
+
'__reduce__', '_dict', '_fonts', '_fills', '_borders',
|
1870
|
+
'_style', '_named_styles']:
|
1871
|
+
if name in obj:
|
1872
|
+
del obj[name]
|
1873
|
+
for k, v in obj.items():
|
1874
|
+
new_path = f"{path}['{k}']" if path else k
|
1875
|
+
traverse_json(v, new_path)
|
1876
|
+
elif isinstance(obj, list):
|
1877
|
+
for i, v in enumerate(obj):
|
1878
|
+
new_path = f"{path}[{i}]"
|
1879
|
+
traverse_json(v, new_path)
|
1880
|
+
else:
|
1881
|
+
pass # 对于基本数据类型,不需要进一步处理
|
1882
|
+
|
1883
|
+
traverse_json(data)
|
1884
|
+
|
1885
|
+
return data
|
1886
|
+
|
1887
|
+
def to_json(self, reduction_degree=0):
|
1888
|
+
"""
|
1889
|
+
:param reduction_degree: 对json进行处理的程度级别
|
1890
|
+
0: 最原始的json
|
1891
|
+
1: 删除易变的配置,剩下的一些属性索引使用hash值存储
|
1892
|
+
2: todo,在跨软件应用的时候,excel整体框架可能会有大改,
|
1893
|
+
此时只比较更基础性的属性,而不进行较完整的全内容比较
|
1894
|
+
"""
|
1895
|
+
# 1 将对象先序列化
|
1896
|
+
s = jsonpickle.encode(self)
|
1897
|
+
data = json.loads(s)
|
1898
|
+
if reduction_degree == 0:
|
1899
|
+
return data
|
1900
|
+
|
1901
|
+
if self.read_only:
|
1902
|
+
return self._to_json_readonly(data, reduction_degree)
|
1903
|
+
|
1904
|
+
# 2 将复合结构hash化
|
1905
|
+
for name in ['font', 'border', 'fill']:
|
1906
|
+
ls = data[f'_{name}s']['py/seq']
|
1907
|
+
for i, x in enumerate(ls):
|
1908
|
+
ls[i] = xlmd5(json.dumps(x))
|
1909
|
+
|
1910
|
+
# 3 将id引用改成其对应的hash值
|
1911
|
+
def traverse_json(obj, path=""):
|
1912
|
+
""" 递归遍历JSON对象,模拟Python字典和列表的索引机制来显示路径。
|
1913
|
+
"""
|
1914
|
+
if isinstance(obj, dict):
|
1915
|
+
for k, v in obj.items():
|
1916
|
+
for name in ['font', 'border', 'fill']:
|
1917
|
+
if k == f'{name}Id':
|
1918
|
+
obj[k] = data[f'_{name}s']['py/seq'][v]
|
1919
|
+
|
1920
|
+
new_path = f"{path}['{k}']" if path else k
|
1921
|
+
traverse_json(v, new_path)
|
1922
|
+
elif isinstance(obj, list):
|
1923
|
+
for i, v in enumerate(obj):
|
1924
|
+
new_path = f"{path}[{i}]"
|
1925
|
+
traverse_json(v, new_path)
|
1926
|
+
else:
|
1927
|
+
pass # 对于基本数据类型,不需要进一步处理
|
1928
|
+
|
1929
|
+
traverse_json(data)
|
1930
|
+
|
1931
|
+
# 4 去掉不需要对比的差异项
|
1932
|
+
def del_volatile_attrs():
|
1933
|
+
del data['properties']['modified']
|
1934
|
+
del data['properties']['created']
|
1935
|
+
|
1936
|
+
del data['_fonts'] # 字体格式
|
1937
|
+
del data['_borders'] # 边框格式
|
1938
|
+
del data['_fills'] # 填充格式
|
1939
|
+
|
1940
|
+
del data['_named_styles'] # 命名样式
|
1941
|
+
del data['_cell_styles'] # 单元格样式
|
1942
|
+
|
1943
|
+
del_volatile_attrs()
|
1944
|
+
|
1945
|
+
return data
|
1946
|
+
|
1947
|
+
def to_md5(self, reduction_degree=1):
|
1948
|
+
""" 基于to_json计算的md5,一般用来判断不同workbook间是否相同 """
|
1949
|
+
return xlmd5(json.dumps(self.to_json(reduction_degree)))
|
1950
|
+
|
1951
|
+
def autofit(self):
|
1952
|
+
for ws in self.worksheets:
|
1953
|
+
ws.autofit()
|
1954
|
+
|
1955
|
+
def extract_summary(self, *, samples_num=5, limit_length=2500):
|
1956
|
+
""" 更新后的函数:提取整个Excel工作簿的摘要信息 """
|
1957
|
+
wb = self
|
1958
|
+
|
1959
|
+
all_sheets_summary = []
|
1960
|
+
|
1961
|
+
for ws in wb._sheets: # 非数据表,也要遍历出来,所以使用了_sheets
|
1962
|
+
# 如果是标准工作表(Worksheet),使用现有的摘要提取机制
|
1963
|
+
if isinstance(ws, openpyxl.worksheet.worksheet.Worksheet):
|
1964
|
+
# 找到使用范围和表头范围
|
1965
|
+
used_range = ws.get_usedrange()
|
1966
|
+
if used_range:
|
1967
|
+
header_range, data_range = split_header_and_data(ws, used_range)
|
1968
|
+
|
1969
|
+
# 提取表头结构
|
1970
|
+
header_structure = extract_header_structure(ws, header_range)
|
1971
|
+
|
1972
|
+
filterRange = re.sub(r'\d+',
|
1973
|
+
lambda m: str(max(int(m.group()) - 1, 1)),
|
1974
|
+
data_range, count=1)
|
1975
|
+
|
1976
|
+
summary = ({
|
1977
|
+
"sheetName": ws.title,
|
1978
|
+
"sheetType": "Worksheet",
|
1979
|
+
"usedRange": used_range,
|
1980
|
+
"headerRange": header_range,
|
1981
|
+
"header": header_structure,
|
1982
|
+
'dataRange': data_range,
|
1983
|
+
'filterRange': filterRange,
|
1984
|
+
'sortRange': filterRange,
|
1985
|
+
'data': extract_field_summaries(ws, header_range, data_range, samples_num)
|
1986
|
+
})
|
1987
|
+
|
1988
|
+
if not summary['data']: # 如果没有数据,则大概率是数据透视表,是计算出来的,读取不到~
|
1989
|
+
summary['sheetType'] = 'PivotTable'
|
1990
|
+
del summary['data']
|
1991
|
+
else:
|
1992
|
+
summary = ({
|
1993
|
+
"sheetName": ws.title,
|
1994
|
+
"sheetType": "DialogOrMacroSheet",
|
1995
|
+
"usedRange": None,
|
1996
|
+
})
|
1997
|
+
|
1998
|
+
# 如果是其他类型的工作表,提供基础摘要
|
1999
|
+
else:
|
2000
|
+
summary = ({
|
2001
|
+
"sheetName": ws.title,
|
2002
|
+
"sheetType": ws.__class__.__name__ # 使用工作表的类名作为类型
|
2003
|
+
})
|
2004
|
+
|
2005
|
+
all_sheets_summary.append(summary)
|
2006
|
+
|
2007
|
+
workbook_summary = {
|
2008
|
+
"fileName": Path(self.path).name if self.path else None,
|
2009
|
+
"sheetNames": wb.sheetnames,
|
2010
|
+
"sheets": all_sheets_summary,
|
2011
|
+
}
|
2012
|
+
|
2013
|
+
WorkbookSummary(workbook_summary).reduce_summarys(limit_length=limit_length)
|
2014
|
+
|
2015
|
+
return workbook_summary
|
2016
|
+
|
2017
|
+
def extract_summary2(self):
|
2018
|
+
""" 另一套按照单元格提取摘要的程序 """
|
2019
|
+
wb = self
|
2020
|
+
|
2021
|
+
all_sheets_summary = []
|
2022
|
+
|
2023
|
+
for ws in wb._sheets: # 非数据表,也要遍历出来,所以使用了_sheets
|
2024
|
+
# 如果是标准工作表(Worksheet),使用现有的摘要提取机制
|
2025
|
+
if isinstance(ws, (openpyxl.worksheet.worksheet.Worksheet)):
|
2026
|
+
# 找到使用范围和表头范围
|
2027
|
+
used_range = ws.get_usedrange()
|
2028
|
+
if used_range:
|
2029
|
+
raw_used_range = ws.get_raw_usedrange()
|
2030
|
+
summary = ({
|
2031
|
+
"sheetName": ws.title,
|
2032
|
+
"sheetType": "Worksheet",
|
2033
|
+
"rawUsedRange": raw_used_range,
|
2034
|
+
"usedRange": used_range,
|
2035
|
+
'cells': extract_cells_content(ws)
|
2036
|
+
})
|
2037
|
+
|
2038
|
+
if not summary['cells']: # 如果没有数据,则大概率是数据透视表,是计算出来的,读取不到~ 但是JSA等场景应该有办法获得
|
2039
|
+
summary['sheetType'] = 'PivotTable'
|
2040
|
+
del summary['cells']
|
2041
|
+
else:
|
2042
|
+
summary = ({
|
2043
|
+
"sheetName": ws.title,
|
2044
|
+
"sheetType": "DialogOrMacroSheet",
|
2045
|
+
"usedRange": None,
|
2046
|
+
})
|
2047
|
+
|
2048
|
+
# 如果是其他类型的工作表,提供基础摘要
|
2049
|
+
else:
|
2050
|
+
summary = ({
|
2051
|
+
"sheetName": ws.title,
|
2052
|
+
"sheetType": ws.__class__.__name__ # 使用工作表的类名作为类型
|
2053
|
+
})
|
2054
|
+
|
2055
|
+
all_sheets_summary.append(summary)
|
2056
|
+
|
2057
|
+
workbook_summary = {
|
2058
|
+
"fileName": Path(self.path).name if self.path else None,
|
2059
|
+
"sheetNames": wb.sheetnames,
|
2060
|
+
"sheets": all_sheets_summary,
|
2061
|
+
}
|
2062
|
+
|
2063
|
+
return workbook_summary
|
2064
|
+
|
2065
|
+
|
2066
|
+
inject_members(XlWorkbook, openpyxl.Workbook)
|
2067
|
+
|
2068
|
+
|
2069
|
+
def __3_extract_summary():
|
2070
|
+
""" 提取表格摘要 """
|
2071
|
+
|
2072
|
+
|
2073
|
+
def score_row(row):
|
2074
|
+
score = 0
|
2075
|
+
for cell in row:
|
2076
|
+
if cell.value is not None:
|
2077
|
+
if is_string_type(cell.value):
|
2078
|
+
score += 1 # Add positive score for string type
|
2079
|
+
else:
|
2080
|
+
score -= 1 # Subtract score for non-string type
|
2081
|
+
|
2082
|
+
# 检查填充颜色和边框,为得分增加0.5分
|
2083
|
+
# if cell.fill.bgColor.rgb != '00000000' or \
|
2084
|
+
# (cell.border.left.style or cell.border.right.style or
|
2085
|
+
# cell.border.top.style or cell.border.bottom.style):
|
2086
|
+
# score += 0.5
|
2087
|
+
return score
|
2088
|
+
|
2089
|
+
|
2090
|
+
def find_header_row(ws, used_range, max_rows_to_check=10):
|
2091
|
+
""" 找到工作表中的表头行 """
|
2092
|
+
range_details = parse_range_address(used_range)
|
2093
|
+
|
2094
|
+
# 初始化得分列表
|
2095
|
+
row_scores = []
|
2096
|
+
|
2097
|
+
# 只检查指定的最大行数
|
2098
|
+
rows_to_check = min(range_details['bottom'] - range_details['top'] + 1, max_rows_to_check)
|
2099
|
+
|
2100
|
+
# 为每行评分
|
2101
|
+
for row in ws.iter_rows(min_row=range_details['top'], max_row=range_details['top'] + rows_to_check - 1,
|
2102
|
+
min_col=range_details['left'], max_col=range_details['right']):
|
2103
|
+
row_scores.append(score_row(row))
|
2104
|
+
|
2105
|
+
# 计算行与行之间分数变化的加权
|
2106
|
+
weighted_scores = []
|
2107
|
+
for i, score in enumerate(row_scores):
|
2108
|
+
b = score - row_scores[i + 1] if i < len(row_scores) - 1 else 0
|
2109
|
+
y = score + b
|
2110
|
+
weighted_scores.append(y)
|
2111
|
+
|
2112
|
+
# 确定表头行的位置
|
2113
|
+
header_row = weighted_scores.index(max(weighted_scores)) + range_details['top']
|
2114
|
+
|
2115
|
+
# 从used_range的起始行到找到的表头行都视为表头
|
2116
|
+
header_range = build_range_address(left=range_details['left'], top=range_details['top'],
|
2117
|
+
right=range_details['right'], bottom=header_row)
|
2118
|
+
return header_range
|
2119
|
+
|
2120
|
+
|
2121
|
+
def split_header_and_data(ws, used_range, max_rows_to_check=50):
|
2122
|
+
""" 将工作表的used_range拆分为表头范围和数据范围 """
|
2123
|
+
header_range = find_header_row(ws, used_range, max_rows_to_check)
|
2124
|
+
header_details = parse_range_address(header_range)
|
2125
|
+
used_range_details = parse_range_address(used_range)
|
2126
|
+
|
2127
|
+
# 数据范围是紧接着表头下面的部分,直到used_range的结束
|
2128
|
+
data_range = build_range_address(left=used_range_details['left'], top=header_details['bottom'] + 1,
|
2129
|
+
right=used_range_details['right'], bottom=used_range_details['bottom'])
|
2130
|
+
return header_range, data_range
|
2131
|
+
|
2132
|
+
|
2133
|
+
def extract_header_structure(ws, header_range):
|
2134
|
+
""" 对输入的表头位置单元格,提取表头结构 """
|
2135
|
+
header_range_details = parse_range_address(header_range)
|
2136
|
+
|
2137
|
+
header_structure = {}
|
2138
|
+
merged_addresses = set()
|
2139
|
+
|
2140
|
+
# 处理合并的单元格
|
2141
|
+
for merged_range in ws.merged_cells.ranges:
|
2142
|
+
# 如果合并的单元格在提供的表头范围内
|
2143
|
+
if merged_range.bounds[1] <= header_range_details['bottom'] \
|
2144
|
+
and merged_range.bounds[3] >= header_range_details['top']:
|
2145
|
+
top_left_cell = ws.cell(row=merged_range.bounds[1], column=merged_range.bounds[0])
|
2146
|
+
address = build_range_address(left=merged_range.bounds[0], top=merged_range.bounds[1],
|
2147
|
+
right=merged_range.bounds[2], bottom=merged_range.bounds[3])
|
2148
|
+
header_structure[address] = top_left_cell.get_render_value()
|
2149
|
+
for row in range(merged_range.bounds[1], merged_range.bounds[3] + 1):
|
2150
|
+
for col in range(merged_range.bounds[0], merged_range.bounds[2] + 1):
|
2151
|
+
merged_addresses.add((row, col))
|
2152
|
+
|
2153
|
+
# 处理未合并的单元格
|
2154
|
+
for row in ws.iter_rows(min_row=header_range_details['top'], max_row=header_range_details['bottom'],
|
2155
|
+
min_col=header_range_details['left'], max_col=header_range_details['right']):
|
2156
|
+
for cell in row:
|
2157
|
+
# 如果这个单元格的地址还没有被添加到结构中,并且它有一个值 (后记,没有值也添加下比较好)
|
2158
|
+
if (cell.row, cell.column) not in merged_addresses:
|
2159
|
+
header_structure[cell.coordinate] = cell.get_render_value()
|
2160
|
+
|
2161
|
+
return header_structure
|
2162
|
+
|
2163
|
+
|
2164
|
+
def determine_field_type_and_summary(ws, col, start_row, end_row, rows):
|
2165
|
+
""" 根据指定的列范围确定字段的摘要信息
|
2166
|
+
|
2167
|
+
:param rows: 由外部传入要抽样的数据编号
|
2168
|
+
"""
|
2169
|
+
# 1 需要全量读取数据,获知主要格式,和数值范围
|
2170
|
+
data = defaultdict(list)
|
2171
|
+
for i in range(start_row, end_row + 1):
|
2172
|
+
cell = ws.cell(i, col)
|
2173
|
+
k, v = cell.get_number_format(), cell.value
|
2174
|
+
data[k].append(v)
|
2175
|
+
|
2176
|
+
data2 = sorted(data.items(), key=lambda item: len(item[1]), reverse=True)
|
2177
|
+
number_formats = [x[0] for x in data2]
|
2178
|
+
|
2179
|
+
# 2 获得要展示的样本值
|
2180
|
+
sample_values = []
|
2181
|
+
for i in rows:
|
2182
|
+
cell = ws.cell(i, col)
|
2183
|
+
value = cell.get_render_value()
|
2184
|
+
if isinstance(value, str) and len(value) > 20:
|
2185
|
+
value = value[:17] + '...'
|
2186
|
+
sample_values.append(value)
|
2187
|
+
|
2188
|
+
# 3 数值范围(只要判断主类型的数值范围就行了)
|
2189
|
+
numeric_range = None
|
2190
|
+
for x in data2:
|
2191
|
+
try:
|
2192
|
+
fmt, values = x
|
2193
|
+
values = [v for v in values if (v is not None and not isinstance(v, str))]
|
2194
|
+
numeric_range = [min(values), max(values)]
|
2195
|
+
numeric_range[0] = xl_render_value(numeric_range[0], fmt)
|
2196
|
+
numeric_range[1] = xl_render_value(numeric_range[1], fmt)
|
2197
|
+
break
|
2198
|
+
except (TypeError, ValueError) as e:
|
2199
|
+
pass
|
2200
|
+
|
2201
|
+
summary = {
|
2202
|
+
"number_formats": number_formats,
|
2203
|
+
"numeric_range": numeric_range,
|
2204
|
+
"sample_values": sample_values,
|
2205
|
+
}
|
2206
|
+
|
2207
|
+
return summary
|
2208
|
+
|
2209
|
+
|
2210
|
+
def extract_cells_content(ws, usedrange=None):
|
2211
|
+
""" 提取一个工作表中的所有单元格内容
|
2212
|
+
|
2213
|
+
1、找出所有合并单元格,按照我的那个顺序先排序。记为堆栈a(sorted_merged_cells_stack)。
|
2214
|
+
另外存一个已被使用的单元格集合b(初始空)(used_cells_set)。
|
2215
|
+
2、按照A1, A2, A3,..., B1, B2,B3等顺序遍历到一个个单元格c(cell)。
|
2216
|
+
3、对于每个c,先判断是否存在b中,如果存在b中,删除b中的c,并且跳过c的处理
|
2217
|
+
4、否则判断与a的最前面的元素a0是否相交,如果相交,则从堆栈a导出a0,把a0存储到结果数组cells。并且把a0中其他衍生单元格地址存入b。
|
2218
|
+
4、如果c不与a0相交,则c自身元素值写入cells。
|
2219
|
+
"""
|
2220
|
+
# 1 预备
|
2221
|
+
sorted_merged_cells_stack = ws.get_sorted_merged_cells()[::-1]
|
2222
|
+
used_cells_set = set()
|
2223
|
+
if usedrange is None:
|
2224
|
+
usedrange = ws.get_usedrange()
|
2225
|
+
usedrange_bound = parse_range_address(usedrange)
|
2226
|
+
cells = {} # 结果集合
|
2227
|
+
|
2228
|
+
# 2 遍历所有单元格
|
2229
|
+
def get_val(cell):
|
2230
|
+
val = cell.value
|
2231
|
+
if val is None:
|
2232
|
+
return ''
|
2233
|
+
else:
|
2234
|
+
return cell.get_render_value()
|
2235
|
+
# 如果感觉这步很慢,可以换一种更简洁的形式;但是发现下面这种对时间的格式显示还是太不智能。
|
2236
|
+
# if not isinstance(val, (str, int, float, bool)):
|
2237
|
+
# return val
|
2238
|
+
# else:
|
2239
|
+
# return str(val)
|
2240
|
+
|
2241
|
+
for i in range(usedrange_bound['top'], usedrange_bound['bottom'] + 1):
|
2242
|
+
for j in range(usedrange_bound['left'], usedrange_bound['right'] + 1):
|
2243
|
+
# 2.1 合并单元格的衍生单元格,直接跳过
|
2244
|
+
if (i, j) in used_cells_set:
|
2245
|
+
used_cells_set.remove((i, j))
|
2246
|
+
continue
|
2247
|
+
cell = ws.cell(i, j)
|
2248
|
+
|
2249
|
+
# 2.2 cell归属某组合并单元格(因为合并单元格排过序,所以只要很少的运算判断即可)
|
2250
|
+
if (sorted_merged_cells_stack
|
2251
|
+
and sorted_merged_cells_stack[-1].min_row == i
|
2252
|
+
and sorted_merged_cells_stack[-1].min_col == j):
|
2253
|
+
rng = sorted_merged_cells_stack.pop()
|
2254
|
+
for rng_i in range(rng.min_row, rng.max_row + 1):
|
2255
|
+
for rng_j in range(rng.min_col, rng.max_col + 1):
|
2256
|
+
used_cells_set.add((rng_i, rng_j))
|
2257
|
+
cells[rng.coord] = get_val(cell)
|
2258
|
+
used_cells_set.remove((i, j))
|
2259
|
+
continue
|
2260
|
+
|
2261
|
+
# 2.3 普通单元格
|
2262
|
+
cells[cell.coordinate] = get_val(cell)
|
2263
|
+
|
2264
|
+
return cells
|
2265
|
+
|
2266
|
+
|
2267
|
+
def extract_field_summaries(ws, header_range, data_range, samples_num=5):
|
2268
|
+
""" 提取所有字段的摘要信息
|
2269
|
+
|
2270
|
+
:param samples_num: 要抽样的数据行数
|
2271
|
+
"""
|
2272
|
+
# 1 数据范围信息
|
2273
|
+
header_details = parse_range_address(header_range)
|
2274
|
+
data_details = parse_range_address(data_range)
|
2275
|
+
start_row = header_details['bottom'] + 1
|
2276
|
+
end_row = data_details['bottom']
|
2277
|
+
|
2278
|
+
# 2 提前决定好所有字段统一抽样的数据行号
|
2279
|
+
rows = list(range(header_details['bottom'] + 1, data_details['bottom'] + 1))
|
2280
|
+
if len(rows) > samples_num:
|
2281
|
+
rows = random.sample(rows, samples_num)
|
2282
|
+
rows.sort()
|
2283
|
+
|
2284
|
+
# 3 提取所有字段的摘要信息
|
2285
|
+
field_summaries = {}
|
2286
|
+
for col in ws.iter_cols(min_col=header_details['left'], max_col=header_details['right']):
|
2287
|
+
header_cell = ws.cell(header_details['bottom'], col[0].column)
|
2288
|
+
if header_cell.celltype() != 1: # todo 这里要改成不使用衍生单元格的场景
|
2289
|
+
# 注意,原本摘要这里用的是.value,后面改成了.coordinate。原本的遇到重名就会出一些问题了~
|
2290
|
+
field_summaries[header_cell.coordinate] = determine_field_type_and_summary(
|
2291
|
+
ws, header_cell.column, start_row, end_row, rows
|
2292
|
+
)
|
2293
|
+
|
2294
|
+
return field_summaries
|
2295
|
+
|
2296
|
+
|
2297
|
+
class WorkbookSummary:
|
2298
|
+
""" 工作薄摘要相关处理功能 """
|
2299
|
+
|
2300
|
+
def __init__(self, data):
|
2301
|
+
self.data = data
|
2302
|
+
|
2303
|
+
def reduce_summarys(self, limit_length=2500):
|
2304
|
+
""" 精简摘要
|
2305
|
+
|
2306
|
+
:param limit_length: 参考限定的长度,折算后大概是限定1500token
|
2307
|
+
"""
|
2308
|
+
sheets = self.data['sheets']
|
2309
|
+
|
2310
|
+
if limit_length == -1: # -1表示不限制长度
|
2311
|
+
return sheets
|
2312
|
+
|
2313
|
+
# 1 不用改变
|
2314
|
+
text1 = json.dumps(sheets, ensure_ascii=False)
|
2315
|
+
length1 = len(text1)
|
2316
|
+
if length1 < limit_length:
|
2317
|
+
return sheets
|
2318
|
+
|
2319
|
+
# 2 每个字段都去掉一个样本
|
2320
|
+
for sheet in sheets:
|
2321
|
+
if 'data' in sheet:
|
2322
|
+
st_data = sheet['data']
|
2323
|
+
for col_header, col in st_data.items():
|
2324
|
+
st_data[col_header]['sample_values'] = col['sample_values'][:-1]
|
2325
|
+
|
2326
|
+
text2 = json.dumps(sheets, ensure_ascii=False)
|
2327
|
+
length2 = len(text2)
|
2328
|
+
bias = length1 - length2
|
2329
|
+
# dprint(length1, length2, bias) # 调试用
|
2330
|
+
if length2 <= limit_length:
|
2331
|
+
return sheets
|
2332
|
+
|
2333
|
+
# 3 算出理论上要去掉几个样本,才能控制到理想长度
|
2334
|
+
n = math.ceil(safe_div(length1 - limit_length, bias))
|
2335
|
+
m = 5 - n
|
2336
|
+
if m >= 0:
|
2337
|
+
for sheet in sheets:
|
2338
|
+
if 'data' in sheet:
|
2339
|
+
st_data = sheet['data']
|
2340
|
+
for col_header, col in st_data.items():
|
2341
|
+
if m > 0:
|
2342
|
+
st_data[col_header]['sample_values'] = col['sample_values'][:m]
|
2343
|
+
elif m == 0:
|
2344
|
+
del st_data[col_header]['sample_values']
|
2345
|
+
return sheets
|
2346
|
+
|
2347
|
+
# 4 如果m<0,可能靠上述删除还不够。这应该是不可能发生的事情,但还是处理下。
|
2348
|
+
for sheet in sheets:
|
2349
|
+
if 'data' in sheet:
|
2350
|
+
del sheet['data']
|
2351
|
+
# 这个删除后可能还是太长的话,表示sheet太多了,需要删掉一些sheet
|
2352
|
+
|
2353
|
+
# 如果删除所有的数据后仍然超过限制,那么删除一些表格
|
2354
|
+
while len(json.dumps(sheets, ensure_ascii=False)) > limit_length:
|
2355
|
+
sheets.pop()
|
2356
|
+
|
2357
|
+
self.data['sheets'] = sheets
|
2358
|
+
return sheets
|
2359
|
+
|
2360
|
+
def random_filename(self):
|
2361
|
+
self.data['fileName'] = str(random.randint(0, 2000)) + '.xlsx'
|
2362
|
+
|
2363
|
+
def choice_samples(self, samples_num=5):
|
2364
|
+
""" 限定最多抽取几个样本
|
2365
|
+
"""
|
2366
|
+
data = self.data
|
2367
|
+
for sheet in data['sheets']:
|
2368
|
+
if 'data' in sheet:
|
2369
|
+
# 预设好要抽哪些行
|
2370
|
+
n = min([len(v['sample_values']) for k, v in sheet['data'].items()])
|
2371
|
+
rows = list(range(n))
|
2372
|
+
if len(rows) > samples_num:
|
2373
|
+
rows = random.sample(rows, samples_num)
|
2374
|
+
rows.sort()
|
2375
|
+
# 抽取样本
|
2376
|
+
for col_name, col_data in sheet['data'].items():
|
2377
|
+
col_data['sample_values'] = [col_data['sample_values'][i] for i in rows]
|
2378
|
+
|
2379
|
+
def random_delete(self):
|
2380
|
+
""" 随机删除一些字段 """
|
2381
|
+
data = self.data
|
2382
|
+
for sheet in data['sheets']:
|
2383
|
+
# 80%概率删除data
|
2384
|
+
if 'data' in sheet and random.random() < 0.8:
|
2385
|
+
del sheet['data']
|
2386
|
+
|
2387
|
+
# filterRange和sortRange有80%概率随机删除一个
|
2388
|
+
if 'filterRange' in sheet and 'sortRange' in sheet:
|
2389
|
+
if random.random() < 0.5:
|
2390
|
+
del sheet['filterRange']
|
2391
|
+
else:
|
2392
|
+
del sheet['sortRange']
|
2393
|
+
|
2394
|
+
# usedRange、headRange、dataRange,有20%概率随机删掉一个
|
2395
|
+
if 'usedRange' in sheet and 'headRange' in sheet and 'dataRange' in sheet:
|
2396
|
+
r = random.random()
|
2397
|
+
if r < 1 / 3:
|
2398
|
+
del sheet['usedRange']
|
2399
|
+
elif r < 2 / 3:
|
2400
|
+
del sheet['headRange']
|
2401
|
+
else:
|
2402
|
+
del sheet['dataRange']
|
2403
|
+
|
2404
|
+
# hearder有50%概率随机打乱
|
2405
|
+
if random.random() < 0.5:
|
2406
|
+
sheet['header'] = shuffle_dict_keys(sheet['header'])
|
2407
|
+
|
2408
|
+
# 一半的概率删掉文件名
|
2409
|
+
if random.random() < 0.5:
|
2410
|
+
del data['fileName']
|
2411
|
+
|
2412
|
+
# 如果只有一个sheet,还会进一步精简
|
2413
|
+
if len(data['sheets']) == 1:
|
2414
|
+
if random.random() < 0.5:
|
2415
|
+
data = data['sheets'][0]
|
2416
|
+
|
2417
|
+
for name in ['sheetName', 'sheetType']:
|
2418
|
+
if random.random() < 0.5 and name in data:
|
2419
|
+
del data[name]
|
2420
|
+
|
2421
|
+
def to_str(self):
|
2422
|
+
return json.dumps(self.data, ensure_ascii=False)
|
2423
|
+
|
2424
|
+
|
2425
|
+
def extract_workbook_summary(file_path, mode=0,
|
2426
|
+
samples_num=5, limit_length=2500, ignore_errors=False):
|
2427
|
+
""" 更新后的函数:提取整个Excel工作簿的摘要信息
|
2428
|
+
|
2429
|
+
:param mode:
|
2430
|
+
-1,提取全量摘要(详细信息,全部样本)
|
2431
|
+
0, 标准的提取摘要(详细信息,随机抽5个样本)
|
2432
|
+
1,精简摘要,在保留逻辑完整性的前提下,随机的修改一些摘要的结构内容
|
2433
|
+
"""
|
2434
|
+
try:
|
2435
|
+
wb: XlWorkbook = openpyxl.load_workbook(file_path)
|
2436
|
+
except Exception as e:
|
2437
|
+
if ignore_errors:
|
2438
|
+
return {}
|
2439
|
+
else:
|
2440
|
+
raise e
|
2441
|
+
|
2442
|
+
if mode == -1:
|
2443
|
+
res = wb.extract_summary(samples_num=1000, limit_length=-1)
|
2444
|
+
res['fileName'] = Path(file_path).name
|
2445
|
+
elif mode == 0:
|
2446
|
+
res = wb.extract_summary(samples_num=samples_num, limit_length=limit_length)
|
2447
|
+
res['fileName'] = Path(file_path).name
|
2448
|
+
|
2449
|
+
elif mode == 1:
|
2450
|
+
res = wb.extract_summary(samples_num=samples_num)
|
2451
|
+
|
2452
|
+
wb_summary = WorkbookSummary(res)
|
2453
|
+
wb_summary.random_filename()
|
2454
|
+
wb_summary.random_delete()
|
2455
|
+
wb_summary.reduce_summarys(limit_length=limit_length)
|
2456
|
+
|
2457
|
+
res = wb_summary.data
|
2458
|
+
else:
|
2459
|
+
raise ValueError('mode参数值不正确')
|
2460
|
+
|
2461
|
+
return res
|
2462
|
+
|
2463
|
+
|
2464
|
+
def extract_workbook_summary2(file_path, *,
|
2465
|
+
keep_links=False,
|
2466
|
+
keep_vba=False,
|
2467
|
+
mode=0,
|
2468
|
+
return_mode=0,
|
2469
|
+
**kwargs):
|
2470
|
+
"""
|
2471
|
+
:param keep_links: 是否保留外部表格链接数据。如果保留,打开好像会有点问题。
|
2472
|
+
:param mode:
|
2473
|
+
0,最原始的summary2摘要
|
2474
|
+
1,添加当前工作表、单元格位置的信息
|
2475
|
+
:param kwargs: 捕捉其他参数,主要是向下兼容,其实现在并没有用
|
2476
|
+
|
2477
|
+
注意这里没有提供read_only可选参数,是因为read_only=True模式在我这里是运行不了的。
|
2478
|
+
"""
|
2479
|
+
|
2480
|
+
# 1 读取文件wb
|
2481
|
+
file_path = Path(file_path)
|
2482
|
+
res = {}
|
2483
|
+
res['fileName'] = file_path.name
|
2484
|
+
start_time = time.time()
|
2485
|
+
wb, suffix = load_as_xlsx_file(file_path, keep_links=keep_links, keep_vba=keep_vba)
|
2486
|
+
if wb is None:
|
2487
|
+
res['error'] = f'Load file error。{suffix}'
|
2488
|
+
else:
|
2489
|
+
res['fileType'] = suffix
|
2490
|
+
|
2491
|
+
load_time = time.time() - start_time
|
2492
|
+
if wb is None: # 不支持的文件类型,不报错,只是返回最基本的文件名信息
|
2493
|
+
if return_mode == 1:
|
2494
|
+
return res, load_time
|
2495
|
+
else:
|
2496
|
+
return res
|
2497
|
+
|
2498
|
+
# 2 提取摘要
|
2499
|
+
summary2 = wb.extract_summary2()
|
2500
|
+
DictTool.ior(res, summary2)
|
2501
|
+
if mode == 1:
|
2502
|
+
ws = wb.active
|
2503
|
+
res['ActiveSheet'] = ws.title
|
2504
|
+
if hasattr(ws, 'selected_cell'):
|
2505
|
+
res['Selection'] = ws.selected_cell
|
2506
|
+
|
2507
|
+
# res = convert_to_json_compatible(res)
|
2508
|
+
|
2509
|
+
if return_mode == 1:
|
2510
|
+
return res, load_time
|
2511
|
+
else:
|
2512
|
+
return res
|
2513
|
+
|
2514
|
+
|
2515
|
+
def update_raw_summary2(data):
|
2516
|
+
# 1 中文率
|
2517
|
+
if 'chineseContentRatio' not in data:
|
2518
|
+
texts = [data['fileName']] # 文件名和表格名都要加上
|
2519
|
+
texts += [x for x in data['sheetNames']]
|
2520
|
+
|
2521
|
+
texts += [v for sheet in data['sheets'] for v in sheet.get('cells', {}).values() if v]
|
2522
|
+
all_text = ''.join(map(str, texts))
|
2523
|
+
data['chineseContentRatio'] = round(calc_chinese_ratio(all_text), 4)
|
2524
|
+
|
2525
|
+
# 2 非空单元格率
|
2526
|
+
if 'nonEmptyCellRatio' not in data:
|
2527
|
+
content_area, total_area = 0, 0
|
2528
|
+
for sheet in data['sheets']:
|
2529
|
+
for addr, value in sheet.get('cells', {}).items():
|
2530
|
+
area = get_addr_area(addr)
|
2531
|
+
if value != '':
|
2532
|
+
content_area += area
|
2533
|
+
total_area += area
|
2534
|
+
data['nonEmptyCellRatio'] = round(safe_div(content_area, total_area), 4)
|
2535
|
+
|
2536
|
+
# 3 判断键值顺序
|
2537
|
+
keys = list(data.keys())
|
2538
|
+
ref_keys = ['fileName', 'fileType', 'chineseContentRatio', 'nonEmptyCellRatio', 'sheetNames', 'sheets']
|
2539
|
+
if keys != ref_keys:
|
2540
|
+
data = {k: data[k] for k in ref_keys if k in data}
|
2541
|
+
|
2542
|
+
return data
|
2543
|
+
|
2544
|
+
|
2545
|
+
def extract_workbook_summary2plus(file_path, **kwargs):
|
2546
|
+
""" 增加了全局ratio的计算 """
|
2547
|
+
# 1 主体摘要
|
2548
|
+
data = extract_workbook_summary2(file_path, **kwargs)
|
2549
|
+
if not data:
|
2550
|
+
return data
|
2551
|
+
|
2552
|
+
# 2 增加一些特征计算
|
2553
|
+
# todo 后续估计要改成按table的颗粒度统计以下特征
|
2554
|
+
|
2555
|
+
data = update_raw_summary2(data)
|
2556
|
+
return data
|
2557
|
+
|
2558
|
+
|
2559
|
+
class WorkbookSummary3:
|
2560
|
+
""" 计算summary3及衍生版本需要的一些功能组件 """
|
2561
|
+
|
2562
|
+
@classmethod
|
2563
|
+
def count_length(cls, text):
|
2564
|
+
if not isinstance(text, str):
|
2565
|
+
text = json.dumps(text, ensure_ascii=False, default=str)
|
2566
|
+
return len(text)
|
2567
|
+
|
2568
|
+
@classmethod
|
2569
|
+
def reduce1_delete_empty_cell(cls, summary3):
|
2570
|
+
""" 删除空单元格 """
|
2571
|
+
for sheet in summary3['sheets']:
|
2572
|
+
new_cells = {}
|
2573
|
+
for addr, val in sheet['cells'].items():
|
2574
|
+
if val != '':
|
2575
|
+
new_cells[addr] = val
|
2576
|
+
sheet['cells'] = new_cells
|
2577
|
+
|
2578
|
+
@classmethod
|
2579
|
+
def reduce2_truncate_overlong_cells(cls, summary3, summary_limit_len, *, cur_summary_len=None):
|
2580
|
+
""" 截断过长的单元格内容以满足表格摘要总长度限制。
|
2581
|
+
|
2582
|
+
此算法旨在处理单个或多个单元格内容过长时,整个表格摘要总长度超过预设限制的问题。在保留尽可能多的有用信息的同时,智能地缩减内容长度。
|
2583
|
+
|
2584
|
+
:param dict summary3: 表格摘要数据,包含多个sheet及其单元格内容
|
2585
|
+
:param int summary_limit_len: 表格摘要的最大长度限制
|
2586
|
+
:param int cur_summary_len: 当前表格摘要的长度,如果不提供,则会计算
|
2587
|
+
:return int: 调整后的表格摘要长度
|
2588
|
+
|
2589
|
+
算法执行流程:
|
2590
|
+
1. 计算基准单元格长度和当前摘要长度超出部分(delta_length)。
|
2591
|
+
2. 筛选出超过基准长度的单元格并按长度降序排序。
|
2592
|
+
3. 逐个尝试截断单元格直到总长度满足要求或处理完所有单元格。
|
2593
|
+
"""
|
2594
|
+
|
2595
|
+
# 如果未提供当前摘要长度,则计算之
|
2596
|
+
if cur_summary_len is None:
|
2597
|
+
cur_summary_len = cls.count_length(summary3)
|
2598
|
+
|
2599
|
+
# 1. 计算基准单元格长度
|
2600
|
+
total_cells_num = sum(len(st['cells']) + 5 for st in summary3['sheets'])
|
2601
|
+
base_cell_length = int(-60 * math.log(total_cells_num, 10) + 260) # 从200趋近到20
|
2602
|
+
base_cell_length = min(int(summary_limit_len * 0.05), base_cell_length)
|
2603
|
+
base_cell_length = max(int(summary_limit_len * 0.005), base_cell_length)
|
2604
|
+
|
2605
|
+
# 2. 预提取全部单元格数据信息并计算需要减少的长度
|
2606
|
+
delta_length = cur_summary_len - summary_limit_len # 计算需要减少的长度
|
2607
|
+
overlong_cells = [(sheet, addr, val, len(val)) for sheet in summary3['sheets']
|
2608
|
+
for addr, val in sheet['cells'].items() if
|
2609
|
+
isinstance(val, str) and len(val) > base_cell_length]
|
2610
|
+
overlong_cells.sort(key=lambda x: -x[3]) # 按长度降序排序
|
2611
|
+
|
2612
|
+
# 3. 逐个尝试截断单元格直到满足长度要求
|
2613
|
+
possible_reduction = 0
|
2614
|
+
for i, (_, _, _, length) in enumerate(overlong_cells):
|
2615
|
+
next_len = overlong_cells[i + 1][3] if i + 1 < len(overlong_cells) else base_cell_length
|
2616
|
+
possible_reduction += (length - next_len) * (i + 1)
|
2617
|
+
if possible_reduction >= delta_length or i == len(overlong_cells) - 1:
|
2618
|
+
for j in range(i + 1):
|
2619
|
+
sheet, addr, val, _ = overlong_cells[j]
|
2620
|
+
sheet['cells'][addr] = val[:next_len - 3] + '...' # 更新单元格内容
|
2621
|
+
break
|
2622
|
+
|
2623
|
+
return cur_summary_len - possible_reduction
|
2624
|
+
|
2625
|
+
@classmethod
|
2626
|
+
def reduce3_fold_rows(cls, summary3, summary_limit_len, *, cur_summary_len=None):
|
2627
|
+
if cur_summary_len is None:
|
2628
|
+
cur_summary_len = cls.count_length(summary3)
|
2629
|
+
|
2630
|
+
# 每个sheet本身其他摘要,按照5个单元格估算
|
2631
|
+
total_cells_num = sum([(len(st['cells']) + 5) for st in summary3['sheets']])
|
2632
|
+
avg_cell_len = cur_summary_len / total_cells_num
|
2633
|
+
# 目标删除单元格数量,向上取整
|
2634
|
+
target_reduce_cells_num = int((cur_summary_len - summary_limit_len) / avg_cell_len + 0.5)
|
2635
|
+
|
2636
|
+
# 相同区域,头尾至少要留2行,然后考虑压缩量,一般一块range至少要10行同构数据,进行中间至少6行的压缩描述才有意义
|
2637
|
+
# 考虑重要性,应该是从末尾表格,末尾数据往前检索压缩,直到压缩量满足要求
|
2638
|
+
|
2639
|
+
for sheet in reversed(summary3['sheets']):
|
2640
|
+
cells = sheet['cells']
|
2641
|
+
# 1 对单元格,先按行分组
|
2642
|
+
last_line_id = -1
|
2643
|
+
row_groups = []
|
2644
|
+
for addr, val in cells.items():
|
2645
|
+
m = re.search(r'\d+', addr)
|
2646
|
+
if not m: # 应该都一定能找到的,这个判断是为了防止报错
|
2647
|
+
continue
|
2648
|
+
line_id = int(m.group())
|
2649
|
+
|
2650
|
+
val_type_tag = '@' if isinstance(val, str) else '#'
|
2651
|
+
cell_tag = re.sub(r'\d+', '', addr) + val_type_tag
|
2652
|
+
|
2653
|
+
if line_id == last_line_id:
|
2654
|
+
row_groups[-1].append([addr, cell_tag])
|
2655
|
+
else:
|
2656
|
+
row_groups.append([[addr, cell_tag]])
|
2657
|
+
last_line_id = line_id
|
2658
|
+
|
2659
|
+
# 2 算出每一行的row_tag,并按照row_tag再分组
|
2660
|
+
last_row_tag = ''
|
2661
|
+
rows_groups = []
|
2662
|
+
for row in row_groups:
|
2663
|
+
row_tag = ''.join([cell_tag for _, cell_tag in row])
|
2664
|
+
if row_tag == last_row_tag:
|
2665
|
+
rows_groups[-1].append(row)
|
2666
|
+
else:
|
2667
|
+
rows_groups.append([row])
|
2668
|
+
last_row_tag = row_tag
|
2669
|
+
|
2670
|
+
# 3 开始压缩
|
2671
|
+
def extract_cells_from_rows(rows):
|
2672
|
+
for row in rows:
|
2673
|
+
for addr, _ in row:
|
2674
|
+
new_cells[addr] = cells[addr]
|
2675
|
+
|
2676
|
+
total_new_cells = []
|
2677
|
+
for rows in reversed(rows_groups):
|
2678
|
+
new_cells = {}
|
2679
|
+
if len(rows) < 10:
|
2680
|
+
extract_cells_from_rows(rows)
|
2681
|
+
else: # 压缩中间的数据
|
2682
|
+
# 如果评估到最终摘要可能太小,要收敛下删除的范围
|
2683
|
+
n, m = len(rows), len(rows[0])
|
2684
|
+
target_n = int(target_reduce_cells_num / m + 0.5) # 本来应该删除多少行才行
|
2685
|
+
if target_n <= 0: # 如果删除的行数太少,那么就不压缩了
|
2686
|
+
extract_cells_from_rows(rows)
|
2687
|
+
else:
|
2688
|
+
cur_n = n - 4 if target_n > n - 4 else target_n # 实际删除多少行
|
2689
|
+
left_n = n - cur_n # 剩余多少行
|
2690
|
+
b = left_n // 2
|
2691
|
+
a = left_n - b
|
2692
|
+
|
2693
|
+
extract_cells_from_rows(rows[:a])
|
2694
|
+
addr = combine_addresses(rows[a][0][0], rows[-b - 1][-1][0])
|
2695
|
+
# new_cells[addr] = '这块区域的内容跟前面几行、后面几行的内容结构是一致的,省略显示'
|
2696
|
+
new_cells[addr] = '...'
|
2697
|
+
extract_cells_from_rows(rows[-b:])
|
2698
|
+
|
2699
|
+
target_reduce_cells_num -= cur_n * m
|
2700
|
+
# 240429周一21:57,这两行不能开,否则会过渡精简。如果压缩够了,那么后面的单元格需要全量补上。
|
2701
|
+
# if target_reduce_cells_num <= 0: # 满足以后不是直接break,而是要把后续的内容都保留
|
2702
|
+
# break
|
2703
|
+
total_new_cells.append(new_cells)
|
2704
|
+
|
2705
|
+
new_cells2 = {}
|
2706
|
+
for rows in reversed(total_new_cells):
|
2707
|
+
new_cells2.update(rows)
|
2708
|
+
sheet['cells'] = new_cells2
|
2709
|
+
|
2710
|
+
@classmethod
|
2711
|
+
def reduce4_truncate_cells(cls, y, summary_limit_len, *, cur_summary_len=None):
|
2712
|
+
if cur_summary_len is None:
|
2713
|
+
cur_summary_len = cls.count_length(y)
|
2714
|
+
|
2715
|
+
# 1 预计要删除单元格数
|
2716
|
+
sheet_cells_num = [len(st['cells']) for st in y['sheets']]
|
2717
|
+
# 每个sheet本身其他摘要,按照5个单元格估算
|
2718
|
+
total_cells_num = sum(sheet_cells_num) + len(sheet_cells_num) * 5
|
2719
|
+
avg_cell_len = cur_summary_len / total_cells_num
|
2720
|
+
# 目标删除单元格数量,向上取整
|
2721
|
+
target_reduce_cells_num = int((cur_summary_len - summary_limit_len) / avg_cell_len + 0.5)
|
2722
|
+
|
2723
|
+
# 2 所有的单元格如果都不够删,那就先把所有cells删了再说
|
2724
|
+
if total_cells_num < target_reduce_cells_num:
|
2725
|
+
for st in y['sheets']:
|
2726
|
+
st['cells'] = {}
|
2727
|
+
return cls.count_length(y)
|
2728
|
+
|
2729
|
+
# 3 否则每张表按照比例删单元格,只保留前面部分的单元格
|
2730
|
+
left_rate = 1 - target_reduce_cells_num / total_cells_num
|
2731
|
+
while True:
|
2732
|
+
for i, st in enumerate(y['sheets']):
|
2733
|
+
st['cells'] = dict(islice(st['cells'].items(), int(left_rate * sheet_cells_num[i])))
|
2734
|
+
cur_summary_len = cls.count_length(y)
|
2735
|
+
if cur_summary_len <= summary_limit_len:
|
2736
|
+
return cur_summary_len
|
2737
|
+
if left_rate * total_cells_num < 1:
|
2738
|
+
break
|
2739
|
+
else: # 缩小保留比例,再试
|
2740
|
+
left_rate *= 0.8
|
2741
|
+
|
2742
|
+
return cur_summary_len
|
2743
|
+
|
2744
|
+
@classmethod
|
2745
|
+
def reduce5_truncate_sheets(cls, y, summary_limit_len, *, cur_summary_len=None):
|
2746
|
+
""" 计算平均每张表的长度,保留前面部分的表格 """
|
2747
|
+
if cur_summary_len is None:
|
2748
|
+
cur_summary_len = cls.count_length(y)
|
2749
|
+
|
2750
|
+
n = len(y['sheets'])
|
2751
|
+
avg_sheet_len = cur_summary_len / n
|
2752
|
+
target_reduce_sheet_num = int((cur_summary_len - summary_limit_len) / avg_sheet_len + 0.5)
|
2753
|
+
y['sheets'] = y['sheets'][:n - target_reduce_sheet_num]
|
2754
|
+
|
2755
|
+
while y['sheets']:
|
2756
|
+
cur_summary_len = cls.count_length(y)
|
2757
|
+
if cur_summary_len <= summary_limit_len:
|
2758
|
+
return cur_summary_len
|
2759
|
+
y['sheets'] = y['sheets'][:-1] # 依次尝试删除最后一张表格的详细信息
|
2760
|
+
|
2761
|
+
@classmethod
|
2762
|
+
def summary2_to_summary3(cls, summary2, summary_limit_len=4000):
|
2763
|
+
""" 将summary2转换成summary3 """
|
2764
|
+
|
2765
|
+
def reduce_step_by_step(y):
|
2766
|
+
mode_tags = [
|
2767
|
+
'Delete empty cell',
|
2768
|
+
'Omit the longer content and replace it with...',
|
2769
|
+
'Omit lines with the same structure',
|
2770
|
+
'Omit later lines',
|
2771
|
+
'Omit later sheets'
|
2772
|
+
]
|
2773
|
+
|
2774
|
+
# 0 摘要本来就不大
|
2775
|
+
cur_summary_len = cls.count_length(y)
|
2776
|
+
if cur_summary_len <= summary_limit_len:
|
2777
|
+
return y
|
2778
|
+
|
2779
|
+
# 1 删除空单元格
|
2780
|
+
cls.reduce1_delete_empty_cell(y)
|
2781
|
+
cur_summary_len = cls.count_length(y)
|
2782
|
+
if cur_summary_len <= summary_limit_len:
|
2783
|
+
y['mode'] = ', '.join(mode_tags[:1])
|
2784
|
+
return y
|
2785
|
+
|
2786
|
+
# 2 单个单元格内容过长的,省略显示
|
2787
|
+
cur_summary_len = cls.reduce2_truncate_overlong_cells(y, summary_limit_len, cur_summary_len=cur_summary_len)
|
2788
|
+
if cur_summary_len <= summary_limit_len:
|
2789
|
+
y['mode'] = ', '.join(mode_tags[:2])
|
2790
|
+
return y
|
2791
|
+
|
2792
|
+
# 3 同构数据,省略显示(有大量相同行数据,折叠省略表达)
|
2793
|
+
cls.reduce3_fold_rows(y, summary_limit_len, cur_summary_len=cur_summary_len)
|
2794
|
+
cur_summary_len = cls.count_length(y)
|
2795
|
+
if cur_summary_len <= summary_limit_len:
|
2796
|
+
y['mode'] = ', '.join(mode_tags[:3])
|
2797
|
+
return y
|
2798
|
+
|
2799
|
+
# 4 每张表都按比例删除后面部分的单元格
|
2800
|
+
cur_summary_len = cls.reduce4_truncate_cells(y, summary_limit_len, cur_summary_len=cur_summary_len)
|
2801
|
+
if cur_summary_len <= summary_limit_len:
|
2802
|
+
y['mode'] = ', '.join(mode_tags[:4])
|
2803
|
+
return y
|
2804
|
+
|
2805
|
+
# 5 从后往前删每张表格的详细信息
|
2806
|
+
cls.reduce5_truncate_sheets(y, summary_limit_len, cur_summary_len=cur_summary_len)
|
2807
|
+
y['mode'] = ', '.join(mode_tags[:5])
|
2808
|
+
return y
|
2809
|
+
|
2810
|
+
x = summary2
|
2811
|
+
y = {
|
2812
|
+
'fileName': x['fileName'],
|
2813
|
+
'sheetNames': x['sheetNames'],
|
2814
|
+
'sheets': x['sheets'],
|
2815
|
+
'mode': 'Complete information',
|
2816
|
+
}
|
2817
|
+
|
2818
|
+
# 处理前确保下cells字段存在,避免后续很多处理过程要特判
|
2819
|
+
for st in y['sheets']:
|
2820
|
+
if 'cells' not in st:
|
2821
|
+
st['cells'] = {}
|
2822
|
+
|
2823
|
+
y = reduce_step_by_step(y)
|
2824
|
+
|
2825
|
+
# 但是最后结果还是去掉空cells
|
2826
|
+
for st in y['sheets']:
|
2827
|
+
if not st['cells']:
|
2828
|
+
del st['cells']
|
2829
|
+
|
2830
|
+
return y
|
2831
|
+
|
2832
|
+
@classmethod
|
2833
|
+
def reduce4b(cls, y, summary_limit_len, *, cur_summary_len=None, active_sheet_weight=0.5):
|
2834
|
+
"""
|
2835
|
+
:param active_sheet_weight: 当前活动表格被删除的权重,0.5表示按比例被删除的量只有其他表格的一半
|
2836
|
+
"""
|
2837
|
+
if cur_summary_len is None:
|
2838
|
+
cur_summary_len = cls.count_length(y)
|
2839
|
+
|
2840
|
+
cur_summary_len0 = cur_summary_len
|
2841
|
+
active_sheet = y['ActiveSheet']
|
2842
|
+
|
2843
|
+
# 1 预计要删除单元格数
|
2844
|
+
sheet_cells_num = [len(st['cells']) for st in y['sheets']]
|
2845
|
+
# 每个sheet本身其他摘要,按照5个单元格估算
|
2846
|
+
total_cells_num = sum(sheet_cells_num) + len(sheet_cells_num) * 5
|
2847
|
+
avg_cell_len = cur_summary_len / total_cells_num
|
2848
|
+
# 目标删除单元格数量,向上取整
|
2849
|
+
target_reduce_cells_num = int((cur_summary_len - summary_limit_len) / avg_cell_len + 0.5)
|
2850
|
+
|
2851
|
+
# 2 对当前活动表格,会减小删除权重
|
2852
|
+
# 标记当前活动表格的单元格数
|
2853
|
+
active_sheet_index = [i for i, st in enumerate(y['sheets']) if st['sheetName'] == active_sheet][0]
|
2854
|
+
active_cells_num = sheet_cells_num[active_sheet_index]
|
2855
|
+
|
2856
|
+
# 计算权重系数
|
2857
|
+
w = active_sheet_weight # 当前激活表的权重系数
|
2858
|
+
m = active_cells_num
|
2859
|
+
n = total_cells_num
|
2860
|
+
r = target_reduce_cells_num / n
|
2861
|
+
|
2862
|
+
# 计算非活动表格的额外权重系数
|
2863
|
+
w2 = 1 + m * (1 - w) / (n - m)
|
2864
|
+
|
2865
|
+
# 3 所有的单元格如果都不够删,那就先把所有cells删了再说
|
2866
|
+
if total_cells_num < target_reduce_cells_num:
|
2867
|
+
for st in y['sheets']:
|
2868
|
+
st['cells'] = {}
|
2869
|
+
return cls.count_length(y)
|
2870
|
+
|
2871
|
+
# 4 否则每张表按照比例删单元格,只保留前面部分的单元格
|
2872
|
+
# todo 这里应该有更好的筛选机制,后续可以思考思考
|
2873
|
+
left_rate = min((summary_limit_len + cur_summary_len) / (2 * cur_summary_len), 0.9) # 首轮减小一点调整幅度
|
2874
|
+
while True:
|
2875
|
+
for i, st in enumerate(y['sheets']):
|
2876
|
+
if i == active_sheet_index:
|
2877
|
+
# 当前激活的sheet保留更多单元格
|
2878
|
+
st['cells'] = dict(islice(st['cells'].items(), int(left_rate * w * sheet_cells_num[i])))
|
2879
|
+
else:
|
2880
|
+
# 其他sheet按照w2权重删除单元格
|
2881
|
+
st['cells'] = dict(islice(st['cells'].items(), int(left_rate * w2 * sheet_cells_num[i])))
|
2882
|
+
cur_summary_len = cls.count_length(y)
|
2883
|
+
if cur_summary_len <= summary_limit_len:
|
2884
|
+
return cur_summary_len
|
2885
|
+
if left_rate * total_cells_num < 1: # 都没有单元格,别删了
|
2886
|
+
break
|
2887
|
+
else: # 更新保留比率,再试
|
2888
|
+
left_rate *= min(summary_limit_len / cur_summary_len, 0.9)
|
2889
|
+
|
2890
|
+
return cur_summary_len
|
2891
|
+
|
2892
|
+
@classmethod
|
2893
|
+
def reduce5b(cls, y, summary_limit_len, *, cur_summary_len=None):
|
2894
|
+
""" 计算平均每张表的长度,保留前面部分的表格 """
|
2895
|
+
if cur_summary_len is None:
|
2896
|
+
cur_summary_len = cls.count_length(y)
|
2897
|
+
|
2898
|
+
n = len(y['sheets'])
|
2899
|
+
active_sheet_name = y['ActiveSheet']
|
2900
|
+
|
2901
|
+
avg_sheet_len = cur_summary_len / n
|
2902
|
+
# target_reduce_sheet_num = int((cur_summary_len - summary_limit_len) / avg_sheet_len + 0.5)
|
2903
|
+
# y['sheets'] = y['sheets'][:n - target_reduce_sheet_num]
|
2904
|
+
|
2905
|
+
while y['sheets']:
|
2906
|
+
cur_summary_len = cls.count_length(y)
|
2907
|
+
if cur_summary_len <= summary_limit_len:
|
2908
|
+
return cur_summary_len
|
2909
|
+
|
2910
|
+
# 如果最后一张表格是激活的表格,尝试删除前一张
|
2911
|
+
if y['sheets'][-1]['sheetName'] == active_sheet_name:
|
2912
|
+
if len(y['sheets']) > 1:
|
2913
|
+
y['sheets'] = y['sheets'][:-2] + [y['sheets'][-1]]
|
2914
|
+
else:
|
2915
|
+
y['sheets'] = []
|
2916
|
+
else:
|
2917
|
+
y['sheets'] = y['sheets'][:-1] # 删除最后一张表格的详细信息
|
2918
|
+
|
2919
|
+
return cur_summary_len
|
2920
|
+
|
2921
|
+
@classmethod
|
2922
|
+
def summary2_to_summary3b(cls, summary2, summary_limit_len=4000):
|
2923
|
+
""" 将summary2转换成summary3 """
|
2924
|
+
|
2925
|
+
def reduce_step_by_step(y):
|
2926
|
+
mode_tags = [
|
2927
|
+
'Delete empty cell',
|
2928
|
+
'Omit the longer content and replace it with...',
|
2929
|
+
'Omit lines with the same structure',
|
2930
|
+
'Omit later lines',
|
2931
|
+
'Omit later sheets'
|
2932
|
+
]
|
2933
|
+
|
2934
|
+
# 0 摘要本来就不大
|
2935
|
+
cur_summary_len = cls.count_length(y)
|
2936
|
+
if cur_summary_len <= summary_limit_len:
|
2937
|
+
return y
|
2938
|
+
|
2939
|
+
# 1 删除空单元格
|
2940
|
+
cls.reduce1_delete_empty_cell(y)
|
2941
|
+
cur_summary_len = cls.count_length(y)
|
2942
|
+
if cur_summary_len <= summary_limit_len:
|
2943
|
+
y['mode'] = ', '.join(mode_tags[:1])
|
2944
|
+
return y
|
2945
|
+
|
2946
|
+
# 2 单个单元格内容过长的,省略显示
|
2947
|
+
cur_summary_len = cls.reduce2_truncate_overlong_cells(y, summary_limit_len, cur_summary_len=cur_summary_len)
|
2948
|
+
if cur_summary_len <= summary_limit_len:
|
2949
|
+
y['mode'] = ', '.join(mode_tags[:2])
|
2950
|
+
return y
|
2951
|
+
|
2952
|
+
# 3 同构数据,省略显示(有大量相同行数据,折叠省略表达)
|
2953
|
+
cls.reduce3_fold_rows(y, summary_limit_len, cur_summary_len=cur_summary_len)
|
2954
|
+
cur_summary_len = cls.count_length(y)
|
2955
|
+
if cur_summary_len <= summary_limit_len:
|
2956
|
+
y['mode'] = ', '.join(mode_tags[:3])
|
2957
|
+
return y
|
2958
|
+
|
2959
|
+
# 4 每张表都按比例删除后面部分的单元格
|
2960
|
+
cur_summary_len = cls.reduce4b(y, summary_limit_len, cur_summary_len=cur_summary_len)
|
2961
|
+
if cur_summary_len <= summary_limit_len:
|
2962
|
+
y['mode'] = ', '.join(mode_tags[:4])
|
2963
|
+
return y
|
2964
|
+
|
2965
|
+
# 5 从后往前删每张表格的详细信息
|
2966
|
+
cls.reduce5b(y, summary_limit_len, cur_summary_len=cur_summary_len)
|
2967
|
+
y['mode'] = ', '.join(mode_tags[:5])
|
2968
|
+
return y
|
2969
|
+
|
2970
|
+
x = summary2
|
2971
|
+
if 'error' in x:
|
2972
|
+
return x
|
2973
|
+
|
2974
|
+
y = {
|
2975
|
+
'fileName': x['fileName'],
|
2976
|
+
'fileType': x['fileType'],
|
2977
|
+
'sheetNames': x['sheetNames'],
|
2978
|
+
'sheets': x['sheets'],
|
2979
|
+
'mode': 'Complete information',
|
2980
|
+
'ActiveSheet': x['ActiveSheet'], # 当期激活的工作表
|
2981
|
+
}
|
2982
|
+
if 'Selection' in x:
|
2983
|
+
# 最多截取250个字符。(一般情况下这个很小的,只是在很极端情况,比如离散选中了非常多区域,这个可能就会太长
|
2984
|
+
y['Selection'] = x['Selection'][:250]
|
2985
|
+
|
2986
|
+
# 处理前确保下cells字段存在,避免后续很多处理过程要特判
|
2987
|
+
for st in y['sheets']:
|
2988
|
+
if 'cells' not in st:
|
2989
|
+
st['cells'] = {}
|
2990
|
+
|
2991
|
+
y = reduce_step_by_step(y)
|
2992
|
+
|
2993
|
+
# 但是最后结果还是去掉空cells
|
2994
|
+
for st in y['sheets']:
|
2995
|
+
if not st['cells']:
|
2996
|
+
del st['cells']
|
2997
|
+
|
2998
|
+
return y
|
2999
|
+
|
3000
|
+
|
3001
|
+
class WorkbookSummary3plus(WorkbookSummary3):
|
3002
|
+
""" 标准的token计算方式,不过暂不打算实装使用 """
|
3003
|
+
|
3004
|
+
@classmethod
|
3005
|
+
def count_length(cls, text):
|
3006
|
+
from pyxlpr.data.gptlib import Tokenizer
|
3007
|
+
if not isinstance(text, str):
|
3008
|
+
text = json.dumps(text, ensure_ascii=False, default=str)
|
3009
|
+
return Tokenizer.count_tokens(text)
|
3010
|
+
|
3011
|
+
|
3012
|
+
def extract_workbook_summary3(file_path, summary_limit_len=4000, **kwargs):
|
3013
|
+
""" 增加了全局ratio的计算 """
|
3014
|
+
data = extract_workbook_summary2(file_path, **kwargs)
|
3015
|
+
if not data:
|
3016
|
+
return data
|
3017
|
+
data = WorkbookSummary3.summary2_to_summary3(data, summary_limit_len)
|
3018
|
+
return data
|
3019
|
+
|
3020
|
+
|
3021
|
+
def summary2_add_enums(summary2, enum_values):
|
3022
|
+
# 1 预备
|
3023
|
+
if enum_values is True:
|
3024
|
+
enum_values = (20, 10)
|
3025
|
+
max_len, max_num = enum_values
|
3026
|
+
|
3027
|
+
# 2 枚举值
|
3028
|
+
for sheet in summary2['sheets']:
|
3029
|
+
# 2.1 遍历计数
|
3030
|
+
cols = defaultdict(Counter)
|
3031
|
+
for addr, val in sheet['cells'].items():
|
3032
|
+
n = len(str(val))
|
3033
|
+
if not n or n > max_len:
|
3034
|
+
continue
|
3035
|
+
col = re.match(r'[A-Z]+', addr).group()
|
3036
|
+
cols[col][val] += 1
|
3037
|
+
|
3038
|
+
# 2.2 添加枚举值列
|
3039
|
+
enums = {}
|
3040
|
+
keys = sorted(cols.keys(), key=column_index_from_string)
|
3041
|
+
for k in keys:
|
3042
|
+
ct = cols[k]
|
3043
|
+
if len(ct) > max_num:
|
3044
|
+
continue
|
3045
|
+
vals = ct.most_common()
|
3046
|
+
if vals[0][1] == 1: # 都只出现了一次,也不认为是枚举值,跳过。或者是小数据表,一般也能全量展示。
|
3047
|
+
continue
|
3048
|
+
enums[k] = [v for v, _ in vals]
|
3049
|
+
|
3050
|
+
# 2.3 保存
|
3051
|
+
if enums:
|
3052
|
+
sheet['enums'] = enums
|
3053
|
+
# enums2 = json.dumps(enums, ensure_ascii=False, default=str)
|
3054
|
+
# sheet['enums'] = json.loads(enums2)
|
3055
|
+
|
3056
|
+
return summary2
|
3057
|
+
|
3058
|
+
|
3059
|
+
def extract_workbook_summary3b(file_path,
|
3060
|
+
summary_limit_len=4000,
|
3061
|
+
timeout_seconds=10,
|
3062
|
+
return_mode=0,
|
3063
|
+
debug=False,
|
3064
|
+
len_mode=0,
|
3065
|
+
enum_values=False,
|
3066
|
+
**kwargs):
|
3067
|
+
"""
|
3068
|
+
|
3069
|
+
:param summary_limit_len: 摘要长度限制
|
3070
|
+
:param timeout_seconds: 超时限制
|
3071
|
+
:param return_mode: 返回模式
|
3072
|
+
0,表示只返回摘要
|
3073
|
+
1,表示返回摘要和耗时
|
3074
|
+
2, 再增加返回summary2
|
3075
|
+
:param len_mode:
|
3076
|
+
0, 使用len作为token长度评估
|
3077
|
+
1, 使用模型评估实际token长度
|
3078
|
+
:param enum_values: 是否展示每列枚举值
|
3079
|
+
False, 默认不展示
|
3080
|
+
True, 展示,并且默认参数 (20, 10) 表示长度超过20的丢弃,只保留枚举类型不超过10种值的列
|
3081
|
+
:param kwargs: 其他是summary2读取文件的时候的参数,其实都不太关键,一般不用特地设置
|
3082
|
+
"""
|
3083
|
+
res = {}
|
3084
|
+
res['fileName'] = Path(file_path).name
|
3085
|
+
load_time = summary2_time = summary3_time = -1
|
3086
|
+
summary2_res = {}
|
3087
|
+
|
3088
|
+
def reduce_summary(summary):
|
3089
|
+
""" 如果转json后的summary超过4K,去掉可能的sheets字段 """
|
3090
|
+
s = json.dumps(summary, ensure_ascii=False)
|
3091
|
+
if len(s) < 4000:
|
3092
|
+
if 'sheets' in summary:
|
3093
|
+
del summary['sheets']
|
3094
|
+
|
3095
|
+
try:
|
3096
|
+
with Timeout(timeout_seconds):
|
3097
|
+
start_time = time.time()
|
3098
|
+
res, load_time = extract_workbook_summary2(file_path, mode=1, return_mode=1, **kwargs)
|
3099
|
+
# res = convert_to_json_compatible(res)
|
3100
|
+
summary2_res = copy.deepcopy(res)
|
3101
|
+
if enum_values:
|
3102
|
+
res = summary2_add_enums(res, enum_values)
|
3103
|
+
summary2_time = time.time() - start_time - load_time
|
3104
|
+
|
3105
|
+
start_time = time.time()
|
3106
|
+
if len_mode == 1:
|
3107
|
+
res = WorkbookSummary3plus.summary2_to_summary3b(res, summary_limit_len)
|
3108
|
+
else:
|
3109
|
+
res = WorkbookSummary3.summary2_to_summary3b(res, summary_limit_len)
|
3110
|
+
summary3_time = time.time() - start_time
|
3111
|
+
except TimeoutError as e:
|
3112
|
+
if debug:
|
3113
|
+
raise e
|
3114
|
+
res['error'] = f'超时,未完成摘要提取:{timeout_seconds}秒'
|
3115
|
+
reduce_summary(res)
|
3116
|
+
except Exception as e:
|
3117
|
+
if debug:
|
3118
|
+
raise e
|
3119
|
+
res['error'] = f'提取摘要时发生错误:{format_exception(e, 2)}'
|
3120
|
+
reduce_summary(res)
|
3121
|
+
|
3122
|
+
time_dict = {'load_time': human_readable_number(load_time),
|
3123
|
+
'summary2_time': human_readable_number(summary2_time),
|
3124
|
+
'summary3_time': human_readable_number(summary3_time)}
|
3125
|
+
|
3126
|
+
if return_mode == 1:
|
3127
|
+
return res, time_dict
|
3128
|
+
elif return_mode == 2:
|
3129
|
+
return res, time_dict, summary2_res
|
3130
|
+
|
3131
|
+
return res
|