pyxllib 0.3.96__py3-none-any.whl → 0.3.197__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/algo/geo.py +12 -0
- pyxllib/algo/intervals.py +1 -1
- pyxllib/algo/matcher.py +78 -0
- pyxllib/algo/pupil.py +187 -19
- pyxllib/algo/specialist.py +2 -1
- pyxllib/algo/stat.py +38 -2
- {pyxlpr → pyxllib/autogui}/__init__.py +1 -1
- pyxllib/autogui/activewin.py +246 -0
- pyxllib/autogui/all.py +9 -0
- pyxllib/{ext/autogui → autogui}/autogui.py +40 -11
- pyxllib/autogui/uiautolib.py +362 -0
- pyxllib/autogui/wechat.py +827 -0
- pyxllib/autogui/wechat_msg.py +421 -0
- pyxllib/autogui/wxautolib.py +84 -0
- pyxllib/cv/slidercaptcha.py +137 -0
- pyxllib/data/echarts.py +123 -12
- pyxllib/data/jsonlib.py +89 -0
- pyxllib/data/pglib.py +514 -30
- pyxllib/data/sqlite.py +231 -4
- pyxllib/ext/JLineViewer.py +14 -1
- pyxllib/ext/drissionlib.py +277 -0
- pyxllib/ext/kq5034lib.py +0 -1594
- pyxllib/ext/robustprocfile.py +497 -0
- pyxllib/ext/unixlib.py +6 -5
- pyxllib/ext/utools.py +108 -95
- pyxllib/ext/webhook.py +32 -14
- pyxllib/ext/wjxlib.py +88 -0
- pyxllib/ext/wpsapi.py +124 -0
- pyxllib/ext/xlwork.py +9 -0
- pyxllib/ext/yuquelib.py +1003 -71
- pyxllib/file/docxlib.py +1 -1
- pyxllib/file/libreoffice.py +165 -0
- pyxllib/file/movielib.py +9 -0
- pyxllib/file/packlib/__init__.py +112 -75
- pyxllib/file/pdflib.py +1 -1
- pyxllib/file/pupil.py +1 -1
- pyxllib/file/specialist/dirlib.py +1 -1
- pyxllib/file/specialist/download.py +10 -3
- pyxllib/file/specialist/filelib.py +266 -55
- pyxllib/file/xlsxlib.py +205 -50
- pyxllib/file/xlsyncfile.py +341 -0
- pyxllib/prog/cachetools.py +64 -0
- pyxllib/prog/filelock.py +42 -0
- pyxllib/prog/multiprogs.py +940 -0
- pyxllib/prog/newbie.py +9 -2
- pyxllib/prog/pupil.py +129 -60
- pyxllib/prog/specialist/__init__.py +176 -2
- pyxllib/prog/specialist/bc.py +5 -2
- pyxllib/prog/specialist/browser.py +11 -2
- pyxllib/prog/specialist/datetime.py +68 -0
- pyxllib/prog/specialist/tictoc.py +12 -13
- pyxllib/prog/specialist/xllog.py +5 -5
- pyxllib/prog/xlosenv.py +7 -0
- pyxllib/text/airscript.js +744 -0
- pyxllib/text/charclasslib.py +17 -5
- pyxllib/text/jiebalib.py +6 -3
- pyxllib/text/jinjalib.py +32 -0
- pyxllib/text/jsa_ai_prompt.md +271 -0
- pyxllib/text/jscode.py +159 -4
- pyxllib/text/nestenv.py +1 -1
- pyxllib/text/newbie.py +12 -0
- pyxllib/text/pupil/common.py +26 -0
- pyxllib/text/specialist/ptag.py +2 -2
- pyxllib/text/templates/echart_base.html +11 -0
- pyxllib/text/templates/highlight_code.html +17 -0
- pyxllib/text/templates/latex_editor.html +103 -0
- pyxllib/text/xmllib.py +76 -14
- pyxllib/xl.py +2 -1
- pyxllib-0.3.197.dist-info/METADATA +48 -0
- pyxllib-0.3.197.dist-info/RECORD +126 -0
- {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +1 -2
- pyxllib/ext/autogui/__init__.py +0 -8
- pyxllib-0.3.96.dist-info/METADATA +0 -51
- pyxllib-0.3.96.dist-info/RECORD +0 -333
- pyxllib-0.3.96.dist-info/top_level.txt +0 -2
- pyxlpr/ai/__init__.py +0 -5
- pyxlpr/ai/clientlib.py +0 -1281
- pyxlpr/ai/specialist.py +0 -286
- pyxlpr/ai/torch_app.py +0 -172
- pyxlpr/ai/xlpaddle.py +0 -655
- pyxlpr/ai/xltorch.py +0 -705
- pyxlpr/data/__init__.py +0 -11
- pyxlpr/data/coco.py +0 -1325
- pyxlpr/data/datacls.py +0 -365
- pyxlpr/data/datasets.py +0 -200
- pyxlpr/data/gptlib.py +0 -1291
- pyxlpr/data/icdar/__init__.py +0 -96
- pyxlpr/data/icdar/deteval.py +0 -377
- pyxlpr/data/icdar/icdar2013.py +0 -341
- pyxlpr/data/icdar/iou.py +0 -340
- pyxlpr/data/icdar/rrc_evaluation_funcs_1_1.py +0 -463
- pyxlpr/data/imtextline.py +0 -473
- pyxlpr/data/labelme.py +0 -866
- pyxlpr/data/removeline.py +0 -179
- pyxlpr/data/specialist.py +0 -57
- pyxlpr/eval/__init__.py +0 -85
- pyxlpr/paddleocr.py +0 -776
- pyxlpr/ppocr/__init__.py +0 -15
- pyxlpr/ppocr/configs/rec/multi_language/generate_multi_language_configs.py +0 -226
- pyxlpr/ppocr/data/__init__.py +0 -135
- pyxlpr/ppocr/data/imaug/ColorJitter.py +0 -26
- pyxlpr/ppocr/data/imaug/__init__.py +0 -67
- pyxlpr/ppocr/data/imaug/copy_paste.py +0 -170
- pyxlpr/ppocr/data/imaug/east_process.py +0 -437
- pyxlpr/ppocr/data/imaug/gen_table_mask.py +0 -244
- pyxlpr/ppocr/data/imaug/iaa_augment.py +0 -114
- pyxlpr/ppocr/data/imaug/label_ops.py +0 -789
- pyxlpr/ppocr/data/imaug/make_border_map.py +0 -184
- pyxlpr/ppocr/data/imaug/make_pse_gt.py +0 -106
- pyxlpr/ppocr/data/imaug/make_shrink_map.py +0 -126
- pyxlpr/ppocr/data/imaug/operators.py +0 -433
- pyxlpr/ppocr/data/imaug/pg_process.py +0 -906
- pyxlpr/ppocr/data/imaug/randaugment.py +0 -143
- pyxlpr/ppocr/data/imaug/random_crop_data.py +0 -239
- pyxlpr/ppocr/data/imaug/rec_img_aug.py +0 -533
- pyxlpr/ppocr/data/imaug/sast_process.py +0 -777
- pyxlpr/ppocr/data/imaug/text_image_aug/__init__.py +0 -17
- pyxlpr/ppocr/data/imaug/text_image_aug/augment.py +0 -120
- pyxlpr/ppocr/data/imaug/text_image_aug/warp_mls.py +0 -168
- pyxlpr/ppocr/data/lmdb_dataset.py +0 -115
- pyxlpr/ppocr/data/pgnet_dataset.py +0 -104
- pyxlpr/ppocr/data/pubtab_dataset.py +0 -107
- pyxlpr/ppocr/data/simple_dataset.py +0 -372
- pyxlpr/ppocr/losses/__init__.py +0 -61
- pyxlpr/ppocr/losses/ace_loss.py +0 -52
- pyxlpr/ppocr/losses/basic_loss.py +0 -135
- pyxlpr/ppocr/losses/center_loss.py +0 -88
- pyxlpr/ppocr/losses/cls_loss.py +0 -30
- pyxlpr/ppocr/losses/combined_loss.py +0 -67
- pyxlpr/ppocr/losses/det_basic_loss.py +0 -208
- pyxlpr/ppocr/losses/det_db_loss.py +0 -80
- pyxlpr/ppocr/losses/det_east_loss.py +0 -63
- pyxlpr/ppocr/losses/det_pse_loss.py +0 -149
- pyxlpr/ppocr/losses/det_sast_loss.py +0 -121
- pyxlpr/ppocr/losses/distillation_loss.py +0 -272
- pyxlpr/ppocr/losses/e2e_pg_loss.py +0 -140
- pyxlpr/ppocr/losses/kie_sdmgr_loss.py +0 -113
- pyxlpr/ppocr/losses/rec_aster_loss.py +0 -99
- pyxlpr/ppocr/losses/rec_att_loss.py +0 -39
- pyxlpr/ppocr/losses/rec_ctc_loss.py +0 -44
- pyxlpr/ppocr/losses/rec_enhanced_ctc_loss.py +0 -70
- pyxlpr/ppocr/losses/rec_nrtr_loss.py +0 -30
- pyxlpr/ppocr/losses/rec_sar_loss.py +0 -28
- pyxlpr/ppocr/losses/rec_srn_loss.py +0 -47
- pyxlpr/ppocr/losses/table_att_loss.py +0 -109
- pyxlpr/ppocr/metrics/__init__.py +0 -44
- pyxlpr/ppocr/metrics/cls_metric.py +0 -45
- pyxlpr/ppocr/metrics/det_metric.py +0 -82
- pyxlpr/ppocr/metrics/distillation_metric.py +0 -73
- pyxlpr/ppocr/metrics/e2e_metric.py +0 -86
- pyxlpr/ppocr/metrics/eval_det_iou.py +0 -274
- pyxlpr/ppocr/metrics/kie_metric.py +0 -70
- pyxlpr/ppocr/metrics/rec_metric.py +0 -75
- pyxlpr/ppocr/metrics/table_metric.py +0 -50
- pyxlpr/ppocr/modeling/architectures/__init__.py +0 -32
- pyxlpr/ppocr/modeling/architectures/base_model.py +0 -88
- pyxlpr/ppocr/modeling/architectures/distillation_model.py +0 -60
- pyxlpr/ppocr/modeling/backbones/__init__.py +0 -54
- pyxlpr/ppocr/modeling/backbones/det_mobilenet_v3.py +0 -268
- pyxlpr/ppocr/modeling/backbones/det_resnet_vd.py +0 -246
- pyxlpr/ppocr/modeling/backbones/det_resnet_vd_sast.py +0 -285
- pyxlpr/ppocr/modeling/backbones/e2e_resnet_vd_pg.py +0 -265
- pyxlpr/ppocr/modeling/backbones/kie_unet_sdmgr.py +0 -186
- pyxlpr/ppocr/modeling/backbones/rec_mobilenet_v3.py +0 -138
- pyxlpr/ppocr/modeling/backbones/rec_mv1_enhance.py +0 -258
- pyxlpr/ppocr/modeling/backbones/rec_nrtr_mtb.py +0 -48
- pyxlpr/ppocr/modeling/backbones/rec_resnet_31.py +0 -210
- pyxlpr/ppocr/modeling/backbones/rec_resnet_aster.py +0 -143
- pyxlpr/ppocr/modeling/backbones/rec_resnet_fpn.py +0 -307
- pyxlpr/ppocr/modeling/backbones/rec_resnet_vd.py +0 -286
- pyxlpr/ppocr/modeling/heads/__init__.py +0 -54
- pyxlpr/ppocr/modeling/heads/cls_head.py +0 -52
- pyxlpr/ppocr/modeling/heads/det_db_head.py +0 -118
- pyxlpr/ppocr/modeling/heads/det_east_head.py +0 -121
- pyxlpr/ppocr/modeling/heads/det_pse_head.py +0 -37
- pyxlpr/ppocr/modeling/heads/det_sast_head.py +0 -128
- pyxlpr/ppocr/modeling/heads/e2e_pg_head.py +0 -253
- pyxlpr/ppocr/modeling/heads/kie_sdmgr_head.py +0 -206
- pyxlpr/ppocr/modeling/heads/multiheadAttention.py +0 -163
- pyxlpr/ppocr/modeling/heads/rec_aster_head.py +0 -393
- pyxlpr/ppocr/modeling/heads/rec_att_head.py +0 -202
- pyxlpr/ppocr/modeling/heads/rec_ctc_head.py +0 -88
- pyxlpr/ppocr/modeling/heads/rec_nrtr_head.py +0 -826
- pyxlpr/ppocr/modeling/heads/rec_sar_head.py +0 -402
- pyxlpr/ppocr/modeling/heads/rec_srn_head.py +0 -280
- pyxlpr/ppocr/modeling/heads/self_attention.py +0 -406
- pyxlpr/ppocr/modeling/heads/table_att_head.py +0 -246
- pyxlpr/ppocr/modeling/necks/__init__.py +0 -32
- pyxlpr/ppocr/modeling/necks/db_fpn.py +0 -111
- pyxlpr/ppocr/modeling/necks/east_fpn.py +0 -188
- pyxlpr/ppocr/modeling/necks/fpn.py +0 -138
- pyxlpr/ppocr/modeling/necks/pg_fpn.py +0 -314
- pyxlpr/ppocr/modeling/necks/rnn.py +0 -92
- pyxlpr/ppocr/modeling/necks/sast_fpn.py +0 -284
- pyxlpr/ppocr/modeling/necks/table_fpn.py +0 -110
- pyxlpr/ppocr/modeling/transforms/__init__.py +0 -28
- pyxlpr/ppocr/modeling/transforms/stn.py +0 -135
- pyxlpr/ppocr/modeling/transforms/tps.py +0 -308
- pyxlpr/ppocr/modeling/transforms/tps_spatial_transformer.py +0 -156
- pyxlpr/ppocr/optimizer/__init__.py +0 -61
- pyxlpr/ppocr/optimizer/learning_rate.py +0 -228
- pyxlpr/ppocr/optimizer/lr_scheduler.py +0 -49
- pyxlpr/ppocr/optimizer/optimizer.py +0 -160
- pyxlpr/ppocr/optimizer/regularizer.py +0 -52
- pyxlpr/ppocr/postprocess/__init__.py +0 -55
- pyxlpr/ppocr/postprocess/cls_postprocess.py +0 -33
- pyxlpr/ppocr/postprocess/db_postprocess.py +0 -234
- pyxlpr/ppocr/postprocess/east_postprocess.py +0 -143
- pyxlpr/ppocr/postprocess/locality_aware_nms.py +0 -200
- pyxlpr/ppocr/postprocess/pg_postprocess.py +0 -52
- pyxlpr/ppocr/postprocess/pse_postprocess/__init__.py +0 -15
- pyxlpr/ppocr/postprocess/pse_postprocess/pse/__init__.py +0 -29
- pyxlpr/ppocr/postprocess/pse_postprocess/pse/setup.py +0 -14
- pyxlpr/ppocr/postprocess/pse_postprocess/pse_postprocess.py +0 -118
- pyxlpr/ppocr/postprocess/rec_postprocess.py +0 -654
- pyxlpr/ppocr/postprocess/sast_postprocess.py +0 -355
- pyxlpr/ppocr/tools/__init__.py +0 -14
- pyxlpr/ppocr/tools/eval.py +0 -83
- pyxlpr/ppocr/tools/export_center.py +0 -77
- pyxlpr/ppocr/tools/export_model.py +0 -129
- pyxlpr/ppocr/tools/infer/predict_cls.py +0 -151
- pyxlpr/ppocr/tools/infer/predict_det.py +0 -300
- pyxlpr/ppocr/tools/infer/predict_e2e.py +0 -169
- pyxlpr/ppocr/tools/infer/predict_rec.py +0 -414
- pyxlpr/ppocr/tools/infer/predict_system.py +0 -204
- pyxlpr/ppocr/tools/infer/utility.py +0 -629
- pyxlpr/ppocr/tools/infer_cls.py +0 -83
- pyxlpr/ppocr/tools/infer_det.py +0 -134
- pyxlpr/ppocr/tools/infer_e2e.py +0 -122
- pyxlpr/ppocr/tools/infer_kie.py +0 -153
- pyxlpr/ppocr/tools/infer_rec.py +0 -146
- pyxlpr/ppocr/tools/infer_table.py +0 -107
- pyxlpr/ppocr/tools/program.py +0 -596
- pyxlpr/ppocr/tools/test_hubserving.py +0 -117
- pyxlpr/ppocr/tools/train.py +0 -163
- pyxlpr/ppocr/tools/xlprog.py +0 -748
- pyxlpr/ppocr/utils/EN_symbol_dict.txt +0 -94
- pyxlpr/ppocr/utils/__init__.py +0 -24
- pyxlpr/ppocr/utils/dict/ar_dict.txt +0 -117
- pyxlpr/ppocr/utils/dict/arabic_dict.txt +0 -162
- pyxlpr/ppocr/utils/dict/be_dict.txt +0 -145
- pyxlpr/ppocr/utils/dict/bg_dict.txt +0 -140
- pyxlpr/ppocr/utils/dict/chinese_cht_dict.txt +0 -8421
- pyxlpr/ppocr/utils/dict/cyrillic_dict.txt +0 -163
- pyxlpr/ppocr/utils/dict/devanagari_dict.txt +0 -167
- pyxlpr/ppocr/utils/dict/en_dict.txt +0 -63
- pyxlpr/ppocr/utils/dict/fa_dict.txt +0 -136
- pyxlpr/ppocr/utils/dict/french_dict.txt +0 -136
- pyxlpr/ppocr/utils/dict/german_dict.txt +0 -143
- pyxlpr/ppocr/utils/dict/hi_dict.txt +0 -162
- pyxlpr/ppocr/utils/dict/it_dict.txt +0 -118
- pyxlpr/ppocr/utils/dict/japan_dict.txt +0 -4399
- pyxlpr/ppocr/utils/dict/ka_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/korean_dict.txt +0 -3688
- pyxlpr/ppocr/utils/dict/latin_dict.txt +0 -185
- pyxlpr/ppocr/utils/dict/mr_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/ne_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/oc_dict.txt +0 -96
- pyxlpr/ppocr/utils/dict/pu_dict.txt +0 -130
- pyxlpr/ppocr/utils/dict/rs_dict.txt +0 -91
- pyxlpr/ppocr/utils/dict/rsc_dict.txt +0 -134
- pyxlpr/ppocr/utils/dict/ru_dict.txt +0 -125
- pyxlpr/ppocr/utils/dict/ta_dict.txt +0 -128
- pyxlpr/ppocr/utils/dict/table_dict.txt +0 -277
- pyxlpr/ppocr/utils/dict/table_structure_dict.txt +0 -2759
- pyxlpr/ppocr/utils/dict/te_dict.txt +0 -151
- pyxlpr/ppocr/utils/dict/ug_dict.txt +0 -114
- pyxlpr/ppocr/utils/dict/uk_dict.txt +0 -142
- pyxlpr/ppocr/utils/dict/ur_dict.txt +0 -137
- pyxlpr/ppocr/utils/dict/xi_dict.txt +0 -110
- pyxlpr/ppocr/utils/dict90.txt +0 -90
- pyxlpr/ppocr/utils/e2e_metric/Deteval.py +0 -574
- pyxlpr/ppocr/utils/e2e_metric/polygon_fast.py +0 -83
- pyxlpr/ppocr/utils/e2e_utils/extract_batchsize.py +0 -87
- pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_fast.py +0 -457
- pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_slow.py +0 -592
- pyxlpr/ppocr/utils/e2e_utils/pgnet_pp_utils.py +0 -162
- pyxlpr/ppocr/utils/e2e_utils/visual.py +0 -162
- pyxlpr/ppocr/utils/en_dict.txt +0 -95
- pyxlpr/ppocr/utils/gen_label.py +0 -81
- pyxlpr/ppocr/utils/ic15_dict.txt +0 -36
- pyxlpr/ppocr/utils/iou.py +0 -54
- pyxlpr/ppocr/utils/logging.py +0 -69
- pyxlpr/ppocr/utils/network.py +0 -84
- pyxlpr/ppocr/utils/ppocr_keys_v1.txt +0 -6623
- pyxlpr/ppocr/utils/profiler.py +0 -110
- pyxlpr/ppocr/utils/save_load.py +0 -150
- pyxlpr/ppocr/utils/stats.py +0 -72
- pyxlpr/ppocr/utils/utility.py +0 -80
- pyxlpr/ppstructure/__init__.py +0 -13
- pyxlpr/ppstructure/predict_system.py +0 -187
- pyxlpr/ppstructure/table/__init__.py +0 -13
- pyxlpr/ppstructure/table/eval_table.py +0 -72
- pyxlpr/ppstructure/table/matcher.py +0 -192
- pyxlpr/ppstructure/table/predict_structure.py +0 -136
- pyxlpr/ppstructure/table/predict_table.py +0 -221
- pyxlpr/ppstructure/table/table_metric/__init__.py +0 -16
- pyxlpr/ppstructure/table/table_metric/parallel.py +0 -51
- pyxlpr/ppstructure/table/table_metric/table_metric.py +0 -247
- pyxlpr/ppstructure/table/tablepyxl/__init__.py +0 -13
- pyxlpr/ppstructure/table/tablepyxl/style.py +0 -283
- pyxlpr/ppstructure/table/tablepyxl/tablepyxl.py +0 -118
- pyxlpr/ppstructure/utility.py +0 -71
- pyxlpr/xlai.py +0 -10
- /pyxllib/{ext/autogui → autogui}/virtualkey.py +0 -0
- {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
pyxllib/file/xlsxlib.py
CHANGED
@@ -7,12 +7,14 @@
|
|
7
7
|
"""
|
8
8
|
扩展了些自己的openpyxl工具
|
9
9
|
"""
|
10
|
+
import copy
|
11
|
+
|
10
12
|
import time
|
11
13
|
|
12
14
|
from pyxllib.prog.pupil import check_install_package, run_once
|
13
15
|
|
14
16
|
check_install_package('openpyxl')
|
15
|
-
check_install_package('premailer')
|
17
|
+
# check_install_package('premailer')
|
16
18
|
# check_install_package('xlrd2')
|
17
19
|
check_install_package('yattag')
|
18
20
|
check_install_package('jsonpickle')
|
@@ -30,11 +32,13 @@ import io
|
|
30
32
|
|
31
33
|
import xlrd
|
32
34
|
|
35
|
+
import filetype
|
33
36
|
import openpyxl
|
34
37
|
from openpyxl import Workbook
|
35
38
|
from openpyxl.cell.cell import MergedCell
|
36
39
|
from openpyxl.styles import Font, Alignment
|
37
40
|
from openpyxl.utils.cell import get_column_letter, column_index_from_string
|
41
|
+
import openpyxl.worksheet.formula
|
38
42
|
import pandas as pd
|
39
43
|
|
40
44
|
try:
|
@@ -117,7 +121,7 @@ def is_valid_excel_address(address):
|
|
117
121
|
return is_valid_excel_cell(address)
|
118
122
|
|
119
123
|
|
120
|
-
@run_once('str'
|
124
|
+
@run_once('str')
|
121
125
|
def xlfmt2pyfmt_date(xl_fmt):
|
122
126
|
""" 日期的渲染操作
|
123
127
|
|
@@ -233,6 +237,8 @@ def xl_render_value(x, xl_fmt):
|
|
233
237
|
注意,遇到公式是很难计算处理的,大概率只能保持原公式显示
|
234
238
|
因为日期用的比较多,需要时常获得真实的渲染效果,所以这里封装一个接口
|
235
239
|
|
240
|
+
对于JSA等场景,直接使用Cell.Text获取渲染值就行,不需要这里这么复杂的实现
|
241
|
+
|
236
242
|
>>> xl_render_value(datetime.datetime(2020, 1, 1), 'yyyy-mm-dd')
|
237
243
|
'2020-01-01'
|
238
244
|
"""
|
@@ -351,19 +357,73 @@ def convert_xls_to_xlsx(xls_file):
|
|
351
357
|
|
352
358
|
|
353
359
|
def load_as_xlsx_file(file_path, keep_links=False, keep_vba=False):
|
360
|
+
""" 这个不能全信文件给的扩展名,需要智能判断 """
|
361
|
+
|
362
|
+
# 0 工具函数
|
363
|
+
@run_once()
|
364
|
+
def read_xlsx():
|
365
|
+
file = file_path
|
366
|
+
# 如果文件原本的后缀不是xlsx,openpyxl是读不了的,要绕个弯
|
367
|
+
if file.suffix[1:] not in ('xlsx', 'xlsm'):
|
368
|
+
with open(file_path, 'rb') as f2:
|
369
|
+
data = f2.read()
|
370
|
+
file = io.BytesIO(data)
|
371
|
+
try:
|
372
|
+
return openpyxl.load_workbook(file,
|
373
|
+
keep_links=keep_links,
|
374
|
+
keep_vba=keep_vba), ''
|
375
|
+
except Exception as e:
|
376
|
+
if isinstance(e, TimeoutError): # 这里触发的是总的超时设定
|
377
|
+
raise e
|
378
|
+
return None, format_exception(e, 2)
|
379
|
+
|
380
|
+
@run_once()
|
381
|
+
def read_xls():
|
382
|
+
try:
|
383
|
+
return convert_xls_to_xlsx(file_path), ''
|
384
|
+
except Exception as e:
|
385
|
+
return None, format_exception(e, 2)
|
386
|
+
|
387
|
+
@run_once()
|
388
|
+
def read_csv():
|
389
|
+
try:
|
390
|
+
return convert_csv_to_xlsx(file_path), ''
|
391
|
+
except Exception as e:
|
392
|
+
return None, format_exception(e, 2)
|
393
|
+
|
394
|
+
def read_test(suffix):
|
395
|
+
if suffix in ('xlsx', 'xlsm', 'zip'):
|
396
|
+
wb, error = read_xlsx()
|
397
|
+
elif suffix == 'xls':
|
398
|
+
wb, error = read_xls()
|
399
|
+
elif suffix == 'csv':
|
400
|
+
wb, error = read_csv()
|
401
|
+
else:
|
402
|
+
wb, error = None, f'不支持的文件类型:{suffix}'
|
403
|
+
return wb, error
|
404
|
+
|
405
|
+
# 1 优先相信用户输入的文件名类型
|
354
406
|
file_path = Path(file_path)
|
355
|
-
suffix = file_path.suffix.lower()
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
return
|
366
|
-
|
407
|
+
suffix = file_path.suffix.lower()[1:]
|
408
|
+
wb, error = read_test(suffix)
|
409
|
+
if wb is not None:
|
410
|
+
return wb, suffix
|
411
|
+
|
412
|
+
# 2 如果处理不了,则尝试用filetype判断的类型
|
413
|
+
suffix2 = filetype.guess(file_path)
|
414
|
+
suffix2 = suffix2.extension if suffix2 else ''
|
415
|
+
wb, _ = read_test(suffix2)
|
416
|
+
if wb is not None:
|
417
|
+
return wb, suffix2
|
418
|
+
|
419
|
+
# 3 如果还处理不了,再把其他可能的情况试一遍
|
420
|
+
for suffix in ('xlsx', 'xls', 'csv'):
|
421
|
+
wb, _ = read_test(suffix)
|
422
|
+
if wb is not None:
|
423
|
+
return wb, suffix
|
424
|
+
|
425
|
+
# 4 确实是处理不了的类型,返回报错信息
|
426
|
+
return None, error
|
367
427
|
|
368
428
|
|
369
429
|
def parse_range_address(address):
|
@@ -466,7 +526,7 @@ def is_string_type(value):
|
|
466
526
|
try:
|
467
527
|
pd.to_datetime(value, errors='raise')
|
468
528
|
return False
|
469
|
-
except (ValueError, TypeError, OverflowError):
|
529
|
+
except (ValueError, TypeError, OverflowError, AttributeError):
|
470
530
|
pass
|
471
531
|
|
472
532
|
# 检查是否为浮点数类型
|
@@ -521,10 +581,18 @@ class XlCell(openpyxl.cell.cell.Cell): # 适用于 openpyxl.cell.cell.MergedCel
|
|
521
581
|
|
522
582
|
TODO 这个函数还是可以看看能不能有更好的实现、提速
|
523
583
|
"""
|
584
|
+
|
585
|
+
def try_offset(x, y):
|
586
|
+
try:
|
587
|
+
return isinstance(self.offset(x, y), MergedCell)
|
588
|
+
except ValueError:
|
589
|
+
# 有可能会越界:ValueError: Row numbers must be between 1 and 1048576
|
590
|
+
return False
|
591
|
+
|
524
592
|
_type, status = 0, {}
|
525
593
|
if isinstance(self, MergedCell):
|
526
594
|
_type = 1
|
527
|
-
elif
|
595
|
+
elif try_offset(1, 0) or try_offset(0, 1):
|
528
596
|
# 这里只能判断可能是合并单元格,具体是不是合并单元格,还要
|
529
597
|
rng = self.in_range()
|
530
598
|
status['rng'] = rng
|
@@ -698,7 +766,7 @@ class XlCell(openpyxl.cell.cell.Cell): # 适用于 openpyxl.cell.cell.MergedCel
|
|
698
766
|
# openpyxl的机制,如果没有配置日期格式,读取到的是默认的'mm-dd-yy',其实在中文场景,默认格式应该是后者
|
699
767
|
if fmt == 'mm-dd-yy':
|
700
768
|
return 'yyyy/m/d' # 中文的默认日期格式
|
701
|
-
elif fmt == 'yyyy\-mm\-dd': # 不知道为什么会有提取到这种\的情况,先暴力替换了
|
769
|
+
elif fmt == r'yyyy\-mm\-dd': # 不知道为什么会有提取到这种\的情况,先暴力替换了
|
702
770
|
fmt = 'yyyy-mm-dd'
|
703
771
|
return fmt
|
704
772
|
|
@@ -709,7 +777,10 @@ class XlCell(openpyxl.cell.cell.Cell): # 适用于 openpyxl.cell.cell.MergedCel
|
|
709
777
|
注意,遇到公式是很难计算处理的,大概率只能保持原公式显示
|
710
778
|
因为日期用的比较多,需要时常获得真实的渲染效果,所以这里封装一个接口
|
711
779
|
"""
|
780
|
+
|
712
781
|
x = self.value
|
782
|
+
if isinstance(x, openpyxl.worksheet.formula.ArrayFormula): # 数组公式要特别渲染
|
783
|
+
return x.text
|
713
784
|
xl_fmt = self.get_number_format()
|
714
785
|
return xl_render_value(x, xl_fmt)
|
715
786
|
|
@@ -1136,7 +1207,7 @@ class XlWorksheet(openpyxl.worksheet.worksheet.Worksheet):
|
|
1136
1207
|
|
1137
1208
|
return df
|
1138
1209
|
|
1139
|
-
def copy_range(self, src_addr, dst_cell, *, temp_sheet=False,
|
1210
|
+
def copy_range(self, src_addr, dst_cell, *, temp_sheet=False, return_mode=False):
|
1140
1211
|
""" 将自身cell_range区间的内容、格式,拷贝到目标dst_cell里
|
1141
1212
|
|
1142
1213
|
:param str src_addr: 自身的一片单元格范围
|
@@ -1158,7 +1229,7 @@ class XlWorksheet(openpyxl.worksheet.worksheet.Worksheet):
|
|
1158
1229
|
mid_result = {}
|
1159
1230
|
if temp_sheet:
|
1160
1231
|
ws3 = self.parent.create_sheet('__copy_range')
|
1161
|
-
mid_result = self.copy_range(src_addr, ws3['A1'],
|
1232
|
+
mid_result = self.copy_range(src_addr, ws3['A1'], return_mode=True)
|
1162
1233
|
ws1 = ws3
|
1163
1234
|
src_addr = f'A1:{excel_addr(mid_result["n"], mid_result["m"])}'
|
1164
1235
|
else:
|
@@ -1193,7 +1264,7 @@ class XlWorksheet(openpyxl.worksheet.worksheet.Worksheet):
|
|
1193
1264
|
if temp_sheet:
|
1194
1265
|
self.parent.remove(ws1)
|
1195
1266
|
|
1196
|
-
if
|
1267
|
+
if return_mode:
|
1197
1268
|
return mid_result
|
1198
1269
|
|
1199
1270
|
def reindex_columns(self, orders):
|
@@ -1964,7 +2035,7 @@ class XlWorkbook(openpyxl.Workbook):
|
|
1964
2035
|
'cells': extract_cells_content(ws)
|
1965
2036
|
})
|
1966
2037
|
|
1967
|
-
if not summary['cells']: # 如果没有数据,则大概率是数据透视表,是计算出来的,读取不到~
|
2038
|
+
if not summary['cells']: # 如果没有数据,则大概率是数据透视表,是计算出来的,读取不到~ 但是JSA等场景应该有办法获得
|
1968
2039
|
summary['sheetType'] = 'PivotTable'
|
1969
2040
|
del summary['cells']
|
1970
2041
|
else:
|
@@ -2399,7 +2470,7 @@ def extract_workbook_summary2(file_path, *,
|
|
2399
2470
|
"""
|
2400
2471
|
:param keep_links: 是否保留外部表格链接数据。如果保留,打开好像会有点问题。
|
2401
2472
|
:param mode:
|
2402
|
-
0,最原始的
|
2473
|
+
0,最原始的summary2摘要
|
2403
2474
|
1,添加当前工作表、单元格位置的信息
|
2404
2475
|
:param kwargs: 捕捉其他参数,主要是向下兼容,其实现在并没有用
|
2405
2476
|
|
@@ -2411,7 +2482,12 @@ def extract_workbook_summary2(file_path, *,
|
|
2411
2482
|
res = {}
|
2412
2483
|
res['fileName'] = file_path.name
|
2413
2484
|
start_time = time.time()
|
2414
|
-
wb = load_as_xlsx_file(file_path, keep_links=keep_links, keep_vba=keep_vba)
|
2485
|
+
wb, suffix = load_as_xlsx_file(file_path, keep_links=keep_links, keep_vba=keep_vba)
|
2486
|
+
if wb is None:
|
2487
|
+
res['error'] = f'Load file error。{suffix}'
|
2488
|
+
else:
|
2489
|
+
res['fileType'] = suffix
|
2490
|
+
|
2415
2491
|
load_time = time.time() - start_time
|
2416
2492
|
if wb is None: # 不支持的文件类型,不报错,只是返回最基本的文件名信息
|
2417
2493
|
if return_mode == 1:
|
@@ -2459,7 +2535,7 @@ def update_raw_summary2(data):
|
|
2459
2535
|
|
2460
2536
|
# 3 判断键值顺序
|
2461
2537
|
keys = list(data.keys())
|
2462
|
-
ref_keys = ['fileName', 'chineseContentRatio', 'nonEmptyCellRatio', 'sheetNames', 'sheets']
|
2538
|
+
ref_keys = ['fileName', 'fileType', 'chineseContentRatio', 'nonEmptyCellRatio', 'sheetNames', 'sheets']
|
2463
2539
|
if keys != ref_keys:
|
2464
2540
|
data = {k: data[k] for k in ref_keys if k in data}
|
2465
2541
|
|
@@ -2597,30 +2673,39 @@ class WorkbookSummary3:
|
|
2597
2673
|
for addr, _ in row:
|
2598
2674
|
new_cells[addr] = cells[addr]
|
2599
2675
|
|
2600
|
-
|
2601
|
-
for rows in rows_groups:
|
2676
|
+
total_new_cells = []
|
2677
|
+
for rows in reversed(rows_groups):
|
2678
|
+
new_cells = {}
|
2602
2679
|
if len(rows) < 10:
|
2603
2680
|
extract_cells_from_rows(rows)
|
2604
2681
|
else: # 压缩中间的数据
|
2605
2682
|
# 如果评估到最终摘要可能太小,要收敛下删除的范围
|
2606
2683
|
n, m = len(rows), len(rows[0])
|
2607
2684
|
target_n = int(target_reduce_cells_num / m + 0.5) # 本来应该删除多少行才行
|
2608
|
-
|
2609
|
-
|
2610
|
-
|
2611
|
-
|
2612
|
-
|
2613
|
-
|
2614
|
-
|
2615
|
-
|
2616
|
-
|
2617
|
-
|
2618
|
-
|
2619
|
-
|
2620
|
-
|
2621
|
-
|
2622
|
-
|
2623
|
-
|
2685
|
+
if target_n <= 0: # 如果删除的行数太少,那么就不压缩了
|
2686
|
+
extract_cells_from_rows(rows)
|
2687
|
+
else:
|
2688
|
+
cur_n = n - 4 if target_n > n - 4 else target_n # 实际删除多少行
|
2689
|
+
left_n = n - cur_n # 剩余多少行
|
2690
|
+
b = left_n // 2
|
2691
|
+
a = left_n - b
|
2692
|
+
|
2693
|
+
extract_cells_from_rows(rows[:a])
|
2694
|
+
addr = combine_addresses(rows[a][0][0], rows[-b - 1][-1][0])
|
2695
|
+
# new_cells[addr] = '这块区域的内容跟前面几行、后面几行的内容结构是一致的,省略显示'
|
2696
|
+
new_cells[addr] = '...'
|
2697
|
+
extract_cells_from_rows(rows[-b:])
|
2698
|
+
|
2699
|
+
target_reduce_cells_num -= cur_n * m
|
2700
|
+
# 240429周一21:57,这两行不能开,否则会过渡精简。如果压缩够了,那么后面的单元格需要全量补上。
|
2701
|
+
# if target_reduce_cells_num <= 0: # 满足以后不是直接break,而是要把后续的内容都保留
|
2702
|
+
# break
|
2703
|
+
total_new_cells.append(new_cells)
|
2704
|
+
|
2705
|
+
new_cells2 = {}
|
2706
|
+
for rows in reversed(total_new_cells):
|
2707
|
+
new_cells2.update(rows)
|
2708
|
+
sheet['cells'] = new_cells2
|
2624
2709
|
|
2625
2710
|
@classmethod
|
2626
2711
|
def reduce4_truncate_cells(cls, y, summary_limit_len, *, cur_summary_len=None):
|
@@ -2752,6 +2837,7 @@ class WorkbookSummary3:
|
|
2752
2837
|
if cur_summary_len is None:
|
2753
2838
|
cur_summary_len = cls.count_length(y)
|
2754
2839
|
|
2840
|
+
cur_summary_len0 = cur_summary_len
|
2755
2841
|
active_sheet = y['ActiveSheet']
|
2756
2842
|
|
2757
2843
|
# 1 预计要删除单元格数
|
@@ -2783,7 +2869,8 @@ class WorkbookSummary3:
|
|
2783
2869
|
return cls.count_length(y)
|
2784
2870
|
|
2785
2871
|
# 4 否则每张表按照比例删单元格,只保留前面部分的单元格
|
2786
|
-
|
2872
|
+
# todo 这里应该有更好的筛选机制,后续可以思考思考
|
2873
|
+
left_rate = min((summary_limit_len + cur_summary_len) / (2 * cur_summary_len), 0.9) # 首轮减小一点调整幅度
|
2787
2874
|
while True:
|
2788
2875
|
for i, st in enumerate(y['sheets']):
|
2789
2876
|
if i == active_sheet_index:
|
@@ -2795,10 +2882,10 @@ class WorkbookSummary3:
|
|
2795
2882
|
cur_summary_len = cls.count_length(y)
|
2796
2883
|
if cur_summary_len <= summary_limit_len:
|
2797
2884
|
return cur_summary_len
|
2798
|
-
if left_rate * total_cells_num < 1:
|
2885
|
+
if left_rate * total_cells_num < 1: # 都没有单元格,别删了
|
2799
2886
|
break
|
2800
|
-
else:
|
2801
|
-
left_rate *= 0.
|
2887
|
+
else: # 更新保留比率,再试
|
2888
|
+
left_rate *= min(summary_limit_len / cur_summary_len, 0.9)
|
2802
2889
|
|
2803
2890
|
return cur_summary_len
|
2804
2891
|
|
@@ -2881,8 +2968,12 @@ class WorkbookSummary3:
|
|
2881
2968
|
return y
|
2882
2969
|
|
2883
2970
|
x = summary2
|
2971
|
+
if 'error' in x:
|
2972
|
+
return x
|
2973
|
+
|
2884
2974
|
y = {
|
2885
2975
|
'fileName': x['fileName'],
|
2976
|
+
'fileType': x['fileType'],
|
2886
2977
|
'sheetNames': x['sheetNames'],
|
2887
2978
|
'sheets': x['sheets'],
|
2888
2979
|
'mode': 'Complete information',
|
@@ -2927,33 +3018,90 @@ def extract_workbook_summary3(file_path, summary_limit_len=4000, **kwargs):
|
|
2927
3018
|
return data
|
2928
3019
|
|
2929
3020
|
|
3021
|
+
def summary2_add_enums(summary2, enum_values):
|
3022
|
+
# 1 预备
|
3023
|
+
if enum_values is True:
|
3024
|
+
enum_values = (20, 10)
|
3025
|
+
max_len, max_num = enum_values
|
3026
|
+
|
3027
|
+
# 2 枚举值
|
3028
|
+
for sheet in summary2['sheets']:
|
3029
|
+
# 2.1 遍历计数
|
3030
|
+
cols = defaultdict(Counter)
|
3031
|
+
for addr, val in sheet['cells'].items():
|
3032
|
+
n = len(str(val))
|
3033
|
+
if not n or n > max_len:
|
3034
|
+
continue
|
3035
|
+
col = re.match(r'[A-Z]+', addr).group()
|
3036
|
+
cols[col][val] += 1
|
3037
|
+
|
3038
|
+
# 2.2 添加枚举值列
|
3039
|
+
enums = {}
|
3040
|
+
keys = sorted(cols.keys(), key=column_index_from_string)
|
3041
|
+
for k in keys:
|
3042
|
+
ct = cols[k]
|
3043
|
+
if len(ct) > max_num:
|
3044
|
+
continue
|
3045
|
+
vals = ct.most_common()
|
3046
|
+
if vals[0][1] == 1: # 都只出现了一次,也不认为是枚举值,跳过。或者是小数据表,一般也能全量展示。
|
3047
|
+
continue
|
3048
|
+
enums[k] = [v for v, _ in vals]
|
3049
|
+
|
3050
|
+
# 2.3 保存
|
3051
|
+
if enums:
|
3052
|
+
sheet['enums'] = enums
|
3053
|
+
# enums2 = json.dumps(enums, ensure_ascii=False, default=str)
|
3054
|
+
# sheet['enums'] = json.loads(enums2)
|
3055
|
+
|
3056
|
+
return summary2
|
3057
|
+
|
3058
|
+
|
2930
3059
|
def extract_workbook_summary3b(file_path,
|
2931
3060
|
summary_limit_len=4000,
|
2932
3061
|
timeout_seconds=10,
|
2933
3062
|
return_mode=0,
|
2934
3063
|
debug=False,
|
2935
3064
|
len_mode=0,
|
3065
|
+
enum_values=False,
|
2936
3066
|
**kwargs):
|
2937
3067
|
"""
|
2938
3068
|
|
2939
3069
|
:param summary_limit_len: 摘要长度限制
|
2940
3070
|
:param timeout_seconds: 超时限制
|
2941
|
-
:param return_mode:
|
3071
|
+
:param return_mode: 返回模式
|
3072
|
+
0,表示只返回摘要
|
3073
|
+
1,表示返回摘要和耗时
|
3074
|
+
2, 再增加返回summary2
|
2942
3075
|
:param len_mode:
|
2943
3076
|
0, 使用len作为token长度评估
|
2944
3077
|
1, 使用模型评估实际token长度
|
3078
|
+
:param enum_values: 是否展示每列枚举值
|
3079
|
+
False, 默认不展示
|
3080
|
+
True, 展示,并且默认参数 (20, 10) 表示长度超过20的丢弃,只保留枚举类型不超过10种值的列
|
2945
3081
|
:param kwargs: 其他是summary2读取文件的时候的参数,其实都不太关键,一般不用特地设置
|
2946
3082
|
"""
|
2947
3083
|
res = {}
|
2948
3084
|
res['fileName'] = Path(file_path).name
|
2949
3085
|
load_time = summary2_time = summary3_time = -1
|
3086
|
+
summary2_res = {}
|
3087
|
+
|
3088
|
+
def reduce_summary(summary):
|
3089
|
+
""" 如果转json后的summary超过4K,去掉可能的sheets字段 """
|
3090
|
+
s = json.dumps(summary, ensure_ascii=False)
|
3091
|
+
if len(s) < 4000:
|
3092
|
+
if 'sheets' in summary:
|
3093
|
+
del summary['sheets']
|
2950
3094
|
|
2951
3095
|
try:
|
2952
3096
|
with Timeout(timeout_seconds):
|
2953
3097
|
start_time = time.time()
|
2954
3098
|
res, load_time = extract_workbook_summary2(file_path, mode=1, return_mode=1, **kwargs)
|
2955
3099
|
# res = convert_to_json_compatible(res)
|
3100
|
+
summary2_res = copy.deepcopy(res)
|
3101
|
+
if enum_values:
|
3102
|
+
res = summary2_add_enums(res, enum_values)
|
2956
3103
|
summary2_time = time.time() - start_time - load_time
|
3104
|
+
|
2957
3105
|
start_time = time.time()
|
2958
3106
|
if len_mode == 1:
|
2959
3107
|
res = WorkbookSummary3plus.summary2_to_summary3b(res, summary_limit_len)
|
@@ -2964,13 +3112,20 @@ def extract_workbook_summary3b(file_path,
|
|
2964
3112
|
if debug:
|
2965
3113
|
raise e
|
2966
3114
|
res['error'] = f'超时,未完成摘要提取:{timeout_seconds}秒'
|
3115
|
+
reduce_summary(res)
|
2967
3116
|
except Exception as e:
|
2968
3117
|
if debug:
|
2969
3118
|
raise e
|
2970
3119
|
res['error'] = f'提取摘要时发生错误:{format_exception(e, 2)}'
|
3120
|
+
reduce_summary(res)
|
3121
|
+
|
3122
|
+
time_dict = {'load_time': human_readable_number(load_time),
|
3123
|
+
'summary2_time': human_readable_number(summary2_time),
|
3124
|
+
'summary3_time': human_readable_number(summary3_time)}
|
2971
3125
|
|
2972
3126
|
if return_mode == 1:
|
2973
|
-
return res,
|
2974
|
-
|
2975
|
-
|
3127
|
+
return res, time_dict
|
3128
|
+
elif return_mode == 2:
|
3129
|
+
return res, time_dict, summary2_res
|
3130
|
+
|
2976
3131
|
return res
|