pyxllib 0.3.96__py3-none-any.whl → 0.3.197__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (306) hide show
  1. pyxllib/algo/geo.py +12 -0
  2. pyxllib/algo/intervals.py +1 -1
  3. pyxllib/algo/matcher.py +78 -0
  4. pyxllib/algo/pupil.py +187 -19
  5. pyxllib/algo/specialist.py +2 -1
  6. pyxllib/algo/stat.py +38 -2
  7. {pyxlpr → pyxllib/autogui}/__init__.py +1 -1
  8. pyxllib/autogui/activewin.py +246 -0
  9. pyxllib/autogui/all.py +9 -0
  10. pyxllib/{ext/autogui → autogui}/autogui.py +40 -11
  11. pyxllib/autogui/uiautolib.py +362 -0
  12. pyxllib/autogui/wechat.py +827 -0
  13. pyxllib/autogui/wechat_msg.py +421 -0
  14. pyxllib/autogui/wxautolib.py +84 -0
  15. pyxllib/cv/slidercaptcha.py +137 -0
  16. pyxllib/data/echarts.py +123 -12
  17. pyxllib/data/jsonlib.py +89 -0
  18. pyxllib/data/pglib.py +514 -30
  19. pyxllib/data/sqlite.py +231 -4
  20. pyxllib/ext/JLineViewer.py +14 -1
  21. pyxllib/ext/drissionlib.py +277 -0
  22. pyxllib/ext/kq5034lib.py +0 -1594
  23. pyxllib/ext/robustprocfile.py +497 -0
  24. pyxllib/ext/unixlib.py +6 -5
  25. pyxllib/ext/utools.py +108 -95
  26. pyxllib/ext/webhook.py +32 -14
  27. pyxllib/ext/wjxlib.py +88 -0
  28. pyxllib/ext/wpsapi.py +124 -0
  29. pyxllib/ext/xlwork.py +9 -0
  30. pyxllib/ext/yuquelib.py +1003 -71
  31. pyxllib/file/docxlib.py +1 -1
  32. pyxllib/file/libreoffice.py +165 -0
  33. pyxllib/file/movielib.py +9 -0
  34. pyxllib/file/packlib/__init__.py +112 -75
  35. pyxllib/file/pdflib.py +1 -1
  36. pyxllib/file/pupil.py +1 -1
  37. pyxllib/file/specialist/dirlib.py +1 -1
  38. pyxllib/file/specialist/download.py +10 -3
  39. pyxllib/file/specialist/filelib.py +266 -55
  40. pyxllib/file/xlsxlib.py +205 -50
  41. pyxllib/file/xlsyncfile.py +341 -0
  42. pyxllib/prog/cachetools.py +64 -0
  43. pyxllib/prog/filelock.py +42 -0
  44. pyxllib/prog/multiprogs.py +940 -0
  45. pyxllib/prog/newbie.py +9 -2
  46. pyxllib/prog/pupil.py +129 -60
  47. pyxllib/prog/specialist/__init__.py +176 -2
  48. pyxllib/prog/specialist/bc.py +5 -2
  49. pyxllib/prog/specialist/browser.py +11 -2
  50. pyxllib/prog/specialist/datetime.py +68 -0
  51. pyxllib/prog/specialist/tictoc.py +12 -13
  52. pyxllib/prog/specialist/xllog.py +5 -5
  53. pyxllib/prog/xlosenv.py +7 -0
  54. pyxllib/text/airscript.js +744 -0
  55. pyxllib/text/charclasslib.py +17 -5
  56. pyxllib/text/jiebalib.py +6 -3
  57. pyxllib/text/jinjalib.py +32 -0
  58. pyxllib/text/jsa_ai_prompt.md +271 -0
  59. pyxllib/text/jscode.py +159 -4
  60. pyxllib/text/nestenv.py +1 -1
  61. pyxllib/text/newbie.py +12 -0
  62. pyxllib/text/pupil/common.py +26 -0
  63. pyxllib/text/specialist/ptag.py +2 -2
  64. pyxllib/text/templates/echart_base.html +11 -0
  65. pyxllib/text/templates/highlight_code.html +17 -0
  66. pyxllib/text/templates/latex_editor.html +103 -0
  67. pyxllib/text/xmllib.py +76 -14
  68. pyxllib/xl.py +2 -1
  69. pyxllib-0.3.197.dist-info/METADATA +48 -0
  70. pyxllib-0.3.197.dist-info/RECORD +126 -0
  71. {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +1 -2
  72. pyxllib/ext/autogui/__init__.py +0 -8
  73. pyxllib-0.3.96.dist-info/METADATA +0 -51
  74. pyxllib-0.3.96.dist-info/RECORD +0 -333
  75. pyxllib-0.3.96.dist-info/top_level.txt +0 -2
  76. pyxlpr/ai/__init__.py +0 -5
  77. pyxlpr/ai/clientlib.py +0 -1281
  78. pyxlpr/ai/specialist.py +0 -286
  79. pyxlpr/ai/torch_app.py +0 -172
  80. pyxlpr/ai/xlpaddle.py +0 -655
  81. pyxlpr/ai/xltorch.py +0 -705
  82. pyxlpr/data/__init__.py +0 -11
  83. pyxlpr/data/coco.py +0 -1325
  84. pyxlpr/data/datacls.py +0 -365
  85. pyxlpr/data/datasets.py +0 -200
  86. pyxlpr/data/gptlib.py +0 -1291
  87. pyxlpr/data/icdar/__init__.py +0 -96
  88. pyxlpr/data/icdar/deteval.py +0 -377
  89. pyxlpr/data/icdar/icdar2013.py +0 -341
  90. pyxlpr/data/icdar/iou.py +0 -340
  91. pyxlpr/data/icdar/rrc_evaluation_funcs_1_1.py +0 -463
  92. pyxlpr/data/imtextline.py +0 -473
  93. pyxlpr/data/labelme.py +0 -866
  94. pyxlpr/data/removeline.py +0 -179
  95. pyxlpr/data/specialist.py +0 -57
  96. pyxlpr/eval/__init__.py +0 -85
  97. pyxlpr/paddleocr.py +0 -776
  98. pyxlpr/ppocr/__init__.py +0 -15
  99. pyxlpr/ppocr/configs/rec/multi_language/generate_multi_language_configs.py +0 -226
  100. pyxlpr/ppocr/data/__init__.py +0 -135
  101. pyxlpr/ppocr/data/imaug/ColorJitter.py +0 -26
  102. pyxlpr/ppocr/data/imaug/__init__.py +0 -67
  103. pyxlpr/ppocr/data/imaug/copy_paste.py +0 -170
  104. pyxlpr/ppocr/data/imaug/east_process.py +0 -437
  105. pyxlpr/ppocr/data/imaug/gen_table_mask.py +0 -244
  106. pyxlpr/ppocr/data/imaug/iaa_augment.py +0 -114
  107. pyxlpr/ppocr/data/imaug/label_ops.py +0 -789
  108. pyxlpr/ppocr/data/imaug/make_border_map.py +0 -184
  109. pyxlpr/ppocr/data/imaug/make_pse_gt.py +0 -106
  110. pyxlpr/ppocr/data/imaug/make_shrink_map.py +0 -126
  111. pyxlpr/ppocr/data/imaug/operators.py +0 -433
  112. pyxlpr/ppocr/data/imaug/pg_process.py +0 -906
  113. pyxlpr/ppocr/data/imaug/randaugment.py +0 -143
  114. pyxlpr/ppocr/data/imaug/random_crop_data.py +0 -239
  115. pyxlpr/ppocr/data/imaug/rec_img_aug.py +0 -533
  116. pyxlpr/ppocr/data/imaug/sast_process.py +0 -777
  117. pyxlpr/ppocr/data/imaug/text_image_aug/__init__.py +0 -17
  118. pyxlpr/ppocr/data/imaug/text_image_aug/augment.py +0 -120
  119. pyxlpr/ppocr/data/imaug/text_image_aug/warp_mls.py +0 -168
  120. pyxlpr/ppocr/data/lmdb_dataset.py +0 -115
  121. pyxlpr/ppocr/data/pgnet_dataset.py +0 -104
  122. pyxlpr/ppocr/data/pubtab_dataset.py +0 -107
  123. pyxlpr/ppocr/data/simple_dataset.py +0 -372
  124. pyxlpr/ppocr/losses/__init__.py +0 -61
  125. pyxlpr/ppocr/losses/ace_loss.py +0 -52
  126. pyxlpr/ppocr/losses/basic_loss.py +0 -135
  127. pyxlpr/ppocr/losses/center_loss.py +0 -88
  128. pyxlpr/ppocr/losses/cls_loss.py +0 -30
  129. pyxlpr/ppocr/losses/combined_loss.py +0 -67
  130. pyxlpr/ppocr/losses/det_basic_loss.py +0 -208
  131. pyxlpr/ppocr/losses/det_db_loss.py +0 -80
  132. pyxlpr/ppocr/losses/det_east_loss.py +0 -63
  133. pyxlpr/ppocr/losses/det_pse_loss.py +0 -149
  134. pyxlpr/ppocr/losses/det_sast_loss.py +0 -121
  135. pyxlpr/ppocr/losses/distillation_loss.py +0 -272
  136. pyxlpr/ppocr/losses/e2e_pg_loss.py +0 -140
  137. pyxlpr/ppocr/losses/kie_sdmgr_loss.py +0 -113
  138. pyxlpr/ppocr/losses/rec_aster_loss.py +0 -99
  139. pyxlpr/ppocr/losses/rec_att_loss.py +0 -39
  140. pyxlpr/ppocr/losses/rec_ctc_loss.py +0 -44
  141. pyxlpr/ppocr/losses/rec_enhanced_ctc_loss.py +0 -70
  142. pyxlpr/ppocr/losses/rec_nrtr_loss.py +0 -30
  143. pyxlpr/ppocr/losses/rec_sar_loss.py +0 -28
  144. pyxlpr/ppocr/losses/rec_srn_loss.py +0 -47
  145. pyxlpr/ppocr/losses/table_att_loss.py +0 -109
  146. pyxlpr/ppocr/metrics/__init__.py +0 -44
  147. pyxlpr/ppocr/metrics/cls_metric.py +0 -45
  148. pyxlpr/ppocr/metrics/det_metric.py +0 -82
  149. pyxlpr/ppocr/metrics/distillation_metric.py +0 -73
  150. pyxlpr/ppocr/metrics/e2e_metric.py +0 -86
  151. pyxlpr/ppocr/metrics/eval_det_iou.py +0 -274
  152. pyxlpr/ppocr/metrics/kie_metric.py +0 -70
  153. pyxlpr/ppocr/metrics/rec_metric.py +0 -75
  154. pyxlpr/ppocr/metrics/table_metric.py +0 -50
  155. pyxlpr/ppocr/modeling/architectures/__init__.py +0 -32
  156. pyxlpr/ppocr/modeling/architectures/base_model.py +0 -88
  157. pyxlpr/ppocr/modeling/architectures/distillation_model.py +0 -60
  158. pyxlpr/ppocr/modeling/backbones/__init__.py +0 -54
  159. pyxlpr/ppocr/modeling/backbones/det_mobilenet_v3.py +0 -268
  160. pyxlpr/ppocr/modeling/backbones/det_resnet_vd.py +0 -246
  161. pyxlpr/ppocr/modeling/backbones/det_resnet_vd_sast.py +0 -285
  162. pyxlpr/ppocr/modeling/backbones/e2e_resnet_vd_pg.py +0 -265
  163. pyxlpr/ppocr/modeling/backbones/kie_unet_sdmgr.py +0 -186
  164. pyxlpr/ppocr/modeling/backbones/rec_mobilenet_v3.py +0 -138
  165. pyxlpr/ppocr/modeling/backbones/rec_mv1_enhance.py +0 -258
  166. pyxlpr/ppocr/modeling/backbones/rec_nrtr_mtb.py +0 -48
  167. pyxlpr/ppocr/modeling/backbones/rec_resnet_31.py +0 -210
  168. pyxlpr/ppocr/modeling/backbones/rec_resnet_aster.py +0 -143
  169. pyxlpr/ppocr/modeling/backbones/rec_resnet_fpn.py +0 -307
  170. pyxlpr/ppocr/modeling/backbones/rec_resnet_vd.py +0 -286
  171. pyxlpr/ppocr/modeling/heads/__init__.py +0 -54
  172. pyxlpr/ppocr/modeling/heads/cls_head.py +0 -52
  173. pyxlpr/ppocr/modeling/heads/det_db_head.py +0 -118
  174. pyxlpr/ppocr/modeling/heads/det_east_head.py +0 -121
  175. pyxlpr/ppocr/modeling/heads/det_pse_head.py +0 -37
  176. pyxlpr/ppocr/modeling/heads/det_sast_head.py +0 -128
  177. pyxlpr/ppocr/modeling/heads/e2e_pg_head.py +0 -253
  178. pyxlpr/ppocr/modeling/heads/kie_sdmgr_head.py +0 -206
  179. pyxlpr/ppocr/modeling/heads/multiheadAttention.py +0 -163
  180. pyxlpr/ppocr/modeling/heads/rec_aster_head.py +0 -393
  181. pyxlpr/ppocr/modeling/heads/rec_att_head.py +0 -202
  182. pyxlpr/ppocr/modeling/heads/rec_ctc_head.py +0 -88
  183. pyxlpr/ppocr/modeling/heads/rec_nrtr_head.py +0 -826
  184. pyxlpr/ppocr/modeling/heads/rec_sar_head.py +0 -402
  185. pyxlpr/ppocr/modeling/heads/rec_srn_head.py +0 -280
  186. pyxlpr/ppocr/modeling/heads/self_attention.py +0 -406
  187. pyxlpr/ppocr/modeling/heads/table_att_head.py +0 -246
  188. pyxlpr/ppocr/modeling/necks/__init__.py +0 -32
  189. pyxlpr/ppocr/modeling/necks/db_fpn.py +0 -111
  190. pyxlpr/ppocr/modeling/necks/east_fpn.py +0 -188
  191. pyxlpr/ppocr/modeling/necks/fpn.py +0 -138
  192. pyxlpr/ppocr/modeling/necks/pg_fpn.py +0 -314
  193. pyxlpr/ppocr/modeling/necks/rnn.py +0 -92
  194. pyxlpr/ppocr/modeling/necks/sast_fpn.py +0 -284
  195. pyxlpr/ppocr/modeling/necks/table_fpn.py +0 -110
  196. pyxlpr/ppocr/modeling/transforms/__init__.py +0 -28
  197. pyxlpr/ppocr/modeling/transforms/stn.py +0 -135
  198. pyxlpr/ppocr/modeling/transforms/tps.py +0 -308
  199. pyxlpr/ppocr/modeling/transforms/tps_spatial_transformer.py +0 -156
  200. pyxlpr/ppocr/optimizer/__init__.py +0 -61
  201. pyxlpr/ppocr/optimizer/learning_rate.py +0 -228
  202. pyxlpr/ppocr/optimizer/lr_scheduler.py +0 -49
  203. pyxlpr/ppocr/optimizer/optimizer.py +0 -160
  204. pyxlpr/ppocr/optimizer/regularizer.py +0 -52
  205. pyxlpr/ppocr/postprocess/__init__.py +0 -55
  206. pyxlpr/ppocr/postprocess/cls_postprocess.py +0 -33
  207. pyxlpr/ppocr/postprocess/db_postprocess.py +0 -234
  208. pyxlpr/ppocr/postprocess/east_postprocess.py +0 -143
  209. pyxlpr/ppocr/postprocess/locality_aware_nms.py +0 -200
  210. pyxlpr/ppocr/postprocess/pg_postprocess.py +0 -52
  211. pyxlpr/ppocr/postprocess/pse_postprocess/__init__.py +0 -15
  212. pyxlpr/ppocr/postprocess/pse_postprocess/pse/__init__.py +0 -29
  213. pyxlpr/ppocr/postprocess/pse_postprocess/pse/setup.py +0 -14
  214. pyxlpr/ppocr/postprocess/pse_postprocess/pse_postprocess.py +0 -118
  215. pyxlpr/ppocr/postprocess/rec_postprocess.py +0 -654
  216. pyxlpr/ppocr/postprocess/sast_postprocess.py +0 -355
  217. pyxlpr/ppocr/tools/__init__.py +0 -14
  218. pyxlpr/ppocr/tools/eval.py +0 -83
  219. pyxlpr/ppocr/tools/export_center.py +0 -77
  220. pyxlpr/ppocr/tools/export_model.py +0 -129
  221. pyxlpr/ppocr/tools/infer/predict_cls.py +0 -151
  222. pyxlpr/ppocr/tools/infer/predict_det.py +0 -300
  223. pyxlpr/ppocr/tools/infer/predict_e2e.py +0 -169
  224. pyxlpr/ppocr/tools/infer/predict_rec.py +0 -414
  225. pyxlpr/ppocr/tools/infer/predict_system.py +0 -204
  226. pyxlpr/ppocr/tools/infer/utility.py +0 -629
  227. pyxlpr/ppocr/tools/infer_cls.py +0 -83
  228. pyxlpr/ppocr/tools/infer_det.py +0 -134
  229. pyxlpr/ppocr/tools/infer_e2e.py +0 -122
  230. pyxlpr/ppocr/tools/infer_kie.py +0 -153
  231. pyxlpr/ppocr/tools/infer_rec.py +0 -146
  232. pyxlpr/ppocr/tools/infer_table.py +0 -107
  233. pyxlpr/ppocr/tools/program.py +0 -596
  234. pyxlpr/ppocr/tools/test_hubserving.py +0 -117
  235. pyxlpr/ppocr/tools/train.py +0 -163
  236. pyxlpr/ppocr/tools/xlprog.py +0 -748
  237. pyxlpr/ppocr/utils/EN_symbol_dict.txt +0 -94
  238. pyxlpr/ppocr/utils/__init__.py +0 -24
  239. pyxlpr/ppocr/utils/dict/ar_dict.txt +0 -117
  240. pyxlpr/ppocr/utils/dict/arabic_dict.txt +0 -162
  241. pyxlpr/ppocr/utils/dict/be_dict.txt +0 -145
  242. pyxlpr/ppocr/utils/dict/bg_dict.txt +0 -140
  243. pyxlpr/ppocr/utils/dict/chinese_cht_dict.txt +0 -8421
  244. pyxlpr/ppocr/utils/dict/cyrillic_dict.txt +0 -163
  245. pyxlpr/ppocr/utils/dict/devanagari_dict.txt +0 -167
  246. pyxlpr/ppocr/utils/dict/en_dict.txt +0 -63
  247. pyxlpr/ppocr/utils/dict/fa_dict.txt +0 -136
  248. pyxlpr/ppocr/utils/dict/french_dict.txt +0 -136
  249. pyxlpr/ppocr/utils/dict/german_dict.txt +0 -143
  250. pyxlpr/ppocr/utils/dict/hi_dict.txt +0 -162
  251. pyxlpr/ppocr/utils/dict/it_dict.txt +0 -118
  252. pyxlpr/ppocr/utils/dict/japan_dict.txt +0 -4399
  253. pyxlpr/ppocr/utils/dict/ka_dict.txt +0 -153
  254. pyxlpr/ppocr/utils/dict/korean_dict.txt +0 -3688
  255. pyxlpr/ppocr/utils/dict/latin_dict.txt +0 -185
  256. pyxlpr/ppocr/utils/dict/mr_dict.txt +0 -153
  257. pyxlpr/ppocr/utils/dict/ne_dict.txt +0 -153
  258. pyxlpr/ppocr/utils/dict/oc_dict.txt +0 -96
  259. pyxlpr/ppocr/utils/dict/pu_dict.txt +0 -130
  260. pyxlpr/ppocr/utils/dict/rs_dict.txt +0 -91
  261. pyxlpr/ppocr/utils/dict/rsc_dict.txt +0 -134
  262. pyxlpr/ppocr/utils/dict/ru_dict.txt +0 -125
  263. pyxlpr/ppocr/utils/dict/ta_dict.txt +0 -128
  264. pyxlpr/ppocr/utils/dict/table_dict.txt +0 -277
  265. pyxlpr/ppocr/utils/dict/table_structure_dict.txt +0 -2759
  266. pyxlpr/ppocr/utils/dict/te_dict.txt +0 -151
  267. pyxlpr/ppocr/utils/dict/ug_dict.txt +0 -114
  268. pyxlpr/ppocr/utils/dict/uk_dict.txt +0 -142
  269. pyxlpr/ppocr/utils/dict/ur_dict.txt +0 -137
  270. pyxlpr/ppocr/utils/dict/xi_dict.txt +0 -110
  271. pyxlpr/ppocr/utils/dict90.txt +0 -90
  272. pyxlpr/ppocr/utils/e2e_metric/Deteval.py +0 -574
  273. pyxlpr/ppocr/utils/e2e_metric/polygon_fast.py +0 -83
  274. pyxlpr/ppocr/utils/e2e_utils/extract_batchsize.py +0 -87
  275. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_fast.py +0 -457
  276. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_slow.py +0 -592
  277. pyxlpr/ppocr/utils/e2e_utils/pgnet_pp_utils.py +0 -162
  278. pyxlpr/ppocr/utils/e2e_utils/visual.py +0 -162
  279. pyxlpr/ppocr/utils/en_dict.txt +0 -95
  280. pyxlpr/ppocr/utils/gen_label.py +0 -81
  281. pyxlpr/ppocr/utils/ic15_dict.txt +0 -36
  282. pyxlpr/ppocr/utils/iou.py +0 -54
  283. pyxlpr/ppocr/utils/logging.py +0 -69
  284. pyxlpr/ppocr/utils/network.py +0 -84
  285. pyxlpr/ppocr/utils/ppocr_keys_v1.txt +0 -6623
  286. pyxlpr/ppocr/utils/profiler.py +0 -110
  287. pyxlpr/ppocr/utils/save_load.py +0 -150
  288. pyxlpr/ppocr/utils/stats.py +0 -72
  289. pyxlpr/ppocr/utils/utility.py +0 -80
  290. pyxlpr/ppstructure/__init__.py +0 -13
  291. pyxlpr/ppstructure/predict_system.py +0 -187
  292. pyxlpr/ppstructure/table/__init__.py +0 -13
  293. pyxlpr/ppstructure/table/eval_table.py +0 -72
  294. pyxlpr/ppstructure/table/matcher.py +0 -192
  295. pyxlpr/ppstructure/table/predict_structure.py +0 -136
  296. pyxlpr/ppstructure/table/predict_table.py +0 -221
  297. pyxlpr/ppstructure/table/table_metric/__init__.py +0 -16
  298. pyxlpr/ppstructure/table/table_metric/parallel.py +0 -51
  299. pyxlpr/ppstructure/table/table_metric/table_metric.py +0 -247
  300. pyxlpr/ppstructure/table/tablepyxl/__init__.py +0 -13
  301. pyxlpr/ppstructure/table/tablepyxl/style.py +0 -283
  302. pyxlpr/ppstructure/table/tablepyxl/tablepyxl.py +0 -118
  303. pyxlpr/ppstructure/utility.py +0 -71
  304. pyxlpr/xlai.py +0 -10
  305. /pyxllib/{ext/autogui → autogui}/virtualkey.py +0 -0
  306. {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
pyxllib/file/xlsxlib.py CHANGED
@@ -7,12 +7,14 @@
7
7
  """
8
8
  扩展了些自己的openpyxl工具
9
9
  """
10
+ import copy
11
+
10
12
  import time
11
13
 
12
14
  from pyxllib.prog.pupil import check_install_package, run_once
13
15
 
14
16
  check_install_package('openpyxl')
15
- check_install_package('premailer')
17
+ # check_install_package('premailer')
16
18
  # check_install_package('xlrd2')
17
19
  check_install_package('yattag')
18
20
  check_install_package('jsonpickle')
@@ -30,11 +32,13 @@ import io
30
32
 
31
33
  import xlrd
32
34
 
35
+ import filetype
33
36
  import openpyxl
34
37
  from openpyxl import Workbook
35
38
  from openpyxl.cell.cell import MergedCell
36
39
  from openpyxl.styles import Font, Alignment
37
40
  from openpyxl.utils.cell import get_column_letter, column_index_from_string
41
+ import openpyxl.worksheet.formula
38
42
  import pandas as pd
39
43
 
40
44
  try:
@@ -117,7 +121,7 @@ def is_valid_excel_address(address):
117
121
  return is_valid_excel_cell(address)
118
122
 
119
123
 
120
- @run_once('str', debug=True)
124
+ @run_once('str')
121
125
  def xlfmt2pyfmt_date(xl_fmt):
122
126
  """ 日期的渲染操作
123
127
 
@@ -233,6 +237,8 @@ def xl_render_value(x, xl_fmt):
233
237
  注意,遇到公式是很难计算处理的,大概率只能保持原公式显示
234
238
  因为日期用的比较多,需要时常获得真实的渲染效果,所以这里封装一个接口
235
239
 
240
+ 对于JSA等场景,直接使用Cell.Text获取渲染值就行,不需要这里这么复杂的实现
241
+
236
242
  >>> xl_render_value(datetime.datetime(2020, 1, 1), 'yyyy-mm-dd')
237
243
  '2020-01-01'
238
244
  """
@@ -351,19 +357,73 @@ def convert_xls_to_xlsx(xls_file):
351
357
 
352
358
 
353
359
  def load_as_xlsx_file(file_path, keep_links=False, keep_vba=False):
360
+ """ 这个不能全信文件给的扩展名,需要智能判断 """
361
+
362
+ # 0 工具函数
363
+ @run_once()
364
+ def read_xlsx():
365
+ file = file_path
366
+ # 如果文件原本的后缀不是xlsx,openpyxl是读不了的,要绕个弯
367
+ if file.suffix[1:] not in ('xlsx', 'xlsm'):
368
+ with open(file_path, 'rb') as f2:
369
+ data = f2.read()
370
+ file = io.BytesIO(data)
371
+ try:
372
+ return openpyxl.load_workbook(file,
373
+ keep_links=keep_links,
374
+ keep_vba=keep_vba), ''
375
+ except Exception as e:
376
+ if isinstance(e, TimeoutError): # 这里触发的是总的超时设定
377
+ raise e
378
+ return None, format_exception(e, 2)
379
+
380
+ @run_once()
381
+ def read_xls():
382
+ try:
383
+ return convert_xls_to_xlsx(file_path), ''
384
+ except Exception as e:
385
+ return None, format_exception(e, 2)
386
+
387
+ @run_once()
388
+ def read_csv():
389
+ try:
390
+ return convert_csv_to_xlsx(file_path), ''
391
+ except Exception as e:
392
+ return None, format_exception(e, 2)
393
+
394
+ def read_test(suffix):
395
+ if suffix in ('xlsx', 'xlsm', 'zip'):
396
+ wb, error = read_xlsx()
397
+ elif suffix == 'xls':
398
+ wb, error = read_xls()
399
+ elif suffix == 'csv':
400
+ wb, error = read_csv()
401
+ else:
402
+ wb, error = None, f'不支持的文件类型:{suffix}'
403
+ return wb, error
404
+
405
+ # 1 优先相信用户输入的文件名类型
354
406
  file_path = Path(file_path)
355
- suffix = file_path.suffix.lower()
356
- if suffix in ('.xlsx', '.xlsm'):
357
- wb = openpyxl.load_workbook(file_path,
358
- keep_links=keep_links,
359
- keep_vba=keep_vba)
360
- elif suffix == '.xls':
361
- wb = convert_xls_to_xlsx(file_path)
362
- elif suffix == '.csv':
363
- wb = convert_csv_to_xlsx(file_path)
364
- else:
365
- return None
366
- return wb
407
+ suffix = file_path.suffix.lower()[1:]
408
+ wb, error = read_test(suffix)
409
+ if wb is not None:
410
+ return wb, suffix
411
+
412
+ # 2 如果处理不了,则尝试用filetype判断的类型
413
+ suffix2 = filetype.guess(file_path)
414
+ suffix2 = suffix2.extension if suffix2 else ''
415
+ wb, _ = read_test(suffix2)
416
+ if wb is not None:
417
+ return wb, suffix2
418
+
419
+ # 3 如果还处理不了,再把其他可能的情况试一遍
420
+ for suffix in ('xlsx', 'xls', 'csv'):
421
+ wb, _ = read_test(suffix)
422
+ if wb is not None:
423
+ return wb, suffix
424
+
425
+ # 4 确实是处理不了的类型,返回报错信息
426
+ return None, error
367
427
 
368
428
 
369
429
  def parse_range_address(address):
@@ -466,7 +526,7 @@ def is_string_type(value):
466
526
  try:
467
527
  pd.to_datetime(value, errors='raise')
468
528
  return False
469
- except (ValueError, TypeError, OverflowError):
529
+ except (ValueError, TypeError, OverflowError, AttributeError):
470
530
  pass
471
531
 
472
532
  # 检查是否为浮点数类型
@@ -521,10 +581,18 @@ class XlCell(openpyxl.cell.cell.Cell): # 适用于 openpyxl.cell.cell.MergedCel
521
581
 
522
582
  TODO 这个函数还是可以看看能不能有更好的实现、提速
523
583
  """
584
+
585
+ def try_offset(x, y):
586
+ try:
587
+ return isinstance(self.offset(x, y), MergedCell)
588
+ except ValueError:
589
+ # 有可能会越界:ValueError: Row numbers must be between 1 and 1048576
590
+ return False
591
+
524
592
  _type, status = 0, {}
525
593
  if isinstance(self, MergedCell):
526
594
  _type = 1
527
- elif isinstance(self.offset(1, 0), MergedCell) or isinstance(self.offset(0, 1), MergedCell):
595
+ elif try_offset(1, 0) or try_offset(0, 1):
528
596
  # 这里只能判断可能是合并单元格,具体是不是合并单元格,还要
529
597
  rng = self.in_range()
530
598
  status['rng'] = rng
@@ -698,7 +766,7 @@ class XlCell(openpyxl.cell.cell.Cell): # 适用于 openpyxl.cell.cell.MergedCel
698
766
  # openpyxl的机制,如果没有配置日期格式,读取到的是默认的'mm-dd-yy',其实在中文场景,默认格式应该是后者
699
767
  if fmt == 'mm-dd-yy':
700
768
  return 'yyyy/m/d' # 中文的默认日期格式
701
- elif fmt == 'yyyy\-mm\-dd': # 不知道为什么会有提取到这种\的情况,先暴力替换了
769
+ elif fmt == r'yyyy\-mm\-dd': # 不知道为什么会有提取到这种\的情况,先暴力替换了
702
770
  fmt = 'yyyy-mm-dd'
703
771
  return fmt
704
772
 
@@ -709,7 +777,10 @@ class XlCell(openpyxl.cell.cell.Cell): # 适用于 openpyxl.cell.cell.MergedCel
709
777
  注意,遇到公式是很难计算处理的,大概率只能保持原公式显示
710
778
  因为日期用的比较多,需要时常获得真实的渲染效果,所以这里封装一个接口
711
779
  """
780
+
712
781
  x = self.value
782
+ if isinstance(x, openpyxl.worksheet.formula.ArrayFormula): # 数组公式要特别渲染
783
+ return x.text
713
784
  xl_fmt = self.get_number_format()
714
785
  return xl_render_value(x, xl_fmt)
715
786
 
@@ -1136,7 +1207,7 @@ class XlWorksheet(openpyxl.worksheet.worksheet.Worksheet):
1136
1207
 
1137
1208
  return df
1138
1209
 
1139
- def copy_range(self, src_addr, dst_cell, *, temp_sheet=False, return_mid_result=False):
1210
+ def copy_range(self, src_addr, dst_cell, *, temp_sheet=False, return_mode=False):
1140
1211
  """ 将自身cell_range区间的内容、格式,拷贝到目标dst_cell里
1141
1212
 
1142
1213
  :param str src_addr: 自身的一片单元格范围
@@ -1158,7 +1229,7 @@ class XlWorksheet(openpyxl.worksheet.worksheet.Worksheet):
1158
1229
  mid_result = {}
1159
1230
  if temp_sheet:
1160
1231
  ws3 = self.parent.create_sheet('__copy_range')
1161
- mid_result = self.copy_range(src_addr, ws3['A1'], return_mid_result=True)
1232
+ mid_result = self.copy_range(src_addr, ws3['A1'], return_mode=True)
1162
1233
  ws1 = ws3
1163
1234
  src_addr = f'A1:{excel_addr(mid_result["n"], mid_result["m"])}'
1164
1235
  else:
@@ -1193,7 +1264,7 @@ class XlWorksheet(openpyxl.worksheet.worksheet.Worksheet):
1193
1264
  if temp_sheet:
1194
1265
  self.parent.remove(ws1)
1195
1266
 
1196
- if return_mid_result:
1267
+ if return_mode:
1197
1268
  return mid_result
1198
1269
 
1199
1270
  def reindex_columns(self, orders):
@@ -1964,7 +2035,7 @@ class XlWorkbook(openpyxl.Workbook):
1964
2035
  'cells': extract_cells_content(ws)
1965
2036
  })
1966
2037
 
1967
- if not summary['cells']: # 如果没有数据,则大概率是数据透视表,是计算出来的,读取不到~
2038
+ if not summary['cells']: # 如果没有数据,则大概率是数据透视表,是计算出来的,读取不到~ 但是JSA等场景应该有办法获得
1968
2039
  summary['sheetType'] = 'PivotTable'
1969
2040
  del summary['cells']
1970
2041
  else:
@@ -2399,7 +2470,7 @@ def extract_workbook_summary2(file_path, *,
2399
2470
  """
2400
2471
  :param keep_links: 是否保留外部表格链接数据。如果保留,打开好像会有点问题。
2401
2472
  :param mode:
2402
- 0,最原始的summary3摘要
2473
+ 0,最原始的summary2摘要
2403
2474
  1,添加当前工作表、单元格位置的信息
2404
2475
  :param kwargs: 捕捉其他参数,主要是向下兼容,其实现在并没有用
2405
2476
 
@@ -2411,7 +2482,12 @@ def extract_workbook_summary2(file_path, *,
2411
2482
  res = {}
2412
2483
  res['fileName'] = file_path.name
2413
2484
  start_time = time.time()
2414
- wb = load_as_xlsx_file(file_path, keep_links=keep_links, keep_vba=keep_vba)
2485
+ wb, suffix = load_as_xlsx_file(file_path, keep_links=keep_links, keep_vba=keep_vba)
2486
+ if wb is None:
2487
+ res['error'] = f'Load file error。{suffix}'
2488
+ else:
2489
+ res['fileType'] = suffix
2490
+
2415
2491
  load_time = time.time() - start_time
2416
2492
  if wb is None: # 不支持的文件类型,不报错,只是返回最基本的文件名信息
2417
2493
  if return_mode == 1:
@@ -2459,7 +2535,7 @@ def update_raw_summary2(data):
2459
2535
 
2460
2536
  # 3 判断键值顺序
2461
2537
  keys = list(data.keys())
2462
- ref_keys = ['fileName', 'chineseContentRatio', 'nonEmptyCellRatio', 'sheetNames', 'sheets']
2538
+ ref_keys = ['fileName', 'fileType', 'chineseContentRatio', 'nonEmptyCellRatio', 'sheetNames', 'sheets']
2463
2539
  if keys != ref_keys:
2464
2540
  data = {k: data[k] for k in ref_keys if k in data}
2465
2541
 
@@ -2597,30 +2673,39 @@ class WorkbookSummary3:
2597
2673
  for addr, _ in row:
2598
2674
  new_cells[addr] = cells[addr]
2599
2675
 
2600
- new_cells = {}
2601
- for rows in rows_groups:
2676
+ total_new_cells = []
2677
+ for rows in reversed(rows_groups):
2678
+ new_cells = {}
2602
2679
  if len(rows) < 10:
2603
2680
  extract_cells_from_rows(rows)
2604
2681
  else: # 压缩中间的数据
2605
2682
  # 如果评估到最终摘要可能太小,要收敛下删除的范围
2606
2683
  n, m = len(rows), len(rows[0])
2607
2684
  target_n = int(target_reduce_cells_num / m + 0.5) # 本来应该删除多少行才行
2608
- cur_n = n - 4 if target_n > n - 4 else target_n # 实际删除多少行
2609
- left_n = n - cur_n # 剩余多少行
2610
- b = left_n // 2
2611
- a = left_n - b
2612
-
2613
- extract_cells_from_rows(rows[:a])
2614
- addr = combine_addresses(rows[a][0][0], rows[-b - 1][-1][0])
2615
- # new_cells[addr] = '这块区域的内容跟前面几行、后面几行的内容结构是一致的,省略显示'
2616
- new_cells[addr] = '...'
2617
- extract_cells_from_rows(rows[-b:])
2618
-
2619
- target_reduce_cells_num -= cur_n * m
2620
- if target_reduce_cells_num <= 0:
2621
- break
2622
-
2623
- sheet['cells'] = new_cells
2685
+ if target_n <= 0: # 如果删除的行数太少,那么就不压缩了
2686
+ extract_cells_from_rows(rows)
2687
+ else:
2688
+ cur_n = n - 4 if target_n > n - 4 else target_n # 实际删除多少行
2689
+ left_n = n - cur_n # 剩余多少行
2690
+ b = left_n // 2
2691
+ a = left_n - b
2692
+
2693
+ extract_cells_from_rows(rows[:a])
2694
+ addr = combine_addresses(rows[a][0][0], rows[-b - 1][-1][0])
2695
+ # new_cells[addr] = '这块区域的内容跟前面几行、后面几行的内容结构是一致的,省略显示'
2696
+ new_cells[addr] = '...'
2697
+ extract_cells_from_rows(rows[-b:])
2698
+
2699
+ target_reduce_cells_num -= cur_n * m
2700
+ # 240429周一21:57,这两行不能开,否则会过渡精简。如果压缩够了,那么后面的单元格需要全量补上。
2701
+ # if target_reduce_cells_num <= 0: # 满足以后不是直接break,而是要把后续的内容都保留
2702
+ # break
2703
+ total_new_cells.append(new_cells)
2704
+
2705
+ new_cells2 = {}
2706
+ for rows in reversed(total_new_cells):
2707
+ new_cells2.update(rows)
2708
+ sheet['cells'] = new_cells2
2624
2709
 
2625
2710
  @classmethod
2626
2711
  def reduce4_truncate_cells(cls, y, summary_limit_len, *, cur_summary_len=None):
@@ -2752,6 +2837,7 @@ class WorkbookSummary3:
2752
2837
  if cur_summary_len is None:
2753
2838
  cur_summary_len = cls.count_length(y)
2754
2839
 
2840
+ cur_summary_len0 = cur_summary_len
2755
2841
  active_sheet = y['ActiveSheet']
2756
2842
 
2757
2843
  # 1 预计要删除单元格数
@@ -2783,7 +2869,8 @@ class WorkbookSummary3:
2783
2869
  return cls.count_length(y)
2784
2870
 
2785
2871
  # 4 否则每张表按照比例删单元格,只保留前面部分的单元格
2786
- left_rate = 1 - r # 原始保留比例
2872
+ # todo 这里应该有更好的筛选机制,后续可以思考思考
2873
+ left_rate = min((summary_limit_len + cur_summary_len) / (2 * cur_summary_len), 0.9) # 首轮减小一点调整幅度
2787
2874
  while True:
2788
2875
  for i, st in enumerate(y['sheets']):
2789
2876
  if i == active_sheet_index:
@@ -2795,10 +2882,10 @@ class WorkbookSummary3:
2795
2882
  cur_summary_len = cls.count_length(y)
2796
2883
  if cur_summary_len <= summary_limit_len:
2797
2884
  return cur_summary_len
2798
- if left_rate * total_cells_num < 1:
2885
+ if left_rate * total_cells_num < 1: # 都没有单元格,别删了
2799
2886
  break
2800
- else:
2801
- left_rate *= 0.8 # 缩小保留比例,再试
2887
+ else: # 更新保留比率,再试
2888
+ left_rate *= min(summary_limit_len / cur_summary_len, 0.9)
2802
2889
 
2803
2890
  return cur_summary_len
2804
2891
 
@@ -2881,8 +2968,12 @@ class WorkbookSummary3:
2881
2968
  return y
2882
2969
 
2883
2970
  x = summary2
2971
+ if 'error' in x:
2972
+ return x
2973
+
2884
2974
  y = {
2885
2975
  'fileName': x['fileName'],
2976
+ 'fileType': x['fileType'],
2886
2977
  'sheetNames': x['sheetNames'],
2887
2978
  'sheets': x['sheets'],
2888
2979
  'mode': 'Complete information',
@@ -2927,33 +3018,90 @@ def extract_workbook_summary3(file_path, summary_limit_len=4000, **kwargs):
2927
3018
  return data
2928
3019
 
2929
3020
 
3021
+ def summary2_add_enums(summary2, enum_values):
3022
+ # 1 预备
3023
+ if enum_values is True:
3024
+ enum_values = (20, 10)
3025
+ max_len, max_num = enum_values
3026
+
3027
+ # 2 枚举值
3028
+ for sheet in summary2['sheets']:
3029
+ # 2.1 遍历计数
3030
+ cols = defaultdict(Counter)
3031
+ for addr, val in sheet['cells'].items():
3032
+ n = len(str(val))
3033
+ if not n or n > max_len:
3034
+ continue
3035
+ col = re.match(r'[A-Z]+', addr).group()
3036
+ cols[col][val] += 1
3037
+
3038
+ # 2.2 添加枚举值列
3039
+ enums = {}
3040
+ keys = sorted(cols.keys(), key=column_index_from_string)
3041
+ for k in keys:
3042
+ ct = cols[k]
3043
+ if len(ct) > max_num:
3044
+ continue
3045
+ vals = ct.most_common()
3046
+ if vals[0][1] == 1: # 都只出现了一次,也不认为是枚举值,跳过。或者是小数据表,一般也能全量展示。
3047
+ continue
3048
+ enums[k] = [v for v, _ in vals]
3049
+
3050
+ # 2.3 保存
3051
+ if enums:
3052
+ sheet['enums'] = enums
3053
+ # enums2 = json.dumps(enums, ensure_ascii=False, default=str)
3054
+ # sheet['enums'] = json.loads(enums2)
3055
+
3056
+ return summary2
3057
+
3058
+
2930
3059
  def extract_workbook_summary3b(file_path,
2931
3060
  summary_limit_len=4000,
2932
3061
  timeout_seconds=10,
2933
3062
  return_mode=0,
2934
3063
  debug=False,
2935
3064
  len_mode=0,
3065
+ enum_values=False,
2936
3066
  **kwargs):
2937
3067
  """
2938
3068
 
2939
3069
  :param summary_limit_len: 摘要长度限制
2940
3070
  :param timeout_seconds: 超时限制
2941
- :param return_mode: 返回模式,0表示只返回摘要,1表示返回摘要和耗时
3071
+ :param return_mode: 返回模式
3072
+ 0,表示只返回摘要
3073
+ 1,表示返回摘要和耗时
3074
+ 2, 再增加返回summary2
2942
3075
  :param len_mode:
2943
3076
  0, 使用len作为token长度评估
2944
3077
  1, 使用模型评估实际token长度
3078
+ :param enum_values: 是否展示每列枚举值
3079
+ False, 默认不展示
3080
+ True, 展示,并且默认参数 (20, 10) 表示长度超过20的丢弃,只保留枚举类型不超过10种值的列
2945
3081
  :param kwargs: 其他是summary2读取文件的时候的参数,其实都不太关键,一般不用特地设置
2946
3082
  """
2947
3083
  res = {}
2948
3084
  res['fileName'] = Path(file_path).name
2949
3085
  load_time = summary2_time = summary3_time = -1
3086
+ summary2_res = {}
3087
+
3088
+ def reduce_summary(summary):
3089
+ """ 如果转json后的summary超过4K,去掉可能的sheets字段 """
3090
+ s = json.dumps(summary, ensure_ascii=False)
3091
+ if len(s) < 4000:
3092
+ if 'sheets' in summary:
3093
+ del summary['sheets']
2950
3094
 
2951
3095
  try:
2952
3096
  with Timeout(timeout_seconds):
2953
3097
  start_time = time.time()
2954
3098
  res, load_time = extract_workbook_summary2(file_path, mode=1, return_mode=1, **kwargs)
2955
3099
  # res = convert_to_json_compatible(res)
3100
+ summary2_res = copy.deepcopy(res)
3101
+ if enum_values:
3102
+ res = summary2_add_enums(res, enum_values)
2956
3103
  summary2_time = time.time() - start_time - load_time
3104
+
2957
3105
  start_time = time.time()
2958
3106
  if len_mode == 1:
2959
3107
  res = WorkbookSummary3plus.summary2_to_summary3b(res, summary_limit_len)
@@ -2964,13 +3112,20 @@ def extract_workbook_summary3b(file_path,
2964
3112
  if debug:
2965
3113
  raise e
2966
3114
  res['error'] = f'超时,未完成摘要提取:{timeout_seconds}秒'
3115
+ reduce_summary(res)
2967
3116
  except Exception as e:
2968
3117
  if debug:
2969
3118
  raise e
2970
3119
  res['error'] = f'提取摘要时发生错误:{format_exception(e, 2)}'
3120
+ reduce_summary(res)
3121
+
3122
+ time_dict = {'load_time': human_readable_number(load_time),
3123
+ 'summary2_time': human_readable_number(summary2_time),
3124
+ 'summary3_time': human_readable_number(summary3_time)}
2971
3125
 
2972
3126
  if return_mode == 1:
2973
- return res, {'load_time': human_readable_number(load_time),
2974
- 'summary2_time': human_readable_number(summary2_time),
2975
- 'summary3_time': human_readable_number(summary3_time)}
3127
+ return res, time_dict
3128
+ elif return_mode == 2:
3129
+ return res, time_dict, summary2_res
3130
+
2976
3131
  return res