pyxllib 0.3.96__py3-none-any.whl → 0.3.200__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. pyxllib/__init__.py +21 -21
  2. pyxllib/algo/__init__.py +8 -8
  3. pyxllib/algo/disjoint.py +54 -54
  4. pyxllib/algo/geo.py +541 -529
  5. pyxllib/algo/intervals.py +964 -964
  6. pyxllib/algo/matcher.py +389 -311
  7. pyxllib/algo/newbie.py +166 -166
  8. pyxllib/algo/pupil.py +629 -461
  9. pyxllib/algo/shapelylib.py +67 -67
  10. pyxllib/algo/specialist.py +241 -240
  11. pyxllib/algo/stat.py +494 -458
  12. pyxllib/algo/treelib.py +149 -149
  13. pyxllib/algo/unitlib.py +66 -66
  14. {pyxlpr → pyxllib/autogui}/__init__.py +5 -5
  15. pyxllib/autogui/activewin.py +246 -0
  16. pyxllib/autogui/all.py +9 -0
  17. pyxllib/{ext/autogui → autogui}/autogui.py +852 -823
  18. pyxllib/autogui/uiautolib.py +362 -0
  19. pyxllib/{ext/autogui → autogui}/virtualkey.py +102 -102
  20. pyxllib/autogui/wechat.py +827 -0
  21. pyxllib/autogui/wechat_msg.py +421 -0
  22. pyxllib/autogui/wxautolib.py +84 -0
  23. pyxllib/cv/__init__.py +5 -5
  24. pyxllib/cv/expert.py +267 -267
  25. pyxllib/cv/imfile.py +159 -159
  26. pyxllib/cv/imhash.py +39 -39
  27. pyxllib/cv/pupil.py +9 -9
  28. pyxllib/cv/rgbfmt.py +1525 -1525
  29. pyxllib/cv/slidercaptcha.py +137 -0
  30. pyxllib/cv/trackbartools.py +251 -251
  31. pyxllib/cv/xlcvlib.py +1040 -1040
  32. pyxllib/cv/xlpillib.py +423 -423
  33. pyxllib/data/echarts.py +240 -129
  34. pyxllib/data/jsonlib.py +89 -0
  35. pyxllib/data/oss.py +72 -72
  36. pyxllib/data/pglib.py +1127 -643
  37. pyxllib/data/sqlite.py +568 -341
  38. pyxllib/data/sqllib.py +297 -297
  39. pyxllib/ext/JLineViewer.py +505 -492
  40. pyxllib/ext/__init__.py +6 -6
  41. pyxllib/ext/demolib.py +246 -246
  42. pyxllib/ext/drissionlib.py +277 -0
  43. pyxllib/ext/kq5034lib.py +12 -1606
  44. pyxllib/ext/old.py +663 -663
  45. pyxllib/ext/qt.py +449 -449
  46. pyxllib/ext/robustprocfile.py +497 -0
  47. pyxllib/ext/seleniumlib.py +76 -76
  48. pyxllib/ext/tk.py +173 -173
  49. pyxllib/ext/unixlib.py +827 -826
  50. pyxllib/ext/utools.py +351 -338
  51. pyxllib/ext/webhook.py +124 -101
  52. pyxllib/ext/win32lib.py +40 -40
  53. pyxllib/ext/wjxlib.py +88 -0
  54. pyxllib/ext/wpsapi.py +124 -0
  55. pyxllib/ext/xlwork.py +9 -0
  56. pyxllib/ext/yuquelib.py +1105 -173
  57. pyxllib/file/__init__.py +17 -17
  58. pyxllib/file/docxlib.py +761 -761
  59. pyxllib/file/gitlib.py +309 -309
  60. pyxllib/file/libreoffice.py +165 -0
  61. pyxllib/file/movielib.py +148 -139
  62. pyxllib/file/newbie.py +10 -10
  63. pyxllib/file/onenotelib.py +1469 -1469
  64. pyxllib/file/packlib/__init__.py +330 -293
  65. pyxllib/file/packlib/zipfile.py +2441 -2441
  66. pyxllib/file/pdflib.py +426 -426
  67. pyxllib/file/pupil.py +185 -185
  68. pyxllib/file/specialist/__init__.py +685 -685
  69. pyxllib/file/specialist/dirlib.py +799 -799
  70. pyxllib/file/specialist/download.py +193 -186
  71. pyxllib/file/specialist/filelib.py +2829 -2618
  72. pyxllib/file/xlsxlib.py +3131 -2976
  73. pyxllib/file/xlsyncfile.py +341 -0
  74. pyxllib/prog/__init__.py +5 -5
  75. pyxllib/prog/cachetools.py +64 -0
  76. pyxllib/prog/deprecatedlib.py +233 -233
  77. pyxllib/prog/filelock.py +42 -0
  78. pyxllib/prog/ipyexec.py +253 -253
  79. pyxllib/prog/multiprogs.py +940 -0
  80. pyxllib/prog/newbie.py +451 -444
  81. pyxllib/prog/pupil.py +1197 -1128
  82. pyxllib/prog/sitepackages.py +33 -33
  83. pyxllib/prog/specialist/__init__.py +391 -217
  84. pyxllib/prog/specialist/bc.py +203 -200
  85. pyxllib/prog/specialist/browser.py +497 -488
  86. pyxllib/prog/specialist/common.py +347 -347
  87. pyxllib/prog/specialist/datetime.py +199 -131
  88. pyxllib/prog/specialist/tictoc.py +240 -241
  89. pyxllib/prog/specialist/xllog.py +180 -180
  90. pyxllib/prog/xlosenv.py +108 -101
  91. pyxllib/stdlib/__init__.py +17 -17
  92. pyxllib/stdlib/tablepyxl/__init__.py +10 -10
  93. pyxllib/stdlib/tablepyxl/style.py +303 -303
  94. pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
  95. pyxllib/text/__init__.py +8 -8
  96. pyxllib/text/ahocorasick.py +39 -39
  97. pyxllib/text/airscript.js +744 -0
  98. pyxllib/text/charclasslib.py +121 -109
  99. pyxllib/text/jiebalib.py +267 -264
  100. pyxllib/text/jinjalib.py +32 -0
  101. pyxllib/text/jsa_ai_prompt.md +271 -0
  102. pyxllib/text/jscode.py +922 -767
  103. pyxllib/text/latex/__init__.py +158 -158
  104. pyxllib/text/levenshtein.py +303 -303
  105. pyxllib/text/nestenv.py +1215 -1215
  106. pyxllib/text/newbie.py +300 -288
  107. pyxllib/text/pupil/__init__.py +8 -8
  108. pyxllib/text/pupil/common.py +1121 -1095
  109. pyxllib/text/pupil/xlalign.py +326 -326
  110. pyxllib/text/pycode.py +47 -47
  111. pyxllib/text/specialist/__init__.py +8 -8
  112. pyxllib/text/specialist/common.py +112 -112
  113. pyxllib/text/specialist/ptag.py +186 -186
  114. pyxllib/text/spellchecker.py +172 -172
  115. pyxllib/text/templates/echart_base.html +11 -0
  116. pyxllib/text/templates/highlight_code.html +17 -0
  117. pyxllib/text/templates/latex_editor.html +103 -0
  118. pyxllib/text/vbacode.py +17 -17
  119. pyxllib/text/xmllib.py +747 -685
  120. pyxllib/xl.py +42 -38
  121. pyxllib/xlcv.py +17 -17
  122. pyxllib-0.3.200.dist-info/METADATA +48 -0
  123. pyxllib-0.3.200.dist-info/RECORD +126 -0
  124. {pyxllib-0.3.96.dist-info → pyxllib-0.3.200.dist-info}/WHEEL +1 -2
  125. {pyxllib-0.3.96.dist-info → pyxllib-0.3.200.dist-info/licenses}/LICENSE +190 -190
  126. pyxllib/ext/autogui/__init__.py +0 -8
  127. pyxllib-0.3.96.dist-info/METADATA +0 -51
  128. pyxllib-0.3.96.dist-info/RECORD +0 -333
  129. pyxllib-0.3.96.dist-info/top_level.txt +0 -2
  130. pyxlpr/ai/__init__.py +0 -5
  131. pyxlpr/ai/clientlib.py +0 -1281
  132. pyxlpr/ai/specialist.py +0 -286
  133. pyxlpr/ai/torch_app.py +0 -172
  134. pyxlpr/ai/xlpaddle.py +0 -655
  135. pyxlpr/ai/xltorch.py +0 -705
  136. pyxlpr/data/__init__.py +0 -11
  137. pyxlpr/data/coco.py +0 -1325
  138. pyxlpr/data/datacls.py +0 -365
  139. pyxlpr/data/datasets.py +0 -200
  140. pyxlpr/data/gptlib.py +0 -1291
  141. pyxlpr/data/icdar/__init__.py +0 -96
  142. pyxlpr/data/icdar/deteval.py +0 -377
  143. pyxlpr/data/icdar/icdar2013.py +0 -341
  144. pyxlpr/data/icdar/iou.py +0 -340
  145. pyxlpr/data/icdar/rrc_evaluation_funcs_1_1.py +0 -463
  146. pyxlpr/data/imtextline.py +0 -473
  147. pyxlpr/data/labelme.py +0 -866
  148. pyxlpr/data/removeline.py +0 -179
  149. pyxlpr/data/specialist.py +0 -57
  150. pyxlpr/eval/__init__.py +0 -85
  151. pyxlpr/paddleocr.py +0 -776
  152. pyxlpr/ppocr/__init__.py +0 -15
  153. pyxlpr/ppocr/configs/rec/multi_language/generate_multi_language_configs.py +0 -226
  154. pyxlpr/ppocr/data/__init__.py +0 -135
  155. pyxlpr/ppocr/data/imaug/ColorJitter.py +0 -26
  156. pyxlpr/ppocr/data/imaug/__init__.py +0 -67
  157. pyxlpr/ppocr/data/imaug/copy_paste.py +0 -170
  158. pyxlpr/ppocr/data/imaug/east_process.py +0 -437
  159. pyxlpr/ppocr/data/imaug/gen_table_mask.py +0 -244
  160. pyxlpr/ppocr/data/imaug/iaa_augment.py +0 -114
  161. pyxlpr/ppocr/data/imaug/label_ops.py +0 -789
  162. pyxlpr/ppocr/data/imaug/make_border_map.py +0 -184
  163. pyxlpr/ppocr/data/imaug/make_pse_gt.py +0 -106
  164. pyxlpr/ppocr/data/imaug/make_shrink_map.py +0 -126
  165. pyxlpr/ppocr/data/imaug/operators.py +0 -433
  166. pyxlpr/ppocr/data/imaug/pg_process.py +0 -906
  167. pyxlpr/ppocr/data/imaug/randaugment.py +0 -143
  168. pyxlpr/ppocr/data/imaug/random_crop_data.py +0 -239
  169. pyxlpr/ppocr/data/imaug/rec_img_aug.py +0 -533
  170. pyxlpr/ppocr/data/imaug/sast_process.py +0 -777
  171. pyxlpr/ppocr/data/imaug/text_image_aug/__init__.py +0 -17
  172. pyxlpr/ppocr/data/imaug/text_image_aug/augment.py +0 -120
  173. pyxlpr/ppocr/data/imaug/text_image_aug/warp_mls.py +0 -168
  174. pyxlpr/ppocr/data/lmdb_dataset.py +0 -115
  175. pyxlpr/ppocr/data/pgnet_dataset.py +0 -104
  176. pyxlpr/ppocr/data/pubtab_dataset.py +0 -107
  177. pyxlpr/ppocr/data/simple_dataset.py +0 -372
  178. pyxlpr/ppocr/losses/__init__.py +0 -61
  179. pyxlpr/ppocr/losses/ace_loss.py +0 -52
  180. pyxlpr/ppocr/losses/basic_loss.py +0 -135
  181. pyxlpr/ppocr/losses/center_loss.py +0 -88
  182. pyxlpr/ppocr/losses/cls_loss.py +0 -30
  183. pyxlpr/ppocr/losses/combined_loss.py +0 -67
  184. pyxlpr/ppocr/losses/det_basic_loss.py +0 -208
  185. pyxlpr/ppocr/losses/det_db_loss.py +0 -80
  186. pyxlpr/ppocr/losses/det_east_loss.py +0 -63
  187. pyxlpr/ppocr/losses/det_pse_loss.py +0 -149
  188. pyxlpr/ppocr/losses/det_sast_loss.py +0 -121
  189. pyxlpr/ppocr/losses/distillation_loss.py +0 -272
  190. pyxlpr/ppocr/losses/e2e_pg_loss.py +0 -140
  191. pyxlpr/ppocr/losses/kie_sdmgr_loss.py +0 -113
  192. pyxlpr/ppocr/losses/rec_aster_loss.py +0 -99
  193. pyxlpr/ppocr/losses/rec_att_loss.py +0 -39
  194. pyxlpr/ppocr/losses/rec_ctc_loss.py +0 -44
  195. pyxlpr/ppocr/losses/rec_enhanced_ctc_loss.py +0 -70
  196. pyxlpr/ppocr/losses/rec_nrtr_loss.py +0 -30
  197. pyxlpr/ppocr/losses/rec_sar_loss.py +0 -28
  198. pyxlpr/ppocr/losses/rec_srn_loss.py +0 -47
  199. pyxlpr/ppocr/losses/table_att_loss.py +0 -109
  200. pyxlpr/ppocr/metrics/__init__.py +0 -44
  201. pyxlpr/ppocr/metrics/cls_metric.py +0 -45
  202. pyxlpr/ppocr/metrics/det_metric.py +0 -82
  203. pyxlpr/ppocr/metrics/distillation_metric.py +0 -73
  204. pyxlpr/ppocr/metrics/e2e_metric.py +0 -86
  205. pyxlpr/ppocr/metrics/eval_det_iou.py +0 -274
  206. pyxlpr/ppocr/metrics/kie_metric.py +0 -70
  207. pyxlpr/ppocr/metrics/rec_metric.py +0 -75
  208. pyxlpr/ppocr/metrics/table_metric.py +0 -50
  209. pyxlpr/ppocr/modeling/architectures/__init__.py +0 -32
  210. pyxlpr/ppocr/modeling/architectures/base_model.py +0 -88
  211. pyxlpr/ppocr/modeling/architectures/distillation_model.py +0 -60
  212. pyxlpr/ppocr/modeling/backbones/__init__.py +0 -54
  213. pyxlpr/ppocr/modeling/backbones/det_mobilenet_v3.py +0 -268
  214. pyxlpr/ppocr/modeling/backbones/det_resnet_vd.py +0 -246
  215. pyxlpr/ppocr/modeling/backbones/det_resnet_vd_sast.py +0 -285
  216. pyxlpr/ppocr/modeling/backbones/e2e_resnet_vd_pg.py +0 -265
  217. pyxlpr/ppocr/modeling/backbones/kie_unet_sdmgr.py +0 -186
  218. pyxlpr/ppocr/modeling/backbones/rec_mobilenet_v3.py +0 -138
  219. pyxlpr/ppocr/modeling/backbones/rec_mv1_enhance.py +0 -258
  220. pyxlpr/ppocr/modeling/backbones/rec_nrtr_mtb.py +0 -48
  221. pyxlpr/ppocr/modeling/backbones/rec_resnet_31.py +0 -210
  222. pyxlpr/ppocr/modeling/backbones/rec_resnet_aster.py +0 -143
  223. pyxlpr/ppocr/modeling/backbones/rec_resnet_fpn.py +0 -307
  224. pyxlpr/ppocr/modeling/backbones/rec_resnet_vd.py +0 -286
  225. pyxlpr/ppocr/modeling/heads/__init__.py +0 -54
  226. pyxlpr/ppocr/modeling/heads/cls_head.py +0 -52
  227. pyxlpr/ppocr/modeling/heads/det_db_head.py +0 -118
  228. pyxlpr/ppocr/modeling/heads/det_east_head.py +0 -121
  229. pyxlpr/ppocr/modeling/heads/det_pse_head.py +0 -37
  230. pyxlpr/ppocr/modeling/heads/det_sast_head.py +0 -128
  231. pyxlpr/ppocr/modeling/heads/e2e_pg_head.py +0 -253
  232. pyxlpr/ppocr/modeling/heads/kie_sdmgr_head.py +0 -206
  233. pyxlpr/ppocr/modeling/heads/multiheadAttention.py +0 -163
  234. pyxlpr/ppocr/modeling/heads/rec_aster_head.py +0 -393
  235. pyxlpr/ppocr/modeling/heads/rec_att_head.py +0 -202
  236. pyxlpr/ppocr/modeling/heads/rec_ctc_head.py +0 -88
  237. pyxlpr/ppocr/modeling/heads/rec_nrtr_head.py +0 -826
  238. pyxlpr/ppocr/modeling/heads/rec_sar_head.py +0 -402
  239. pyxlpr/ppocr/modeling/heads/rec_srn_head.py +0 -280
  240. pyxlpr/ppocr/modeling/heads/self_attention.py +0 -406
  241. pyxlpr/ppocr/modeling/heads/table_att_head.py +0 -246
  242. pyxlpr/ppocr/modeling/necks/__init__.py +0 -32
  243. pyxlpr/ppocr/modeling/necks/db_fpn.py +0 -111
  244. pyxlpr/ppocr/modeling/necks/east_fpn.py +0 -188
  245. pyxlpr/ppocr/modeling/necks/fpn.py +0 -138
  246. pyxlpr/ppocr/modeling/necks/pg_fpn.py +0 -314
  247. pyxlpr/ppocr/modeling/necks/rnn.py +0 -92
  248. pyxlpr/ppocr/modeling/necks/sast_fpn.py +0 -284
  249. pyxlpr/ppocr/modeling/necks/table_fpn.py +0 -110
  250. pyxlpr/ppocr/modeling/transforms/__init__.py +0 -28
  251. pyxlpr/ppocr/modeling/transforms/stn.py +0 -135
  252. pyxlpr/ppocr/modeling/transforms/tps.py +0 -308
  253. pyxlpr/ppocr/modeling/transforms/tps_spatial_transformer.py +0 -156
  254. pyxlpr/ppocr/optimizer/__init__.py +0 -61
  255. pyxlpr/ppocr/optimizer/learning_rate.py +0 -228
  256. pyxlpr/ppocr/optimizer/lr_scheduler.py +0 -49
  257. pyxlpr/ppocr/optimizer/optimizer.py +0 -160
  258. pyxlpr/ppocr/optimizer/regularizer.py +0 -52
  259. pyxlpr/ppocr/postprocess/__init__.py +0 -55
  260. pyxlpr/ppocr/postprocess/cls_postprocess.py +0 -33
  261. pyxlpr/ppocr/postprocess/db_postprocess.py +0 -234
  262. pyxlpr/ppocr/postprocess/east_postprocess.py +0 -143
  263. pyxlpr/ppocr/postprocess/locality_aware_nms.py +0 -200
  264. pyxlpr/ppocr/postprocess/pg_postprocess.py +0 -52
  265. pyxlpr/ppocr/postprocess/pse_postprocess/__init__.py +0 -15
  266. pyxlpr/ppocr/postprocess/pse_postprocess/pse/__init__.py +0 -29
  267. pyxlpr/ppocr/postprocess/pse_postprocess/pse/setup.py +0 -14
  268. pyxlpr/ppocr/postprocess/pse_postprocess/pse_postprocess.py +0 -118
  269. pyxlpr/ppocr/postprocess/rec_postprocess.py +0 -654
  270. pyxlpr/ppocr/postprocess/sast_postprocess.py +0 -355
  271. pyxlpr/ppocr/tools/__init__.py +0 -14
  272. pyxlpr/ppocr/tools/eval.py +0 -83
  273. pyxlpr/ppocr/tools/export_center.py +0 -77
  274. pyxlpr/ppocr/tools/export_model.py +0 -129
  275. pyxlpr/ppocr/tools/infer/predict_cls.py +0 -151
  276. pyxlpr/ppocr/tools/infer/predict_det.py +0 -300
  277. pyxlpr/ppocr/tools/infer/predict_e2e.py +0 -169
  278. pyxlpr/ppocr/tools/infer/predict_rec.py +0 -414
  279. pyxlpr/ppocr/tools/infer/predict_system.py +0 -204
  280. pyxlpr/ppocr/tools/infer/utility.py +0 -629
  281. pyxlpr/ppocr/tools/infer_cls.py +0 -83
  282. pyxlpr/ppocr/tools/infer_det.py +0 -134
  283. pyxlpr/ppocr/tools/infer_e2e.py +0 -122
  284. pyxlpr/ppocr/tools/infer_kie.py +0 -153
  285. pyxlpr/ppocr/tools/infer_rec.py +0 -146
  286. pyxlpr/ppocr/tools/infer_table.py +0 -107
  287. pyxlpr/ppocr/tools/program.py +0 -596
  288. pyxlpr/ppocr/tools/test_hubserving.py +0 -117
  289. pyxlpr/ppocr/tools/train.py +0 -163
  290. pyxlpr/ppocr/tools/xlprog.py +0 -748
  291. pyxlpr/ppocr/utils/EN_symbol_dict.txt +0 -94
  292. pyxlpr/ppocr/utils/__init__.py +0 -24
  293. pyxlpr/ppocr/utils/dict/ar_dict.txt +0 -117
  294. pyxlpr/ppocr/utils/dict/arabic_dict.txt +0 -162
  295. pyxlpr/ppocr/utils/dict/be_dict.txt +0 -145
  296. pyxlpr/ppocr/utils/dict/bg_dict.txt +0 -140
  297. pyxlpr/ppocr/utils/dict/chinese_cht_dict.txt +0 -8421
  298. pyxlpr/ppocr/utils/dict/cyrillic_dict.txt +0 -163
  299. pyxlpr/ppocr/utils/dict/devanagari_dict.txt +0 -167
  300. pyxlpr/ppocr/utils/dict/en_dict.txt +0 -63
  301. pyxlpr/ppocr/utils/dict/fa_dict.txt +0 -136
  302. pyxlpr/ppocr/utils/dict/french_dict.txt +0 -136
  303. pyxlpr/ppocr/utils/dict/german_dict.txt +0 -143
  304. pyxlpr/ppocr/utils/dict/hi_dict.txt +0 -162
  305. pyxlpr/ppocr/utils/dict/it_dict.txt +0 -118
  306. pyxlpr/ppocr/utils/dict/japan_dict.txt +0 -4399
  307. pyxlpr/ppocr/utils/dict/ka_dict.txt +0 -153
  308. pyxlpr/ppocr/utils/dict/korean_dict.txt +0 -3688
  309. pyxlpr/ppocr/utils/dict/latin_dict.txt +0 -185
  310. pyxlpr/ppocr/utils/dict/mr_dict.txt +0 -153
  311. pyxlpr/ppocr/utils/dict/ne_dict.txt +0 -153
  312. pyxlpr/ppocr/utils/dict/oc_dict.txt +0 -96
  313. pyxlpr/ppocr/utils/dict/pu_dict.txt +0 -130
  314. pyxlpr/ppocr/utils/dict/rs_dict.txt +0 -91
  315. pyxlpr/ppocr/utils/dict/rsc_dict.txt +0 -134
  316. pyxlpr/ppocr/utils/dict/ru_dict.txt +0 -125
  317. pyxlpr/ppocr/utils/dict/ta_dict.txt +0 -128
  318. pyxlpr/ppocr/utils/dict/table_dict.txt +0 -277
  319. pyxlpr/ppocr/utils/dict/table_structure_dict.txt +0 -2759
  320. pyxlpr/ppocr/utils/dict/te_dict.txt +0 -151
  321. pyxlpr/ppocr/utils/dict/ug_dict.txt +0 -114
  322. pyxlpr/ppocr/utils/dict/uk_dict.txt +0 -142
  323. pyxlpr/ppocr/utils/dict/ur_dict.txt +0 -137
  324. pyxlpr/ppocr/utils/dict/xi_dict.txt +0 -110
  325. pyxlpr/ppocr/utils/dict90.txt +0 -90
  326. pyxlpr/ppocr/utils/e2e_metric/Deteval.py +0 -574
  327. pyxlpr/ppocr/utils/e2e_metric/polygon_fast.py +0 -83
  328. pyxlpr/ppocr/utils/e2e_utils/extract_batchsize.py +0 -87
  329. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_fast.py +0 -457
  330. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_slow.py +0 -592
  331. pyxlpr/ppocr/utils/e2e_utils/pgnet_pp_utils.py +0 -162
  332. pyxlpr/ppocr/utils/e2e_utils/visual.py +0 -162
  333. pyxlpr/ppocr/utils/en_dict.txt +0 -95
  334. pyxlpr/ppocr/utils/gen_label.py +0 -81
  335. pyxlpr/ppocr/utils/ic15_dict.txt +0 -36
  336. pyxlpr/ppocr/utils/iou.py +0 -54
  337. pyxlpr/ppocr/utils/logging.py +0 -69
  338. pyxlpr/ppocr/utils/network.py +0 -84
  339. pyxlpr/ppocr/utils/ppocr_keys_v1.txt +0 -6623
  340. pyxlpr/ppocr/utils/profiler.py +0 -110
  341. pyxlpr/ppocr/utils/save_load.py +0 -150
  342. pyxlpr/ppocr/utils/stats.py +0 -72
  343. pyxlpr/ppocr/utils/utility.py +0 -80
  344. pyxlpr/ppstructure/__init__.py +0 -13
  345. pyxlpr/ppstructure/predict_system.py +0 -187
  346. pyxlpr/ppstructure/table/__init__.py +0 -13
  347. pyxlpr/ppstructure/table/eval_table.py +0 -72
  348. pyxlpr/ppstructure/table/matcher.py +0 -192
  349. pyxlpr/ppstructure/table/predict_structure.py +0 -136
  350. pyxlpr/ppstructure/table/predict_table.py +0 -221
  351. pyxlpr/ppstructure/table/table_metric/__init__.py +0 -16
  352. pyxlpr/ppstructure/table/table_metric/parallel.py +0 -51
  353. pyxlpr/ppstructure/table/table_metric/table_metric.py +0 -247
  354. pyxlpr/ppstructure/table/tablepyxl/__init__.py +0 -13
  355. pyxlpr/ppstructure/table/tablepyxl/style.py +0 -283
  356. pyxlpr/ppstructure/table/tablepyxl/tablepyxl.py +0 -118
  357. pyxlpr/ppstructure/utility.py +0 -71
  358. pyxlpr/xlai.py +0 -10
pyxllib/algo/stat.py CHANGED
@@ -1,458 +1,494 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Date : 2021/06/03 23:04
6
-
7
- """ 统计方面的功能
8
-
9
- 主要是pandas、表格运算
10
- """
11
-
12
- import sys
13
- from collections import defaultdict, Counter
14
-
15
- import pandas as pd
16
-
17
- from pyxllib.prog.pupil import dprint, typename
18
- from pyxllib.file.specialist import XlPath
19
-
20
- pd.options.display.unicode.east_asian_width = True # 优化中文输出对齐问题
21
-
22
-
23
- def treetable(childreds, parents, arg3=None, nodename_colname=None):
24
- """ 输入childres子结点id列表,和parents父结点id列表
25
-
26
- 两个列表长度必须相等
27
- 文档:http://note.youdao.com/noteshare?id=126200f45d301fcb4364d06a0cae8376
28
-
29
- 有两种调用形式
30
- >> treetable(childreds, parents) --> DataFrame (新建df)
31
- >> treetable(df, child_colname, parent_colname) --> DataFrame (修改后的df)
32
-
33
- 返回一个二维列表
34
- 新的childreds (末尾可能回加虚结点)
35
- 新的parents
36
- 函数会计算每一行childred对应的树排序后的排序编号order
37
- 以及每个节点深度depth
38
-
39
- >> ls1 = [6, 2, 4, 5, 3], ls2 = [7, 1, 2, 2, 1], treetable(ls1, ls2)
40
- child_id parent_id depth tree_order tree_struct
41
- 5 7 root 1 1 = = 7
42
- 0 6 7 2 2 = = = = 6
43
- 6 1 root 1 3 = = 1
44
- 1 2 1 2 4 = = = = 2
45
- 2 4 2 3 5 = = = = = = 4
46
- 3 5 2 3 6 = = = = = = 5
47
- 4 3 1 2 7 = = = = 3
48
- """
49
- # 0 参数预处理
50
- if isinstance(childreds, pd.DataFrame):
51
- df = childreds
52
- child_colname = parents
53
- parent_colname = arg3
54
- if not arg3: raise TypeError
55
- childreds = df[child_colname].tolist()
56
- parents = df[parent_colname].tolist()
57
- else:
58
- df = None
59
-
60
- # 1 建立root根节点,确保除了root其他结点都存在记录
61
- lefts = set(parents) - set(childreds) # parents列中没有在childreds出现的结点
62
- cs, ps = list(childreds), list(parents)
63
-
64
- if len(lefts) == 0:
65
- # b_left为空一定有环,b_left不为空也不一定是正常的树
66
- raise ValueError('有环,不是树结构')
67
- elif len(lefts) == 1: # 只有一个未出现的结点,那么它既是根节点
68
- root = list(lefts)[0]
69
- else: # 多个父结点没有记录,则对这些父结点统一加一个root父结点
70
- root = 'root'
71
- allnode = set(parents) | set(childreds) # 所有结点集合
72
- while root in allnode: root += '-' # 一直在末尾加'-',直到这个结点是输入里未出现的
73
- # 添加结点
74
- lefts = list(lefts)
75
- lefts.sort(key=lambda x: parents.index(x))
76
- for t in lefts:
77
- cs.append(t)
78
- ps.append(root)
79
-
80
- n = len(cs)
81
- depth, tree_order, len_childs = [-1] * n, [-1] * n, [0] * n
82
-
83
- # 2 构造父结点-孩子结点的字典dd
84
- dd = defaultdict(list)
85
- for i in range(n): dd[ps[i]] += [i]
86
-
87
- # 3 dfs
88
- cnt = 1
89
-
90
- def dfs(node, d):
91
- """找node的所有子结点"""
92
- nonlocal cnt
93
- for i in dd.get(node, []):
94
- tree_order[i], depth[i], len_childs[i] = cnt, d, len(dd[cs[i]])
95
- cnt += 1
96
- dfs(cs[i], d + 1)
97
-
98
- dfs(root, 1)
99
-
100
- # 4 输出格式
101
- tree_struct = list(map(lambda i: f"{'_ _ ' * depth[i]}{cs[i]}" + (f'[{len_childs[i]}]' if len_childs[i] else ''),
102
- range(n)))
103
-
104
- if df is None:
105
- ls = list(zip(cs, ps, depth, tree_order, len_childs, tree_struct))
106
- df = pd.DataFrame.from_records(ls, columns=('child_id', 'parent_id',
107
- 'depth', 'tree_order', 'len_childs', 'tree_struct'))
108
- else:
109
- k = len(df)
110
- df = df.append(pd.DataFrame({child_colname: cs[k:], parent_colname: ps[k:]}), sort=False, ignore_index=True)
111
- if nodename_colname:
112
- tree_struct = list(
113
- map(lambda i: f"{'_ _ ' * depth[i]}{cs[i]} {df.iloc[i][nodename_colname]}"
114
- + (f'[{len_childs[i]}]' if len_childs[i] else ''), range(n)))
115
- df['depth'], df['tree_order'], df['len_childs'], df['tree_struct'] = depth, tree_order, len_childs, tree_struct
116
- df.sort_values('tree_order', inplace=True) # 注意有时候可能不能排序,要维持输入时候的顺序
117
- return df
118
-
119
-
120
- def treetable_flatten(df, *, reverse=False, childid_colname='id', parentid_colname='parent_id', format_colname=None):
121
- """ 获得知识树横向展开表:列为depth-3, depth-2, depth-1,表示倒数第3级、倒数第2级、倒数第1级
122
-
123
- :param df: DataFrame数据
124
- :param reverse:
125
- False,正常地罗列depth1、depth2、depth3...等结点信息
126
- True,反向列举所属层级,即显示倒数第1层parent1,然后是倒数第2层parent2...
127
- :param childid_colname: 孩子结点列
128
- :param parentid_colname: 父结点列
129
- :param format_colname: 显示的数值
130
- None,默认采用 childid_colname 的值
131
- str,某一列的名称,采用那一列的值(可以实现设置好格式)
132
- :return:
133
- """
134
- # 1 构造辅助数组
135
- if format_colname is None: format_colname = parentid_colname
136
- parentid = dict() # parentid[k] = v, 存储结点k对应的父结点v
137
- nodeval = dict() # nodeval[k] = v, 存储结点k需要显示的数值情况
138
- if len(df[df.index.duplicated()]):
139
- dprint(len(set(df.index)), len(df.index)) # 有重复index
140
- raise ValueError
141
-
142
- for idx, row in df.iterrows():
143
- parentid[row[childid_colname]] = row[parentid_colname]
144
- nodeval[row[childid_colname]] = str(row[format_colname])
145
-
146
- # 2 每个结点往上遍历出所有父结点
147
- parents = []
148
- for idx, row in df.iterrows():
149
- ps = [nodeval[row[childid_colname]]] # 包含结点自身的所有父结点名称
150
- p = row[parentid_colname]
151
- while p in parentid:
152
- ps.append(nodeval[p])
153
- p = parentid[p]
154
- parents.append(ps)
155
- num_depth = max(map(len, parents), default=0)
156
-
157
- # 3 这里可以灵活调整最终要显示的格式效果
158
- df['parents'] = parents
159
- if reverse:
160
- for j in range(num_depth, 0, -1): df[f'depth-{j}'] = ''
161
- for idx, row in df.iterrows():
162
- for j in range(1, len(row.parents) + 1):
163
- df.loc[idx, f'depth-{j}'] = row.parents[j - 1]
164
- else:
165
- for j in range(num_depth): df[f'depth{j}'] = ''
166
- for idx, row in df.iterrows():
167
- for j in range(len(row.parents)):
168
- df.loc[idx, f'depth{j}'] = row.parents[-j - 1]
169
- df.drop('parents', axis=1, inplace=True)
170
- return df
171
-
172
-
173
- def write_dataframes_to_excel(outfile, dataframes, order_mode='序号'):
174
- """ 将多个DataFrame表格写入一个Excel文件,并添加序号列
175
-
176
- :param str outfile: 输出的Excel文件路径
177
- :param dict dataframes: 包含要保存的DataFrame的字典,键为sheet名,值为DataFrame
178
- :param str order_mode: 序号模式,可选值为 'default' 或 '序号',默认为 '序号'
179
-
180
- >> write_dataframes_to_excel('test.xlsx', {'images': df1, 'annotations': df2})
181
-
182
- # TODO 存成表格后,可以使用openpyxl等库再打开表格精修
183
-
184
- 实现上,尽可能在一些常见结构上,进行一些格式美化。但对费常规结构,就保留df默认排版效果,不做特殊处理。
185
- """
186
- with pd.ExcelWriter(str(outfile), engine='xlsxwriter') as writer:
187
- head_format = writer.book.add_format({'font_size': 12, 'font_color': 'blue',
188
- 'align': 'left', 'valign': 'vcenter'})
189
- for sheet_name, df in dataframes.items():
190
- if df.index.nlevels == 1 and df.columns.nlevels == 1:
191
- if order_mode == '序号':
192
- # 写入带有序号列的数据表格
193
- if '序号' not in df.columns:
194
- df = df.copy()
195
- df.insert(0, '序号', range(1, len(df) + 1))
196
- else:
197
- df = df.reset_index()
198
- df.columns = ['_index'] + list(df.columns[1:])
199
- df.to_excel(writer, sheet_name=sheet_name, freeze_panes=(1, 0), index=False)
200
- else:
201
- # 写入普通的数据表格
202
- df.to_excel(writer, sheet_name=sheet_name, freeze_panes=(1, df.index.nlevels))
203
-
204
- # 设置表头格式
205
- if df.columns.nlevels == 1:
206
- start = df.index.nlevels
207
- if start == 1:
208
- start = 0
209
- for col_num, value in enumerate(df.columns, start=start):
210
- writer.sheets[sheet_name].write(0, col_num, value, head_format)
211
-
212
-
213
- def read_dataframes_from_excel(infile):
214
- """ 从Excel文件读取多个DataFrame表格
215
-
216
- :param str infile: Excel文件路径
217
- :return: 包含读取的DataFrame的字典,键为工作表名,值为DataFrame
218
- :rtype: dict
219
-
220
- 注意这个函数不太适用于与读取多级index和多级columns的情况,建议遇到这种情况,手动读取,
221
- read_excel可以设置header=[0,1]、index=[0,1,2]的形式来定制表头所在位置。
222
- """
223
- dataframes = {}
224
- with pd.ExcelFile(infile) as xls:
225
- sheet_names = xls.sheet_names
226
- for sheet_name in sheet_names:
227
- df = pd.read_excel(xls, sheet_name=sheet_name)
228
- if '_index' in df.columns:
229
- df = df.drop('_index', axis=1)
230
- dataframes[sheet_name] = df
231
- return dataframes
232
-
233
-
234
- def update_dataframes_to_excel(outfile, dataframes, order_mode='序号'):
235
- """ 更新xlsx文件中的sheets数据 """
236
- outfile = XlPath(outfile)
237
- if outfile.is_file():
238
- data = read_dataframes_from_excel(outfile)
239
- else:
240
- data = {}
241
- data.update(dataframes)
242
- write_dataframes_to_excel(outfile, data, order_mode)
243
-
244
-
245
- def xlpivot(df, index=None, columns=None, values=None):
246
- """ 对pandas进行封装的数据透视表功能
247
-
248
- :param df: 数据表
249
- :param index: 行划分方式
250
- :param columns: 列划分方式
251
- :param values: 显示的值
252
- Callable[items, value]:输出一个函数
253
- :return: 数据透视表的表格
254
-
255
- 使用示例:
256
- def func(items): # 输入匹配的多行数据
257
- x = items.iloc[0]
258
- return f'{x["precision"]:.0f},{x["recall"]:.0f},{x["hmean"]:.2f},{x["fps"]}' # 返回显示的值
259
- >> df2 = xlpivot(df, ['model_type'], ['dataset', 'total_frame'], {'precision,recall,hmean,fps': func})
260
- """
261
-
262
- # 1 将分组的格式标准化
263
- def reset_groups(keys):
264
- if isinstance(keys, (list, tuple)):
265
- return list(keys)
266
- elif keys:
267
- return [keys]
268
- else:
269
- return []
270
-
271
- index_, columns_ = reset_groups(index), reset_groups(columns)
272
-
273
- # 2 目标值的格式标准化
274
- if callable(values):
275
- values_ = {'values': values}
276
- elif isinstance(values, dict):
277
- values_ = values
278
- else:
279
- raise TypeError
280
-
281
- # 3 分组
282
- keys = index_ + columns_
283
- dfgp = df.groupby(keys)
284
- data = defaultdict(list)
285
- for ks, items in dfgp:
286
- # 要存储分组(keys)相关的值
287
- if len(keys) == 1:
288
- data[keys[0]].append(ks)
289
- else:
290
- for i, k in enumerate(keys):
291
- data[k].append(ks[i])
292
- # 再存储生成的值
293
- for k, func in values_.items():
294
- data[k].append(func(items))
295
- df2 = pd.DataFrame.from_dict(data)
296
-
297
- # 4 可视化表格
298
- if index and columns:
299
- view_table = df2.pivot(index=index, columns=columns, values=list(values_.keys()))
300
- elif index:
301
- view_table = df2.set_index(index_)
302
- else: # 只有columns,没有index
303
- view_table = df2.set_index(index_).T
304
- return view_table
305
-
306
-
307
- def count_key_combinations(df, col_names, count_col_name='count'):
308
- """ 统计列出的几个列名,各种组合出现的次数
309
-
310
- :param df:
311
- :param col_names: ['a', 'b', 'c']
312
- :param count_col_name: 新增的统计出现次数的列名,默认count
313
- :return: 新的次数统计的df表格
314
-
315
- 这个功能跟 SqlCodeGenerator keys_count、one2many很像,是可以代替这两个功能的
316
- """
317
- from collections import Counter
318
-
319
- # 0 参数处理
320
- if isinstance(col_names, str):
321
- col_names = [col_names]
322
-
323
- # 1 统计每种组合出现的次数
324
- cols = [df[name] for name in col_names]
325
- ct = Counter(tuple(zip(*cols)))
326
-
327
- # 2 生成新的df的统计表
328
- ls = []
329
- for k, v in ct.most_common():
330
- ls.append([*k, v])
331
- df2 = pd.DataFrame.from_records(ls, columns=list(col_names) + [count_col_name])
332
- return df2
333
-
334
-
335
- def pareto_accumulate(weights, accuracy=0.01, *, print_mode=False, value_unit_type='K'):
336
- """ 帕累托累计
337
-
338
- 可以用来分析主要出现的权重、频次
339
- 二八法则,往往20%的数据,就能解决80%的问题
340
-
341
- :param weights: 一组权重数据
342
- :param accuracy: 累计精度,当统计到末尾时,可能有大量权重过小的数值
343
- 此时不频繁进行累计权重计算,而是当更新权重累计达到accuracy,才会更新一个记录点
344
- 注意这是全量数据综合的百分比,所以最小更新量就是1%
345
- :param print_mode: 是否直接展示可视化结果
346
- :return: [(累计数值数量, ≥当前阈值, 累计权重), ...]
347
-
348
- >>> pareto_accumulate([1, 2, 3, 9, 8, 7, 4, 6, 5])
349
- [(1, 9, 9), (2, 8, 17), (3, 7, 24), (4, 6, 30), (5, 5, 35), (6, 4, 39), (7, 3, 42), (8, 2, 44), (9, 1, 45)]
350
- >>> pareto_accumulate([1, 2, 3, 9, 8, 7, 4, 6, 5], 0.1)
351
- [(1, 9, 9), (2, 8, 17), (3, 7, 24), (4, 6, 30), (5, 5, 35), (7, 3, 42), (9, 1, 45)]
352
- """
353
- # 1 基础数据计算
354
- points = []
355
- weights = sorted(weights, reverse=True)
356
-
357
- total = sum(weights)
358
- accuracy = total * accuracy
359
-
360
- acc = 0
361
- delta = 0
362
- for i, w in enumerate(weights, start=1):
363
- acc += w
364
- delta += w
365
- if delta >= accuracy:
366
- points.append((i, w, acc))
367
- delta = 0
368
- if delta:
369
- points.append((len(weights), weights[-1], acc))
370
-
371
- # 2 结果展示
372
- def fmt(p):
373
- from pyxllib.prog.newbie import human_readable_number
374
- ls = [f'{human_readable_number(p[0], "")}条≥{human_readable_number(p[1])}',
375
- f'{human_readable_number(p[2], value_unit_type)}({p[2] / total_size:.0%})']
376
- return ','.join(map(str, ls))
377
-
378
- total_size = points[-1][2]
379
- labels = [fmt(p) for p in points]
380
-
381
- pts = [[p[0], p[2]] for p in points]
382
-
383
- if print_mode:
384
- if sys.platform == 'win32':
385
- from pyxllib.data.echarts import Line
386
- from pyxllib.prog.specialist import browser
387
-
388
- x = Line()
389
- x.add_series('帕累托累积权重', pts, labels=labels, label={'position': 'right'})
390
- browser(x)
391
- else:
392
- print(*labels, sep='\n')
393
-
394
- return pts, labels
395
-
396
-
397
- class XlDataFrame(pd.DataFrame):
398
- def check_dtypes(self):
399
- """ 检查数据类型
400
- 1列是列名,第2列是原本dtypes显示的类型看,第3列是我扩展的统计的实际数据类型
401
- """
402
- d = self.dtypes
403
- ls = [[k, d[k], Counter([typename(x) for x in v]).most_common()] for k, v in self.iteritems()]
404
- df = pd.DataFrame.from_records(ls, columns=['name', 'type', 'detail'])
405
- return df
406
-
407
-
408
- class ModifiableRow:
409
- def __init__(self, df, index):
410
- self.df = df
411
- self.index = index
412
-
413
- def __getitem__(self, item):
414
- return self.df.at[self.index, item]
415
-
416
- def __setitem__(self, key, value):
417
- self.df.at[self.index, key] = value
418
-
419
-
420
- def print_full_dataframe(df):
421
- """
422
- 临时设置以完整显示DataFrame的内容
423
-
424
- :param pd.DataFrame df: 需要完整显示的DataFrame
425
- """
426
- with pd.option_context('display.max_rows', None,
427
- 'display.max_columns', None,
428
- 'display.width', 1000,
429
- 'display.max_colwidth', None):
430
- print(df)
431
-
432
- pd.options('display.max_rows', 60)
433
-
434
-
435
- def custom_fillna(df, default_fill_value='', numeric_fill_value=None, specific_fill=None):
436
- """ 使用更多灵活性填充DataFrame中的NaN值。
437
-
438
- :param pandas.DataFrame df: 需要处理的DataFrame。
439
- :param str default_fill_value: 非数值列中NaN的默认填充值。
440
- :param numeric_fill_value: 数值列中NaN的填充值,如果不指定,则默认为None。
441
- :param dict specific_fill: 指定列名及其NaN的填充值,如果不指定,则默认为None。
442
- :return: 已根据指定标准填充NaN值的pandas.DataFrame。
443
-
444
- >>> df = pd.DataFrame({'A': [1, 2, None], 'B': [None, 'x', 'y'], 'C': [None, None, None]})
445
- >>> custom_fillna(df, 'filled', 0, {'C': 'special'})
446
- """
447
- for column in df.columns:
448
- # 检查列是否在specific_fill中指定;如果是,则使用指定的值填充。
449
- if specific_fill and column in specific_fill:
450
- df[column] = df[column].fillna(specific_fill[column])
451
- # 如果列是数值型且指定了numeric_fill_value,则使用numeric_fill_value填充。
452
- elif numeric_fill_value is not None and pd.api.types.is_numeric_dtype(df[column]):
453
- df[column] = df[column].fillna(numeric_fill_value)
454
- # 否则,对非数值列使用default_fill_value进行填充。
455
- elif pd.api.types.is_object_dtype(df[column]) or pd.api.types.is_string_dtype(df[column]):
456
- df[column] = df[column].fillna(default_fill_value)
457
- # 可以在这里添加更多条件,以处理其他数据类型,如datetime。
458
- return df
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : 陈坤泽
4
+ # @Email : 877362867@qq.com
5
+ # @Date : 2021/06/03 23:04
6
+
7
+ """ 统计方面的功能
8
+
9
+ 主要是pandas、表格运算
10
+ """
11
+
12
+ import sys
13
+ from collections import defaultdict, Counter
14
+
15
+ import pandas as pd
16
+
17
+ from pyxllib.prog.pupil import dprint, typename
18
+ from pyxllib.file.specialist import XlPath
19
+
20
+ pd.options.display.unicode.east_asian_width = True # 优化中文输出对齐问题
21
+ try:
22
+ pd.set_option('future.no_silent_downcasting', True)
23
+ except Exception as e:
24
+ pass
25
+
26
+
27
+ def treetable(childreds, parents, arg3=None, nodename_colname=None):
28
+ """ 输入childres子结点id列表,和parents父结点id列表
29
+
30
+ 两个列表长度必须相等
31
+ 文档:http://note.youdao.com/noteshare?id=126200f45d301fcb4364d06a0cae8376
32
+
33
+ 有两种调用形式
34
+ >> treetable(childreds, parents) --> DataFrame (新建df)
35
+ >> treetable(df, child_colname, parent_colname) --> DataFrame (修改后的df)
36
+
37
+ 返回一个二维列表
38
+ 新的childreds (末尾可能回加虚结点)
39
+ 新的parents
40
+ 函数会计算每一行childred对应的树排序后的排序编号order
41
+ 以及每个节点深度depth
42
+
43
+ >> ls1 = [6, 2, 4, 5, 3], ls2 = [7, 1, 2, 2, 1], treetable(ls1, ls2)
44
+ child_id parent_id depth tree_order tree_struct
45
+ 5 7 root 1 1 = = 7
46
+ 0 6 7 2 2 = = = = 6
47
+ 6 1 root 1 3 = = 1
48
+ 1 2 1 2 4 = = = = 2
49
+ 2 4 2 3 5 = = = = = = 4
50
+ 3 5 2 3 6 = = = = = = 5
51
+ 4 3 1 2 7 = = = = 3
52
+ """
53
+ # 0 参数预处理
54
+ if isinstance(childreds, pd.DataFrame):
55
+ df = childreds
56
+ child_colname = parents
57
+ parent_colname = arg3
58
+ if not arg3: raise TypeError
59
+ childreds = df[child_colname].tolist()
60
+ parents = df[parent_colname].tolist()
61
+ else:
62
+ df = None
63
+
64
+ # 1 建立root根节点,确保除了root其他结点都存在记录
65
+ lefts = set(parents) - set(childreds) # parents列中没有在childreds出现的结点
66
+ cs, ps = list(childreds), list(parents)
67
+
68
+ if len(lefts) == 0:
69
+ # b_left为空一定有环,b_left不为空也不一定是正常的树
70
+ raise ValueError('有环,不是树结构')
71
+ elif len(lefts) == 1: # 只有一个未出现的结点,那么它既是根节点
72
+ root = list(lefts)[0]
73
+ else: # 多个父结点没有记录,则对这些父结点统一加一个root父结点
74
+ root = 'root'
75
+ allnode = set(parents) | set(childreds) # 所有结点集合
76
+ while root in allnode: root += '-' # 一直在末尾加'-',直到这个结点是输入里未出现的
77
+ # 添加结点
78
+ lefts = list(lefts)
79
+ lefts.sort(key=lambda x: parents.index(x))
80
+ for t in lefts:
81
+ cs.append(t)
82
+ ps.append(root)
83
+
84
+ n = len(cs)
85
+ depth, tree_order, len_childs = [-1] * n, [-1] * n, [0] * n
86
+
87
+ # 2 构造父结点-孩子结点的字典dd
88
+ dd = defaultdict(list)
89
+ for i in range(n): dd[ps[i]] += [i]
90
+
91
+ # 3 dfs
92
+ cnt = 1
93
+
94
+ def dfs(node, d):
95
+ """找node的所有子结点"""
96
+ nonlocal cnt
97
+ for i in dd.get(node, []):
98
+ tree_order[i], depth[i], len_childs[i] = cnt, d, len(dd[cs[i]])
99
+ cnt += 1
100
+ dfs(cs[i], d + 1)
101
+
102
+ dfs(root, 1)
103
+
104
+ # 4 输出格式
105
+ tree_struct = list(map(lambda i: f"{'_ _ ' * depth[i]}{cs[i]}" + (f'[{len_childs[i]}]' if len_childs[i] else ''),
106
+ range(n)))
107
+
108
+ if df is None:
109
+ ls = list(zip(cs, ps, depth, tree_order, len_childs, tree_struct))
110
+ df = pd.DataFrame.from_records(ls, columns=('child_id', 'parent_id',
111
+ 'depth', 'tree_order', 'len_childs', 'tree_struct'))
112
+ else:
113
+ k = len(df)
114
+ df = df.append(pd.DataFrame({child_colname: cs[k:], parent_colname: ps[k:]}), sort=False, ignore_index=True)
115
+ if nodename_colname:
116
+ tree_struct = list(
117
+ map(lambda i: f"{'_ _ ' * depth[i]}{cs[i]} {df.iloc[i][nodename_colname]}"
118
+ + (f'[{len_childs[i]}]' if len_childs[i] else ''), range(n)))
119
+ df['depth'], df['tree_order'], df['len_childs'], df['tree_struct'] = depth, tree_order, len_childs, tree_struct
120
+ df.sort_values('tree_order', inplace=True) # 注意有时候可能不能排序,要维持输入时候的顺序
121
+ return df
122
+
123
+
124
+ def treetable_flatten(df, *, reverse=False, childid_colname='id', parentid_colname='parent_id', format_colname=None):
125
+ """ 获得知识树横向展开表:列为depth-3, depth-2, depth-1,表示倒数第3级、倒数第2级、倒数第1级
126
+
127
+ :param df: DataFrame数据
128
+ :param reverse:
129
+ False,正常地罗列depth1、depth2、depth3...等结点信息
130
+ True,反向列举所属层级,即显示倒数第1层parent1,然后是倒数第2层parent2...
131
+ :param childid_colname: 孩子结点列
132
+ :param parentid_colname: 父结点列
133
+ :param format_colname: 显示的数值
134
+ None,默认采用 childid_colname 的值
135
+ str,某一列的名称,采用那一列的值(可以实现设置好格式)
136
+ :return:
137
+ """
138
+ # 1 构造辅助数组
139
+ if format_colname is None: format_colname = parentid_colname
140
+ parentid = dict() # parentid[k] = v, 存储结点k对应的父结点v
141
+ nodeval = dict() # nodeval[k] = v, 存储结点k需要显示的数值情况
142
+ if len(df[df.index.duplicated()]):
143
+ dprint(len(set(df.index)), len(df.index)) # 有重复index
144
+ raise ValueError
145
+
146
+ for idx, row in df.iterrows():
147
+ parentid[row[childid_colname]] = row[parentid_colname]
148
+ nodeval[row[childid_colname]] = str(row[format_colname])
149
+
150
+ # 2 每个结点往上遍历出所有父结点
151
+ parents = []
152
+ for idx, row in df.iterrows():
153
+ ps = [nodeval[row[childid_colname]]] # 包含结点自身的所有父结点名称
154
+ p = row[parentid_colname]
155
+ while p in parentid:
156
+ ps.append(nodeval[p])
157
+ p = parentid[p]
158
+ parents.append(ps)
159
+ num_depth = max(map(len, parents), default=0)
160
+
161
+ # 3 这里可以灵活调整最终要显示的格式效果
162
+ df['parents'] = parents
163
+ if reverse:
164
+ for j in range(num_depth, 0, -1): df[f'depth-{j}'] = ''
165
+ for idx, row in df.iterrows():
166
+ for j in range(1, len(row.parents) + 1):
167
+ df.loc[idx, f'depth-{j}'] = row.parents[j - 1]
168
+ else:
169
+ for j in range(num_depth): df[f'depth{j}'] = ''
170
+ for idx, row in df.iterrows():
171
+ for j in range(len(row.parents)):
172
+ df.loc[idx, f'depth{j}'] = row.parents[-j - 1]
173
+ df.drop('parents', axis=1, inplace=True)
174
+ return df
175
+
176
+
177
+ def write_dataframes_to_excel(outfile, dataframes, order_mode='序号'):
178
+ """ 将多个DataFrame表格写入一个Excel文件,并添加序号列
179
+
180
+ :param str outfile: 输出的Excel文件路径
181
+ :param dict dataframes: 包含要保存的DataFrame的字典,键为sheet名,值为DataFrame
182
+ :param str order_mode: 序号模式,可选值为 'default' 或 '序号',默认为 '序号'
183
+
184
+ >> write_dataframes_to_excel('test.xlsx', {'images': df1, 'annotations': df2})
185
+
186
+ # TODO 存成表格后,可以使用openpyxl等库再打开表格精修
187
+
188
+ 实现上,尽可能在一些常见结构上,进行一些格式美化。但对费常规结构,就保留df默认排版效果,不做特殊处理。
189
+ """
190
+ with pd.ExcelWriter(str(outfile), engine='xlsxwriter') as writer:
191
+ head_format = writer.book.add_format({'font_size': 12, 'font_color': 'blue',
192
+ 'align': 'left', 'valign': 'vcenter'})
193
+ for sheet_name, df in dataframes.items():
194
+ if df.index.nlevels == 1 and df.columns.nlevels == 1:
195
+ if order_mode == '序号':
196
+ # 写入带有序号列的数据表格
197
+ if '序号' not in df.columns:
198
+ df = df.copy()
199
+ df.insert(0, '序号', range(1, len(df) + 1))
200
+ else:
201
+ df = df.reset_index()
202
+ df.columns = ['_index'] + list(df.columns[1:])
203
+ df.to_excel(writer, sheet_name=sheet_name, freeze_panes=(1, 0), index=False)
204
+ else:
205
+ # 写入普通的数据表格
206
+ df.to_excel(writer, sheet_name=sheet_name, freeze_panes=(1, df.index.nlevels))
207
+
208
+ # 设置表头格式
209
+ if df.columns.nlevels == 1:
210
+ start = df.index.nlevels
211
+ if start == 1:
212
+ start = 0
213
+ for col_num, value in enumerate(df.columns, start=start):
214
+ writer.sheets[sheet_name].write(0, col_num, value, head_format)
215
+
216
+
217
+ def read_dataframes_from_excel(infile):
218
+ """ 从Excel文件读取多个DataFrame表格
219
+
220
+ :param str infile: Excel文件路径
221
+ :return: 包含读取的DataFrame的字典,键为工作表名,值为DataFrame
222
+ :rtype: dict
223
+
224
+ 注意这个函数不太适用于与读取多级index和多级columns的情况,建议遇到这种情况,手动读取,
225
+ read_excel可以设置header=[0,1]、index=[0,1,2]的形式来定制表头所在位置。
226
+ """
227
+ dataframes = {}
228
+ with pd.ExcelFile(infile) as xls:
229
+ sheet_names = xls.sheet_names
230
+ for sheet_name in sheet_names:
231
+ df = pd.read_excel(xls, sheet_name=sheet_name)
232
+ if '_index' in df.columns:
233
+ df = df.drop('_index', axis=1)
234
+ dataframes[sheet_name] = df
235
+ return dataframes
236
+
237
+
238
+ def update_dataframes_to_excel(outfile, dataframes, order_mode='序号'):
239
+ """ 更新xlsx文件中的sheets数据 """
240
+ outfile = XlPath(outfile)
241
+ if outfile.is_file():
242
+ data = read_dataframes_from_excel(outfile)
243
+ else:
244
+ data = {}
245
+ data.update(dataframes)
246
+ write_dataframes_to_excel(outfile, data, order_mode)
247
+
248
+
249
+ def xlpivot(df, index=None, columns=None, values=None):
250
+ """ 对pandas进行封装的数据透视表功能
251
+
252
+ :param df: 数据表
253
+ :param index: 行划分方式
254
+ :param columns: 列划分方式
255
+ :param values: 显示的值
256
+ Callable[items, value]:输出一个函数
257
+ list[str]: 支持输入属性列表,表示显示原始值的意思。如果原始值不唯一,则逗号分开拼接后显示。但这种用法就不太算是传统意义的数据透视表了
258
+ :return: 数据透视表的表格
259
+
260
+ 使用示例:
261
+ def func(items): # 输入匹配的多行数据
262
+ x = items.iloc[0]
263
+ return f'{x["precision"]:.0f},{x["recall"]:.0f},{x["hmean"]:.2f},{x["fps"]}' # 返回显示的值
264
+
265
+ >> df2 = xlpivot(df, ['model_type'], ['dataset', 'total_frame'],
266
+ {'precision,recall,hmean,fps': func})
267
+
268
+ 注意技巧:如果要在分组后约束特定顺序,可以使用特殊前缀进行编号对齐
269
+ """
270
+
271
+ # 1 将分组的格式标准化
272
+ def reset_groups(keys):
273
+ if isinstance(keys, (list, tuple)):
274
+ return list(keys)
275
+ elif keys:
276
+ return [keys]
277
+ else:
278
+ return []
279
+
280
+ index_, columns_ = reset_groups(index), reset_groups(columns)
281
+
282
+ # 2 目标值的格式标准化
283
+ def make_col_func(col):
284
+ def func(rows):
285
+ if len(rows):
286
+ return ', '.join(map(str, rows[col].values))
287
+ return ''
288
+
289
+ return func
290
+
291
+ if isinstance(values, (list, tuple)):
292
+ values = {v: make_col_func(v) for v in values}
293
+
294
+ if callable(values):
295
+ values_ = {'values': values}
296
+ elif isinstance(values, dict):
297
+ values_ = values
298
+ else:
299
+ raise TypeError
300
+
301
+ # 3 分组
302
+ assert len(df), 'df是空的'
303
+
304
+ keys = index_ + columns_
305
+ dfgp = df.groupby(keys)
306
+ data = defaultdict(list)
307
+ for ks, items in dfgp:
308
+ # 要存储分组(keys)相关的值
309
+ if len(keys) == 1:
310
+ data[keys[0]].append(ks)
311
+ else:
312
+ for i, k in enumerate(keys):
313
+ data[k].append(ks[i])
314
+ # 再存储生成的值
315
+ for k, func in values_.items():
316
+ data[k].append(func(items))
317
+ df2 = pd.DataFrame.from_dict(data)
318
+
319
+ # 4 可视化表格
320
+ if index and columns:
321
+ view_table = df2.pivot(index=index, columns=columns, values=list(values_.keys()))
322
+ elif index:
323
+ view_table = df2.set_index(index_)
324
+ else: # 只有columns,没有index
325
+ view_table = df2.set_index(index_).T
326
+ return view_table
327
+
328
+
329
+ def count_key_combinations(df, col_names, count_col_name='count'):
330
+ """ 统计列出的几个列名,各种组合出现的次数
331
+
332
+ :param df:
333
+ :param col_names: ['a', 'b', 'c']
334
+ :param count_col_name: 新增的统计出现次数的列名,默认count
335
+ :return: 新的次数统计的df表格
336
+
337
+ 这个功能跟 SqlCodeGenerator 的 keys_count、one2many很像,是可以代替这两个功能的
338
+ """
339
+ from collections import Counter
340
+
341
+ # 0 参数处理
342
+ if isinstance(col_names, str):
343
+ col_names = [col_names]
344
+
345
+ # 1 统计每种组合出现的次数
346
+ cols = [df[name] for name in col_names]
347
+ ct = Counter(tuple(zip(*cols)))
348
+
349
+ # 2 生成新的df的统计表
350
+ ls = []
351
+ for k, v in ct.most_common():
352
+ ls.append([*k, v])
353
+ df2 = pd.DataFrame.from_records(ls, columns=list(col_names) + [count_col_name])
354
+ return df2
355
+
356
+
357
+ def pareto_accumulate(weights, accuracy=0.01, *, print_mode=False, value_unit_type='K'):
358
+ """ 帕累托累计
359
+
360
+ 可以用来分析主要出现的权重、频次
361
+ 二八法则,往往20%的数据,就能解决80%的问题
362
+
363
+ :param weights: 一组权重数据
364
+ :param accuracy: 累计精度,当统计到末尾时,可能有大量权重过小的数值
365
+ 此时不频繁进行累计权重计算,而是当更新权重累计达到accuracy,才会更新一个记录点
366
+ 注意这是全量数据综合的百分比,所以最小更新量就是1%
367
+ :param print_mode: 是否直接展示可视化结果
368
+ :return: [(累计数值数量, ≥当前阈值, 累计权重), ...]
369
+
370
+ >>> pareto_accumulate([1, 2, 3, 9, 8, 7, 4, 6, 5])
371
+ [(1, 9, 9), (2, 8, 17), (3, 7, 24), (4, 6, 30), (5, 5, 35), (6, 4, 39), (7, 3, 42), (8, 2, 44), (9, 1, 45)]
372
+ >>> pareto_accumulate([1, 2, 3, 9, 8, 7, 4, 6, 5], 0.1)
373
+ [(1, 9, 9), (2, 8, 17), (3, 7, 24), (4, 6, 30), (5, 5, 35), (7, 3, 42), (9, 1, 45)]
374
+ """
375
+ # 1 基础数据计算
376
+ points = []
377
+ weights = sorted(weights, reverse=True)
378
+
379
+ total = sum(weights)
380
+ accuracy = total * accuracy
381
+
382
+ acc = 0
383
+ delta = 0
384
+ for i, w in enumerate(weights, start=1):
385
+ acc += w
386
+ delta += w
387
+ if delta >= accuracy:
388
+ points.append((i, w, acc))
389
+ delta = 0
390
+ if delta:
391
+ points.append((len(weights), weights[-1], acc))
392
+
393
+ # 2 结果展示
394
+ def fmt(p):
395
+ from pyxllib.prog.newbie import human_readable_number
396
+ ls = [f'{human_readable_number(p[0], "万")}条≥{human_readable_number(p[1])}',
397
+ f'{human_readable_number(p[2], value_unit_type)}({p[2] / total_size:.0%})']
398
+ return ','.join(map(str, ls))
399
+
400
+ total_size = points[-1][2]
401
+ labels = [fmt(p) for p in points]
402
+
403
+ pts = [[p[0], p[2]] for p in points]
404
+
405
+ if print_mode:
406
+ if sys.platform == 'win32':
407
+ from pyxllib.data.echarts import Line
408
+ from pyxllib.prog.specialist import browser
409
+
410
+ x = Line()
411
+ x.add_series('帕累托累积权重', pts, labels=labels, label={'position': 'right'})
412
+ browser(x)
413
+ else:
414
+ print(*labels, sep='\n')
415
+
416
+ return pts, labels
417
+
418
+
419
+ class XlDataFrame(pd.DataFrame):
420
+ def check_dtypes(self):
421
+ """ 检查数据类型
422
+ 第1列是列名,第2列是原本dtypes显示的类型看,第3列是我扩展的统计的实际数据类型
423
+ """
424
+ d = self.dtypes
425
+ ls = [[k, d[k], Counter([typename(x) for x in v]).most_common()] for k, v in self.iteritems()]
426
+ df = pd.DataFrame.from_records(ls, columns=['name', 'type', 'detail'])
427
+ return df
428
+
429
+
430
+ class ModifiableRow:
431
+ def __init__(self, df, index):
432
+ self.df = df
433
+ self.index = index
434
+
435
+ def __getitem__(self, item):
436
+ return self.df.at[self.index, item]
437
+
438
+ def __setitem__(self, key, value):
439
+ self.df.at[self.index, key] = value
440
+
441
+
442
+ def print_full_dataframe(df):
443
+ """
444
+ 临时设置以完整显示DataFrame的内容
445
+
446
+ :param pd.DataFrame df: 需要完整显示的DataFrame
447
+ """
448
+ with pd.option_context('display.max_rows', None,
449
+ 'display.max_columns', None,
450
+ 'display.width', 1000,
451
+ 'display.max_colwidth', None):
452
+ print(df)
453
+
454
+ pd.options.display.max_rows = 60
455
+
456
+
457
+ def custom_fillna(df, default_fill_value='', numeric_fill_value=None, specific_fill=None):
458
+ """ 使用更多灵活性填充DataFrame中的NaN值。
459
+
460
+ :param pandas.DataFrame df: 需要处理的DataFrame。
461
+ :param str default_fill_value: 非数值列中NaN的默认填充值。
462
+ :param numeric_fill_value: 数值列中NaN的填充值,如果不指定,则默认为None。
463
+ :param dict specific_fill: 指定列名及其NaN的填充值,如果不指定,则默认为None。
464
+ :return: 已根据指定标准填充NaN值的pandas.DataFrame。
465
+
466
+ >>> df = pd.DataFrame({'A': [1, 2, None], 'B': [None, 'x', 'y'], 'C': [None, None, None]})
467
+ >>> custom_fillna(df, 'filled', 0, {'C': 'special'})
468
+ """
469
+ for column in df.columns:
470
+ # 检查列是否在specific_fill中指定;如果是,则使用指定的值填充。
471
+ if specific_fill and column in specific_fill:
472
+ df[column] = df[column].fillna(specific_fill[column])
473
+ # 如果列是数值型且指定了numeric_fill_value,则使用numeric_fill_value填充。
474
+ elif numeric_fill_value is not None and pd.api.types.is_numeric_dtype(df[column]):
475
+ df[column] = df[column].fillna(numeric_fill_value)
476
+ # 否则,对非数值列使用default_fill_value进行填充。
477
+ elif pd.api.types.is_object_dtype(df[column]) or pd.api.types.is_string_dtype(df[column]):
478
+ df[column] = df[column].fillna(default_fill_value)
479
+ # 可以在这里添加更多条件,以处理其他数据类型,如datetime。
480
+ return df
481
+
482
+
483
+ def dataframe_to_list(df):
484
+ """将DataFrame转换为列表结构,第一行是表头,其余是数据"""
485
+ # 获取表头(列名)作为第一个列表元素
486
+ headers = df.columns.tolist()
487
+
488
+ # 获取数据行,每一行作为一个列表,然后将所有这些列表收集到一个大列表中
489
+ data_rows = df.values.tolist()
490
+
491
+ # 将表头和数据行合并成最终的列表
492
+ result_list = [headers] + data_rows
493
+
494
+ return result_list