pyxllib 0.3.96__py3-none-any.whl → 0.3.200__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. pyxllib/__init__.py +21 -21
  2. pyxllib/algo/__init__.py +8 -8
  3. pyxllib/algo/disjoint.py +54 -54
  4. pyxllib/algo/geo.py +541 -529
  5. pyxllib/algo/intervals.py +964 -964
  6. pyxllib/algo/matcher.py +389 -311
  7. pyxllib/algo/newbie.py +166 -166
  8. pyxllib/algo/pupil.py +629 -461
  9. pyxllib/algo/shapelylib.py +67 -67
  10. pyxllib/algo/specialist.py +241 -240
  11. pyxllib/algo/stat.py +494 -458
  12. pyxllib/algo/treelib.py +149 -149
  13. pyxllib/algo/unitlib.py +66 -66
  14. {pyxlpr → pyxllib/autogui}/__init__.py +5 -5
  15. pyxllib/autogui/activewin.py +246 -0
  16. pyxllib/autogui/all.py +9 -0
  17. pyxllib/{ext/autogui → autogui}/autogui.py +852 -823
  18. pyxllib/autogui/uiautolib.py +362 -0
  19. pyxllib/{ext/autogui → autogui}/virtualkey.py +102 -102
  20. pyxllib/autogui/wechat.py +827 -0
  21. pyxllib/autogui/wechat_msg.py +421 -0
  22. pyxllib/autogui/wxautolib.py +84 -0
  23. pyxllib/cv/__init__.py +5 -5
  24. pyxllib/cv/expert.py +267 -267
  25. pyxllib/cv/imfile.py +159 -159
  26. pyxllib/cv/imhash.py +39 -39
  27. pyxllib/cv/pupil.py +9 -9
  28. pyxllib/cv/rgbfmt.py +1525 -1525
  29. pyxllib/cv/slidercaptcha.py +137 -0
  30. pyxllib/cv/trackbartools.py +251 -251
  31. pyxllib/cv/xlcvlib.py +1040 -1040
  32. pyxllib/cv/xlpillib.py +423 -423
  33. pyxllib/data/echarts.py +240 -129
  34. pyxllib/data/jsonlib.py +89 -0
  35. pyxllib/data/oss.py +72 -72
  36. pyxllib/data/pglib.py +1127 -643
  37. pyxllib/data/sqlite.py +568 -341
  38. pyxllib/data/sqllib.py +297 -297
  39. pyxllib/ext/JLineViewer.py +505 -492
  40. pyxllib/ext/__init__.py +6 -6
  41. pyxllib/ext/demolib.py +246 -246
  42. pyxllib/ext/drissionlib.py +277 -0
  43. pyxllib/ext/kq5034lib.py +12 -1606
  44. pyxllib/ext/old.py +663 -663
  45. pyxllib/ext/qt.py +449 -449
  46. pyxllib/ext/robustprocfile.py +497 -0
  47. pyxllib/ext/seleniumlib.py +76 -76
  48. pyxllib/ext/tk.py +173 -173
  49. pyxllib/ext/unixlib.py +827 -826
  50. pyxllib/ext/utools.py +351 -338
  51. pyxllib/ext/webhook.py +124 -101
  52. pyxllib/ext/win32lib.py +40 -40
  53. pyxllib/ext/wjxlib.py +88 -0
  54. pyxllib/ext/wpsapi.py +124 -0
  55. pyxllib/ext/xlwork.py +9 -0
  56. pyxllib/ext/yuquelib.py +1105 -173
  57. pyxllib/file/__init__.py +17 -17
  58. pyxllib/file/docxlib.py +761 -761
  59. pyxllib/file/gitlib.py +309 -309
  60. pyxllib/file/libreoffice.py +165 -0
  61. pyxllib/file/movielib.py +148 -139
  62. pyxllib/file/newbie.py +10 -10
  63. pyxllib/file/onenotelib.py +1469 -1469
  64. pyxllib/file/packlib/__init__.py +330 -293
  65. pyxllib/file/packlib/zipfile.py +2441 -2441
  66. pyxllib/file/pdflib.py +426 -426
  67. pyxllib/file/pupil.py +185 -185
  68. pyxllib/file/specialist/__init__.py +685 -685
  69. pyxllib/file/specialist/dirlib.py +799 -799
  70. pyxllib/file/specialist/download.py +193 -186
  71. pyxllib/file/specialist/filelib.py +2829 -2618
  72. pyxllib/file/xlsxlib.py +3131 -2976
  73. pyxllib/file/xlsyncfile.py +341 -0
  74. pyxllib/prog/__init__.py +5 -5
  75. pyxllib/prog/cachetools.py +64 -0
  76. pyxllib/prog/deprecatedlib.py +233 -233
  77. pyxllib/prog/filelock.py +42 -0
  78. pyxllib/prog/ipyexec.py +253 -253
  79. pyxllib/prog/multiprogs.py +940 -0
  80. pyxllib/prog/newbie.py +451 -444
  81. pyxllib/prog/pupil.py +1197 -1128
  82. pyxllib/prog/sitepackages.py +33 -33
  83. pyxllib/prog/specialist/__init__.py +391 -217
  84. pyxllib/prog/specialist/bc.py +203 -200
  85. pyxllib/prog/specialist/browser.py +497 -488
  86. pyxllib/prog/specialist/common.py +347 -347
  87. pyxllib/prog/specialist/datetime.py +199 -131
  88. pyxllib/prog/specialist/tictoc.py +240 -241
  89. pyxllib/prog/specialist/xllog.py +180 -180
  90. pyxllib/prog/xlosenv.py +108 -101
  91. pyxllib/stdlib/__init__.py +17 -17
  92. pyxllib/stdlib/tablepyxl/__init__.py +10 -10
  93. pyxllib/stdlib/tablepyxl/style.py +303 -303
  94. pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
  95. pyxllib/text/__init__.py +8 -8
  96. pyxllib/text/ahocorasick.py +39 -39
  97. pyxllib/text/airscript.js +744 -0
  98. pyxllib/text/charclasslib.py +121 -109
  99. pyxllib/text/jiebalib.py +267 -264
  100. pyxllib/text/jinjalib.py +32 -0
  101. pyxllib/text/jsa_ai_prompt.md +271 -0
  102. pyxllib/text/jscode.py +922 -767
  103. pyxllib/text/latex/__init__.py +158 -158
  104. pyxllib/text/levenshtein.py +303 -303
  105. pyxllib/text/nestenv.py +1215 -1215
  106. pyxllib/text/newbie.py +300 -288
  107. pyxllib/text/pupil/__init__.py +8 -8
  108. pyxllib/text/pupil/common.py +1121 -1095
  109. pyxllib/text/pupil/xlalign.py +326 -326
  110. pyxllib/text/pycode.py +47 -47
  111. pyxllib/text/specialist/__init__.py +8 -8
  112. pyxllib/text/specialist/common.py +112 -112
  113. pyxllib/text/specialist/ptag.py +186 -186
  114. pyxllib/text/spellchecker.py +172 -172
  115. pyxllib/text/templates/echart_base.html +11 -0
  116. pyxllib/text/templates/highlight_code.html +17 -0
  117. pyxllib/text/templates/latex_editor.html +103 -0
  118. pyxllib/text/vbacode.py +17 -17
  119. pyxllib/text/xmllib.py +747 -685
  120. pyxllib/xl.py +42 -38
  121. pyxllib/xlcv.py +17 -17
  122. pyxllib-0.3.200.dist-info/METADATA +48 -0
  123. pyxllib-0.3.200.dist-info/RECORD +126 -0
  124. {pyxllib-0.3.96.dist-info → pyxllib-0.3.200.dist-info}/WHEEL +1 -2
  125. {pyxllib-0.3.96.dist-info → pyxllib-0.3.200.dist-info/licenses}/LICENSE +190 -190
  126. pyxllib/ext/autogui/__init__.py +0 -8
  127. pyxllib-0.3.96.dist-info/METADATA +0 -51
  128. pyxllib-0.3.96.dist-info/RECORD +0 -333
  129. pyxllib-0.3.96.dist-info/top_level.txt +0 -2
  130. pyxlpr/ai/__init__.py +0 -5
  131. pyxlpr/ai/clientlib.py +0 -1281
  132. pyxlpr/ai/specialist.py +0 -286
  133. pyxlpr/ai/torch_app.py +0 -172
  134. pyxlpr/ai/xlpaddle.py +0 -655
  135. pyxlpr/ai/xltorch.py +0 -705
  136. pyxlpr/data/__init__.py +0 -11
  137. pyxlpr/data/coco.py +0 -1325
  138. pyxlpr/data/datacls.py +0 -365
  139. pyxlpr/data/datasets.py +0 -200
  140. pyxlpr/data/gptlib.py +0 -1291
  141. pyxlpr/data/icdar/__init__.py +0 -96
  142. pyxlpr/data/icdar/deteval.py +0 -377
  143. pyxlpr/data/icdar/icdar2013.py +0 -341
  144. pyxlpr/data/icdar/iou.py +0 -340
  145. pyxlpr/data/icdar/rrc_evaluation_funcs_1_1.py +0 -463
  146. pyxlpr/data/imtextline.py +0 -473
  147. pyxlpr/data/labelme.py +0 -866
  148. pyxlpr/data/removeline.py +0 -179
  149. pyxlpr/data/specialist.py +0 -57
  150. pyxlpr/eval/__init__.py +0 -85
  151. pyxlpr/paddleocr.py +0 -776
  152. pyxlpr/ppocr/__init__.py +0 -15
  153. pyxlpr/ppocr/configs/rec/multi_language/generate_multi_language_configs.py +0 -226
  154. pyxlpr/ppocr/data/__init__.py +0 -135
  155. pyxlpr/ppocr/data/imaug/ColorJitter.py +0 -26
  156. pyxlpr/ppocr/data/imaug/__init__.py +0 -67
  157. pyxlpr/ppocr/data/imaug/copy_paste.py +0 -170
  158. pyxlpr/ppocr/data/imaug/east_process.py +0 -437
  159. pyxlpr/ppocr/data/imaug/gen_table_mask.py +0 -244
  160. pyxlpr/ppocr/data/imaug/iaa_augment.py +0 -114
  161. pyxlpr/ppocr/data/imaug/label_ops.py +0 -789
  162. pyxlpr/ppocr/data/imaug/make_border_map.py +0 -184
  163. pyxlpr/ppocr/data/imaug/make_pse_gt.py +0 -106
  164. pyxlpr/ppocr/data/imaug/make_shrink_map.py +0 -126
  165. pyxlpr/ppocr/data/imaug/operators.py +0 -433
  166. pyxlpr/ppocr/data/imaug/pg_process.py +0 -906
  167. pyxlpr/ppocr/data/imaug/randaugment.py +0 -143
  168. pyxlpr/ppocr/data/imaug/random_crop_data.py +0 -239
  169. pyxlpr/ppocr/data/imaug/rec_img_aug.py +0 -533
  170. pyxlpr/ppocr/data/imaug/sast_process.py +0 -777
  171. pyxlpr/ppocr/data/imaug/text_image_aug/__init__.py +0 -17
  172. pyxlpr/ppocr/data/imaug/text_image_aug/augment.py +0 -120
  173. pyxlpr/ppocr/data/imaug/text_image_aug/warp_mls.py +0 -168
  174. pyxlpr/ppocr/data/lmdb_dataset.py +0 -115
  175. pyxlpr/ppocr/data/pgnet_dataset.py +0 -104
  176. pyxlpr/ppocr/data/pubtab_dataset.py +0 -107
  177. pyxlpr/ppocr/data/simple_dataset.py +0 -372
  178. pyxlpr/ppocr/losses/__init__.py +0 -61
  179. pyxlpr/ppocr/losses/ace_loss.py +0 -52
  180. pyxlpr/ppocr/losses/basic_loss.py +0 -135
  181. pyxlpr/ppocr/losses/center_loss.py +0 -88
  182. pyxlpr/ppocr/losses/cls_loss.py +0 -30
  183. pyxlpr/ppocr/losses/combined_loss.py +0 -67
  184. pyxlpr/ppocr/losses/det_basic_loss.py +0 -208
  185. pyxlpr/ppocr/losses/det_db_loss.py +0 -80
  186. pyxlpr/ppocr/losses/det_east_loss.py +0 -63
  187. pyxlpr/ppocr/losses/det_pse_loss.py +0 -149
  188. pyxlpr/ppocr/losses/det_sast_loss.py +0 -121
  189. pyxlpr/ppocr/losses/distillation_loss.py +0 -272
  190. pyxlpr/ppocr/losses/e2e_pg_loss.py +0 -140
  191. pyxlpr/ppocr/losses/kie_sdmgr_loss.py +0 -113
  192. pyxlpr/ppocr/losses/rec_aster_loss.py +0 -99
  193. pyxlpr/ppocr/losses/rec_att_loss.py +0 -39
  194. pyxlpr/ppocr/losses/rec_ctc_loss.py +0 -44
  195. pyxlpr/ppocr/losses/rec_enhanced_ctc_loss.py +0 -70
  196. pyxlpr/ppocr/losses/rec_nrtr_loss.py +0 -30
  197. pyxlpr/ppocr/losses/rec_sar_loss.py +0 -28
  198. pyxlpr/ppocr/losses/rec_srn_loss.py +0 -47
  199. pyxlpr/ppocr/losses/table_att_loss.py +0 -109
  200. pyxlpr/ppocr/metrics/__init__.py +0 -44
  201. pyxlpr/ppocr/metrics/cls_metric.py +0 -45
  202. pyxlpr/ppocr/metrics/det_metric.py +0 -82
  203. pyxlpr/ppocr/metrics/distillation_metric.py +0 -73
  204. pyxlpr/ppocr/metrics/e2e_metric.py +0 -86
  205. pyxlpr/ppocr/metrics/eval_det_iou.py +0 -274
  206. pyxlpr/ppocr/metrics/kie_metric.py +0 -70
  207. pyxlpr/ppocr/metrics/rec_metric.py +0 -75
  208. pyxlpr/ppocr/metrics/table_metric.py +0 -50
  209. pyxlpr/ppocr/modeling/architectures/__init__.py +0 -32
  210. pyxlpr/ppocr/modeling/architectures/base_model.py +0 -88
  211. pyxlpr/ppocr/modeling/architectures/distillation_model.py +0 -60
  212. pyxlpr/ppocr/modeling/backbones/__init__.py +0 -54
  213. pyxlpr/ppocr/modeling/backbones/det_mobilenet_v3.py +0 -268
  214. pyxlpr/ppocr/modeling/backbones/det_resnet_vd.py +0 -246
  215. pyxlpr/ppocr/modeling/backbones/det_resnet_vd_sast.py +0 -285
  216. pyxlpr/ppocr/modeling/backbones/e2e_resnet_vd_pg.py +0 -265
  217. pyxlpr/ppocr/modeling/backbones/kie_unet_sdmgr.py +0 -186
  218. pyxlpr/ppocr/modeling/backbones/rec_mobilenet_v3.py +0 -138
  219. pyxlpr/ppocr/modeling/backbones/rec_mv1_enhance.py +0 -258
  220. pyxlpr/ppocr/modeling/backbones/rec_nrtr_mtb.py +0 -48
  221. pyxlpr/ppocr/modeling/backbones/rec_resnet_31.py +0 -210
  222. pyxlpr/ppocr/modeling/backbones/rec_resnet_aster.py +0 -143
  223. pyxlpr/ppocr/modeling/backbones/rec_resnet_fpn.py +0 -307
  224. pyxlpr/ppocr/modeling/backbones/rec_resnet_vd.py +0 -286
  225. pyxlpr/ppocr/modeling/heads/__init__.py +0 -54
  226. pyxlpr/ppocr/modeling/heads/cls_head.py +0 -52
  227. pyxlpr/ppocr/modeling/heads/det_db_head.py +0 -118
  228. pyxlpr/ppocr/modeling/heads/det_east_head.py +0 -121
  229. pyxlpr/ppocr/modeling/heads/det_pse_head.py +0 -37
  230. pyxlpr/ppocr/modeling/heads/det_sast_head.py +0 -128
  231. pyxlpr/ppocr/modeling/heads/e2e_pg_head.py +0 -253
  232. pyxlpr/ppocr/modeling/heads/kie_sdmgr_head.py +0 -206
  233. pyxlpr/ppocr/modeling/heads/multiheadAttention.py +0 -163
  234. pyxlpr/ppocr/modeling/heads/rec_aster_head.py +0 -393
  235. pyxlpr/ppocr/modeling/heads/rec_att_head.py +0 -202
  236. pyxlpr/ppocr/modeling/heads/rec_ctc_head.py +0 -88
  237. pyxlpr/ppocr/modeling/heads/rec_nrtr_head.py +0 -826
  238. pyxlpr/ppocr/modeling/heads/rec_sar_head.py +0 -402
  239. pyxlpr/ppocr/modeling/heads/rec_srn_head.py +0 -280
  240. pyxlpr/ppocr/modeling/heads/self_attention.py +0 -406
  241. pyxlpr/ppocr/modeling/heads/table_att_head.py +0 -246
  242. pyxlpr/ppocr/modeling/necks/__init__.py +0 -32
  243. pyxlpr/ppocr/modeling/necks/db_fpn.py +0 -111
  244. pyxlpr/ppocr/modeling/necks/east_fpn.py +0 -188
  245. pyxlpr/ppocr/modeling/necks/fpn.py +0 -138
  246. pyxlpr/ppocr/modeling/necks/pg_fpn.py +0 -314
  247. pyxlpr/ppocr/modeling/necks/rnn.py +0 -92
  248. pyxlpr/ppocr/modeling/necks/sast_fpn.py +0 -284
  249. pyxlpr/ppocr/modeling/necks/table_fpn.py +0 -110
  250. pyxlpr/ppocr/modeling/transforms/__init__.py +0 -28
  251. pyxlpr/ppocr/modeling/transforms/stn.py +0 -135
  252. pyxlpr/ppocr/modeling/transforms/tps.py +0 -308
  253. pyxlpr/ppocr/modeling/transforms/tps_spatial_transformer.py +0 -156
  254. pyxlpr/ppocr/optimizer/__init__.py +0 -61
  255. pyxlpr/ppocr/optimizer/learning_rate.py +0 -228
  256. pyxlpr/ppocr/optimizer/lr_scheduler.py +0 -49
  257. pyxlpr/ppocr/optimizer/optimizer.py +0 -160
  258. pyxlpr/ppocr/optimizer/regularizer.py +0 -52
  259. pyxlpr/ppocr/postprocess/__init__.py +0 -55
  260. pyxlpr/ppocr/postprocess/cls_postprocess.py +0 -33
  261. pyxlpr/ppocr/postprocess/db_postprocess.py +0 -234
  262. pyxlpr/ppocr/postprocess/east_postprocess.py +0 -143
  263. pyxlpr/ppocr/postprocess/locality_aware_nms.py +0 -200
  264. pyxlpr/ppocr/postprocess/pg_postprocess.py +0 -52
  265. pyxlpr/ppocr/postprocess/pse_postprocess/__init__.py +0 -15
  266. pyxlpr/ppocr/postprocess/pse_postprocess/pse/__init__.py +0 -29
  267. pyxlpr/ppocr/postprocess/pse_postprocess/pse/setup.py +0 -14
  268. pyxlpr/ppocr/postprocess/pse_postprocess/pse_postprocess.py +0 -118
  269. pyxlpr/ppocr/postprocess/rec_postprocess.py +0 -654
  270. pyxlpr/ppocr/postprocess/sast_postprocess.py +0 -355
  271. pyxlpr/ppocr/tools/__init__.py +0 -14
  272. pyxlpr/ppocr/tools/eval.py +0 -83
  273. pyxlpr/ppocr/tools/export_center.py +0 -77
  274. pyxlpr/ppocr/tools/export_model.py +0 -129
  275. pyxlpr/ppocr/tools/infer/predict_cls.py +0 -151
  276. pyxlpr/ppocr/tools/infer/predict_det.py +0 -300
  277. pyxlpr/ppocr/tools/infer/predict_e2e.py +0 -169
  278. pyxlpr/ppocr/tools/infer/predict_rec.py +0 -414
  279. pyxlpr/ppocr/tools/infer/predict_system.py +0 -204
  280. pyxlpr/ppocr/tools/infer/utility.py +0 -629
  281. pyxlpr/ppocr/tools/infer_cls.py +0 -83
  282. pyxlpr/ppocr/tools/infer_det.py +0 -134
  283. pyxlpr/ppocr/tools/infer_e2e.py +0 -122
  284. pyxlpr/ppocr/tools/infer_kie.py +0 -153
  285. pyxlpr/ppocr/tools/infer_rec.py +0 -146
  286. pyxlpr/ppocr/tools/infer_table.py +0 -107
  287. pyxlpr/ppocr/tools/program.py +0 -596
  288. pyxlpr/ppocr/tools/test_hubserving.py +0 -117
  289. pyxlpr/ppocr/tools/train.py +0 -163
  290. pyxlpr/ppocr/tools/xlprog.py +0 -748
  291. pyxlpr/ppocr/utils/EN_symbol_dict.txt +0 -94
  292. pyxlpr/ppocr/utils/__init__.py +0 -24
  293. pyxlpr/ppocr/utils/dict/ar_dict.txt +0 -117
  294. pyxlpr/ppocr/utils/dict/arabic_dict.txt +0 -162
  295. pyxlpr/ppocr/utils/dict/be_dict.txt +0 -145
  296. pyxlpr/ppocr/utils/dict/bg_dict.txt +0 -140
  297. pyxlpr/ppocr/utils/dict/chinese_cht_dict.txt +0 -8421
  298. pyxlpr/ppocr/utils/dict/cyrillic_dict.txt +0 -163
  299. pyxlpr/ppocr/utils/dict/devanagari_dict.txt +0 -167
  300. pyxlpr/ppocr/utils/dict/en_dict.txt +0 -63
  301. pyxlpr/ppocr/utils/dict/fa_dict.txt +0 -136
  302. pyxlpr/ppocr/utils/dict/french_dict.txt +0 -136
  303. pyxlpr/ppocr/utils/dict/german_dict.txt +0 -143
  304. pyxlpr/ppocr/utils/dict/hi_dict.txt +0 -162
  305. pyxlpr/ppocr/utils/dict/it_dict.txt +0 -118
  306. pyxlpr/ppocr/utils/dict/japan_dict.txt +0 -4399
  307. pyxlpr/ppocr/utils/dict/ka_dict.txt +0 -153
  308. pyxlpr/ppocr/utils/dict/korean_dict.txt +0 -3688
  309. pyxlpr/ppocr/utils/dict/latin_dict.txt +0 -185
  310. pyxlpr/ppocr/utils/dict/mr_dict.txt +0 -153
  311. pyxlpr/ppocr/utils/dict/ne_dict.txt +0 -153
  312. pyxlpr/ppocr/utils/dict/oc_dict.txt +0 -96
  313. pyxlpr/ppocr/utils/dict/pu_dict.txt +0 -130
  314. pyxlpr/ppocr/utils/dict/rs_dict.txt +0 -91
  315. pyxlpr/ppocr/utils/dict/rsc_dict.txt +0 -134
  316. pyxlpr/ppocr/utils/dict/ru_dict.txt +0 -125
  317. pyxlpr/ppocr/utils/dict/ta_dict.txt +0 -128
  318. pyxlpr/ppocr/utils/dict/table_dict.txt +0 -277
  319. pyxlpr/ppocr/utils/dict/table_structure_dict.txt +0 -2759
  320. pyxlpr/ppocr/utils/dict/te_dict.txt +0 -151
  321. pyxlpr/ppocr/utils/dict/ug_dict.txt +0 -114
  322. pyxlpr/ppocr/utils/dict/uk_dict.txt +0 -142
  323. pyxlpr/ppocr/utils/dict/ur_dict.txt +0 -137
  324. pyxlpr/ppocr/utils/dict/xi_dict.txt +0 -110
  325. pyxlpr/ppocr/utils/dict90.txt +0 -90
  326. pyxlpr/ppocr/utils/e2e_metric/Deteval.py +0 -574
  327. pyxlpr/ppocr/utils/e2e_metric/polygon_fast.py +0 -83
  328. pyxlpr/ppocr/utils/e2e_utils/extract_batchsize.py +0 -87
  329. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_fast.py +0 -457
  330. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_slow.py +0 -592
  331. pyxlpr/ppocr/utils/e2e_utils/pgnet_pp_utils.py +0 -162
  332. pyxlpr/ppocr/utils/e2e_utils/visual.py +0 -162
  333. pyxlpr/ppocr/utils/en_dict.txt +0 -95
  334. pyxlpr/ppocr/utils/gen_label.py +0 -81
  335. pyxlpr/ppocr/utils/ic15_dict.txt +0 -36
  336. pyxlpr/ppocr/utils/iou.py +0 -54
  337. pyxlpr/ppocr/utils/logging.py +0 -69
  338. pyxlpr/ppocr/utils/network.py +0 -84
  339. pyxlpr/ppocr/utils/ppocr_keys_v1.txt +0 -6623
  340. pyxlpr/ppocr/utils/profiler.py +0 -110
  341. pyxlpr/ppocr/utils/save_load.py +0 -150
  342. pyxlpr/ppocr/utils/stats.py +0 -72
  343. pyxlpr/ppocr/utils/utility.py +0 -80
  344. pyxlpr/ppstructure/__init__.py +0 -13
  345. pyxlpr/ppstructure/predict_system.py +0 -187
  346. pyxlpr/ppstructure/table/__init__.py +0 -13
  347. pyxlpr/ppstructure/table/eval_table.py +0 -72
  348. pyxlpr/ppstructure/table/matcher.py +0 -192
  349. pyxlpr/ppstructure/table/predict_structure.py +0 -136
  350. pyxlpr/ppstructure/table/predict_table.py +0 -221
  351. pyxlpr/ppstructure/table/table_metric/__init__.py +0 -16
  352. pyxlpr/ppstructure/table/table_metric/parallel.py +0 -51
  353. pyxlpr/ppstructure/table/table_metric/table_metric.py +0 -247
  354. pyxlpr/ppstructure/table/tablepyxl/__init__.py +0 -13
  355. pyxlpr/ppstructure/table/tablepyxl/style.py +0 -283
  356. pyxlpr/ppstructure/table/tablepyxl/tablepyxl.py +0 -118
  357. pyxlpr/ppstructure/utility.py +0 -71
  358. pyxlpr/xlai.py +0 -10
@@ -1,748 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Date : 2022/02/21 11:07
6
-
7
- """
8
- 对PaddleOcr进行了更高程度的工程化封装
9
- """
10
- import collections
11
- import os
12
- import sys
13
- import re
14
-
15
- import pandas as pd
16
- import yaml
17
- import shutil
18
- import copy
19
- import inspect
20
- import math
21
- import json
22
-
23
- import numpy as np
24
- from tqdm import tqdm
25
-
26
- from pyxlpr.ppocr.tools.program import preprocess
27
- from pyxlpr.ppocr.data import build_dataloader
28
-
29
- from pyxllib.algo.geo import rect_bounds, ltrb2xywh
30
- from pyxllib.file.specialist import XlPath, ensure_localfile, ensure_localdir
31
- from pyxllib.cv.xlcvlib import xlcv
32
- from pyxllib.prog.newbie import round_int
33
-
34
-
35
- class PaddleOcrBaseConfig:
36
- """ paddle(ocr)标准配置文件的封装,为了简化配置方便自己使用,
37
- 做了一个中间层组件,方便做一些统一的参数设置、修改
38
- """
39
-
40
- def __init__(self):
41
- self.cfg = {}
42
-
43
- def __1_config(self):
44
- """ 配置文件相关的功能 """
45
- pass
46
-
47
- def autoset(self):
48
- """ 这个接口方便写一些通用的配置 """
49
-
50
- x = self.cfg['Global']
51
- x['use_visualdl'] = True
52
- x['print_batch_step'] = 1000 # 这个单位是iter。原本很小2,我改成了100。但epoch很小的时候,每轮epoch也会输出。
53
- x['pretrained_model'] = None
54
- # 每隔多少次epoch,保存模型,原本默认是1200,这里故意设置的特别大,相当于不保存模型,需要的话手动设置补充。
55
- # 虽然没有固定间隔保存模型,但默认还是会根据eval,保存最优模型的
56
- x['save_epoch_step'] = 100000
57
-
58
- self.set_save_dir('models/' + inspect.stack()[3].function)
59
-
60
- def resume(self, train=False):
61
- """ 如果没有设置checkpoints,尝试加载best_accuracy或latest模型
62
-
63
- 跟是否是Train模式有关,默认加载的模型会不一样
64
- train要加载latest,其他默认优先加载accuracy
65
- """
66
- if train: # 用于模型训练时,应该是优先回复上一次的模型
67
- candidates = ['latest', 'best_accuracy']
68
- else: # 用于其他场合,则应该是默认找最佳模型来使用
69
- candidates = ['best_accuracy', 'latest']
70
-
71
- for name in candidates:
72
- f = XlPath(self.cfg['Global']['save_model_dir']) / name
73
- if f.with_suffix('.pdparams').exists():
74
- self.cfg['Global']['checkpoints'] = f
75
- return
76
-
77
- def config_from_content(self, content):
78
- self.cfg = yaml.safe_load(content)
79
- self.autoset()
80
- return self.cfg
81
-
82
- def config_from_template(self, subpath):
83
- """
84
- :param subpath: 例如 'det/det_mv3_db'
85
- """
86
- f = os.path.join(sys.modules['pyxlpr.ppocr'].__path__[0], 'configs', subpath + '.yml')
87
- return self.config_from_content(XlPath(f).read_text())
88
-
89
- def set_save_dir(self, save_dir):
90
- """ 有很多个运行中文件的输出路径,可以统一到一个地方,并且只设置一次就够 """
91
- # self.d['Global']
92
- save_dir = XlPath(save_dir)
93
- x = self.cfg['Global']
94
- x['save_model_dir'] = save_dir # train时模型存储目录
95
- x['save_inference_dir'] = save_dir / 'infer' # export_model时存储目录
96
- # 这个选项暂时还不清楚具体作用,不知道是不是db专有的
97
- x['save_res_path'] = save_dir / 'predicts.txt'
98
-
99
- def set_simpledataset(self, mode, data_dir, label_file_list, ratio_list=None):
100
- """ paddle官方标准的SimpleDataset数据格式
101
-
102
- :param str mode: Train or Eval,设置训练集或者验证集
103
- :param PathLike data_dir: 数据所在根目录
104
- :param list label_file_list: 标注文件清单 [txtfile1, textfile2, ...]
105
- 每个txtfile文件里的内容,每行是一张图的标注
106
- 每行第1列是图片相对data_dir的路径,\t隔开,第2列是json.dumps的json标注数据
107
- json里有transcription字段存储文本内容,points存储四边形框位置
108
- :param list ratio_list: 只有一个label_file_list的时候,可以只输入一个数字,但最好统一按列表输入
109
- 填写一个0~1.0的小数值,表示所取样本比例数
110
- 这个paddle官方实现是随机取的,没有顺序规律
111
- """
112
- # 注意如果在SimpleDataSet、XlSimpleDataSet之间切换的话,有些字段格式是有区别的
113
- # 保险起见,就把self.cfg[mode]['dataset']整个重置了
114
- node = self.cfg[mode]['dataset']
115
- x = {'name': 'SimpleDataSet',
116
- 'data_dir': XlPath(data_dir),
117
- 'label_file_list': label_file_list}
118
- if ratio_list:
119
- x['ratio_list'] = ratio_list
120
- x['transforms'] = node['transforms']
121
- self.cfg[mode]['dataset'] = x
122
-
123
- def set_xlsimpledataset(self, mode, data_dir, data_list):
124
- """ 设置自己的XlSampleDataSet数据格式
125
-
126
- 用于对各种源生的格式,在程序运行中将格式直接转为paddle的内存支持格式接口,从而不用重复生成冗余的中间数据文件
127
- 目前最主要的是扩展了对xllabelme标注格式的支持,如labelme_det
128
-
129
- :param str mode: Train or Eval,设置训练集或者验证集
130
- :param PathLike data_dir: 数据所在根目录
131
- :param list data_list: 数据具体清单,每个条目都是一个字典
132
- [必填]type: 具体的数据格式,目前支持 labelme_det, icdar2015, refineAgree
133
- 具体支持的方法,可以见XlSimpleDataSet类下前缀为from_的成员方法
134
- 其他为选填字段,具体见from_定义支持的扩展功能,一般有以下常见参数功能
135
- [ratio] 一个小数比例,可以负数代表从后往前取
136
- 一般用于懒得物理区分Train、Eval数据集的时候,在代码里用算法自动拆分训练、验证集
137
-
138
- """
139
- node = self.cfg[mode]['dataset']
140
- x = {'name': 'XlSimpleDataSet',
141
- 'data_dir': XlPath(data_dir),
142
- 'data_list': data_list}
143
- x['transforms'] = node['transforms']
144
- self.cfg[mode]['dataset'] = x
145
-
146
- @classmethod
147
- def _rset_posix_path(cls, d):
148
- from pathlib import Path
149
-
150
- if isinstance(d, list):
151
- for i, x in enumerate(d):
152
- if isinstance(x, (Path, XlPath)):
153
- d[i] = x.as_posix()
154
- else:
155
- cls._rset_posix_path(x)
156
- elif isinstance(d, dict):
157
- for k, v in d.items():
158
- if isinstance(v, (Path, XlPath)):
159
- d[k] = v.as_posix()
160
- else:
161
- cls._rset_posix_path(v)
162
-
163
- def rset_posix_path(self):
164
- """ 配置字典中,可能存在XlPath、Path类,需要递归统一修改为str类型来存储
165
-
166
- rset是递归设置的意思
167
- """
168
- d = copy.deepcopy(self.cfg)
169
- self._rset_posix_path(d)
170
- return d
171
-
172
- def write_cfg_tempfile(self):
173
- """ 存储一个文件到临时目录,并返回文件路径 """
174
- p = XlPath.tempfile('.yml')
175
- # TODO 写入文件前,会把配置里 XlPath全部转为 as_poisx 的str
176
- self._rset_posix_path(self.cfg)
177
- p.write_yaml(self.cfg)
178
- return str(p)
179
-
180
- def add_config_to_cmd_argv(self):
181
- """ 把配置参数加入命令行的 -c 命令中 """
182
- sys.argv = sys.argv + ['-c', self.write_cfg_tempfile()]
183
-
184
- def set_iter_num(self, num):
185
- """ 按迭代数设置训练长度
186
-
187
- paddle的配置源生并不支持按iter来统计训练长度,
188
- 要通过batch_size_per_card和数据量,来反推epoch_num需要设置多少
189
-
190
- 注意要先设置好数据集,再继续迭代数哦!
191
- """
192
- config, device, logger, _ = preprocess(from_dict=self.rset_posix_path(), use_visualdl=False)
193
- train_dataloader = build_dataloader(config, 'Train', device, logger)
194
- per_epoch_iter_num = len(train_dataloader) # 每个epoch的迭代数
195
- self.cfg['Global']['epoch_num'] = math.ceil(num / per_epoch_iter_num)
196
-
197
- def __2_main(self):
198
- """ 一些脚本功能工具 """
199
- pass
200
-
201
- def train(self, resume=False):
202
- from pyxlpr.ppocr.tools.train import main
203
-
204
- if resume:
205
- self.resume(True)
206
- config, device, logger, vdl_writer = preprocess(is_train=True, from_dict=self.rset_posix_path())
207
- main(config, device, logger, vdl_writer)
208
-
209
- def eval(self, resume=True, *, dataset_mode='Eval'):
210
- """
211
- :param dataset_mode: 使用的数据集,默认是Eval,也可以用Train
212
- """
213
- from pyxlpr.ppocr.tools.eval import main
214
-
215
- if resume:
216
- self.resume()
217
-
218
- config, device, logger, vdl_writer = preprocess(from_dict=self.rset_posix_path())
219
- for k in ['name', 'data_dir', 'data_list']:
220
- config['Eval']['dataset'][k] = config[dataset_mode]['dataset'][k]
221
- metric = main(config, device, logger)
222
- return metric
223
-
224
- def infer_det(self, resume=True):
225
- from pyxlpr.ppocr.tools.infer_det import main
226
-
227
- if resume:
228
- self.resume()
229
- config, device, logger, vdl_writer = preprocess(from_dict=self.rset_posix_path())
230
- main(config, logger)
231
-
232
- def export_model(self, resume=True):
233
- from pyxlpr.ppocr.tools.export_model import main
234
-
235
- if resume:
236
- self.resume()
237
- config, device, logger, vdl_writer = preprocess(from_dict=self.rset_posix_path())
238
- main(config, logger)
239
-
240
- def __3_pretrained(self):
241
- """ 使用预训练模型相关配置的封装 """
242
-
243
- @classmethod
244
- def get_pretrained_model_backbone(cls, name):
245
- """ 只拿骨干网络的权重 """
246
- local_file = XlPath.userdir() / f'.paddleocr/pretrained/{name}.pdparams'
247
- url = f'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/{name}.pdparams'
248
- ensure_localfile(local_file, url)
249
- return local_file.parent / local_file.stem # 省略.pdparams后缀
250
-
251
- @classmethod
252
- def get_pretrained_model_ppocr(cls, name):
253
- local_dir = XlPath.userdir() / f'.paddleocr/pretrained/{name}'
254
- url = f'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/{name}.tar'
255
- ensure_localdir(local_dir, url, wrap=-1)
256
- return local_dir / 'best_accuracy' # ppocr训练好的ocr模型
257
-
258
- def set_pretrained_model_backbone(self, name):
259
- path = self.get_pretrained_model_backbone(name)
260
- self.cfg['Global']['pretrained_model'] = path
261
-
262
- def set_pretrained_model_ppocr(self, name):
263
- path = self.get_pretrained_model_ppocr(name)
264
- self.cfg['Global']['pretrained_model'] = path
265
-
266
- def set_pretrained_infer_model(self, local_dir, url):
267
- """ 自己扩展的一个配置参数,metric的时候用 """
268
- local_dir = XlPath.userdir() / f'.paddleocr/pretrained_infer/{local_dir}'
269
- path = ensure_localdir(local_dir, url, wrap=-1)
270
- self.cfg['Global']['pretrained_infer_model'] = path
271
-
272
- def set_pretrained_model(self, pretrained, models):
273
- """ 对上述功能进一步封装,简化高层接口配置时的代码复杂度
274
-
275
- :param bool|int pretrained:
276
- 0 不使用预训练权重
277
- 1 使用骨干网络权重
278
- 2 使用完整的ppocr权重
279
- 3 之前定制训练过的最好的模型
280
- :param models: pretrained为1、2时加载的模型
281
- """
282
- if pretrained == 1:
283
- self.set_pretrained_model_backbone(models[0])
284
- elif pretrained == 2:
285
- self.set_pretrained_model_ppocr(models[1])
286
- elif pretrained == 3:
287
- self.cfg['Global']['pretrained_model'] = self.cfg['Global']['save_model_dir'] / 'best_accuracy'
288
-
289
- def __call__(self, *args, **kwargs):
290
- # 让fire库配合return self不会报错
291
- pass
292
-
293
-
294
- class XlDetText(PaddleOcrBaseConfig):
295
- """ 检测模型专用配置
296
- """
297
-
298
- def autolabel(self, datadir, *, model_type=0, **kwargs):
299
- """ 预标注检测、识别
300
-
301
- TODO model_type在det1_mobile的时候,默认设为2?
302
-
303
- """
304
- pocr = self.build_ppocr(model_type, **kwargs)
305
- pocr.ocr2labelme(datadir, det=True, rec=True)
306
-
307
- def set_deploy_args_det(self):
308
- """ 检测模型在部署时候的参数,不一定跟eval一样
309
- 换句话说,eval本来就应该尽量贴近真实部署的配置参数
310
-
311
- 由于很多文本检测的配置文件,在eval时有些配置跟部署不同,这里按照deploy的情况自动进行调整
312
-
313
- 当然,有些配置,如果eval效果确实比deploy来的好,可以考虑deploy采用eval的配置方式
314
- """
315
- for x in self.cfg['Eval']['dataset']['transforms']:
316
- if 'DetResizeForTest' in x:
317
- x['DetResizeForTest'] = {'limit_side_len': 960, 'limit_type': 'max'}
318
-
319
- def det1_mobile_init(self, *, pretrained=2):
320
- """
321
- 官方实验:ic15, train1000+val500张, batch_size_per_card=8, epoch=1200
322
- 也就是总训练量120w,除batchsize,是15万iter
323
- 按照核酸的实验,每iter耗时大概是0.4秒,实验总用时15iter/3600*0.4约等于17小时
324
-
325
- batchsize=8,hesuan训练过程占用显存 6.7G
326
- 以后有其他实验数据,会尝试都覆盖上,但记忆中差不多都是消耗这么多
327
-
328
- 这个部署文件共 3M
329
-
330
- TODO datalist不只一个的相关功能,还没有进行测试,但问题应该不大
331
- """
332
- # 1 加载基础的配置
333
- cfg = self.config_from_template('det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0')
334
- self.set_pretrained_model(pretrained, ['MobileNetV3_large_x0_5_pretrained', 'ch_ppocr_mobile_v2.0_det_train'])
335
- self.set_deploy_args_det()
336
-
337
- # 2 预训练权重也提供一个部署模型,供后续metric分析
338
- infer_model_url = 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar'
339
- self.set_pretrained_infer_model('ch_ppocr_mobile_v2.0_det_infer', infer_model_url)
340
-
341
- def det1_server_init(self, *, pretrained=2):
342
- """
343
- 训练显存 10.2 G
344
-
345
- 这个部署文件共 47M
346
- """
347
- # 1 加载基础的配置
348
- cfg = self.config_from_template('det/ch_ppocr_v2.0/ch_det_res18_db_v2.0')
349
- self.set_pretrained_model(pretrained, ['ResNet18_vd_pretrained', 'ch_ppocr_server_v2.0_det_train'])
350
- self.set_deploy_args_det()
351
-
352
- # 2 预训练权重也提供一个部署模型,供后续metric分析
353
- infer_model_url = 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar'
354
- self.set_pretrained_infer_model('ch_ppocr_server_v2.0_det_infer', infer_model_url)
355
-
356
- def det2_init(self, *, pretrained=1):
357
- """ 2021.9.7发布的PP-OCRv2
358
- 但是我还没有试跑过,不确定我这样配置是对的
359
-
360
- 220223周三18:11,跑通了,但还没有完全对,metric结果有点奇怪,摸不着头脑
361
- """
362
- cfg = self.config_from_template('det/ch_PP-OCRv2/ch_PP-OCRv2_det_distill')
363
- if pretrained:
364
- x = cfg['Architecture']['Models']
365
-
366
- # self.set_pretrained_model_ppocr('ch_PP-OCRv2_det_distill_train')
367
- x['Student']['pretrained'] = self.get_pretrained_model_backbone('MobileNetV3_large_x0_5_pretrained')
368
- # x['Student']['pretrained'] = self.get_pretrained_model_ppocr('ch_PP-OCRv2_det_distill_train')
369
- x['Teacher']['pretrained'] = self.get_pretrained_model_ppocr('ch_ppocr_server_v2.0_det_train')
370
-
371
- self.set_deploy_args_det()
372
-
373
- infer_model_url = 'https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar'
374
- self.set_pretrained_infer_model('ch_PP-OCRv2_det_infer', infer_model_url)
375
-
376
- return self
377
-
378
- def build_ppocr(self, model_type=2, **kwargs):
379
- """ 获得部署用的接口类
380
- 导出部署模型,并加载
381
-
382
- :param model_type:
383
- 0,原始的PaddleOcr
384
- 1,配置文件自带的部署文件(需要配置Global.pretrained_infer_model参数)
385
- 2,finetune后的模型
386
- :param kwargs: 可以增加一些检测模型的配置参数,比如常用的 det_db_unclip_ratio=1.5
387
- """
388
- from pyxlpr.paddleocr import PaddleOCR
389
-
390
- if model_type == 0:
391
- ppocr = PaddleOCR.build_ppocr(**kwargs)
392
- elif model_type == 1:
393
- d = self.cfg['Global']['pretrained_infer_model']
394
- if not d:
395
- return {}
396
- ppocr = PaddleOCR.build_ppocr(det_model_dir=d, **kwargs)
397
- else:
398
- self.export_model(True)
399
- ppocr = PaddleOCR.build_ppocr(det_model_dir=self.cfg['Global']['save_inference_dir'], **kwargs)
400
-
401
- return ppocr
402
-
403
- def _build_dataset(self, config, logger, dataset_mode='Eval'):
404
- from pyxlpr.ppocr.data import build_dataset
405
- # 注意这里数据集切换方法跟PaddleOCRConfig.eval有点不太一样,因为部署操作要连transforms一起改掉
406
- src = config[dataset_mode]['dataset']
407
- config['Eval']['dataset'] = {'name': src['name'],
408
- 'data_dir': src['data_dir'],
409
- 'data_list': src['data_list'],
410
- 'transforms': [{'DetLabelEncode': None}]}
411
- dataset = build_dataset(config, 'Eval', logger)
412
- return config, dataset
413
-
414
- def eval_deploy(self, model_type=2, dataset_mode='Eval', **kwargs):
415
- ppocr = self.build_ppocr(model_type, **kwargs)
416
- config, device, logger, vdl_writer = preprocess(from_dict=self.rset_posix_path())
417
- config, dataset = self._build_dataset(config, logger, dataset_mode)
418
- metric = ppocr.det_metric(dataset)
419
- logger.info(str(metric))
420
- return metric
421
-
422
- def metric(self, *, print_mode=False):
423
- """ 得到一个综合的测评结果,一般如下:
424
- type train_dataset eval_dataset
425
- ①PaddleOCR* 32.35*56 100.0*190
426
- ②pretrained 17.57*43 50.0*22
427
- ③pretrained* 17.57*184 50.0*192
428
- ④finetune 93.05*49 100.0*20
429
- ⑤finetune* 93.05*173 100.0*164
430
-
431
- 有几条规律
432
- 1、精度②=③,速度③>②。如果精度不同,很可能官方给的预训练模型和部署文件有问题
433
- 2、精度④=⑤,速度⑤>④。如果精度不同,可能eval和部署阶段的图片处理方式不同
434
- det就存在这个问题,处理后的图片尺寸不同,通过set_deploy_args_det修复了
435
- 3、去掉上述两个eval阶段的,其实就是①③⑤三个模型间的比较
436
- ①PaddleOCR直接给的一条龙准备好的模型,一般要高于③开源模型训练效果,低于⑤定制化的效果
437
- 即精读:③<①<⑤
438
- """
439
- import pandas as pd
440
- from pyxllib.algo.stat import xlpivot
441
-
442
- # 1 收集各模型结果
443
- eval_list = []
444
-
445
- def core(title, eval_func):
446
- for dataset in ['a、Train', 'b、Eval']:
447
- m = eval_func(dataset[2:]) # m, metric
448
- m = {k: (round_int(v) if k in ('fps', 'total_frame') else round(v * 100, 2)) for k, v in m.items()}
449
- m2 = {'model_type': title, 'dataset': dataset}
450
- m2.update(m)
451
- eval_list.append(m2)
452
-
453
- core('①PaddleOCR*', lambda m: self.eval_deploy(model_type=0, dataset_mode=m))
454
- core('②pretrained', lambda m: self.eval(resume=False, dataset_mode=m))
455
- core('③pretrained*', lambda m: self.eval_deploy(model_type=1, dataset_mode=m))
456
- core('④finetune', lambda m: self.eval(resume=True, dataset_mode=m))
457
- core('⑤finetune*', lambda m: self.eval_deploy(model_type=2, dataset_mode=m))
458
-
459
- # 2 最后的统计表
460
- df = pd.DataFrame.from_records(eval_list)
461
- outfile = self.cfg['Global']['save_model_dir'] / f'results/metric.html'
462
- os.makedirs(outfile.parent, exist_ok=True)
463
-
464
- def func(items):
465
- x = items.iloc[0]
466
- return f'{x["precision"]:.0f},{x["recall"]:.0f},{x["hmean"]:.2f},{x["fps"]}'
467
-
468
- df2 = xlpivot(df, ['model_type'], ['dataset', 'total_frame'], {'precision,recall,hmean,fps': func})
469
- stat_html = df2.to_html()
470
- stat_html = stat_html.replace('<th></th>', f'<th>{sys.argv[2]}</th>', 1)
471
- outfile.write_text(stat_html)
472
-
473
- if 'metric' in sys.argv:
474
- print(df2)
475
- return
476
-
477
- if print_mode:
478
- print(df2)
479
-
480
- return df
481
-
482
- def create_visual_results(self, *, model_type=2, max_samples=None, **kwargs):
483
- """ 将可视化结果生成到目录下
484
-
485
- :param max_samples: 限制生成的可视化图片上限,有时候只需要看少量样本
486
-
487
- 【算法流程】基本思路是将数据转成coco格式后,使用coco的功能接口来实现,考验我以前接口好不好用的时候到了
488
- 1、初始化指定的ppocr
489
- 2、用ppocr生成一套检测结果
490
- 3、和gt对比,生成一套coco数据
491
- 4、生成coco可视化结果
492
- 5、生成coco的数据分析表格
493
- """
494
- import PIL.Image
495
- from pyxlpr.data.coco import CocoGtData, CocoMatch
496
-
497
- ppocr = self.build_ppocr(model_type, **kwargs)
498
- for dataset_mode in ['Train', 'Eval']: # 训练集和验证集结果都生成,放在两个不同目录
499
- gt = {'images': [],
500
- 'annotations': [],
501
- 'categories': CocoGtData.gen_categories(['text'])}
502
- dt = []
503
- k = 1
504
-
505
- config, device, logger, vdl_writer = preprocess(from_dict=self.rset_posix_path())
506
- config, dataset = self._build_dataset(config, logger, dataset_mode)
507
- out_dir = self.cfg['Global']['save_model_dir'] / f'results/{dataset_mode}'
508
- data_dir = self.cfg['Eval']['dataset']['data_dir']
509
- for img_id, x in enumerate(dataset, start=1):
510
- if max_samples and img_id > max_samples:
511
- break
512
-
513
- # 1 拷贝图片数据到相对目录
514
- src_img_path = x['img_path']
515
- rel_img_path = XlPath(src_img_path).relpath(data_dir)
516
- dst_img_path = out_dir / rel_img_path
517
- os.makedirs(dst_img_path.parent, exist_ok=True)
518
- if not dst_img_path.is_file():
519
- shutil.copy2(src_img_path, dst_img_path)
520
-
521
- # 2 生成对应图片的gt标注数据
522
- w, h = PIL.Image.open(str(dst_img_path)).size
523
- gt['images'].append(CocoGtData.gen_image(img_id, rel_img_path, h, w))
524
- for p in x['polys']:
525
- gt['annotations'].append(
526
- CocoGtData.gen_annotation(id=k, image_id=img_id, points=p, text=x['texts']))
527
- k += 1
528
-
529
- # 3 生成dt标注数据
530
- img = xlcv.read_from_buffer(x['image'])
531
- for p in ppocr.ocr(img, rec=False):
532
- dt.append({'image_id': img_id, 'category_id': 1, 'segmentation': np.array(p).reshape([1, -1]),
533
- 'bbox': ltrb2xywh(rect_bounds(p)), 'score': 1.0})
534
-
535
- cm = CocoMatch(gt, dt)
536
- cm.to_labelme_match(out_dir, segmentation=True)
537
- cm.to_excel(out_dir / 'cocomatch.xlsx')
538
-
539
- def __config_demo(self):
540
- """ 常用的配置示例 """
541
-
542
- def set_xllabelme_dataset(self, data_dir, ratio_list):
543
- """ 设置xllabelme格式的文字检测标注数据
544
-
545
- 我自设的一种简单的数据集范式
546
-
547
- :param data_dir: 数据根目录
548
- :param list[float, float] ratio_list: 训练集、验证集所需的比例
549
- 可以取负数,表示从后往前取;底层设置了随机数种子,每次取得具体文件是固定的。
550
- 数据集较少的话,一般是推荐 [0.9, -0.1],较多的话可以 [0.8, -0.2]
551
- """
552
- self.set_xlsimpledataset('Train', data_dir, [{'type': 'labelme_det', 'ratio': ratio_list[0]}])
553
- self.set_xlsimpledataset('Eval', data_dir, [{'type': 'labelme_det', 'ratio': ratio_list[1]}])
554
-
555
- def det1_mobile_raw(self):
556
- """ paddle源生格式的配置示例 """
557
- self.det1_mobile_init(pretrained=2) # 基础配置
558
- self.set_save_dir('train/det1_mobile_raw') # 模型保存位置
559
- self.set_simpledataset('Train', 'data', ['data/ppdet_train.txt'])
560
- self.set_simpledataset('Eval', 'data', ['data/ppdet_val.txt'])
561
- self.set_iter_num(150000)
562
- return self
563
-
564
- def det1_mobile(self):
565
- """ labelme标注格式的检测训练 """
566
- self.det1_mobile_init(pretrained=2) # 基础配置
567
- self.set_save_dir('train/det1_mobile') # 模型保存位置
568
- self.set_xllabelme_dataset('data', [0.9, -0.1]) # 设置数据集
569
- self.set_iter_num(150000) # 设置迭代轮次
570
- return self
571
-
572
- def det1_server(self):
573
- self.det1_server_init(pretrained=2) # 基础配置
574
- self.set_save_dir('train/det1_server') # 模型保存位置
575
- self.set_xllabelme_dataset('data', [0.9, -0.1]) # 设置数据集
576
- self.set_iter_num(150000) # 设置迭代轮次
577
- return self
578
-
579
-
580
- class XlRec(PaddleOcrBaseConfig):
581
- """ 识别模型专用配置
582
- """
583
-
584
- def stat_texts(self, xllabelme_data_dir, *, ref_dict='ppocr_keys_v1.txt'):
585
- """ 检查标注的句子、字符出现情况 statistics texts
586
-
587
- :param xllabelme_data_dir: xllabelme格式的标注数据所在目录
588
- :param ref_dict: 参考字典文件
589
- """
590
- from collections import Counter
591
- from pyxllib.algo.pupil import ValuesStat
592
- from pyxllib.algo.stat import dataframes_to_excel
593
- from pyxlpr.ppocr.utils import get_dict_content
594
-
595
- root = XlPath(xllabelme_data_dir)
596
- outfile = root.parent / 'stat_texts.xlsx'
597
-
598
- # 1 读取数据
599
- sentances_counter = Counter() # 每句话的内容,和相同话出现的次数
600
- for f in root.rglob('*.json'):
601
- for sp in f.read_json()['shapes']:
602
- attr = json.loads(sp['label'])
603
- if 'text' in attr:
604
- text = attr['text']
605
- sentances_counter[text] += 1
606
-
607
- # 2 统计 sentances 每句话出现频率, words 每个单词出现频率, chars 每个字符出现频率
608
- chars_counter = Counter()
609
- words_counter = Counter()
610
- for sentance, cnt in sentances_counter.items():
611
- for word in sentance.split(): # 目前先按空格分开,虽然严格来说,对于中文情况,要用结巴分词处理更准确
612
- words_counter[word] += cnt
613
- for ch in sentance: # 统计每个字符出现次数,包括空格
614
- chars_counter[ch] += cnt
615
-
616
- # 3 转df
617
- char_dict = set(get_dict_content(ref_dict).splitlines())
618
- ls = []
619
- new_chars = []
620
- for char, cnt in chars_counter.most_common():
621
- ls.append([char, cnt, '' if char in char_dict else 'True'])
622
- if char not in char_dict and char != ' ':
623
- new_chars.append(char)
624
- chars_df = pd.DataFrame.from_records(ls, columns=['char', 'count', 'new_char'])
625
-
626
- words_df = pd.DataFrame.from_records(words_counter.most_common(), columns=['word', 'count'])
627
- sentances_df = pd.DataFrame.from_records([[sentance, cnt, len(sentance)]
628
- for sentance, cnt in sentances_counter.most_common()],
629
- columns=['sentance', 'count', 'length'])
630
-
631
- # 计算不同长度句子的分布规律
632
- ct = Counter()
633
- lengths = []
634
- for _, row in sentances_df.iterrows():
635
- ct[row['length']] += row['count']
636
- lengths += [row['length']] * row['count'] # 这个实现不是那么得优雅,但如果要兼容ValuesStat只能先这样处理
637
- # ct = sentances_df.groupby('length').sum().to_dict()['count']
638
- max_len = max(sentances_df['length'])
639
- sentances_length_df = pd.DataFrame.from_records([[i, ct.get(i, 0)] for i in range(max_len + 1)],
640
- columns=['length', 'count'])
641
-
642
- # 4 频数规律计算
643
- def summary(title, vals):
644
- msg = ValuesStat(vals).summary(['g', '.2f', '.2f', 'g', 'g'])
645
- # print(msg)
646
- return [title] + re.findall(r':\s+(\S+)', msg)
647
-
648
- print('【stat_texts】')
649
- print(f'输出文件:{outfile.as_posix()}')
650
-
651
- print(f'不在字典中的{len(new_chars)}个字符:' + ''.join(new_chars))
652
-
653
- ls = [
654
- summary('字符频数', chars_df['count']),
655
- summary('词组频数', words_df['count']),
656
- summary('句子频数', sentances_df['count']),
657
- summary('句子长度', lengths),
658
- ]
659
- df = pd.DataFrame.from_records(ls, columns=['title', '总和', '均值标准差', '总数', '最小值', '最大值'])
660
- print(df)
661
-
662
- # 5 存储分析表
663
- sheets = {'字符': chars_df, '词组': words_df,
664
- '句子': sentances_df, '句子长度': sentances_length_df}
665
- dataframes_to_excel(outfile, sheets)
666
-
667
- def create_recdata(self, src, dst, *, print_mode=True, recreate=False):
668
- """ 从xllabelme标注的格式,生成到paddle支持的识别数据格式;提取出供文本识别模型训练的文本行数据
669
-
670
- :param src: xllabelme_data_dir
671
- :param dst: 目标存储位置的根目录
672
- :param recreate: 如果目标目录存在,将其删除,重新生成
673
-
674
- 注意:本套生成方法仅供参考,这套处理目前不是那么泛用
675
- """
676
- # 0
677
- src, dst = XlPath(src), XlPath(dst)
678
- if recreate and dst.is_dir():
679
- dst.delete() # 如果已有,将其删除
680
-
681
- # 1 生成图片
682
- chars = set()
683
- labels1, labels2 = [], []
684
- for f in tqdm(list(src.rglob('*.json')), desc='提取文本行数据', disable=not print_mode):
685
- data = f.read_json()
686
- impath = f.parent / data['imagePath']
687
- im = xlcv.read(impath)
688
- for i, sp in enumerate(data['shapes'], start=1):
689
- # a组,提取文本行的时候,按外接矩形框截取
690
- name = f'imgs/{f.stem}_r{i:03}.jpg'
691
- text = json.loads(sp['label'])['text']
692
- chars |= set(text)
693
- xlcv.write(xlcv.get_sub(im, sp['points']), dst / name)
694
- labels1.append(f'{name}\t{text}')
695
-
696
- # b组,提取文本行的时候,进行仿射变换矫正
697
- name = f'imgs/{f.stem}_w{i:03}.jpg'
698
- xlcv.write(xlcv.get_sub(im, sp['points'], warp_quad=True), dst / name)
699
- labels2.append(f'{name}\t{text}')
700
-
701
- # 2 字典文件
702
- chars -= set(' \n\t') # 要去掉空格等字符
703
- (dst / 'char_dict.txt').write_text('\n'.join(sorted(chars)))
704
-
705
- # 3 标注数据
706
- (dst / 'labels_rect.txt').write_text('\n'.join(labels1))
707
- (dst / 'labels_warp.txt').write_text('\n'.join(labels2))
708
- (dst / 'labels_total.txt').write_text('\n'.join(labels1 + labels2))
709
-
710
- return self
711
-
712
- def set_rec_dataset(self, data_dir, label_file_list):
713
- """ 设置识别数据集
714
-
715
- :param data_dir: 数据所在根目录
716
- :param list[str|list] label_file_list: 标注文件清单
717
- str,标注文件的相对路径
718
- list[str, float],除了str描述标注文件路径,还有个ratio值配置选取样本的比例
719
-
720
- TODO 想做设置的集成,但目前还没想到好的设计方式,可以自己手动拆分数据,并在autoset中配置,也不会很麻烦
721
- """
722
-
723
- # self.cfg['Train']['dataset']['data_dir'] = Paths.eleclabel / 'recdata'
724
- # self.cfg['Train']['dataset']['label_file_list'] = [Paths.eleclabel / 'recdata/labels_ab.txt']
725
- # self.cfg['Eval']['dataset']['data_dir'] = Paths.eleclabel / 'recdata'
726
- # self.cfg['Eval']['dataset']['label_file_list'] = [Paths.eleclabel / 'recdata/labels_ab.txt']
727
-
728
- raise NotImplementedError
729
-
730
-
731
- class XlCls:
732
- """ 分类模型,这个是基本使用源生的paddlepaddle,没有使用有个更强的paddleclas """
733
-
734
-
735
- class XlOcr:
736
- """ 封装了文字技术体系,检测识别的一些标准化处理流程 """
737
-
738
- def __init__(self, root):
739
- self.root = XlPath(root) # 项目根目录
740
-
741
- def step1_autolabel(self):
742
- """ 预标注检测、识别 """
743
-
744
- def step2_refinelabel(self):
745
- """ 人工手动优化label标注 """
746
-
747
- def step3_det(self):
748
- """ 训练检测模型 """