pyxllib 0.3.96__py3-none-any.whl → 0.3.197__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (306) hide show
  1. pyxllib/algo/geo.py +12 -0
  2. pyxllib/algo/intervals.py +1 -1
  3. pyxllib/algo/matcher.py +78 -0
  4. pyxllib/algo/pupil.py +187 -19
  5. pyxllib/algo/specialist.py +2 -1
  6. pyxllib/algo/stat.py +38 -2
  7. {pyxlpr → pyxllib/autogui}/__init__.py +1 -1
  8. pyxllib/autogui/activewin.py +246 -0
  9. pyxllib/autogui/all.py +9 -0
  10. pyxllib/{ext/autogui → autogui}/autogui.py +40 -11
  11. pyxllib/autogui/uiautolib.py +362 -0
  12. pyxllib/autogui/wechat.py +827 -0
  13. pyxllib/autogui/wechat_msg.py +421 -0
  14. pyxllib/autogui/wxautolib.py +84 -0
  15. pyxllib/cv/slidercaptcha.py +137 -0
  16. pyxllib/data/echarts.py +123 -12
  17. pyxllib/data/jsonlib.py +89 -0
  18. pyxllib/data/pglib.py +514 -30
  19. pyxllib/data/sqlite.py +231 -4
  20. pyxllib/ext/JLineViewer.py +14 -1
  21. pyxllib/ext/drissionlib.py +277 -0
  22. pyxllib/ext/kq5034lib.py +0 -1594
  23. pyxllib/ext/robustprocfile.py +497 -0
  24. pyxllib/ext/unixlib.py +6 -5
  25. pyxllib/ext/utools.py +108 -95
  26. pyxllib/ext/webhook.py +32 -14
  27. pyxllib/ext/wjxlib.py +88 -0
  28. pyxllib/ext/wpsapi.py +124 -0
  29. pyxllib/ext/xlwork.py +9 -0
  30. pyxllib/ext/yuquelib.py +1003 -71
  31. pyxllib/file/docxlib.py +1 -1
  32. pyxllib/file/libreoffice.py +165 -0
  33. pyxllib/file/movielib.py +9 -0
  34. pyxllib/file/packlib/__init__.py +112 -75
  35. pyxllib/file/pdflib.py +1 -1
  36. pyxllib/file/pupil.py +1 -1
  37. pyxllib/file/specialist/dirlib.py +1 -1
  38. pyxllib/file/specialist/download.py +10 -3
  39. pyxllib/file/specialist/filelib.py +266 -55
  40. pyxllib/file/xlsxlib.py +205 -50
  41. pyxllib/file/xlsyncfile.py +341 -0
  42. pyxllib/prog/cachetools.py +64 -0
  43. pyxllib/prog/filelock.py +42 -0
  44. pyxllib/prog/multiprogs.py +940 -0
  45. pyxllib/prog/newbie.py +9 -2
  46. pyxllib/prog/pupil.py +129 -60
  47. pyxllib/prog/specialist/__init__.py +176 -2
  48. pyxllib/prog/specialist/bc.py +5 -2
  49. pyxllib/prog/specialist/browser.py +11 -2
  50. pyxllib/prog/specialist/datetime.py +68 -0
  51. pyxllib/prog/specialist/tictoc.py +12 -13
  52. pyxllib/prog/specialist/xllog.py +5 -5
  53. pyxllib/prog/xlosenv.py +7 -0
  54. pyxllib/text/airscript.js +744 -0
  55. pyxllib/text/charclasslib.py +17 -5
  56. pyxllib/text/jiebalib.py +6 -3
  57. pyxllib/text/jinjalib.py +32 -0
  58. pyxllib/text/jsa_ai_prompt.md +271 -0
  59. pyxllib/text/jscode.py +159 -4
  60. pyxllib/text/nestenv.py +1 -1
  61. pyxllib/text/newbie.py +12 -0
  62. pyxllib/text/pupil/common.py +26 -0
  63. pyxllib/text/specialist/ptag.py +2 -2
  64. pyxllib/text/templates/echart_base.html +11 -0
  65. pyxllib/text/templates/highlight_code.html +17 -0
  66. pyxllib/text/templates/latex_editor.html +103 -0
  67. pyxllib/text/xmllib.py +76 -14
  68. pyxllib/xl.py +2 -1
  69. pyxllib-0.3.197.dist-info/METADATA +48 -0
  70. pyxllib-0.3.197.dist-info/RECORD +126 -0
  71. {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +1 -2
  72. pyxllib/ext/autogui/__init__.py +0 -8
  73. pyxllib-0.3.96.dist-info/METADATA +0 -51
  74. pyxllib-0.3.96.dist-info/RECORD +0 -333
  75. pyxllib-0.3.96.dist-info/top_level.txt +0 -2
  76. pyxlpr/ai/__init__.py +0 -5
  77. pyxlpr/ai/clientlib.py +0 -1281
  78. pyxlpr/ai/specialist.py +0 -286
  79. pyxlpr/ai/torch_app.py +0 -172
  80. pyxlpr/ai/xlpaddle.py +0 -655
  81. pyxlpr/ai/xltorch.py +0 -705
  82. pyxlpr/data/__init__.py +0 -11
  83. pyxlpr/data/coco.py +0 -1325
  84. pyxlpr/data/datacls.py +0 -365
  85. pyxlpr/data/datasets.py +0 -200
  86. pyxlpr/data/gptlib.py +0 -1291
  87. pyxlpr/data/icdar/__init__.py +0 -96
  88. pyxlpr/data/icdar/deteval.py +0 -377
  89. pyxlpr/data/icdar/icdar2013.py +0 -341
  90. pyxlpr/data/icdar/iou.py +0 -340
  91. pyxlpr/data/icdar/rrc_evaluation_funcs_1_1.py +0 -463
  92. pyxlpr/data/imtextline.py +0 -473
  93. pyxlpr/data/labelme.py +0 -866
  94. pyxlpr/data/removeline.py +0 -179
  95. pyxlpr/data/specialist.py +0 -57
  96. pyxlpr/eval/__init__.py +0 -85
  97. pyxlpr/paddleocr.py +0 -776
  98. pyxlpr/ppocr/__init__.py +0 -15
  99. pyxlpr/ppocr/configs/rec/multi_language/generate_multi_language_configs.py +0 -226
  100. pyxlpr/ppocr/data/__init__.py +0 -135
  101. pyxlpr/ppocr/data/imaug/ColorJitter.py +0 -26
  102. pyxlpr/ppocr/data/imaug/__init__.py +0 -67
  103. pyxlpr/ppocr/data/imaug/copy_paste.py +0 -170
  104. pyxlpr/ppocr/data/imaug/east_process.py +0 -437
  105. pyxlpr/ppocr/data/imaug/gen_table_mask.py +0 -244
  106. pyxlpr/ppocr/data/imaug/iaa_augment.py +0 -114
  107. pyxlpr/ppocr/data/imaug/label_ops.py +0 -789
  108. pyxlpr/ppocr/data/imaug/make_border_map.py +0 -184
  109. pyxlpr/ppocr/data/imaug/make_pse_gt.py +0 -106
  110. pyxlpr/ppocr/data/imaug/make_shrink_map.py +0 -126
  111. pyxlpr/ppocr/data/imaug/operators.py +0 -433
  112. pyxlpr/ppocr/data/imaug/pg_process.py +0 -906
  113. pyxlpr/ppocr/data/imaug/randaugment.py +0 -143
  114. pyxlpr/ppocr/data/imaug/random_crop_data.py +0 -239
  115. pyxlpr/ppocr/data/imaug/rec_img_aug.py +0 -533
  116. pyxlpr/ppocr/data/imaug/sast_process.py +0 -777
  117. pyxlpr/ppocr/data/imaug/text_image_aug/__init__.py +0 -17
  118. pyxlpr/ppocr/data/imaug/text_image_aug/augment.py +0 -120
  119. pyxlpr/ppocr/data/imaug/text_image_aug/warp_mls.py +0 -168
  120. pyxlpr/ppocr/data/lmdb_dataset.py +0 -115
  121. pyxlpr/ppocr/data/pgnet_dataset.py +0 -104
  122. pyxlpr/ppocr/data/pubtab_dataset.py +0 -107
  123. pyxlpr/ppocr/data/simple_dataset.py +0 -372
  124. pyxlpr/ppocr/losses/__init__.py +0 -61
  125. pyxlpr/ppocr/losses/ace_loss.py +0 -52
  126. pyxlpr/ppocr/losses/basic_loss.py +0 -135
  127. pyxlpr/ppocr/losses/center_loss.py +0 -88
  128. pyxlpr/ppocr/losses/cls_loss.py +0 -30
  129. pyxlpr/ppocr/losses/combined_loss.py +0 -67
  130. pyxlpr/ppocr/losses/det_basic_loss.py +0 -208
  131. pyxlpr/ppocr/losses/det_db_loss.py +0 -80
  132. pyxlpr/ppocr/losses/det_east_loss.py +0 -63
  133. pyxlpr/ppocr/losses/det_pse_loss.py +0 -149
  134. pyxlpr/ppocr/losses/det_sast_loss.py +0 -121
  135. pyxlpr/ppocr/losses/distillation_loss.py +0 -272
  136. pyxlpr/ppocr/losses/e2e_pg_loss.py +0 -140
  137. pyxlpr/ppocr/losses/kie_sdmgr_loss.py +0 -113
  138. pyxlpr/ppocr/losses/rec_aster_loss.py +0 -99
  139. pyxlpr/ppocr/losses/rec_att_loss.py +0 -39
  140. pyxlpr/ppocr/losses/rec_ctc_loss.py +0 -44
  141. pyxlpr/ppocr/losses/rec_enhanced_ctc_loss.py +0 -70
  142. pyxlpr/ppocr/losses/rec_nrtr_loss.py +0 -30
  143. pyxlpr/ppocr/losses/rec_sar_loss.py +0 -28
  144. pyxlpr/ppocr/losses/rec_srn_loss.py +0 -47
  145. pyxlpr/ppocr/losses/table_att_loss.py +0 -109
  146. pyxlpr/ppocr/metrics/__init__.py +0 -44
  147. pyxlpr/ppocr/metrics/cls_metric.py +0 -45
  148. pyxlpr/ppocr/metrics/det_metric.py +0 -82
  149. pyxlpr/ppocr/metrics/distillation_metric.py +0 -73
  150. pyxlpr/ppocr/metrics/e2e_metric.py +0 -86
  151. pyxlpr/ppocr/metrics/eval_det_iou.py +0 -274
  152. pyxlpr/ppocr/metrics/kie_metric.py +0 -70
  153. pyxlpr/ppocr/metrics/rec_metric.py +0 -75
  154. pyxlpr/ppocr/metrics/table_metric.py +0 -50
  155. pyxlpr/ppocr/modeling/architectures/__init__.py +0 -32
  156. pyxlpr/ppocr/modeling/architectures/base_model.py +0 -88
  157. pyxlpr/ppocr/modeling/architectures/distillation_model.py +0 -60
  158. pyxlpr/ppocr/modeling/backbones/__init__.py +0 -54
  159. pyxlpr/ppocr/modeling/backbones/det_mobilenet_v3.py +0 -268
  160. pyxlpr/ppocr/modeling/backbones/det_resnet_vd.py +0 -246
  161. pyxlpr/ppocr/modeling/backbones/det_resnet_vd_sast.py +0 -285
  162. pyxlpr/ppocr/modeling/backbones/e2e_resnet_vd_pg.py +0 -265
  163. pyxlpr/ppocr/modeling/backbones/kie_unet_sdmgr.py +0 -186
  164. pyxlpr/ppocr/modeling/backbones/rec_mobilenet_v3.py +0 -138
  165. pyxlpr/ppocr/modeling/backbones/rec_mv1_enhance.py +0 -258
  166. pyxlpr/ppocr/modeling/backbones/rec_nrtr_mtb.py +0 -48
  167. pyxlpr/ppocr/modeling/backbones/rec_resnet_31.py +0 -210
  168. pyxlpr/ppocr/modeling/backbones/rec_resnet_aster.py +0 -143
  169. pyxlpr/ppocr/modeling/backbones/rec_resnet_fpn.py +0 -307
  170. pyxlpr/ppocr/modeling/backbones/rec_resnet_vd.py +0 -286
  171. pyxlpr/ppocr/modeling/heads/__init__.py +0 -54
  172. pyxlpr/ppocr/modeling/heads/cls_head.py +0 -52
  173. pyxlpr/ppocr/modeling/heads/det_db_head.py +0 -118
  174. pyxlpr/ppocr/modeling/heads/det_east_head.py +0 -121
  175. pyxlpr/ppocr/modeling/heads/det_pse_head.py +0 -37
  176. pyxlpr/ppocr/modeling/heads/det_sast_head.py +0 -128
  177. pyxlpr/ppocr/modeling/heads/e2e_pg_head.py +0 -253
  178. pyxlpr/ppocr/modeling/heads/kie_sdmgr_head.py +0 -206
  179. pyxlpr/ppocr/modeling/heads/multiheadAttention.py +0 -163
  180. pyxlpr/ppocr/modeling/heads/rec_aster_head.py +0 -393
  181. pyxlpr/ppocr/modeling/heads/rec_att_head.py +0 -202
  182. pyxlpr/ppocr/modeling/heads/rec_ctc_head.py +0 -88
  183. pyxlpr/ppocr/modeling/heads/rec_nrtr_head.py +0 -826
  184. pyxlpr/ppocr/modeling/heads/rec_sar_head.py +0 -402
  185. pyxlpr/ppocr/modeling/heads/rec_srn_head.py +0 -280
  186. pyxlpr/ppocr/modeling/heads/self_attention.py +0 -406
  187. pyxlpr/ppocr/modeling/heads/table_att_head.py +0 -246
  188. pyxlpr/ppocr/modeling/necks/__init__.py +0 -32
  189. pyxlpr/ppocr/modeling/necks/db_fpn.py +0 -111
  190. pyxlpr/ppocr/modeling/necks/east_fpn.py +0 -188
  191. pyxlpr/ppocr/modeling/necks/fpn.py +0 -138
  192. pyxlpr/ppocr/modeling/necks/pg_fpn.py +0 -314
  193. pyxlpr/ppocr/modeling/necks/rnn.py +0 -92
  194. pyxlpr/ppocr/modeling/necks/sast_fpn.py +0 -284
  195. pyxlpr/ppocr/modeling/necks/table_fpn.py +0 -110
  196. pyxlpr/ppocr/modeling/transforms/__init__.py +0 -28
  197. pyxlpr/ppocr/modeling/transforms/stn.py +0 -135
  198. pyxlpr/ppocr/modeling/transforms/tps.py +0 -308
  199. pyxlpr/ppocr/modeling/transforms/tps_spatial_transformer.py +0 -156
  200. pyxlpr/ppocr/optimizer/__init__.py +0 -61
  201. pyxlpr/ppocr/optimizer/learning_rate.py +0 -228
  202. pyxlpr/ppocr/optimizer/lr_scheduler.py +0 -49
  203. pyxlpr/ppocr/optimizer/optimizer.py +0 -160
  204. pyxlpr/ppocr/optimizer/regularizer.py +0 -52
  205. pyxlpr/ppocr/postprocess/__init__.py +0 -55
  206. pyxlpr/ppocr/postprocess/cls_postprocess.py +0 -33
  207. pyxlpr/ppocr/postprocess/db_postprocess.py +0 -234
  208. pyxlpr/ppocr/postprocess/east_postprocess.py +0 -143
  209. pyxlpr/ppocr/postprocess/locality_aware_nms.py +0 -200
  210. pyxlpr/ppocr/postprocess/pg_postprocess.py +0 -52
  211. pyxlpr/ppocr/postprocess/pse_postprocess/__init__.py +0 -15
  212. pyxlpr/ppocr/postprocess/pse_postprocess/pse/__init__.py +0 -29
  213. pyxlpr/ppocr/postprocess/pse_postprocess/pse/setup.py +0 -14
  214. pyxlpr/ppocr/postprocess/pse_postprocess/pse_postprocess.py +0 -118
  215. pyxlpr/ppocr/postprocess/rec_postprocess.py +0 -654
  216. pyxlpr/ppocr/postprocess/sast_postprocess.py +0 -355
  217. pyxlpr/ppocr/tools/__init__.py +0 -14
  218. pyxlpr/ppocr/tools/eval.py +0 -83
  219. pyxlpr/ppocr/tools/export_center.py +0 -77
  220. pyxlpr/ppocr/tools/export_model.py +0 -129
  221. pyxlpr/ppocr/tools/infer/predict_cls.py +0 -151
  222. pyxlpr/ppocr/tools/infer/predict_det.py +0 -300
  223. pyxlpr/ppocr/tools/infer/predict_e2e.py +0 -169
  224. pyxlpr/ppocr/tools/infer/predict_rec.py +0 -414
  225. pyxlpr/ppocr/tools/infer/predict_system.py +0 -204
  226. pyxlpr/ppocr/tools/infer/utility.py +0 -629
  227. pyxlpr/ppocr/tools/infer_cls.py +0 -83
  228. pyxlpr/ppocr/tools/infer_det.py +0 -134
  229. pyxlpr/ppocr/tools/infer_e2e.py +0 -122
  230. pyxlpr/ppocr/tools/infer_kie.py +0 -153
  231. pyxlpr/ppocr/tools/infer_rec.py +0 -146
  232. pyxlpr/ppocr/tools/infer_table.py +0 -107
  233. pyxlpr/ppocr/tools/program.py +0 -596
  234. pyxlpr/ppocr/tools/test_hubserving.py +0 -117
  235. pyxlpr/ppocr/tools/train.py +0 -163
  236. pyxlpr/ppocr/tools/xlprog.py +0 -748
  237. pyxlpr/ppocr/utils/EN_symbol_dict.txt +0 -94
  238. pyxlpr/ppocr/utils/__init__.py +0 -24
  239. pyxlpr/ppocr/utils/dict/ar_dict.txt +0 -117
  240. pyxlpr/ppocr/utils/dict/arabic_dict.txt +0 -162
  241. pyxlpr/ppocr/utils/dict/be_dict.txt +0 -145
  242. pyxlpr/ppocr/utils/dict/bg_dict.txt +0 -140
  243. pyxlpr/ppocr/utils/dict/chinese_cht_dict.txt +0 -8421
  244. pyxlpr/ppocr/utils/dict/cyrillic_dict.txt +0 -163
  245. pyxlpr/ppocr/utils/dict/devanagari_dict.txt +0 -167
  246. pyxlpr/ppocr/utils/dict/en_dict.txt +0 -63
  247. pyxlpr/ppocr/utils/dict/fa_dict.txt +0 -136
  248. pyxlpr/ppocr/utils/dict/french_dict.txt +0 -136
  249. pyxlpr/ppocr/utils/dict/german_dict.txt +0 -143
  250. pyxlpr/ppocr/utils/dict/hi_dict.txt +0 -162
  251. pyxlpr/ppocr/utils/dict/it_dict.txt +0 -118
  252. pyxlpr/ppocr/utils/dict/japan_dict.txt +0 -4399
  253. pyxlpr/ppocr/utils/dict/ka_dict.txt +0 -153
  254. pyxlpr/ppocr/utils/dict/korean_dict.txt +0 -3688
  255. pyxlpr/ppocr/utils/dict/latin_dict.txt +0 -185
  256. pyxlpr/ppocr/utils/dict/mr_dict.txt +0 -153
  257. pyxlpr/ppocr/utils/dict/ne_dict.txt +0 -153
  258. pyxlpr/ppocr/utils/dict/oc_dict.txt +0 -96
  259. pyxlpr/ppocr/utils/dict/pu_dict.txt +0 -130
  260. pyxlpr/ppocr/utils/dict/rs_dict.txt +0 -91
  261. pyxlpr/ppocr/utils/dict/rsc_dict.txt +0 -134
  262. pyxlpr/ppocr/utils/dict/ru_dict.txt +0 -125
  263. pyxlpr/ppocr/utils/dict/ta_dict.txt +0 -128
  264. pyxlpr/ppocr/utils/dict/table_dict.txt +0 -277
  265. pyxlpr/ppocr/utils/dict/table_structure_dict.txt +0 -2759
  266. pyxlpr/ppocr/utils/dict/te_dict.txt +0 -151
  267. pyxlpr/ppocr/utils/dict/ug_dict.txt +0 -114
  268. pyxlpr/ppocr/utils/dict/uk_dict.txt +0 -142
  269. pyxlpr/ppocr/utils/dict/ur_dict.txt +0 -137
  270. pyxlpr/ppocr/utils/dict/xi_dict.txt +0 -110
  271. pyxlpr/ppocr/utils/dict90.txt +0 -90
  272. pyxlpr/ppocr/utils/e2e_metric/Deteval.py +0 -574
  273. pyxlpr/ppocr/utils/e2e_metric/polygon_fast.py +0 -83
  274. pyxlpr/ppocr/utils/e2e_utils/extract_batchsize.py +0 -87
  275. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_fast.py +0 -457
  276. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_slow.py +0 -592
  277. pyxlpr/ppocr/utils/e2e_utils/pgnet_pp_utils.py +0 -162
  278. pyxlpr/ppocr/utils/e2e_utils/visual.py +0 -162
  279. pyxlpr/ppocr/utils/en_dict.txt +0 -95
  280. pyxlpr/ppocr/utils/gen_label.py +0 -81
  281. pyxlpr/ppocr/utils/ic15_dict.txt +0 -36
  282. pyxlpr/ppocr/utils/iou.py +0 -54
  283. pyxlpr/ppocr/utils/logging.py +0 -69
  284. pyxlpr/ppocr/utils/network.py +0 -84
  285. pyxlpr/ppocr/utils/ppocr_keys_v1.txt +0 -6623
  286. pyxlpr/ppocr/utils/profiler.py +0 -110
  287. pyxlpr/ppocr/utils/save_load.py +0 -150
  288. pyxlpr/ppocr/utils/stats.py +0 -72
  289. pyxlpr/ppocr/utils/utility.py +0 -80
  290. pyxlpr/ppstructure/__init__.py +0 -13
  291. pyxlpr/ppstructure/predict_system.py +0 -187
  292. pyxlpr/ppstructure/table/__init__.py +0 -13
  293. pyxlpr/ppstructure/table/eval_table.py +0 -72
  294. pyxlpr/ppstructure/table/matcher.py +0 -192
  295. pyxlpr/ppstructure/table/predict_structure.py +0 -136
  296. pyxlpr/ppstructure/table/predict_table.py +0 -221
  297. pyxlpr/ppstructure/table/table_metric/__init__.py +0 -16
  298. pyxlpr/ppstructure/table/table_metric/parallel.py +0 -51
  299. pyxlpr/ppstructure/table/table_metric/table_metric.py +0 -247
  300. pyxlpr/ppstructure/table/tablepyxl/__init__.py +0 -13
  301. pyxlpr/ppstructure/table/tablepyxl/style.py +0 -283
  302. pyxlpr/ppstructure/table/tablepyxl/tablepyxl.py +0 -118
  303. pyxlpr/ppstructure/utility.py +0 -71
  304. pyxlpr/xlai.py +0 -10
  305. /pyxllib/{ext/autogui → autogui}/virtualkey.py +0 -0
  306. {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
@@ -1,748 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Date : 2022/02/21 11:07
6
-
7
- """
8
- 对PaddleOcr进行了更高程度的工程化封装
9
- """
10
- import collections
11
- import os
12
- import sys
13
- import re
14
-
15
- import pandas as pd
16
- import yaml
17
- import shutil
18
- import copy
19
- import inspect
20
- import math
21
- import json
22
-
23
- import numpy as np
24
- from tqdm import tqdm
25
-
26
- from pyxlpr.ppocr.tools.program import preprocess
27
- from pyxlpr.ppocr.data import build_dataloader
28
-
29
- from pyxllib.algo.geo import rect_bounds, ltrb2xywh
30
- from pyxllib.file.specialist import XlPath, ensure_localfile, ensure_localdir
31
- from pyxllib.cv.xlcvlib import xlcv
32
- from pyxllib.prog.newbie import round_int
33
-
34
-
35
- class PaddleOcrBaseConfig:
36
- """ paddle(ocr)标准配置文件的封装,为了简化配置方便自己使用,
37
- 做了一个中间层组件,方便做一些统一的参数设置、修改
38
- """
39
-
40
- def __init__(self):
41
- self.cfg = {}
42
-
43
- def __1_config(self):
44
- """ 配置文件相关的功能 """
45
- pass
46
-
47
- def autoset(self):
48
- """ 这个接口方便写一些通用的配置 """
49
-
50
- x = self.cfg['Global']
51
- x['use_visualdl'] = True
52
- x['print_batch_step'] = 1000 # 这个单位是iter。原本很小2,我改成了100。但epoch很小的时候,每轮epoch也会输出。
53
- x['pretrained_model'] = None
54
- # 每隔多少次epoch,保存模型,原本默认是1200,这里故意设置的特别大,相当于不保存模型,需要的话手动设置补充。
55
- # 虽然没有固定间隔保存模型,但默认还是会根据eval,保存最优模型的
56
- x['save_epoch_step'] = 100000
57
-
58
- self.set_save_dir('models/' + inspect.stack()[3].function)
59
-
60
- def resume(self, train=False):
61
- """ 如果没有设置checkpoints,尝试加载best_accuracy或latest模型
62
-
63
- 跟是否是Train模式有关,默认加载的模型会不一样
64
- train要加载latest,其他默认优先加载accuracy
65
- """
66
- if train: # 用于模型训练时,应该是优先回复上一次的模型
67
- candidates = ['latest', 'best_accuracy']
68
- else: # 用于其他场合,则应该是默认找最佳模型来使用
69
- candidates = ['best_accuracy', 'latest']
70
-
71
- for name in candidates:
72
- f = XlPath(self.cfg['Global']['save_model_dir']) / name
73
- if f.with_suffix('.pdparams').exists():
74
- self.cfg['Global']['checkpoints'] = f
75
- return
76
-
77
- def config_from_content(self, content):
78
- self.cfg = yaml.safe_load(content)
79
- self.autoset()
80
- return self.cfg
81
-
82
- def config_from_template(self, subpath):
83
- """
84
- :param subpath: 例如 'det/det_mv3_db'
85
- """
86
- f = os.path.join(sys.modules['pyxlpr.ppocr'].__path__[0], 'configs', subpath + '.yml')
87
- return self.config_from_content(XlPath(f).read_text())
88
-
89
- def set_save_dir(self, save_dir):
90
- """ 有很多个运行中文件的输出路径,可以统一到一个地方,并且只设置一次就够 """
91
- # self.d['Global']
92
- save_dir = XlPath(save_dir)
93
- x = self.cfg['Global']
94
- x['save_model_dir'] = save_dir # train时模型存储目录
95
- x['save_inference_dir'] = save_dir / 'infer' # export_model时存储目录
96
- # 这个选项暂时还不清楚具体作用,不知道是不是db专有的
97
- x['save_res_path'] = save_dir / 'predicts.txt'
98
-
99
- def set_simpledataset(self, mode, data_dir, label_file_list, ratio_list=None):
100
- """ paddle官方标准的SimpleDataset数据格式
101
-
102
- :param str mode: Train or Eval,设置训练集或者验证集
103
- :param PathLike data_dir: 数据所在根目录
104
- :param list label_file_list: 标注文件清单 [txtfile1, textfile2, ...]
105
- 每个txtfile文件里的内容,每行是一张图的标注
106
- 每行第1列是图片相对data_dir的路径,\t隔开,第2列是json.dumps的json标注数据
107
- json里有transcription字段存储文本内容,points存储四边形框位置
108
- :param list ratio_list: 只有一个label_file_list的时候,可以只输入一个数字,但最好统一按列表输入
109
- 填写一个0~1.0的小数值,表示所取样本比例数
110
- 这个paddle官方实现是随机取的,没有顺序规律
111
- """
112
- # 注意如果在SimpleDataSet、XlSimpleDataSet之间切换的话,有些字段格式是有区别的
113
- # 保险起见,就把self.cfg[mode]['dataset']整个重置了
114
- node = self.cfg[mode]['dataset']
115
- x = {'name': 'SimpleDataSet',
116
- 'data_dir': XlPath(data_dir),
117
- 'label_file_list': label_file_list}
118
- if ratio_list:
119
- x['ratio_list'] = ratio_list
120
- x['transforms'] = node['transforms']
121
- self.cfg[mode]['dataset'] = x
122
-
123
- def set_xlsimpledataset(self, mode, data_dir, data_list):
124
- """ 设置自己的XlSampleDataSet数据格式
125
-
126
- 用于对各种源生的格式,在程序运行中将格式直接转为paddle的内存支持格式接口,从而不用重复生成冗余的中间数据文件
127
- 目前最主要的是扩展了对xllabelme标注格式的支持,如labelme_det
128
-
129
- :param str mode: Train or Eval,设置训练集或者验证集
130
- :param PathLike data_dir: 数据所在根目录
131
- :param list data_list: 数据具体清单,每个条目都是一个字典
132
- [必填]type: 具体的数据格式,目前支持 labelme_det, icdar2015, refineAgree
133
- 具体支持的方法,可以见XlSimpleDataSet类下前缀为from_的成员方法
134
- 其他为选填字段,具体见from_定义支持的扩展功能,一般有以下常见参数功能
135
- [ratio] 一个小数比例,可以负数代表从后往前取
136
- 一般用于懒得物理区分Train、Eval数据集的时候,在代码里用算法自动拆分训练、验证集
137
-
138
- """
139
- node = self.cfg[mode]['dataset']
140
- x = {'name': 'XlSimpleDataSet',
141
- 'data_dir': XlPath(data_dir),
142
- 'data_list': data_list}
143
- x['transforms'] = node['transforms']
144
- self.cfg[mode]['dataset'] = x
145
-
146
- @classmethod
147
- def _rset_posix_path(cls, d):
148
- from pathlib import Path
149
-
150
- if isinstance(d, list):
151
- for i, x in enumerate(d):
152
- if isinstance(x, (Path, XlPath)):
153
- d[i] = x.as_posix()
154
- else:
155
- cls._rset_posix_path(x)
156
- elif isinstance(d, dict):
157
- for k, v in d.items():
158
- if isinstance(v, (Path, XlPath)):
159
- d[k] = v.as_posix()
160
- else:
161
- cls._rset_posix_path(v)
162
-
163
- def rset_posix_path(self):
164
- """ 配置字典中,可能存在XlPath、Path类,需要递归统一修改为str类型来存储
165
-
166
- rset是递归设置的意思
167
- """
168
- d = copy.deepcopy(self.cfg)
169
- self._rset_posix_path(d)
170
- return d
171
-
172
- def write_cfg_tempfile(self):
173
- """ 存储一个文件到临时目录,并返回文件路径 """
174
- p = XlPath.tempfile('.yml')
175
- # TODO 写入文件前,会把配置里 XlPath全部转为 as_poisx 的str
176
- self._rset_posix_path(self.cfg)
177
- p.write_yaml(self.cfg)
178
- return str(p)
179
-
180
- def add_config_to_cmd_argv(self):
181
- """ 把配置参数加入命令行的 -c 命令中 """
182
- sys.argv = sys.argv + ['-c', self.write_cfg_tempfile()]
183
-
184
- def set_iter_num(self, num):
185
- """ 按迭代数设置训练长度
186
-
187
- paddle的配置源生并不支持按iter来统计训练长度,
188
- 要通过batch_size_per_card和数据量,来反推epoch_num需要设置多少
189
-
190
- 注意要先设置好数据集,再继续迭代数哦!
191
- """
192
- config, device, logger, _ = preprocess(from_dict=self.rset_posix_path(), use_visualdl=False)
193
- train_dataloader = build_dataloader(config, 'Train', device, logger)
194
- per_epoch_iter_num = len(train_dataloader) # 每个epoch的迭代数
195
- self.cfg['Global']['epoch_num'] = math.ceil(num / per_epoch_iter_num)
196
-
197
- def __2_main(self):
198
- """ 一些脚本功能工具 """
199
- pass
200
-
201
- def train(self, resume=False):
202
- from pyxlpr.ppocr.tools.train import main
203
-
204
- if resume:
205
- self.resume(True)
206
- config, device, logger, vdl_writer = preprocess(is_train=True, from_dict=self.rset_posix_path())
207
- main(config, device, logger, vdl_writer)
208
-
209
- def eval(self, resume=True, *, dataset_mode='Eval'):
210
- """
211
- :param dataset_mode: 使用的数据集,默认是Eval,也可以用Train
212
- """
213
- from pyxlpr.ppocr.tools.eval import main
214
-
215
- if resume:
216
- self.resume()
217
-
218
- config, device, logger, vdl_writer = preprocess(from_dict=self.rset_posix_path())
219
- for k in ['name', 'data_dir', 'data_list']:
220
- config['Eval']['dataset'][k] = config[dataset_mode]['dataset'][k]
221
- metric = main(config, device, logger)
222
- return metric
223
-
224
- def infer_det(self, resume=True):
225
- from pyxlpr.ppocr.tools.infer_det import main
226
-
227
- if resume:
228
- self.resume()
229
- config, device, logger, vdl_writer = preprocess(from_dict=self.rset_posix_path())
230
- main(config, logger)
231
-
232
- def export_model(self, resume=True):
233
- from pyxlpr.ppocr.tools.export_model import main
234
-
235
- if resume:
236
- self.resume()
237
- config, device, logger, vdl_writer = preprocess(from_dict=self.rset_posix_path())
238
- main(config, logger)
239
-
240
- def __3_pretrained(self):
241
- """ 使用预训练模型相关配置的封装 """
242
-
243
- @classmethod
244
- def get_pretrained_model_backbone(cls, name):
245
- """ 只拿骨干网络的权重 """
246
- local_file = XlPath.userdir() / f'.paddleocr/pretrained/{name}.pdparams'
247
- url = f'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/{name}.pdparams'
248
- ensure_localfile(local_file, url)
249
- return local_file.parent / local_file.stem # 省略.pdparams后缀
250
-
251
- @classmethod
252
- def get_pretrained_model_ppocr(cls, name):
253
- local_dir = XlPath.userdir() / f'.paddleocr/pretrained/{name}'
254
- url = f'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/{name}.tar'
255
- ensure_localdir(local_dir, url, wrap=-1)
256
- return local_dir / 'best_accuracy' # ppocr训练好的ocr模型
257
-
258
- def set_pretrained_model_backbone(self, name):
259
- path = self.get_pretrained_model_backbone(name)
260
- self.cfg['Global']['pretrained_model'] = path
261
-
262
- def set_pretrained_model_ppocr(self, name):
263
- path = self.get_pretrained_model_ppocr(name)
264
- self.cfg['Global']['pretrained_model'] = path
265
-
266
- def set_pretrained_infer_model(self, local_dir, url):
267
- """ 自己扩展的一个配置参数,metric的时候用 """
268
- local_dir = XlPath.userdir() / f'.paddleocr/pretrained_infer/{local_dir}'
269
- path = ensure_localdir(local_dir, url, wrap=-1)
270
- self.cfg['Global']['pretrained_infer_model'] = path
271
-
272
- def set_pretrained_model(self, pretrained, models):
273
- """ 对上述功能进一步封装,简化高层接口配置时的代码复杂度
274
-
275
- :param bool|int pretrained:
276
- 0 不使用预训练权重
277
- 1 使用骨干网络权重
278
- 2 使用完整的ppocr权重
279
- 3 之前定制训练过的最好的模型
280
- :param models: pretrained为1、2时加载的模型
281
- """
282
- if pretrained == 1:
283
- self.set_pretrained_model_backbone(models[0])
284
- elif pretrained == 2:
285
- self.set_pretrained_model_ppocr(models[1])
286
- elif pretrained == 3:
287
- self.cfg['Global']['pretrained_model'] = self.cfg['Global']['save_model_dir'] / 'best_accuracy'
288
-
289
- def __call__(self, *args, **kwargs):
290
- # 让fire库配合return self不会报错
291
- pass
292
-
293
-
294
- class XlDetText(PaddleOcrBaseConfig):
295
- """ 检测模型专用配置
296
- """
297
-
298
- def autolabel(self, datadir, *, model_type=0, **kwargs):
299
- """ 预标注检测、识别
300
-
301
- TODO model_type在det1_mobile的时候,默认设为2?
302
-
303
- """
304
- pocr = self.build_ppocr(model_type, **kwargs)
305
- pocr.ocr2labelme(datadir, det=True, rec=True)
306
-
307
- def set_deploy_args_det(self):
308
- """ 检测模型在部署时候的参数,不一定跟eval一样
309
- 换句话说,eval本来就应该尽量贴近真实部署的配置参数
310
-
311
- 由于很多文本检测的配置文件,在eval时有些配置跟部署不同,这里按照deploy的情况自动进行调整
312
-
313
- 当然,有些配置,如果eval效果确实比deploy来的好,可以考虑deploy采用eval的配置方式
314
- """
315
- for x in self.cfg['Eval']['dataset']['transforms']:
316
- if 'DetResizeForTest' in x:
317
- x['DetResizeForTest'] = {'limit_side_len': 960, 'limit_type': 'max'}
318
-
319
- def det1_mobile_init(self, *, pretrained=2):
320
- """
321
- 官方实验:ic15, train1000+val500张, batch_size_per_card=8, epoch=1200
322
- 也就是总训练量120w,除batchsize,是15万iter
323
- 按照核酸的实验,每iter耗时大概是0.4秒,实验总用时15iter/3600*0.4约等于17小时
324
-
325
- batchsize=8,hesuan训练过程占用显存 6.7G
326
- 以后有其他实验数据,会尝试都覆盖上,但记忆中差不多都是消耗这么多
327
-
328
- 这个部署文件共 3M
329
-
330
- TODO datalist不只一个的相关功能,还没有进行测试,但问题应该不大
331
- """
332
- # 1 加载基础的配置
333
- cfg = self.config_from_template('det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0')
334
- self.set_pretrained_model(pretrained, ['MobileNetV3_large_x0_5_pretrained', 'ch_ppocr_mobile_v2.0_det_train'])
335
- self.set_deploy_args_det()
336
-
337
- # 2 预训练权重也提供一个部署模型,供后续metric分析
338
- infer_model_url = 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar'
339
- self.set_pretrained_infer_model('ch_ppocr_mobile_v2.0_det_infer', infer_model_url)
340
-
341
- def det1_server_init(self, *, pretrained=2):
342
- """
343
- 训练显存 10.2 G
344
-
345
- 这个部署文件共 47M
346
- """
347
- # 1 加载基础的配置
348
- cfg = self.config_from_template('det/ch_ppocr_v2.0/ch_det_res18_db_v2.0')
349
- self.set_pretrained_model(pretrained, ['ResNet18_vd_pretrained', 'ch_ppocr_server_v2.0_det_train'])
350
- self.set_deploy_args_det()
351
-
352
- # 2 预训练权重也提供一个部署模型,供后续metric分析
353
- infer_model_url = 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar'
354
- self.set_pretrained_infer_model('ch_ppocr_server_v2.0_det_infer', infer_model_url)
355
-
356
- def det2_init(self, *, pretrained=1):
357
- """ 2021.9.7发布的PP-OCRv2
358
- 但是我还没有试跑过,不确定我这样配置是对的
359
-
360
- 220223周三18:11,跑通了,但还没有完全对,metric结果有点奇怪,摸不着头脑
361
- """
362
- cfg = self.config_from_template('det/ch_PP-OCRv2/ch_PP-OCRv2_det_distill')
363
- if pretrained:
364
- x = cfg['Architecture']['Models']
365
-
366
- # self.set_pretrained_model_ppocr('ch_PP-OCRv2_det_distill_train')
367
- x['Student']['pretrained'] = self.get_pretrained_model_backbone('MobileNetV3_large_x0_5_pretrained')
368
- # x['Student']['pretrained'] = self.get_pretrained_model_ppocr('ch_PP-OCRv2_det_distill_train')
369
- x['Teacher']['pretrained'] = self.get_pretrained_model_ppocr('ch_ppocr_server_v2.0_det_train')
370
-
371
- self.set_deploy_args_det()
372
-
373
- infer_model_url = 'https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar'
374
- self.set_pretrained_infer_model('ch_PP-OCRv2_det_infer', infer_model_url)
375
-
376
- return self
377
-
378
- def build_ppocr(self, model_type=2, **kwargs):
379
- """ 获得部署用的接口类
380
- 导出部署模型,并加载
381
-
382
- :param model_type:
383
- 0,原始的PaddleOcr
384
- 1,配置文件自带的部署文件(需要配置Global.pretrained_infer_model参数)
385
- 2,finetune后的模型
386
- :param kwargs: 可以增加一些检测模型的配置参数,比如常用的 det_db_unclip_ratio=1.5
387
- """
388
- from pyxlpr.paddleocr import PaddleOCR
389
-
390
- if model_type == 0:
391
- ppocr = PaddleOCR.build_ppocr(**kwargs)
392
- elif model_type == 1:
393
- d = self.cfg['Global']['pretrained_infer_model']
394
- if not d:
395
- return {}
396
- ppocr = PaddleOCR.build_ppocr(det_model_dir=d, **kwargs)
397
- else:
398
- self.export_model(True)
399
- ppocr = PaddleOCR.build_ppocr(det_model_dir=self.cfg['Global']['save_inference_dir'], **kwargs)
400
-
401
- return ppocr
402
-
403
- def _build_dataset(self, config, logger, dataset_mode='Eval'):
404
- from pyxlpr.ppocr.data import build_dataset
405
- # 注意这里数据集切换方法跟PaddleOCRConfig.eval有点不太一样,因为部署操作要连transforms一起改掉
406
- src = config[dataset_mode]['dataset']
407
- config['Eval']['dataset'] = {'name': src['name'],
408
- 'data_dir': src['data_dir'],
409
- 'data_list': src['data_list'],
410
- 'transforms': [{'DetLabelEncode': None}]}
411
- dataset = build_dataset(config, 'Eval', logger)
412
- return config, dataset
413
-
414
- def eval_deploy(self, model_type=2, dataset_mode='Eval', **kwargs):
415
- ppocr = self.build_ppocr(model_type, **kwargs)
416
- config, device, logger, vdl_writer = preprocess(from_dict=self.rset_posix_path())
417
- config, dataset = self._build_dataset(config, logger, dataset_mode)
418
- metric = ppocr.det_metric(dataset)
419
- logger.info(str(metric))
420
- return metric
421
-
422
- def metric(self, *, print_mode=False):
423
- """ 得到一个综合的测评结果,一般如下:
424
- type train_dataset eval_dataset
425
- ①PaddleOCR* 32.35*56 100.0*190
426
- ②pretrained 17.57*43 50.0*22
427
- ③pretrained* 17.57*184 50.0*192
428
- ④finetune 93.05*49 100.0*20
429
- ⑤finetune* 93.05*173 100.0*164
430
-
431
- 有几条规律
432
- 1、精度②=③,速度③>②。如果精度不同,很可能官方给的预训练模型和部署文件有问题
433
- 2、精度④=⑤,速度⑤>④。如果精度不同,可能eval和部署阶段的图片处理方式不同
434
- det就存在这个问题,处理后的图片尺寸不同,通过set_deploy_args_det修复了
435
- 3、去掉上述两个eval阶段的,其实就是①③⑤三个模型间的比较
436
- ①PaddleOCR直接给的一条龙准备好的模型,一般要高于③开源模型训练效果,低于⑤定制化的效果
437
- 即精读:③<①<⑤
438
- """
439
- import pandas as pd
440
- from pyxllib.algo.stat import xlpivot
441
-
442
- # 1 收集各模型结果
443
- eval_list = []
444
-
445
- def core(title, eval_func):
446
- for dataset in ['a、Train', 'b、Eval']:
447
- m = eval_func(dataset[2:]) # m, metric
448
- m = {k: (round_int(v) if k in ('fps', 'total_frame') else round(v * 100, 2)) for k, v in m.items()}
449
- m2 = {'model_type': title, 'dataset': dataset}
450
- m2.update(m)
451
- eval_list.append(m2)
452
-
453
- core('①PaddleOCR*', lambda m: self.eval_deploy(model_type=0, dataset_mode=m))
454
- core('②pretrained', lambda m: self.eval(resume=False, dataset_mode=m))
455
- core('③pretrained*', lambda m: self.eval_deploy(model_type=1, dataset_mode=m))
456
- core('④finetune', lambda m: self.eval(resume=True, dataset_mode=m))
457
- core('⑤finetune*', lambda m: self.eval_deploy(model_type=2, dataset_mode=m))
458
-
459
- # 2 最后的统计表
460
- df = pd.DataFrame.from_records(eval_list)
461
- outfile = self.cfg['Global']['save_model_dir'] / f'results/metric.html'
462
- os.makedirs(outfile.parent, exist_ok=True)
463
-
464
- def func(items):
465
- x = items.iloc[0]
466
- return f'{x["precision"]:.0f},{x["recall"]:.0f},{x["hmean"]:.2f},{x["fps"]}'
467
-
468
- df2 = xlpivot(df, ['model_type'], ['dataset', 'total_frame'], {'precision,recall,hmean,fps': func})
469
- stat_html = df2.to_html()
470
- stat_html = stat_html.replace('<th></th>', f'<th>{sys.argv[2]}</th>', 1)
471
- outfile.write_text(stat_html)
472
-
473
- if 'metric' in sys.argv:
474
- print(df2)
475
- return
476
-
477
- if print_mode:
478
- print(df2)
479
-
480
- return df
481
-
482
- def create_visual_results(self, *, model_type=2, max_samples=None, **kwargs):
483
- """ 将可视化结果生成到目录下
484
-
485
- :param max_samples: 限制生成的可视化图片上限,有时候只需要看少量样本
486
-
487
- 【算法流程】基本思路是将数据转成coco格式后,使用coco的功能接口来实现,考验我以前接口好不好用的时候到了
488
- 1、初始化指定的ppocr
489
- 2、用ppocr生成一套检测结果
490
- 3、和gt对比,生成一套coco数据
491
- 4、生成coco可视化结果
492
- 5、生成coco的数据分析表格
493
- """
494
- import PIL.Image
495
- from pyxlpr.data.coco import CocoGtData, CocoMatch
496
-
497
- ppocr = self.build_ppocr(model_type, **kwargs)
498
- for dataset_mode in ['Train', 'Eval']: # 训练集和验证集结果都生成,放在两个不同目录
499
- gt = {'images': [],
500
- 'annotations': [],
501
- 'categories': CocoGtData.gen_categories(['text'])}
502
- dt = []
503
- k = 1
504
-
505
- config, device, logger, vdl_writer = preprocess(from_dict=self.rset_posix_path())
506
- config, dataset = self._build_dataset(config, logger, dataset_mode)
507
- out_dir = self.cfg['Global']['save_model_dir'] / f'results/{dataset_mode}'
508
- data_dir = self.cfg['Eval']['dataset']['data_dir']
509
- for img_id, x in enumerate(dataset, start=1):
510
- if max_samples and img_id > max_samples:
511
- break
512
-
513
- # 1 拷贝图片数据到相对目录
514
- src_img_path = x['img_path']
515
- rel_img_path = XlPath(src_img_path).relpath(data_dir)
516
- dst_img_path = out_dir / rel_img_path
517
- os.makedirs(dst_img_path.parent, exist_ok=True)
518
- if not dst_img_path.is_file():
519
- shutil.copy2(src_img_path, dst_img_path)
520
-
521
- # 2 生成对应图片的gt标注数据
522
- w, h = PIL.Image.open(str(dst_img_path)).size
523
- gt['images'].append(CocoGtData.gen_image(img_id, rel_img_path, h, w))
524
- for p in x['polys']:
525
- gt['annotations'].append(
526
- CocoGtData.gen_annotation(id=k, image_id=img_id, points=p, text=x['texts']))
527
- k += 1
528
-
529
- # 3 生成dt标注数据
530
- img = xlcv.read_from_buffer(x['image'])
531
- for p in ppocr.ocr(img, rec=False):
532
- dt.append({'image_id': img_id, 'category_id': 1, 'segmentation': np.array(p).reshape([1, -1]),
533
- 'bbox': ltrb2xywh(rect_bounds(p)), 'score': 1.0})
534
-
535
- cm = CocoMatch(gt, dt)
536
- cm.to_labelme_match(out_dir, segmentation=True)
537
- cm.to_excel(out_dir / 'cocomatch.xlsx')
538
-
539
- def __config_demo(self):
540
- """ 常用的配置示例 """
541
-
542
- def set_xllabelme_dataset(self, data_dir, ratio_list):
543
- """ 设置xllabelme格式的文字检测标注数据
544
-
545
- 我自设的一种简单的数据集范式
546
-
547
- :param data_dir: 数据根目录
548
- :param list[float, float] ratio_list: 训练集、验证集所需的比例
549
- 可以取负数,表示从后往前取;底层设置了随机数种子,每次取得具体文件是固定的。
550
- 数据集较少的话,一般是推荐 [0.9, -0.1],较多的话可以 [0.8, -0.2]
551
- """
552
- self.set_xlsimpledataset('Train', data_dir, [{'type': 'labelme_det', 'ratio': ratio_list[0]}])
553
- self.set_xlsimpledataset('Eval', data_dir, [{'type': 'labelme_det', 'ratio': ratio_list[1]}])
554
-
555
- def det1_mobile_raw(self):
556
- """ paddle源生格式的配置示例 """
557
- self.det1_mobile_init(pretrained=2) # 基础配置
558
- self.set_save_dir('train/det1_mobile_raw') # 模型保存位置
559
- self.set_simpledataset('Train', 'data', ['data/ppdet_train.txt'])
560
- self.set_simpledataset('Eval', 'data', ['data/ppdet_val.txt'])
561
- self.set_iter_num(150000)
562
- return self
563
-
564
- def det1_mobile(self):
565
- """ labelme标注格式的检测训练 """
566
- self.det1_mobile_init(pretrained=2) # 基础配置
567
- self.set_save_dir('train/det1_mobile') # 模型保存位置
568
- self.set_xllabelme_dataset('data', [0.9, -0.1]) # 设置数据集
569
- self.set_iter_num(150000) # 设置迭代轮次
570
- return self
571
-
572
- def det1_server(self):
573
- self.det1_server_init(pretrained=2) # 基础配置
574
- self.set_save_dir('train/det1_server') # 模型保存位置
575
- self.set_xllabelme_dataset('data', [0.9, -0.1]) # 设置数据集
576
- self.set_iter_num(150000) # 设置迭代轮次
577
- return self
578
-
579
-
580
- class XlRec(PaddleOcrBaseConfig):
581
- """ 识别模型专用配置
582
- """
583
-
584
- def stat_texts(self, xllabelme_data_dir, *, ref_dict='ppocr_keys_v1.txt'):
585
- """ 检查标注的句子、字符出现情况 statistics texts
586
-
587
- :param xllabelme_data_dir: xllabelme格式的标注数据所在目录
588
- :param ref_dict: 参考字典文件
589
- """
590
- from collections import Counter
591
- from pyxllib.algo.pupil import ValuesStat
592
- from pyxllib.algo.stat import dataframes_to_excel
593
- from pyxlpr.ppocr.utils import get_dict_content
594
-
595
- root = XlPath(xllabelme_data_dir)
596
- outfile = root.parent / 'stat_texts.xlsx'
597
-
598
- # 1 读取数据
599
- sentances_counter = Counter() # 每句话的内容,和相同话出现的次数
600
- for f in root.rglob('*.json'):
601
- for sp in f.read_json()['shapes']:
602
- attr = json.loads(sp['label'])
603
- if 'text' in attr:
604
- text = attr['text']
605
- sentances_counter[text] += 1
606
-
607
- # 2 统计 sentances 每句话出现频率, words 每个单词出现频率, chars 每个字符出现频率
608
- chars_counter = Counter()
609
- words_counter = Counter()
610
- for sentance, cnt in sentances_counter.items():
611
- for word in sentance.split(): # 目前先按空格分开,虽然严格来说,对于中文情况,要用结巴分词处理更准确
612
- words_counter[word] += cnt
613
- for ch in sentance: # 统计每个字符出现次数,包括空格
614
- chars_counter[ch] += cnt
615
-
616
- # 3 转df
617
- char_dict = set(get_dict_content(ref_dict).splitlines())
618
- ls = []
619
- new_chars = []
620
- for char, cnt in chars_counter.most_common():
621
- ls.append([char, cnt, '' if char in char_dict else 'True'])
622
- if char not in char_dict and char != ' ':
623
- new_chars.append(char)
624
- chars_df = pd.DataFrame.from_records(ls, columns=['char', 'count', 'new_char'])
625
-
626
- words_df = pd.DataFrame.from_records(words_counter.most_common(), columns=['word', 'count'])
627
- sentances_df = pd.DataFrame.from_records([[sentance, cnt, len(sentance)]
628
- for sentance, cnt in sentances_counter.most_common()],
629
- columns=['sentance', 'count', 'length'])
630
-
631
- # 计算不同长度句子的分布规律
632
- ct = Counter()
633
- lengths = []
634
- for _, row in sentances_df.iterrows():
635
- ct[row['length']] += row['count']
636
- lengths += [row['length']] * row['count'] # 这个实现不是那么得优雅,但如果要兼容ValuesStat只能先这样处理
637
- # ct = sentances_df.groupby('length').sum().to_dict()['count']
638
- max_len = max(sentances_df['length'])
639
- sentances_length_df = pd.DataFrame.from_records([[i, ct.get(i, 0)] for i in range(max_len + 1)],
640
- columns=['length', 'count'])
641
-
642
- # 4 频数规律计算
643
- def summary(title, vals):
644
- msg = ValuesStat(vals).summary(['g', '.2f', '.2f', 'g', 'g'])
645
- # print(msg)
646
- return [title] + re.findall(r':\s+(\S+)', msg)
647
-
648
- print('【stat_texts】')
649
- print(f'输出文件:{outfile.as_posix()}')
650
-
651
- print(f'不在字典中的{len(new_chars)}个字符:' + ''.join(new_chars))
652
-
653
- ls = [
654
- summary('字符频数', chars_df['count']),
655
- summary('词组频数', words_df['count']),
656
- summary('句子频数', sentances_df['count']),
657
- summary('句子长度', lengths),
658
- ]
659
- df = pd.DataFrame.from_records(ls, columns=['title', '总和', '均值标准差', '总数', '最小值', '最大值'])
660
- print(df)
661
-
662
- # 5 存储分析表
663
- sheets = {'字符': chars_df, '词组': words_df,
664
- '句子': sentances_df, '句子长度': sentances_length_df}
665
- dataframes_to_excel(outfile, sheets)
666
-
667
- def create_recdata(self, src, dst, *, print_mode=True, recreate=False):
668
- """ 从xllabelme标注的格式,生成到paddle支持的识别数据格式;提取出供文本识别模型训练的文本行数据
669
-
670
- :param src: xllabelme_data_dir
671
- :param dst: 目标存储位置的根目录
672
- :param recreate: 如果目标目录存在,将其删除,重新生成
673
-
674
- 注意:本套生成方法仅供参考,这套处理目前不是那么泛用
675
- """
676
- # 0
677
- src, dst = XlPath(src), XlPath(dst)
678
- if recreate and dst.is_dir():
679
- dst.delete() # 如果已有,将其删除
680
-
681
- # 1 生成图片
682
- chars = set()
683
- labels1, labels2 = [], []
684
- for f in tqdm(list(src.rglob('*.json')), desc='提取文本行数据', disable=not print_mode):
685
- data = f.read_json()
686
- impath = f.parent / data['imagePath']
687
- im = xlcv.read(impath)
688
- for i, sp in enumerate(data['shapes'], start=1):
689
- # a组,提取文本行的时候,按外接矩形框截取
690
- name = f'imgs/{f.stem}_r{i:03}.jpg'
691
- text = json.loads(sp['label'])['text']
692
- chars |= set(text)
693
- xlcv.write(xlcv.get_sub(im, sp['points']), dst / name)
694
- labels1.append(f'{name}\t{text}')
695
-
696
- # b组,提取文本行的时候,进行仿射变换矫正
697
- name = f'imgs/{f.stem}_w{i:03}.jpg'
698
- xlcv.write(xlcv.get_sub(im, sp['points'], warp_quad=True), dst / name)
699
- labels2.append(f'{name}\t{text}')
700
-
701
- # 2 字典文件
702
- chars -= set(' \n\t') # 要去掉空格等字符
703
- (dst / 'char_dict.txt').write_text('\n'.join(sorted(chars)))
704
-
705
- # 3 标注数据
706
- (dst / 'labels_rect.txt').write_text('\n'.join(labels1))
707
- (dst / 'labels_warp.txt').write_text('\n'.join(labels2))
708
- (dst / 'labels_total.txt').write_text('\n'.join(labels1 + labels2))
709
-
710
- return self
711
-
712
- def set_rec_dataset(self, data_dir, label_file_list):
713
- """ 设置识别数据集
714
-
715
- :param data_dir: 数据所在根目录
716
- :param list[str|list] label_file_list: 标注文件清单
717
- str,标注文件的相对路径
718
- list[str, float],除了str描述标注文件路径,还有个ratio值配置选取样本的比例
719
-
720
- TODO 想做设置的集成,但目前还没想到好的设计方式,可以自己手动拆分数据,并在autoset中配置,也不会很麻烦
721
- """
722
-
723
- # self.cfg['Train']['dataset']['data_dir'] = Paths.eleclabel / 'recdata'
724
- # self.cfg['Train']['dataset']['label_file_list'] = [Paths.eleclabel / 'recdata/labels_ab.txt']
725
- # self.cfg['Eval']['dataset']['data_dir'] = Paths.eleclabel / 'recdata'
726
- # self.cfg['Eval']['dataset']['label_file_list'] = [Paths.eleclabel / 'recdata/labels_ab.txt']
727
-
728
- raise NotImplementedError
729
-
730
-
731
- class XlCls:
732
- """ 分类模型,这个是基本使用源生的paddlepaddle,没有使用有个更强的paddleclas """
733
-
734
-
735
- class XlOcr:
736
- """ 封装了文字技术体系,检测识别的一些标准化处理流程 """
737
-
738
- def __init__(self, root):
739
- self.root = XlPath(root) # 项目根目录
740
-
741
- def step1_autolabel(self):
742
- """ 预标注检测、识别 """
743
-
744
- def step2_refinelabel(self):
745
- """ 人工手动优化label标注 """
746
-
747
- def step3_det(self):
748
- """ 训练检测模型 """