pyxllib 0.3.96__py3-none-any.whl → 0.3.197__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (306) hide show
  1. pyxllib/algo/geo.py +12 -0
  2. pyxllib/algo/intervals.py +1 -1
  3. pyxllib/algo/matcher.py +78 -0
  4. pyxllib/algo/pupil.py +187 -19
  5. pyxllib/algo/specialist.py +2 -1
  6. pyxllib/algo/stat.py +38 -2
  7. {pyxlpr → pyxllib/autogui}/__init__.py +1 -1
  8. pyxllib/autogui/activewin.py +246 -0
  9. pyxllib/autogui/all.py +9 -0
  10. pyxllib/{ext/autogui → autogui}/autogui.py +40 -11
  11. pyxllib/autogui/uiautolib.py +362 -0
  12. pyxllib/autogui/wechat.py +827 -0
  13. pyxllib/autogui/wechat_msg.py +421 -0
  14. pyxllib/autogui/wxautolib.py +84 -0
  15. pyxllib/cv/slidercaptcha.py +137 -0
  16. pyxllib/data/echarts.py +123 -12
  17. pyxllib/data/jsonlib.py +89 -0
  18. pyxllib/data/pglib.py +514 -30
  19. pyxllib/data/sqlite.py +231 -4
  20. pyxllib/ext/JLineViewer.py +14 -1
  21. pyxllib/ext/drissionlib.py +277 -0
  22. pyxllib/ext/kq5034lib.py +0 -1594
  23. pyxllib/ext/robustprocfile.py +497 -0
  24. pyxllib/ext/unixlib.py +6 -5
  25. pyxllib/ext/utools.py +108 -95
  26. pyxllib/ext/webhook.py +32 -14
  27. pyxllib/ext/wjxlib.py +88 -0
  28. pyxllib/ext/wpsapi.py +124 -0
  29. pyxllib/ext/xlwork.py +9 -0
  30. pyxllib/ext/yuquelib.py +1003 -71
  31. pyxllib/file/docxlib.py +1 -1
  32. pyxllib/file/libreoffice.py +165 -0
  33. pyxllib/file/movielib.py +9 -0
  34. pyxllib/file/packlib/__init__.py +112 -75
  35. pyxllib/file/pdflib.py +1 -1
  36. pyxllib/file/pupil.py +1 -1
  37. pyxllib/file/specialist/dirlib.py +1 -1
  38. pyxllib/file/specialist/download.py +10 -3
  39. pyxllib/file/specialist/filelib.py +266 -55
  40. pyxllib/file/xlsxlib.py +205 -50
  41. pyxllib/file/xlsyncfile.py +341 -0
  42. pyxllib/prog/cachetools.py +64 -0
  43. pyxllib/prog/filelock.py +42 -0
  44. pyxllib/prog/multiprogs.py +940 -0
  45. pyxllib/prog/newbie.py +9 -2
  46. pyxllib/prog/pupil.py +129 -60
  47. pyxllib/prog/specialist/__init__.py +176 -2
  48. pyxllib/prog/specialist/bc.py +5 -2
  49. pyxllib/prog/specialist/browser.py +11 -2
  50. pyxllib/prog/specialist/datetime.py +68 -0
  51. pyxllib/prog/specialist/tictoc.py +12 -13
  52. pyxllib/prog/specialist/xllog.py +5 -5
  53. pyxllib/prog/xlosenv.py +7 -0
  54. pyxllib/text/airscript.js +744 -0
  55. pyxllib/text/charclasslib.py +17 -5
  56. pyxllib/text/jiebalib.py +6 -3
  57. pyxllib/text/jinjalib.py +32 -0
  58. pyxllib/text/jsa_ai_prompt.md +271 -0
  59. pyxllib/text/jscode.py +159 -4
  60. pyxllib/text/nestenv.py +1 -1
  61. pyxllib/text/newbie.py +12 -0
  62. pyxllib/text/pupil/common.py +26 -0
  63. pyxllib/text/specialist/ptag.py +2 -2
  64. pyxllib/text/templates/echart_base.html +11 -0
  65. pyxllib/text/templates/highlight_code.html +17 -0
  66. pyxllib/text/templates/latex_editor.html +103 -0
  67. pyxllib/text/xmllib.py +76 -14
  68. pyxllib/xl.py +2 -1
  69. pyxllib-0.3.197.dist-info/METADATA +48 -0
  70. pyxllib-0.3.197.dist-info/RECORD +126 -0
  71. {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +1 -2
  72. pyxllib/ext/autogui/__init__.py +0 -8
  73. pyxllib-0.3.96.dist-info/METADATA +0 -51
  74. pyxllib-0.3.96.dist-info/RECORD +0 -333
  75. pyxllib-0.3.96.dist-info/top_level.txt +0 -2
  76. pyxlpr/ai/__init__.py +0 -5
  77. pyxlpr/ai/clientlib.py +0 -1281
  78. pyxlpr/ai/specialist.py +0 -286
  79. pyxlpr/ai/torch_app.py +0 -172
  80. pyxlpr/ai/xlpaddle.py +0 -655
  81. pyxlpr/ai/xltorch.py +0 -705
  82. pyxlpr/data/__init__.py +0 -11
  83. pyxlpr/data/coco.py +0 -1325
  84. pyxlpr/data/datacls.py +0 -365
  85. pyxlpr/data/datasets.py +0 -200
  86. pyxlpr/data/gptlib.py +0 -1291
  87. pyxlpr/data/icdar/__init__.py +0 -96
  88. pyxlpr/data/icdar/deteval.py +0 -377
  89. pyxlpr/data/icdar/icdar2013.py +0 -341
  90. pyxlpr/data/icdar/iou.py +0 -340
  91. pyxlpr/data/icdar/rrc_evaluation_funcs_1_1.py +0 -463
  92. pyxlpr/data/imtextline.py +0 -473
  93. pyxlpr/data/labelme.py +0 -866
  94. pyxlpr/data/removeline.py +0 -179
  95. pyxlpr/data/specialist.py +0 -57
  96. pyxlpr/eval/__init__.py +0 -85
  97. pyxlpr/paddleocr.py +0 -776
  98. pyxlpr/ppocr/__init__.py +0 -15
  99. pyxlpr/ppocr/configs/rec/multi_language/generate_multi_language_configs.py +0 -226
  100. pyxlpr/ppocr/data/__init__.py +0 -135
  101. pyxlpr/ppocr/data/imaug/ColorJitter.py +0 -26
  102. pyxlpr/ppocr/data/imaug/__init__.py +0 -67
  103. pyxlpr/ppocr/data/imaug/copy_paste.py +0 -170
  104. pyxlpr/ppocr/data/imaug/east_process.py +0 -437
  105. pyxlpr/ppocr/data/imaug/gen_table_mask.py +0 -244
  106. pyxlpr/ppocr/data/imaug/iaa_augment.py +0 -114
  107. pyxlpr/ppocr/data/imaug/label_ops.py +0 -789
  108. pyxlpr/ppocr/data/imaug/make_border_map.py +0 -184
  109. pyxlpr/ppocr/data/imaug/make_pse_gt.py +0 -106
  110. pyxlpr/ppocr/data/imaug/make_shrink_map.py +0 -126
  111. pyxlpr/ppocr/data/imaug/operators.py +0 -433
  112. pyxlpr/ppocr/data/imaug/pg_process.py +0 -906
  113. pyxlpr/ppocr/data/imaug/randaugment.py +0 -143
  114. pyxlpr/ppocr/data/imaug/random_crop_data.py +0 -239
  115. pyxlpr/ppocr/data/imaug/rec_img_aug.py +0 -533
  116. pyxlpr/ppocr/data/imaug/sast_process.py +0 -777
  117. pyxlpr/ppocr/data/imaug/text_image_aug/__init__.py +0 -17
  118. pyxlpr/ppocr/data/imaug/text_image_aug/augment.py +0 -120
  119. pyxlpr/ppocr/data/imaug/text_image_aug/warp_mls.py +0 -168
  120. pyxlpr/ppocr/data/lmdb_dataset.py +0 -115
  121. pyxlpr/ppocr/data/pgnet_dataset.py +0 -104
  122. pyxlpr/ppocr/data/pubtab_dataset.py +0 -107
  123. pyxlpr/ppocr/data/simple_dataset.py +0 -372
  124. pyxlpr/ppocr/losses/__init__.py +0 -61
  125. pyxlpr/ppocr/losses/ace_loss.py +0 -52
  126. pyxlpr/ppocr/losses/basic_loss.py +0 -135
  127. pyxlpr/ppocr/losses/center_loss.py +0 -88
  128. pyxlpr/ppocr/losses/cls_loss.py +0 -30
  129. pyxlpr/ppocr/losses/combined_loss.py +0 -67
  130. pyxlpr/ppocr/losses/det_basic_loss.py +0 -208
  131. pyxlpr/ppocr/losses/det_db_loss.py +0 -80
  132. pyxlpr/ppocr/losses/det_east_loss.py +0 -63
  133. pyxlpr/ppocr/losses/det_pse_loss.py +0 -149
  134. pyxlpr/ppocr/losses/det_sast_loss.py +0 -121
  135. pyxlpr/ppocr/losses/distillation_loss.py +0 -272
  136. pyxlpr/ppocr/losses/e2e_pg_loss.py +0 -140
  137. pyxlpr/ppocr/losses/kie_sdmgr_loss.py +0 -113
  138. pyxlpr/ppocr/losses/rec_aster_loss.py +0 -99
  139. pyxlpr/ppocr/losses/rec_att_loss.py +0 -39
  140. pyxlpr/ppocr/losses/rec_ctc_loss.py +0 -44
  141. pyxlpr/ppocr/losses/rec_enhanced_ctc_loss.py +0 -70
  142. pyxlpr/ppocr/losses/rec_nrtr_loss.py +0 -30
  143. pyxlpr/ppocr/losses/rec_sar_loss.py +0 -28
  144. pyxlpr/ppocr/losses/rec_srn_loss.py +0 -47
  145. pyxlpr/ppocr/losses/table_att_loss.py +0 -109
  146. pyxlpr/ppocr/metrics/__init__.py +0 -44
  147. pyxlpr/ppocr/metrics/cls_metric.py +0 -45
  148. pyxlpr/ppocr/metrics/det_metric.py +0 -82
  149. pyxlpr/ppocr/metrics/distillation_metric.py +0 -73
  150. pyxlpr/ppocr/metrics/e2e_metric.py +0 -86
  151. pyxlpr/ppocr/metrics/eval_det_iou.py +0 -274
  152. pyxlpr/ppocr/metrics/kie_metric.py +0 -70
  153. pyxlpr/ppocr/metrics/rec_metric.py +0 -75
  154. pyxlpr/ppocr/metrics/table_metric.py +0 -50
  155. pyxlpr/ppocr/modeling/architectures/__init__.py +0 -32
  156. pyxlpr/ppocr/modeling/architectures/base_model.py +0 -88
  157. pyxlpr/ppocr/modeling/architectures/distillation_model.py +0 -60
  158. pyxlpr/ppocr/modeling/backbones/__init__.py +0 -54
  159. pyxlpr/ppocr/modeling/backbones/det_mobilenet_v3.py +0 -268
  160. pyxlpr/ppocr/modeling/backbones/det_resnet_vd.py +0 -246
  161. pyxlpr/ppocr/modeling/backbones/det_resnet_vd_sast.py +0 -285
  162. pyxlpr/ppocr/modeling/backbones/e2e_resnet_vd_pg.py +0 -265
  163. pyxlpr/ppocr/modeling/backbones/kie_unet_sdmgr.py +0 -186
  164. pyxlpr/ppocr/modeling/backbones/rec_mobilenet_v3.py +0 -138
  165. pyxlpr/ppocr/modeling/backbones/rec_mv1_enhance.py +0 -258
  166. pyxlpr/ppocr/modeling/backbones/rec_nrtr_mtb.py +0 -48
  167. pyxlpr/ppocr/modeling/backbones/rec_resnet_31.py +0 -210
  168. pyxlpr/ppocr/modeling/backbones/rec_resnet_aster.py +0 -143
  169. pyxlpr/ppocr/modeling/backbones/rec_resnet_fpn.py +0 -307
  170. pyxlpr/ppocr/modeling/backbones/rec_resnet_vd.py +0 -286
  171. pyxlpr/ppocr/modeling/heads/__init__.py +0 -54
  172. pyxlpr/ppocr/modeling/heads/cls_head.py +0 -52
  173. pyxlpr/ppocr/modeling/heads/det_db_head.py +0 -118
  174. pyxlpr/ppocr/modeling/heads/det_east_head.py +0 -121
  175. pyxlpr/ppocr/modeling/heads/det_pse_head.py +0 -37
  176. pyxlpr/ppocr/modeling/heads/det_sast_head.py +0 -128
  177. pyxlpr/ppocr/modeling/heads/e2e_pg_head.py +0 -253
  178. pyxlpr/ppocr/modeling/heads/kie_sdmgr_head.py +0 -206
  179. pyxlpr/ppocr/modeling/heads/multiheadAttention.py +0 -163
  180. pyxlpr/ppocr/modeling/heads/rec_aster_head.py +0 -393
  181. pyxlpr/ppocr/modeling/heads/rec_att_head.py +0 -202
  182. pyxlpr/ppocr/modeling/heads/rec_ctc_head.py +0 -88
  183. pyxlpr/ppocr/modeling/heads/rec_nrtr_head.py +0 -826
  184. pyxlpr/ppocr/modeling/heads/rec_sar_head.py +0 -402
  185. pyxlpr/ppocr/modeling/heads/rec_srn_head.py +0 -280
  186. pyxlpr/ppocr/modeling/heads/self_attention.py +0 -406
  187. pyxlpr/ppocr/modeling/heads/table_att_head.py +0 -246
  188. pyxlpr/ppocr/modeling/necks/__init__.py +0 -32
  189. pyxlpr/ppocr/modeling/necks/db_fpn.py +0 -111
  190. pyxlpr/ppocr/modeling/necks/east_fpn.py +0 -188
  191. pyxlpr/ppocr/modeling/necks/fpn.py +0 -138
  192. pyxlpr/ppocr/modeling/necks/pg_fpn.py +0 -314
  193. pyxlpr/ppocr/modeling/necks/rnn.py +0 -92
  194. pyxlpr/ppocr/modeling/necks/sast_fpn.py +0 -284
  195. pyxlpr/ppocr/modeling/necks/table_fpn.py +0 -110
  196. pyxlpr/ppocr/modeling/transforms/__init__.py +0 -28
  197. pyxlpr/ppocr/modeling/transforms/stn.py +0 -135
  198. pyxlpr/ppocr/modeling/transforms/tps.py +0 -308
  199. pyxlpr/ppocr/modeling/transforms/tps_spatial_transformer.py +0 -156
  200. pyxlpr/ppocr/optimizer/__init__.py +0 -61
  201. pyxlpr/ppocr/optimizer/learning_rate.py +0 -228
  202. pyxlpr/ppocr/optimizer/lr_scheduler.py +0 -49
  203. pyxlpr/ppocr/optimizer/optimizer.py +0 -160
  204. pyxlpr/ppocr/optimizer/regularizer.py +0 -52
  205. pyxlpr/ppocr/postprocess/__init__.py +0 -55
  206. pyxlpr/ppocr/postprocess/cls_postprocess.py +0 -33
  207. pyxlpr/ppocr/postprocess/db_postprocess.py +0 -234
  208. pyxlpr/ppocr/postprocess/east_postprocess.py +0 -143
  209. pyxlpr/ppocr/postprocess/locality_aware_nms.py +0 -200
  210. pyxlpr/ppocr/postprocess/pg_postprocess.py +0 -52
  211. pyxlpr/ppocr/postprocess/pse_postprocess/__init__.py +0 -15
  212. pyxlpr/ppocr/postprocess/pse_postprocess/pse/__init__.py +0 -29
  213. pyxlpr/ppocr/postprocess/pse_postprocess/pse/setup.py +0 -14
  214. pyxlpr/ppocr/postprocess/pse_postprocess/pse_postprocess.py +0 -118
  215. pyxlpr/ppocr/postprocess/rec_postprocess.py +0 -654
  216. pyxlpr/ppocr/postprocess/sast_postprocess.py +0 -355
  217. pyxlpr/ppocr/tools/__init__.py +0 -14
  218. pyxlpr/ppocr/tools/eval.py +0 -83
  219. pyxlpr/ppocr/tools/export_center.py +0 -77
  220. pyxlpr/ppocr/tools/export_model.py +0 -129
  221. pyxlpr/ppocr/tools/infer/predict_cls.py +0 -151
  222. pyxlpr/ppocr/tools/infer/predict_det.py +0 -300
  223. pyxlpr/ppocr/tools/infer/predict_e2e.py +0 -169
  224. pyxlpr/ppocr/tools/infer/predict_rec.py +0 -414
  225. pyxlpr/ppocr/tools/infer/predict_system.py +0 -204
  226. pyxlpr/ppocr/tools/infer/utility.py +0 -629
  227. pyxlpr/ppocr/tools/infer_cls.py +0 -83
  228. pyxlpr/ppocr/tools/infer_det.py +0 -134
  229. pyxlpr/ppocr/tools/infer_e2e.py +0 -122
  230. pyxlpr/ppocr/tools/infer_kie.py +0 -153
  231. pyxlpr/ppocr/tools/infer_rec.py +0 -146
  232. pyxlpr/ppocr/tools/infer_table.py +0 -107
  233. pyxlpr/ppocr/tools/program.py +0 -596
  234. pyxlpr/ppocr/tools/test_hubserving.py +0 -117
  235. pyxlpr/ppocr/tools/train.py +0 -163
  236. pyxlpr/ppocr/tools/xlprog.py +0 -748
  237. pyxlpr/ppocr/utils/EN_symbol_dict.txt +0 -94
  238. pyxlpr/ppocr/utils/__init__.py +0 -24
  239. pyxlpr/ppocr/utils/dict/ar_dict.txt +0 -117
  240. pyxlpr/ppocr/utils/dict/arabic_dict.txt +0 -162
  241. pyxlpr/ppocr/utils/dict/be_dict.txt +0 -145
  242. pyxlpr/ppocr/utils/dict/bg_dict.txt +0 -140
  243. pyxlpr/ppocr/utils/dict/chinese_cht_dict.txt +0 -8421
  244. pyxlpr/ppocr/utils/dict/cyrillic_dict.txt +0 -163
  245. pyxlpr/ppocr/utils/dict/devanagari_dict.txt +0 -167
  246. pyxlpr/ppocr/utils/dict/en_dict.txt +0 -63
  247. pyxlpr/ppocr/utils/dict/fa_dict.txt +0 -136
  248. pyxlpr/ppocr/utils/dict/french_dict.txt +0 -136
  249. pyxlpr/ppocr/utils/dict/german_dict.txt +0 -143
  250. pyxlpr/ppocr/utils/dict/hi_dict.txt +0 -162
  251. pyxlpr/ppocr/utils/dict/it_dict.txt +0 -118
  252. pyxlpr/ppocr/utils/dict/japan_dict.txt +0 -4399
  253. pyxlpr/ppocr/utils/dict/ka_dict.txt +0 -153
  254. pyxlpr/ppocr/utils/dict/korean_dict.txt +0 -3688
  255. pyxlpr/ppocr/utils/dict/latin_dict.txt +0 -185
  256. pyxlpr/ppocr/utils/dict/mr_dict.txt +0 -153
  257. pyxlpr/ppocr/utils/dict/ne_dict.txt +0 -153
  258. pyxlpr/ppocr/utils/dict/oc_dict.txt +0 -96
  259. pyxlpr/ppocr/utils/dict/pu_dict.txt +0 -130
  260. pyxlpr/ppocr/utils/dict/rs_dict.txt +0 -91
  261. pyxlpr/ppocr/utils/dict/rsc_dict.txt +0 -134
  262. pyxlpr/ppocr/utils/dict/ru_dict.txt +0 -125
  263. pyxlpr/ppocr/utils/dict/ta_dict.txt +0 -128
  264. pyxlpr/ppocr/utils/dict/table_dict.txt +0 -277
  265. pyxlpr/ppocr/utils/dict/table_structure_dict.txt +0 -2759
  266. pyxlpr/ppocr/utils/dict/te_dict.txt +0 -151
  267. pyxlpr/ppocr/utils/dict/ug_dict.txt +0 -114
  268. pyxlpr/ppocr/utils/dict/uk_dict.txt +0 -142
  269. pyxlpr/ppocr/utils/dict/ur_dict.txt +0 -137
  270. pyxlpr/ppocr/utils/dict/xi_dict.txt +0 -110
  271. pyxlpr/ppocr/utils/dict90.txt +0 -90
  272. pyxlpr/ppocr/utils/e2e_metric/Deteval.py +0 -574
  273. pyxlpr/ppocr/utils/e2e_metric/polygon_fast.py +0 -83
  274. pyxlpr/ppocr/utils/e2e_utils/extract_batchsize.py +0 -87
  275. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_fast.py +0 -457
  276. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_slow.py +0 -592
  277. pyxlpr/ppocr/utils/e2e_utils/pgnet_pp_utils.py +0 -162
  278. pyxlpr/ppocr/utils/e2e_utils/visual.py +0 -162
  279. pyxlpr/ppocr/utils/en_dict.txt +0 -95
  280. pyxlpr/ppocr/utils/gen_label.py +0 -81
  281. pyxlpr/ppocr/utils/ic15_dict.txt +0 -36
  282. pyxlpr/ppocr/utils/iou.py +0 -54
  283. pyxlpr/ppocr/utils/logging.py +0 -69
  284. pyxlpr/ppocr/utils/network.py +0 -84
  285. pyxlpr/ppocr/utils/ppocr_keys_v1.txt +0 -6623
  286. pyxlpr/ppocr/utils/profiler.py +0 -110
  287. pyxlpr/ppocr/utils/save_load.py +0 -150
  288. pyxlpr/ppocr/utils/stats.py +0 -72
  289. pyxlpr/ppocr/utils/utility.py +0 -80
  290. pyxlpr/ppstructure/__init__.py +0 -13
  291. pyxlpr/ppstructure/predict_system.py +0 -187
  292. pyxlpr/ppstructure/table/__init__.py +0 -13
  293. pyxlpr/ppstructure/table/eval_table.py +0 -72
  294. pyxlpr/ppstructure/table/matcher.py +0 -192
  295. pyxlpr/ppstructure/table/predict_structure.py +0 -136
  296. pyxlpr/ppstructure/table/predict_table.py +0 -221
  297. pyxlpr/ppstructure/table/table_metric/__init__.py +0 -16
  298. pyxlpr/ppstructure/table/table_metric/parallel.py +0 -51
  299. pyxlpr/ppstructure/table/table_metric/table_metric.py +0 -247
  300. pyxlpr/ppstructure/table/tablepyxl/__init__.py +0 -13
  301. pyxlpr/ppstructure/table/tablepyxl/style.py +0 -283
  302. pyxlpr/ppstructure/table/tablepyxl/tablepyxl.py +0 -118
  303. pyxlpr/ppstructure/utility.py +0 -71
  304. pyxlpr/xlai.py +0 -10
  305. /pyxllib/{ext/autogui → autogui}/virtualkey.py +0 -0
  306. {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
pyxlpr/data/labelme.py DELETED
@@ -1,866 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Date : 2020/08/15 00:59
6
-
7
- import os
8
- from tqdm import tqdm
9
- import json
10
- import ujson
11
- import copy
12
- from collections import Counter
13
-
14
- import numpy as np
15
-
16
- from pyxllib.prog.newbie import round_int
17
- from pyxllib.prog.pupil import DictTool
18
- from pyxllib.prog.specialist import get_xllog, Iterate
19
- from pyxllib.file.specialist import PathGroups, get_encoding, XlPath
20
- from pyxllib.prog.specialist import mtqdm
21
- from pyxllib.cv.expert import xlpil
22
- from pyxllib.algo.geo import ltrb2xywh, rect_bounds, warp_points, resort_quad_points, rect2polygon, get_warp_mat
23
-
24
-
25
- def __0_basic():
26
- """ 这里可以写每个模块注释 """
27
-
28
-
29
- class BasicLabelDataset:
30
- """ 一张图一份标注文件的一些基本操作功能 """
31
-
32
- def __init__(self, root, relpath2data=None, *, reads=True, prt=False, fltr=None, slt=None, extdata=None):
33
- """
34
- :param root: 数据所在根目录
35
- :param dict[str, readed_data] relpath2data: {relpath: data1, 'a/1.txt': data2, ...}
36
- 如果未传入data具体值,则根据目录里的情况自动初始化获得data的值
37
-
38
- relpath是对应的XlPath标注文件的相对路径字符串
39
- data1, data2 是读取的标注数据,根据不同情况,会存成不同格式
40
- 如果是json则直接保存json内存对象结构
41
- 如果是txt可能会进行一定的结构化解析存储
42
- :param extdata: 可以存储一些扩展信息内容
43
- :param fltr: filter的缩写,PathGroups 的过滤规则。一般用来进行图片匹配。
44
- None,没有过滤规则,就算不存在slt格式的情况下,也会保留分组
45
- 'json'等字符串规则, 使用 select_group_which_hassuffix,必须含有特定后缀的分组
46
- judge(k, v),自定义函数规则
47
- :param slt: select的缩写,要选中的标注文件后缀格式
48
- 如果传入slt参数,该 Basic 基础类只会预设好 file 参数,数据部分会置 None,需要后续主动读取
49
-
50
- >> BasicLabelData('textGroup/aabb', {'a.json': ..., 'a/1.json': ...})
51
- >> BasicLabelData('textGroup/aabb', slt='json')
52
- >> BasicLabelData('textGroup/aabb', fltr='jpg', slt='json') # 只获取有对应jpg图片的json文件
53
- >> BasicLabelData('textGroup/aabb', fltr='jpg|png', slt='json')
54
- """
55
-
56
- # 1 基础操作
57
- root = XlPath(root)
58
- self.root, self.rp2data, self.extdata = root, relpath2data or {}, extdata or {}
59
- self.pathgs = None
60
-
61
- if relpath2data is not None or slt is None:
62
- return
63
-
64
- # 2 如果没有默认data数据,以及传入slt参数,则需要使用默认文件关联方式读取标注
65
- relpath2data = {}
66
- gs = PathGroups.groupby(XlPath(root).rglob_files())
67
- if isinstance(fltr, str):
68
- gs = gs.select_group_which_hassuffix(fltr)
69
- elif callable(fltr):
70
- gs = gs.select_group(fltr)
71
- self.pathgs = gs
72
-
73
- # 3 读取数据
74
- for stem, suffixs in tqdm(gs.data.items(), f'{self.__class__.__name__}读取数据', disable=not prt):
75
- f = XlPath(stem + f'.{slt}')
76
- if reads and f.exists():
77
- # dprint(f) # 空json会报错:json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
78
- relpath2data[f.relpath(self.root)] = f.read_auto()
79
- else:
80
- relpath2data[f.relpath(self.root)] = None
81
-
82
- self.rp2data = relpath2data
83
-
84
- def __len__(self):
85
- return len(self.rp2data)
86
-
87
- def read(self, relpath, **kwargs):
88
- """
89
- :param relpath: 必须是斜杠表示的相对路径 'a/1.txt'、'b/2.json'
90
- """
91
- self.rp2data[relpath] = (self.root / relpath).read_auto(**kwargs)
92
-
93
- def reads(self, prt=False, **kwargs):
94
- """ 为了性能效率,初始化默认不会读取数据,需要调用reads才会开始读取数据 """
95
- for k in tqdm(self.rp2data.keys(), f'读取{self.__class__.__name__}数据', disable=not prt):
96
- self.rp2data[k] = (self.root / k).read_auto(**kwargs)
97
-
98
- def write(self, relpath, **kwargs):
99
- """
100
- :param relpath: 必须是斜杠表示的相对路径 'a/1.txt'、'b/2.json'
101
- """
102
- data = self.rp2data[relpath]
103
- file = self.root / relpath
104
- if file.is_file(): # 如果文件存在,要遵循原有的编码规则
105
- with open(str(file), 'rb') as f:
106
- bstr = f.read()
107
- encoding = get_encoding(bstr)
108
- kwargs['encoding'] = encoding
109
- kwargs['if_exists'] = 'replace'
110
- file.write_auto(data, **kwargs)
111
- else: # 否则直接写入
112
- file.write_auto(data, **kwargs)
113
-
114
- def writes(self, *, max_workers=8, print_mode=False, **kwargs):
115
- """ 重新写入每份标注文件
116
-
117
- 可能是内存里修改了数据,需要重新覆盖
118
- 也可能是从coco等其他格式初始化,转换而来的内存数据,需要生成对应的新标注文件
119
- """
120
- mtqdm(lambda x: self.write(x, **kwargs), self.rp2data.keys(), desc=f'{self.__class__.__name__}写入标注数据',
121
- max_workers=max_workers, disable=not print_mode)
122
-
123
-
124
- def __1_labelme():
125
- """ """
126
-
127
-
128
- # 我自己按照“红橙黄绿蓝靛紫”的顺序展示
129
- LABEL_COLORMAP7 = [(0, 0, 0), (255, 0, 0), (255, 125, 0), (255, 255, 0),
130
- (0, 255, 0), (0, 0, 255), (0, 255, 255), (255, 0, 255)]
131
-
132
-
133
- def is_labelme_json_data(data):
134
- """ 是labelme的标注格式
135
- :param data: dict
136
- :return: True or False
137
- """
138
- if not isinstance(data, dict):
139
- return False
140
- has_keys = set('version flags shapes imagePath imageData imageHeight imageWidth'.split())
141
- return not (has_keys - data.keys())
142
-
143
-
144
- def reduce_labelme_jsonfile(jsonpath):
145
- """ 删除imageData """
146
- p = str(jsonpath)
147
-
148
- with open(p, 'rb') as f:
149
- bstr = f.read()
150
- encoding = get_encoding(bstr)
151
- data = ujson.loads(bstr.decode(encoding=encoding))
152
-
153
- if is_labelme_json_data(data) and data['imageData']:
154
- data['imageData'] = None
155
- XlPath(p).write_json(data, encoding=encoding, if_exists='replace')
156
-
157
-
158
- def reduce_labelme_dir(d, print_mode=False):
159
- """ 精简一个目录里的所有labelme json文件 """
160
-
161
- def printf(*args, **kwargs):
162
- if print_mode:
163
- print(*args, **kwargs)
164
-
165
- i = 0
166
- for f in XlPath(d).rglob_files('*.json'):
167
- data = f.read_json()
168
- if data.get('imageData'):
169
- data['imageData'] = None
170
- f.write_json(data)
171
- i += 1
172
- printf(i, f)
173
-
174
-
175
- class ToLabelmeJson:
176
- """ 标注格式转label形式
177
-
178
- 初始化最好带有图片路径,能获得一些相关有用的信息
179
- 然后自定义实现一个 get_data 接口,实现self.data的初始化,运行完可以从self.data取到字典数据
180
- 根据需要可以定制自己的shape,修改get_shape函数
181
- 可以调用write写入文件
182
-
183
- document: https://www.yuque.com/xlpr/pyxllib/ks5h4o
184
- """
185
-
186
- # 可能有其他人会用我库的高级接口,不应该莫名其妙报警告。除非我先实现自己库内该功能的剥离
187
- # @deprecated(reason='建议使用LabelmeData实现')
188
- def __init__(self, imgpath):
189
- """
190
- :param imgpath: 可选参数图片路径,强烈建议要输入,否则建立的label json会少掉图片宽高信息
191
- """
192
- self.imgpath = XlPath(imgpath)
193
- # 读取图片数据,在一些转换规则比较复杂,有可能要用到原图数据
194
- if self.imgpath:
195
- # 一般都只需要获得尺寸,用pil读取即可,速度更快,不需要读取图片rgb数据
196
- self.img = xlpil.read(self.imgpath)
197
- else:
198
- self.img = None
199
- self.data = self.get_data_base() # 存储json的字典数据
200
-
201
- def get_data(self, infile):
202
- """ 格式转换接口函数,继承的类需要自己实现这个方法
203
-
204
- :param infile: 待解析的标注数据
205
- """
206
- raise NotImplementedError('get_data方法必须在子类中实现')
207
-
208
- def get_data_base(self, name='', height=0, width=0):
209
- """ 获得一个labelme标注文件的框架 (这是标准结构,也可以自己修改定制)
210
-
211
- 如果初始化时没有输入图片,也可以这里传入name等的值
212
- """
213
- # 1 默认属性,和图片名、尺寸
214
- if self.imgpath:
215
- name = self.imgpath.name
216
- height, width = self.img.height, self.img.width
217
- # 2 构建结构框架
218
- data = {'version': '4.5.6',
219
- 'flags': {},
220
- 'shapes': [],
221
- 'imagePath': name,
222
- 'imageData': None,
223
- 'imageWidth': width,
224
- 'imageHeight': height,
225
- }
226
- return data
227
-
228
- def get_shape(self, label, points, shape_type=None, dtype=None, group_id=None, **kwargs):
229
- """ 最基本的添加形状功能
230
-
231
- :param shape_type: 会根据points的点数量,智能判断类型,默认一般是polygon
232
- 其他需要自己指定的格式:line、circle
233
- :param dtype: 可以重置points的存储数值类型,一般是浮点数,可以转成整数更精简
234
- :param group_id: 本来是用来分组的,但其值会以括号的形式添加在label后面,可以在可视化中做一些特殊操作
235
- """
236
- # 1 优化点集数据格式
237
- points = np.array(points, dtype=dtype).reshape(-1, 2).tolist()
238
- # 2 判断形状类型
239
- if shape_type is None:
240
- m = len(points)
241
- if m == 1:
242
- shape_type = 'point'
243
- elif m == 2:
244
- shape_type = 'rectangle'
245
- elif m >= 3:
246
- shape_type = 'polygon'
247
- else:
248
- raise ValueError
249
- # 3 创建标注
250
- shape = {'flags': {},
251
- 'group_id': group_id,
252
- 'label': str(label),
253
- 'points': points,
254
- 'shape_type': shape_type}
255
- shape.update(kwargs)
256
- return shape
257
-
258
- def get_shape2(self, **kwargs):
259
- """ 完全使用字典的接口形式 """
260
- label = kwargs.get('label', '')
261
- points = kwargs['points'] # 这个是必须要有的字段
262
- kw = copy.deepcopy(kwargs)
263
- del kw['label']
264
- del kw['points']
265
- return self.get_shape(label, points, **kw)
266
-
267
- def add_shape(self, *args, **kwargs):
268
- self.data['shapes'].append(self.get_shape(*args, **kwargs))
269
-
270
- def add_shape2(self, **kwargs):
271
- self.data['shapes'].append(self.get_shape2(**kwargs))
272
-
273
- def write(self, dst=None, if_exists='replace'):
274
- """
275
- :param dst: 往dst目标路径存入json文件,默认名称在self.imgpath同目录的同名json文件
276
- :return: 写入后的文件路径
277
- """
278
- if dst is None and self.imgpath:
279
- dst = self.imgpath.with_suffix('.json')
280
- # 官方json支持indent=None的写法,但是ujson必须要显式写indent=0
281
- return XlPath(dst).write_auto(self.data, if_exists=if_exists, indent=0)
282
-
283
- @classmethod
284
- def create_json(cls, imgpath, annotation):
285
- """ 输入图片路径p,和对应的annotation标注数据(一般是对应目录下txt文件) """
286
- try:
287
- obj = cls(imgpath)
288
- except TypeError as e: # 解析不了的输出错误日志
289
- get_xllog().exception(e)
290
- return
291
- obj.get_data(annotation)
292
- obj.write() # 保存json文件到img对应目录下
293
-
294
- @classmethod
295
- def main_normal(cls, imdir, labeldir=None, label_file_suffix='.txt'):
296
- """ 封装更高层的接口,输入目录,直接标注目录下所有图片
297
-
298
- :param imdir: 图片路径
299
- :param labeldir: 标注数据路径,默认跟imdir同目录
300
- :return:
301
- """
302
- ims = XlPath(imdir).rglob_images()
303
- if not labeldir: labeldir = imdir
304
- txts = [(XlPath(labeldir) / (f.stem + label_file_suffix)) for f in ims]
305
- cls.main_pair(ims, txts)
306
-
307
- @classmethod
308
- def main_pair(cls, images, labels):
309
- """ 一一配对匹配处理 """
310
- Iterate(zip(images, labels)).run(lambda x: cls.create_json(x[0], x[1]),
311
- pinterval='20%', max_workers=8)
312
-
313
-
314
- class Quad2Labelme(ToLabelmeJson):
315
- """ 四边形类标注转labelme """
316
-
317
- def get_data(self, infile):
318
- lines = XlPath(infile).read_text().splitlines()
319
- for line in lines:
320
- # 一般是要改这里,每行数据的解析规则
321
- vals = line.split(',', maxsplit=8)
322
- if len(vals) < 9: continue
323
- pts = [int(v) for v in vals[:8]] # 点集
324
- label = vals[-1] # 标注的文本
325
- # get_shape还有shape_type形状参数可以设置
326
- # 如果是2个点的矩形,或者3个点以上的多边形,会自动判断,不用指定shape_type
327
- self.add_shape(label, pts)
328
-
329
-
330
- class LabelmeDict:
331
- """ Labelme格式的字典数据
332
-
333
- 这里的成员函数基本都是原地操作
334
- """
335
-
336
- @classmethod
337
- def gen_data(cls, imfile=None, **kwargs):
338
- """ 主要框架结构
339
- :param imfile: 可以传入一张图片路径
340
- """
341
- # 1 传入图片路径的初始化
342
- if imfile:
343
- file = XlPath(imfile)
344
- name = file.name
345
- img = xlpil.read(file)
346
- height, width = img.height, img.width
347
- else:
348
- name, height, width = '', 0, 0
349
-
350
- # 2 字段值
351
- data = {'version': '5.1.7',
352
- 'flags': {},
353
- 'shapes': [],
354
- 'imagePath': name,
355
- 'imageData': None,
356
- 'imageWidth': width,
357
- 'imageHeight': height,
358
- }
359
- if kwargs:
360
- data.update(kwargs)
361
- return data
362
-
363
- @classmethod
364
- def gen_ocr_data(cls, imfile=None, **kwargs):
365
- """" 支持调用PaddleOCR进行预识别
366
-
367
- 该接口是为了方便性保留,更推荐使用 PaddleOCR.labelme_ocr 的功能进行批量识别
368
- """
369
- from paddleocr import PaddleOCR
370
- ppocr = PaddleOCR.get_paddleocr()
371
-
372
- data = cls.gen_data(imfile, **kwargs)
373
- lines = ppocr.ocr(str(imfile))
374
- for line in lines:
375
- pts, [text, score] = line
376
- pts = [[int(p[0]), int(p[1])] for p in pts] # 转整数
377
- sp = cls.gen_shape({'text': text, 'score': round(float(score), 4)}, pts)
378
- data['shapes'].append(sp)
379
- return data
380
-
381
- @classmethod
382
- def gen_shape(cls, label, points, shape_type=None, dtype=None, group_id=None, **kwargs):
383
- """ 最基本的添加形状功能
384
-
385
- :param label: 支持输入dict类型,会编码为json格式的字符串
386
- :param shape_type: 会根据points的点数量,智能判断类型,默认一般是polygon
387
- 其他需要自己指定的格式:line、circle
388
- :param dtype: 可以重置points的存储数值类型,一般是浮点数,可以转成整数更精简
389
- :param group_id: 本来是用来分组的,但其值会以括号的形式添加在label后面,可以在可视化中做一些特殊操作
390
- """
391
- # 1 优化点集数据格式
392
- points = np.array(points, dtype=dtype).reshape(-1, 2).tolist()
393
- # 2 判断形状类型
394
- if shape_type is None:
395
- m = len(points)
396
- if m == 1:
397
- shape_type = 'point'
398
- elif m == 2:
399
- shape_type = 'rectangle'
400
- elif m >= 3:
401
- shape_type = 'polygon'
402
- else:
403
- raise ValueError
404
- # 3 创建标注
405
- if isinstance(label, dict):
406
- label = json.dumps(label, ensure_ascii=False)
407
- shape = {'flags': {},
408
- 'group_id': group_id,
409
- 'label': str(label),
410
- 'points': points,
411
- 'shape_type': shape_type}
412
- shape.update(kwargs)
413
- return shape
414
-
415
- @classmethod
416
- def gen_shape2(cls, **kwargs):
417
- """ 完全使用字典的接口形式 """
418
- label = kwargs.get('label', '')
419
- points = kwargs['points'] # 这个是必须要有的字段
420
- kw = copy.deepcopy(kwargs)
421
- if 'label' in kw:
422
- del kw['label']
423
- if 'points' in kw:
424
- del kw['points']
425
- return cls.gen_shape(label, points, **kw)
426
-
427
- @classmethod
428
- def reduce(cls, lmdict, *, inplace=True):
429
- if not inplace:
430
- lmdict = copy.deepcopy(lmdict)
431
-
432
- lmdict['imageData'] = None
433
- return lmdict
434
-
435
- @classmethod
436
- def refine_structure(cls, old_json_path, *, old_img_path=None,
437
- new_stem_name=None, new_img_suffix=None):
438
- """ 重置labelme标注文件,这是一个比较综合的调整优化接口
439
-
440
- :param old_json_path: 原json路径
441
- :param old_img_path: 原图路径,可以不填,从old_json_path推算出来
442
- :param new_stem_name: 新的stem昵称,没写的时候,以json的stem为准
443
- :param new_img_suffix: 是否要调整图片后缀格式,常用图图片格式统一操作
444
- 没写的时候,以找到的图片为准,如果图片没找到,则以imagePath的后缀为准
445
- """
446
- from pyxllib.cv.expert import xlcv
447
-
448
- # 1 参数解析
449
- old_json_path = XlPath(old_json_path)
450
- parent = old_json_path.parent
451
- lmdict = old_json_path.read_json()
452
-
453
- if old_img_path is None:
454
- old_img_path = parent / lmdict['imagePath']
455
- if not old_img_path.is_file():
456
- # 如果imagePath的图片并不存在,需要用json的名称去推导,如果也还是不存在,就按照imagePath的后缀设置
457
- try:
458
- old_img_path = next(parent.glob_images(f'{old_json_path.stem}.*'))
459
- except StopIteration:
460
- old_img_path = parent / (old_json_path.stem + XlPath(lmdict['imagePath']).suffix)
461
-
462
- if new_stem_name is None:
463
- new_stem_name = old_json_path.stem
464
-
465
- if new_img_suffix is None:
466
- new_img_suffix = old_img_path.suffix
467
-
468
- # 2 重命名、重置
469
- new_json_path = parent / (new_stem_name + '.json')
470
- new_img_path = parent / (new_stem_name + new_img_suffix)
471
-
472
- # 优化json数据
473
- cls.reduce(lmdict)
474
- lmdict['imagePath'] = new_img_path.name
475
- new_json_path.write_json(lmdict)
476
- if new_json_path.as_posix() != old_json_path.as_posix():
477
- old_json_path.delete()
478
-
479
- # TODO points浮点过长的优化?xllabelme默认优化了?
480
-
481
- # 优化图片
482
- if old_img_path.is_file():
483
- xlcv.write(xlcv.read(old_img_path), new_img_path)
484
- if new_img_path.as_posix() != old_img_path.as_posix():
485
- old_img_path.delete()
486
-
487
- @classmethod
488
- def flip_points(cls, lmdict, direction, *, inplace=True):
489
- """
490
- :param direction: points的翻转方向
491
- 1表示顺时针转90度,2表示顺时针转180度...
492
- -1表示逆时针转90度,...
493
- :return:
494
- """
495
- if not inplace:
496
- lmdict = copy.deepcopy(lmdict)
497
-
498
- w, h = lmdict['imageWidth'], lmdict['imageHeight']
499
- pts = [[[0, 0], [w, 0], [w, h], [0, h]],
500
- [[h, 0], [h, w], [0, w], [0, 0]],
501
- [[w, h], [0, h], [0, 0], [w, 0]],
502
- [[0, w], [0, 0], [h, 0], [h, w]]]
503
- warp_mat = get_warp_mat(pts[0], pts[direction % 4])
504
-
505
- if direction % 2:
506
- lmdict['imageWidth'], lmdict['imageHeight'] = lmdict['imageHeight'], lmdict['imageWidth']
507
- shapes = lmdict['shapes']
508
- for i, shape in enumerate(shapes):
509
- pts = [warp_points(x, warp_mat)[0].tolist() for x in shape['points']]
510
- if shape['shape_type'] == 'rectangle':
511
- pts = resort_quad_points(rect2polygon(pts))
512
- shape['points'] = [pts[0], pts[2]]
513
- elif shape['shape_type'] == 'polygon' and len(pts) == 4:
514
- shape['points'] = resort_quad_points(pts)
515
- else: # 其他形状暂不处理,也不报错
516
- pass
517
- return lmdict
518
-
519
- @classmethod
520
- def flip_img_and_json(cls, impath, direction):
521
- """ 旋转impath,如果有对应的json也会自动处理
522
- demo_flip_labelme,演示如何使用翻转图片、labelme标注功能
523
-
524
- :param XlPath impath: 图片路径
525
- :param direction: 标记现在图片是哪个方向:0是正常,1是向右翻转,2是向下翻转,3是向左翻转
526
- 顺时针0123表示当前图片方向
527
- 甚至该参数可以设成None,没有输入的时候调用模型识别结果,不过那个模型不是很准确,先不搞这种功能
528
- """
529
- # 图片旋转
530
- im = xlpil.read(impath)
531
- im = xlpil.flip_direction(im, direction)
532
- xlpil.write(im, impath)
533
-
534
- # json格式
535
- jsonpath = impath.with_suffix('.json')
536
- if jsonpath.exists():
537
- lmdict = jsonpath.read_json('utf8') # 必须是labelme的格式,其他格式不支持处理哦
538
- cls.flip_points(lmdict, -direction) # 默认是inplace操作
539
- jsonpath.write_json(lmdict, 'utf8')
540
-
541
- @classmethod
542
- def update_labelattr(cls, lmdict, *, points=False, inplace=True):
543
- """
544
-
545
- :param points: 是否更新labelattr中的points、bbox等几何信息
546
- 并且在无任何几何信息的情况下,增设points
547
- """
548
- if not inplace:
549
- lmdict = copy.deepcopy(lmdict)
550
-
551
- for shape in lmdict['shapes']:
552
- # 1 属性字典,至少先初始化一个label属性
553
- labelattr = DictTool.json_loads(shape['label'], 'label')
554
- # 2 填充其他扩展属性值
555
- keys = set(shape.keys())
556
- stdkeys = set('label,points,group_id,shape_type,flags'.split(','))
557
- for k in (keys - stdkeys):
558
- labelattr[k] = shape[k]
559
- del shape[k] # 要删除原有的扩展字段值
560
-
561
- # 3 处理points等几何信息
562
- if points:
563
- if 'bbox' in labelattr:
564
- labelattr['bbox'] = ltrb2xywh(rect_bounds(shape['points']))
565
- else:
566
- labelattr['points'] = shape['points']
567
-
568
- # + 写回shape
569
- shape['label'] = json.dumps(labelattr, ensure_ascii=False)
570
- return lmdict
571
-
572
- @classmethod
573
- def to_quad_pts(cls, shape):
574
- """ 将一个形状标注变成4个点标注的四边形 """
575
- pts = shape['points']
576
- t = shape['shape_type']
577
- if t == 'rectangle':
578
- return rect2polygon(pts)
579
- elif t == 'polygon':
580
- if len(pts) != 4:
581
- # 暂用外接矩形代替
582
- xs = [p[0] for p in pts]
583
- ys = [p[1] for p in pts]
584
- r = [(min(xs), min(ys)), (max(xs), max(ys))]
585
- pts = rect2polygon(r)
586
- return pts
587
- else:
588
- raise NotImplementedError(f'{t}')
589
-
590
-
591
- class LabelmeDataset(BasicLabelDataset):
592
- def __init__(self, root, relpath2data=None, *, reads=True, prt=False, fltr='json', slt='json', extdata=None):
593
- """
594
- :param root: 文件根目录
595
- :param relpath2data: {jsonfile: lmdict, ...},其中 lmdict 为一个labelme文件格式的标准内容
596
- 如果未传入data具体值,则根据目录里的情况自动初始化获得data的值
597
-
598
- 210602周三16:26,为了工程等一些考虑,删除了 is_labelme_json_data 的检查
599
- 尽量通过 fltr、slt 的机制选出正确的 json 文件
600
- """
601
- super().__init__(root, relpath2data, reads=reads, prt=prt, fltr=fltr, slt=slt, extdata=extdata)
602
-
603
- # 已有的数据已经读取了,这里要补充空labelme标注
604
- if self.pathgs:
605
- for stem, suffixs in tqdm(self.pathgs.data.items(), f'{self.__class__.__name__}优化数据', disable=not prt):
606
- f = XlPath(stem + f'.{slt}')
607
- if reads and not f.exists():
608
- self.rp2data[f.relpath(self.root)] = LabelmeDict.gen_data(XlPath.init(stem, suffix=suffixs[0]))
609
-
610
- # 优化rp2data,去掉一些并不是labelme的字典
611
- rp2data = {}
612
- for k, v in self.rp2data.items():
613
- if is_labelme_json_data(v):
614
- rp2data[k] = v
615
- self.rp2data = rp2data
616
-
617
- def reduces(self):
618
- """ 移除imageData字段值 """
619
- for lmdict in self.rp2data.values():
620
- LabelmeDict.reduce(lmdict)
621
-
622
- def refine_structures(self, *, img_suffix=None):
623
- """ 整套labelme数据的重置
624
-
625
- :param img_suffix: 是否统一图片的后缀格式,比如.jpg
626
-
627
- 不过不同的情景问题不同,请了解清楚这个函数的算法逻辑,能解决什么性质的问题后,再调用
628
- """
629
- # 有些字典的imagePath可能有错误,可以调用该方法修正
630
- for jsonfile in tqdm(self.rp2data.keys(), desc='labelme字典优化'):
631
- LabelmeDict.refine_structure(self.root / jsonfile, new_img_suffix=img_suffix)
632
-
633
- def update_labelattrs(self, *, points=False):
634
- """ 将shape['label'] 升级为字典类型
635
-
636
- 可以处理旧版不动产标注 content_class 等问题
637
- """
638
- for jsonfile, lmdict in self.rp2data.items():
639
- LabelmeDict.update_labelattr(lmdict, points=points)
640
-
641
- def to_excel(self, savepath):
642
- """ 转成dataframe表格查看
643
-
644
- 这个细节太多,可以 labelme 先转 coco 后,借助 coco 转 excel
645
- coco 里会给 image、box 编号,能显示一些补充属性
646
- """
647
- from pyxlpr.data.coco import CocoParser
648
- gt_dict = self.to_coco_gt_dict()
649
- CocoParser(gt_dict).to_excel(savepath)
650
-
651
- @classmethod
652
- def plot(self, img, lmdict):
653
- """ 将标注画成静态图 """
654
- raise NotImplementedError
655
-
656
- def to_coco_gt_dict(self, categories=None):
657
- """ 将labelme转成 coco gt 标注的格式
658
-
659
- 分两种大情况
660
- 1、一种是raw原始数据转labelme标注后,首次转coco格式,这种编号等相关数据都可以重新生成
661
- raw_data --可视化--> labelme --转存--> coco
662
- 2、还有种原来就是coco,转labelme修改标注后,又要再转回coco,这种应该尽量保存原始值
663
- coco --> labelme --手动修改--> labelme' --> coco'
664
- 这种在coco转labelme时,会做一些特殊标记,方便后续转回coco
665
- 3、 1, 2两种情况是可以连在一起,然后形成 labelme 和 coco 之间的多次互转的
666
-
667
- :param categories: 类别
668
- 默认只设一个类别 {'id': 0, 'name': 'text', 'supercategory'}
669
- 支持自定义,所有annotations的category_id
670
- :return: gt_dict
671
- 注意,如果对文件顺序、ann顺序有需求的,请先自行操作self.data数据后,再调用该to_coco函数
672
- 对image_id、annotation_id有需求的,需要使用CocoData进一步操作
673
- """
674
- from pyxlpr.data.coco import CocoGtData
675
-
676
- if not categories:
677
- if 'categories' in self.extdata:
678
- # coco 转过来的labelme,存储有原始的 categories
679
- categories = self.extdata['categories']
680
- else:
681
- categories = [{'id': 0, 'name': 'text', 'supercategory': ''}]
682
-
683
- # 1 第一轮遍历:结构处理 jsonfile, lmdict --> data(image, shapes)
684
- img_id, ann_id, data = 0, 0, []
685
- for jsonfile, lmdict in self.rp2data.items():
686
- # 1.0 升级为字典类型
687
- lmdict = LabelmeDict.update_labelattr(lmdict, points=True)
688
-
689
- for sp in lmdict['shapes']: # label转成字典
690
- sp['label'] = json.loads(sp['label'])
691
-
692
- # 1.1 找shapes里的image
693
- image = None
694
- # 1.1.1 xltype='image'
695
- for sp in filter(lambda x: x.get('xltype', None) == 'image', lmdict['shapes']):
696
- image = DictTool.json_loads(sp['label'])
697
- if not image:
698
- raise ValueError(sp['label'])
699
- # TODO 删除 coco_eval 等字段?
700
- del image['xltype']
701
- break
702
- # 1.1.2 shapes里没有图像级标注则生成一个
703
- if image is None:
704
- # TODO file_name 加上相对路径?
705
- image = CocoGtData.gen_image(-1, lmdict['imagePath'],
706
- lmdict['imageHeight'], lmdict['imageWidth'])
707
- img_id = max(img_id, image.get('id', -1))
708
-
709
- # 1.2 遍历shapes
710
- shapes = []
711
- for sp in lmdict['shapes']:
712
- label = sp['label']
713
- if 'xltype' not in label:
714
- # 普通的标注框
715
- d = sp['label'].copy()
716
- # DictTool.isub_(d, '')
717
- ann_id = max(ann_id, d.get('id', -1))
718
- shapes.append(d)
719
- elif label['xltype'] == 'image':
720
- # image,图像级标注数据;之前已经处理了,这里可以跳过
721
- pass
722
- elif label['xltype'] == 'seg':
723
- # seg,衍生的分割标注框,在转回coco时可以丢弃
724
- pass
725
- else:
726
- raise ValueError
727
- data.append([image, shapes])
728
-
729
- # 2 第二轮遍历:处理id等问题
730
- images, annotations = [], []
731
- for image, shapes in data:
732
- # 2.1 image
733
- if image.get('id', -1) == -1:
734
- img_id += 1
735
- image['id'] = img_id
736
- images.append(image)
737
-
738
- # 2.2 annotations
739
- for sp in shapes:
740
- sp['image_id'] = img_id
741
- if sp.get('id', -1) == -1:
742
- ann_id += 1
743
- sp['id'] = ann_id
744
- # 如果没有框类别,会默认设置一个。 (强烈建议外部业务功能代码自行设置好category_id)
745
- if 'category_id' not in sp:
746
- sp['category_id'] = categories[0]['id']
747
- DictTool.isub(sp, ['category_name'])
748
- ann = CocoGtData.gen_annotation(**sp)
749
- annotations.append(ann)
750
-
751
- # 3 result
752
- gt_dict = CocoGtData.gen_gt_dict(images, annotations, categories)
753
- return gt_dict
754
-
755
- def to_ppdet(self, outfile=None, print_mode=True):
756
- """ 转paddle的文本检测格式
757
-
758
- 图片要存相对目录,默认就按self的root参数设置
759
- """
760
- lines = []
761
-
762
- # 1 转成一行行标注数据
763
- for jsonfile, lmdict in tqdm(self.rp2data.items(), disable=not print_mode):
764
- shapes = [] # pp格式的标注清单
765
-
766
- for sp in lmdict['shapes']:
767
- attrs = DictTool.json_loads(sp['label'], 'text')
768
- d = {'transcription': attrs['text'],
769
- 'points': round_int(LabelmeDict.to_quad_pts(sp), ndim=2)}
770
- shapes.append(d)
771
- imfile = os.path.split(jsonfile)[0] + f'/{lmdict["imagePath"]}'
772
- lines.append(f'{imfile}\t{json.dumps(shapes, ensure_ascii=False)}')
773
-
774
- # 2 输出
775
- content = '\n'.join(lines)
776
- if outfile:
777
- XlPath(outfile).write_text(content)
778
- return content
779
-
780
- def get_char_count_dict(self):
781
- """ 文本识别需要用到的功能,检查字符集出现情况
782
-
783
- return dict: 返回一个字典,k是出现的字符,v是各字符出现的次数,按顺序从多到少排序
784
- 差不多是Counter的结构
785
- """
786
- texts = []
787
- for lmdict in self.rp2data.values():
788
- for sp in lmdict['shapes']:
789
- text = DictTool.json_loads(sp['label'], 'text')['text']
790
- texts.append(text)
791
- ct = Counter(''.join(texts))
792
- return {k: v for k, v in ct.most_common()}
793
-
794
- def check_char_set(self, refdict=None):
795
- """ 检查本套labelme数据集里,text字符集出现情况,和paddleocr的识别字典是否有额外新增字符
796
- """
797
- from pyxllib.algo.specialist import DictCmper
798
- from pyxlpr.ppocr.utils import get_dict_content
799
-
800
- # 0 计算字典
801
- if refdict is None:
802
- d1 = get_dict_content('ppocr_keys_v1.txt')
803
- refdict = {k: 1 for k in d1.split('\n')}
804
-
805
- d2 = self.get_char_count_dict()
806
-
807
- print('1 整体统计信息')
808
- dc = DictCmper({'refdict': refdict, 'chars': d2})
809
- print(dc.pair_summary())
810
-
811
- print('2 新增字符及出现数量(如果只是多出空白字符,可以统一转空格处理)')
812
- keys = set(list(d2.keys())) - set(list(refdict.keys()))
813
- sorted(keys, key=lambda k: -d2[k])
814
- for k in keys:
815
- print(repr(k), d2[k])
816
-
817
- # 3 返回所有新增的非空字符
818
- return {k for k in keys if k.strip()}
819
-
820
- def to_pprec(self, image_dir, txt_path, *, reset=False):
821
- """ 转paddle的文本识别格式
822
-
823
- :param image_dir: 要导出文本行数据所在的目录
824
- :param txt_path: 标注文件的路径
825
- :param reset: 目标目录存在则重置
826
- """
827
- pass
828
-
829
-
830
- class ItemFormula:
831
- """ 为m2302中科院题库准备的labelme数据相关处理算法
832
- 如果写到labelme中,其他脚本就复用不了了,所以把核心算法功能写到这里
833
-
834
- line_id的功能是动态计算的,参考了与上一个shape的区别,这里暂时不提供静态算法。
835
- """
836
-
837
- @classmethod
838
- def check_label(cls, shapes):
839
- """ 检查数据异常 """
840
- # "删除" 是否都为手写,并且text为空
841
-
842
- @classmethod
843
- def joint_label(cls, shapes):
844
- """ 将shapes的label拼接成一篇文章
845
-
846
- :return: [line1, line2, ...]
847
- """
848
- last_line_id = 1
849
- paper_text = []
850
- line_text = []
851
- for sp in shapes:
852
- label = json.loads(sp['label'])
853
- if label['line_id'] != last_line_id:
854
- last_line_id = label['line_id']
855
- paper_text.append(' '.join(line_text))
856
- line_text = []
857
-
858
- if label['content_class'] == '公式':
859
- t = '$' + label['text'] + '$'
860
- else:
861
- t = label['text']
862
- line_text.append(t)
863
-
864
- paper_text.append(' '.join(line_text))
865
-
866
- return paper_text