pyxllib 0.3.96__py3-none-any.whl → 0.3.197__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (306) hide show
  1. pyxllib/algo/geo.py +12 -0
  2. pyxllib/algo/intervals.py +1 -1
  3. pyxllib/algo/matcher.py +78 -0
  4. pyxllib/algo/pupil.py +187 -19
  5. pyxllib/algo/specialist.py +2 -1
  6. pyxllib/algo/stat.py +38 -2
  7. {pyxlpr → pyxllib/autogui}/__init__.py +1 -1
  8. pyxllib/autogui/activewin.py +246 -0
  9. pyxllib/autogui/all.py +9 -0
  10. pyxllib/{ext/autogui → autogui}/autogui.py +40 -11
  11. pyxllib/autogui/uiautolib.py +362 -0
  12. pyxllib/autogui/wechat.py +827 -0
  13. pyxllib/autogui/wechat_msg.py +421 -0
  14. pyxllib/autogui/wxautolib.py +84 -0
  15. pyxllib/cv/slidercaptcha.py +137 -0
  16. pyxllib/data/echarts.py +123 -12
  17. pyxllib/data/jsonlib.py +89 -0
  18. pyxllib/data/pglib.py +514 -30
  19. pyxllib/data/sqlite.py +231 -4
  20. pyxllib/ext/JLineViewer.py +14 -1
  21. pyxllib/ext/drissionlib.py +277 -0
  22. pyxllib/ext/kq5034lib.py +0 -1594
  23. pyxllib/ext/robustprocfile.py +497 -0
  24. pyxllib/ext/unixlib.py +6 -5
  25. pyxllib/ext/utools.py +108 -95
  26. pyxllib/ext/webhook.py +32 -14
  27. pyxllib/ext/wjxlib.py +88 -0
  28. pyxllib/ext/wpsapi.py +124 -0
  29. pyxllib/ext/xlwork.py +9 -0
  30. pyxllib/ext/yuquelib.py +1003 -71
  31. pyxllib/file/docxlib.py +1 -1
  32. pyxllib/file/libreoffice.py +165 -0
  33. pyxllib/file/movielib.py +9 -0
  34. pyxllib/file/packlib/__init__.py +112 -75
  35. pyxllib/file/pdflib.py +1 -1
  36. pyxllib/file/pupil.py +1 -1
  37. pyxllib/file/specialist/dirlib.py +1 -1
  38. pyxllib/file/specialist/download.py +10 -3
  39. pyxllib/file/specialist/filelib.py +266 -55
  40. pyxllib/file/xlsxlib.py +205 -50
  41. pyxllib/file/xlsyncfile.py +341 -0
  42. pyxllib/prog/cachetools.py +64 -0
  43. pyxllib/prog/filelock.py +42 -0
  44. pyxllib/prog/multiprogs.py +940 -0
  45. pyxllib/prog/newbie.py +9 -2
  46. pyxllib/prog/pupil.py +129 -60
  47. pyxllib/prog/specialist/__init__.py +176 -2
  48. pyxllib/prog/specialist/bc.py +5 -2
  49. pyxllib/prog/specialist/browser.py +11 -2
  50. pyxllib/prog/specialist/datetime.py +68 -0
  51. pyxllib/prog/specialist/tictoc.py +12 -13
  52. pyxllib/prog/specialist/xllog.py +5 -5
  53. pyxllib/prog/xlosenv.py +7 -0
  54. pyxllib/text/airscript.js +744 -0
  55. pyxllib/text/charclasslib.py +17 -5
  56. pyxllib/text/jiebalib.py +6 -3
  57. pyxllib/text/jinjalib.py +32 -0
  58. pyxllib/text/jsa_ai_prompt.md +271 -0
  59. pyxllib/text/jscode.py +159 -4
  60. pyxllib/text/nestenv.py +1 -1
  61. pyxllib/text/newbie.py +12 -0
  62. pyxllib/text/pupil/common.py +26 -0
  63. pyxllib/text/specialist/ptag.py +2 -2
  64. pyxllib/text/templates/echart_base.html +11 -0
  65. pyxllib/text/templates/highlight_code.html +17 -0
  66. pyxllib/text/templates/latex_editor.html +103 -0
  67. pyxllib/text/xmllib.py +76 -14
  68. pyxllib/xl.py +2 -1
  69. pyxllib-0.3.197.dist-info/METADATA +48 -0
  70. pyxllib-0.3.197.dist-info/RECORD +126 -0
  71. {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +1 -2
  72. pyxllib/ext/autogui/__init__.py +0 -8
  73. pyxllib-0.3.96.dist-info/METADATA +0 -51
  74. pyxllib-0.3.96.dist-info/RECORD +0 -333
  75. pyxllib-0.3.96.dist-info/top_level.txt +0 -2
  76. pyxlpr/ai/__init__.py +0 -5
  77. pyxlpr/ai/clientlib.py +0 -1281
  78. pyxlpr/ai/specialist.py +0 -286
  79. pyxlpr/ai/torch_app.py +0 -172
  80. pyxlpr/ai/xlpaddle.py +0 -655
  81. pyxlpr/ai/xltorch.py +0 -705
  82. pyxlpr/data/__init__.py +0 -11
  83. pyxlpr/data/coco.py +0 -1325
  84. pyxlpr/data/datacls.py +0 -365
  85. pyxlpr/data/datasets.py +0 -200
  86. pyxlpr/data/gptlib.py +0 -1291
  87. pyxlpr/data/icdar/__init__.py +0 -96
  88. pyxlpr/data/icdar/deteval.py +0 -377
  89. pyxlpr/data/icdar/icdar2013.py +0 -341
  90. pyxlpr/data/icdar/iou.py +0 -340
  91. pyxlpr/data/icdar/rrc_evaluation_funcs_1_1.py +0 -463
  92. pyxlpr/data/imtextline.py +0 -473
  93. pyxlpr/data/labelme.py +0 -866
  94. pyxlpr/data/removeline.py +0 -179
  95. pyxlpr/data/specialist.py +0 -57
  96. pyxlpr/eval/__init__.py +0 -85
  97. pyxlpr/paddleocr.py +0 -776
  98. pyxlpr/ppocr/__init__.py +0 -15
  99. pyxlpr/ppocr/configs/rec/multi_language/generate_multi_language_configs.py +0 -226
  100. pyxlpr/ppocr/data/__init__.py +0 -135
  101. pyxlpr/ppocr/data/imaug/ColorJitter.py +0 -26
  102. pyxlpr/ppocr/data/imaug/__init__.py +0 -67
  103. pyxlpr/ppocr/data/imaug/copy_paste.py +0 -170
  104. pyxlpr/ppocr/data/imaug/east_process.py +0 -437
  105. pyxlpr/ppocr/data/imaug/gen_table_mask.py +0 -244
  106. pyxlpr/ppocr/data/imaug/iaa_augment.py +0 -114
  107. pyxlpr/ppocr/data/imaug/label_ops.py +0 -789
  108. pyxlpr/ppocr/data/imaug/make_border_map.py +0 -184
  109. pyxlpr/ppocr/data/imaug/make_pse_gt.py +0 -106
  110. pyxlpr/ppocr/data/imaug/make_shrink_map.py +0 -126
  111. pyxlpr/ppocr/data/imaug/operators.py +0 -433
  112. pyxlpr/ppocr/data/imaug/pg_process.py +0 -906
  113. pyxlpr/ppocr/data/imaug/randaugment.py +0 -143
  114. pyxlpr/ppocr/data/imaug/random_crop_data.py +0 -239
  115. pyxlpr/ppocr/data/imaug/rec_img_aug.py +0 -533
  116. pyxlpr/ppocr/data/imaug/sast_process.py +0 -777
  117. pyxlpr/ppocr/data/imaug/text_image_aug/__init__.py +0 -17
  118. pyxlpr/ppocr/data/imaug/text_image_aug/augment.py +0 -120
  119. pyxlpr/ppocr/data/imaug/text_image_aug/warp_mls.py +0 -168
  120. pyxlpr/ppocr/data/lmdb_dataset.py +0 -115
  121. pyxlpr/ppocr/data/pgnet_dataset.py +0 -104
  122. pyxlpr/ppocr/data/pubtab_dataset.py +0 -107
  123. pyxlpr/ppocr/data/simple_dataset.py +0 -372
  124. pyxlpr/ppocr/losses/__init__.py +0 -61
  125. pyxlpr/ppocr/losses/ace_loss.py +0 -52
  126. pyxlpr/ppocr/losses/basic_loss.py +0 -135
  127. pyxlpr/ppocr/losses/center_loss.py +0 -88
  128. pyxlpr/ppocr/losses/cls_loss.py +0 -30
  129. pyxlpr/ppocr/losses/combined_loss.py +0 -67
  130. pyxlpr/ppocr/losses/det_basic_loss.py +0 -208
  131. pyxlpr/ppocr/losses/det_db_loss.py +0 -80
  132. pyxlpr/ppocr/losses/det_east_loss.py +0 -63
  133. pyxlpr/ppocr/losses/det_pse_loss.py +0 -149
  134. pyxlpr/ppocr/losses/det_sast_loss.py +0 -121
  135. pyxlpr/ppocr/losses/distillation_loss.py +0 -272
  136. pyxlpr/ppocr/losses/e2e_pg_loss.py +0 -140
  137. pyxlpr/ppocr/losses/kie_sdmgr_loss.py +0 -113
  138. pyxlpr/ppocr/losses/rec_aster_loss.py +0 -99
  139. pyxlpr/ppocr/losses/rec_att_loss.py +0 -39
  140. pyxlpr/ppocr/losses/rec_ctc_loss.py +0 -44
  141. pyxlpr/ppocr/losses/rec_enhanced_ctc_loss.py +0 -70
  142. pyxlpr/ppocr/losses/rec_nrtr_loss.py +0 -30
  143. pyxlpr/ppocr/losses/rec_sar_loss.py +0 -28
  144. pyxlpr/ppocr/losses/rec_srn_loss.py +0 -47
  145. pyxlpr/ppocr/losses/table_att_loss.py +0 -109
  146. pyxlpr/ppocr/metrics/__init__.py +0 -44
  147. pyxlpr/ppocr/metrics/cls_metric.py +0 -45
  148. pyxlpr/ppocr/metrics/det_metric.py +0 -82
  149. pyxlpr/ppocr/metrics/distillation_metric.py +0 -73
  150. pyxlpr/ppocr/metrics/e2e_metric.py +0 -86
  151. pyxlpr/ppocr/metrics/eval_det_iou.py +0 -274
  152. pyxlpr/ppocr/metrics/kie_metric.py +0 -70
  153. pyxlpr/ppocr/metrics/rec_metric.py +0 -75
  154. pyxlpr/ppocr/metrics/table_metric.py +0 -50
  155. pyxlpr/ppocr/modeling/architectures/__init__.py +0 -32
  156. pyxlpr/ppocr/modeling/architectures/base_model.py +0 -88
  157. pyxlpr/ppocr/modeling/architectures/distillation_model.py +0 -60
  158. pyxlpr/ppocr/modeling/backbones/__init__.py +0 -54
  159. pyxlpr/ppocr/modeling/backbones/det_mobilenet_v3.py +0 -268
  160. pyxlpr/ppocr/modeling/backbones/det_resnet_vd.py +0 -246
  161. pyxlpr/ppocr/modeling/backbones/det_resnet_vd_sast.py +0 -285
  162. pyxlpr/ppocr/modeling/backbones/e2e_resnet_vd_pg.py +0 -265
  163. pyxlpr/ppocr/modeling/backbones/kie_unet_sdmgr.py +0 -186
  164. pyxlpr/ppocr/modeling/backbones/rec_mobilenet_v3.py +0 -138
  165. pyxlpr/ppocr/modeling/backbones/rec_mv1_enhance.py +0 -258
  166. pyxlpr/ppocr/modeling/backbones/rec_nrtr_mtb.py +0 -48
  167. pyxlpr/ppocr/modeling/backbones/rec_resnet_31.py +0 -210
  168. pyxlpr/ppocr/modeling/backbones/rec_resnet_aster.py +0 -143
  169. pyxlpr/ppocr/modeling/backbones/rec_resnet_fpn.py +0 -307
  170. pyxlpr/ppocr/modeling/backbones/rec_resnet_vd.py +0 -286
  171. pyxlpr/ppocr/modeling/heads/__init__.py +0 -54
  172. pyxlpr/ppocr/modeling/heads/cls_head.py +0 -52
  173. pyxlpr/ppocr/modeling/heads/det_db_head.py +0 -118
  174. pyxlpr/ppocr/modeling/heads/det_east_head.py +0 -121
  175. pyxlpr/ppocr/modeling/heads/det_pse_head.py +0 -37
  176. pyxlpr/ppocr/modeling/heads/det_sast_head.py +0 -128
  177. pyxlpr/ppocr/modeling/heads/e2e_pg_head.py +0 -253
  178. pyxlpr/ppocr/modeling/heads/kie_sdmgr_head.py +0 -206
  179. pyxlpr/ppocr/modeling/heads/multiheadAttention.py +0 -163
  180. pyxlpr/ppocr/modeling/heads/rec_aster_head.py +0 -393
  181. pyxlpr/ppocr/modeling/heads/rec_att_head.py +0 -202
  182. pyxlpr/ppocr/modeling/heads/rec_ctc_head.py +0 -88
  183. pyxlpr/ppocr/modeling/heads/rec_nrtr_head.py +0 -826
  184. pyxlpr/ppocr/modeling/heads/rec_sar_head.py +0 -402
  185. pyxlpr/ppocr/modeling/heads/rec_srn_head.py +0 -280
  186. pyxlpr/ppocr/modeling/heads/self_attention.py +0 -406
  187. pyxlpr/ppocr/modeling/heads/table_att_head.py +0 -246
  188. pyxlpr/ppocr/modeling/necks/__init__.py +0 -32
  189. pyxlpr/ppocr/modeling/necks/db_fpn.py +0 -111
  190. pyxlpr/ppocr/modeling/necks/east_fpn.py +0 -188
  191. pyxlpr/ppocr/modeling/necks/fpn.py +0 -138
  192. pyxlpr/ppocr/modeling/necks/pg_fpn.py +0 -314
  193. pyxlpr/ppocr/modeling/necks/rnn.py +0 -92
  194. pyxlpr/ppocr/modeling/necks/sast_fpn.py +0 -284
  195. pyxlpr/ppocr/modeling/necks/table_fpn.py +0 -110
  196. pyxlpr/ppocr/modeling/transforms/__init__.py +0 -28
  197. pyxlpr/ppocr/modeling/transforms/stn.py +0 -135
  198. pyxlpr/ppocr/modeling/transforms/tps.py +0 -308
  199. pyxlpr/ppocr/modeling/transforms/tps_spatial_transformer.py +0 -156
  200. pyxlpr/ppocr/optimizer/__init__.py +0 -61
  201. pyxlpr/ppocr/optimizer/learning_rate.py +0 -228
  202. pyxlpr/ppocr/optimizer/lr_scheduler.py +0 -49
  203. pyxlpr/ppocr/optimizer/optimizer.py +0 -160
  204. pyxlpr/ppocr/optimizer/regularizer.py +0 -52
  205. pyxlpr/ppocr/postprocess/__init__.py +0 -55
  206. pyxlpr/ppocr/postprocess/cls_postprocess.py +0 -33
  207. pyxlpr/ppocr/postprocess/db_postprocess.py +0 -234
  208. pyxlpr/ppocr/postprocess/east_postprocess.py +0 -143
  209. pyxlpr/ppocr/postprocess/locality_aware_nms.py +0 -200
  210. pyxlpr/ppocr/postprocess/pg_postprocess.py +0 -52
  211. pyxlpr/ppocr/postprocess/pse_postprocess/__init__.py +0 -15
  212. pyxlpr/ppocr/postprocess/pse_postprocess/pse/__init__.py +0 -29
  213. pyxlpr/ppocr/postprocess/pse_postprocess/pse/setup.py +0 -14
  214. pyxlpr/ppocr/postprocess/pse_postprocess/pse_postprocess.py +0 -118
  215. pyxlpr/ppocr/postprocess/rec_postprocess.py +0 -654
  216. pyxlpr/ppocr/postprocess/sast_postprocess.py +0 -355
  217. pyxlpr/ppocr/tools/__init__.py +0 -14
  218. pyxlpr/ppocr/tools/eval.py +0 -83
  219. pyxlpr/ppocr/tools/export_center.py +0 -77
  220. pyxlpr/ppocr/tools/export_model.py +0 -129
  221. pyxlpr/ppocr/tools/infer/predict_cls.py +0 -151
  222. pyxlpr/ppocr/tools/infer/predict_det.py +0 -300
  223. pyxlpr/ppocr/tools/infer/predict_e2e.py +0 -169
  224. pyxlpr/ppocr/tools/infer/predict_rec.py +0 -414
  225. pyxlpr/ppocr/tools/infer/predict_system.py +0 -204
  226. pyxlpr/ppocr/tools/infer/utility.py +0 -629
  227. pyxlpr/ppocr/tools/infer_cls.py +0 -83
  228. pyxlpr/ppocr/tools/infer_det.py +0 -134
  229. pyxlpr/ppocr/tools/infer_e2e.py +0 -122
  230. pyxlpr/ppocr/tools/infer_kie.py +0 -153
  231. pyxlpr/ppocr/tools/infer_rec.py +0 -146
  232. pyxlpr/ppocr/tools/infer_table.py +0 -107
  233. pyxlpr/ppocr/tools/program.py +0 -596
  234. pyxlpr/ppocr/tools/test_hubserving.py +0 -117
  235. pyxlpr/ppocr/tools/train.py +0 -163
  236. pyxlpr/ppocr/tools/xlprog.py +0 -748
  237. pyxlpr/ppocr/utils/EN_symbol_dict.txt +0 -94
  238. pyxlpr/ppocr/utils/__init__.py +0 -24
  239. pyxlpr/ppocr/utils/dict/ar_dict.txt +0 -117
  240. pyxlpr/ppocr/utils/dict/arabic_dict.txt +0 -162
  241. pyxlpr/ppocr/utils/dict/be_dict.txt +0 -145
  242. pyxlpr/ppocr/utils/dict/bg_dict.txt +0 -140
  243. pyxlpr/ppocr/utils/dict/chinese_cht_dict.txt +0 -8421
  244. pyxlpr/ppocr/utils/dict/cyrillic_dict.txt +0 -163
  245. pyxlpr/ppocr/utils/dict/devanagari_dict.txt +0 -167
  246. pyxlpr/ppocr/utils/dict/en_dict.txt +0 -63
  247. pyxlpr/ppocr/utils/dict/fa_dict.txt +0 -136
  248. pyxlpr/ppocr/utils/dict/french_dict.txt +0 -136
  249. pyxlpr/ppocr/utils/dict/german_dict.txt +0 -143
  250. pyxlpr/ppocr/utils/dict/hi_dict.txt +0 -162
  251. pyxlpr/ppocr/utils/dict/it_dict.txt +0 -118
  252. pyxlpr/ppocr/utils/dict/japan_dict.txt +0 -4399
  253. pyxlpr/ppocr/utils/dict/ka_dict.txt +0 -153
  254. pyxlpr/ppocr/utils/dict/korean_dict.txt +0 -3688
  255. pyxlpr/ppocr/utils/dict/latin_dict.txt +0 -185
  256. pyxlpr/ppocr/utils/dict/mr_dict.txt +0 -153
  257. pyxlpr/ppocr/utils/dict/ne_dict.txt +0 -153
  258. pyxlpr/ppocr/utils/dict/oc_dict.txt +0 -96
  259. pyxlpr/ppocr/utils/dict/pu_dict.txt +0 -130
  260. pyxlpr/ppocr/utils/dict/rs_dict.txt +0 -91
  261. pyxlpr/ppocr/utils/dict/rsc_dict.txt +0 -134
  262. pyxlpr/ppocr/utils/dict/ru_dict.txt +0 -125
  263. pyxlpr/ppocr/utils/dict/ta_dict.txt +0 -128
  264. pyxlpr/ppocr/utils/dict/table_dict.txt +0 -277
  265. pyxlpr/ppocr/utils/dict/table_structure_dict.txt +0 -2759
  266. pyxlpr/ppocr/utils/dict/te_dict.txt +0 -151
  267. pyxlpr/ppocr/utils/dict/ug_dict.txt +0 -114
  268. pyxlpr/ppocr/utils/dict/uk_dict.txt +0 -142
  269. pyxlpr/ppocr/utils/dict/ur_dict.txt +0 -137
  270. pyxlpr/ppocr/utils/dict/xi_dict.txt +0 -110
  271. pyxlpr/ppocr/utils/dict90.txt +0 -90
  272. pyxlpr/ppocr/utils/e2e_metric/Deteval.py +0 -574
  273. pyxlpr/ppocr/utils/e2e_metric/polygon_fast.py +0 -83
  274. pyxlpr/ppocr/utils/e2e_utils/extract_batchsize.py +0 -87
  275. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_fast.py +0 -457
  276. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_slow.py +0 -592
  277. pyxlpr/ppocr/utils/e2e_utils/pgnet_pp_utils.py +0 -162
  278. pyxlpr/ppocr/utils/e2e_utils/visual.py +0 -162
  279. pyxlpr/ppocr/utils/en_dict.txt +0 -95
  280. pyxlpr/ppocr/utils/gen_label.py +0 -81
  281. pyxlpr/ppocr/utils/ic15_dict.txt +0 -36
  282. pyxlpr/ppocr/utils/iou.py +0 -54
  283. pyxlpr/ppocr/utils/logging.py +0 -69
  284. pyxlpr/ppocr/utils/network.py +0 -84
  285. pyxlpr/ppocr/utils/ppocr_keys_v1.txt +0 -6623
  286. pyxlpr/ppocr/utils/profiler.py +0 -110
  287. pyxlpr/ppocr/utils/save_load.py +0 -150
  288. pyxlpr/ppocr/utils/stats.py +0 -72
  289. pyxlpr/ppocr/utils/utility.py +0 -80
  290. pyxlpr/ppstructure/__init__.py +0 -13
  291. pyxlpr/ppstructure/predict_system.py +0 -187
  292. pyxlpr/ppstructure/table/__init__.py +0 -13
  293. pyxlpr/ppstructure/table/eval_table.py +0 -72
  294. pyxlpr/ppstructure/table/matcher.py +0 -192
  295. pyxlpr/ppstructure/table/predict_structure.py +0 -136
  296. pyxlpr/ppstructure/table/predict_table.py +0 -221
  297. pyxlpr/ppstructure/table/table_metric/__init__.py +0 -16
  298. pyxlpr/ppstructure/table/table_metric/parallel.py +0 -51
  299. pyxlpr/ppstructure/table/table_metric/table_metric.py +0 -247
  300. pyxlpr/ppstructure/table/tablepyxl/__init__.py +0 -13
  301. pyxlpr/ppstructure/table/tablepyxl/style.py +0 -283
  302. pyxlpr/ppstructure/table/tablepyxl/tablepyxl.py +0 -118
  303. pyxlpr/ppstructure/utility.py +0 -71
  304. pyxlpr/xlai.py +0 -10
  305. /pyxllib/{ext/autogui → autogui}/virtualkey.py +0 -0
  306. {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
pyxlpr/data/datacls.py DELETED
@@ -1,365 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Date : 2021/05/19 16:28
6
-
7
- """ 相关数据格式类 """
8
-
9
- from pathlib import Path
10
- from pyxlpr.ai import *
11
- from pyxlpr.data import *
12
-
13
- # from pyxlpr.data.imtextline import TextlineShape
14
-
15
- __1_zcdata = """
16
- """
17
-
18
-
19
- class ZcTextGt:
20
- def __init__(self, root, data=None, *, imdir='images', parts=None):
21
- """
22
- :param root: 数据根目录
23
- :param imdir: 所在图片子目录
24
- :param parts: ['test.txt', 'train.txt'] 等分块标记
25
-
26
- 备注:具体文本数据好像不太适合直接存到内存里,就先不存了。
27
- 但是这个类至少可以把相关功能整合在一起,不零散。
28
- """
29
- self.root = Dir(root)
30
- self.imdir = Dir(imdir, self.root)
31
- self.parts = parts or []
32
-
33
- if data is None:
34
- pass
35
- self.data = data
36
-
37
- def writes_from_coco(self, gt_dict, *, prt=False):
38
- """ coco标注 --> 智财偏好的文本标注格式
39
-
40
- :param gt_dict: coco 的 gt 字典
41
-
42
- TODO gt_dict可能是过量版,增设筛选规则?
43
- """
44
- items = list(CocoData(gt_dict).group_gt())
45
- for img, anns in tqdm(items, desc='生成ZcTextGt的txt标注文件', disable=not prt):
46
- content = []
47
- for ann in anns:
48
- ltrb = xywh2ltrb(ann['bbox'])
49
- ltrb = ','.join([str(int(v)) for v in ltrb])
50
- content.append('\t'.join([ltrb, ann['label']]))
51
- File(img['file_name'], self.imdir, suffix='.txt').write('\n'.join(content))
52
-
53
- def writes(self, *, max_workers=8, prt=False):
54
- """ 重新写入txt文件 """
55
-
56
- def write(x):
57
- file, data = x
58
- if file: # 如果文件存在,要遵循原有的编码规则
59
- with open(str(file), 'rb') as f:
60
- bstr = f.read()
61
- encoding = get_encoding(bstr)
62
- file.write(data, encoding=encoding, if_exists='replace')
63
- else: # 否则直接写入
64
- file.write(data)
65
-
66
- mtqdm(write, self.data, desc='写入labelme json数据', max_workers=max_workers, disable=not prt)
67
-
68
-
69
- class ZcKvGt(ZcTextGt):
70
-
71
- def writes_from_coco(self, gt_dict, *, prt=False):
72
- """ coco标注 -> 智财偏好的,带类别的文本标注格式 """
73
- items = list(CocoData(gt_dict).group_gt())
74
- for img, anns in tqdm(items, '生成ZcKvGt的txt标注文件', disable=not prt):
75
- content = []
76
- for ann in anns:
77
- ltrb = xywh2ltrb(ann['bbox'])
78
- ltrb = ','.join([str(int(v)) for v in ltrb])
79
- cat_id = ann['category_id']
80
- cat = (cat_id + 1) // 2
81
- if cat == 5:
82
- cat = 0
83
- kv = -1
84
- else:
85
- kv = (cat - 1) % 2
86
- content.append('\t'.join([ltrb, str(cat), str(kv), ann['label']]))
87
- File(img['file_name'], self.imdir, suffix='.txt').write('\n'.join(content))
88
-
89
-
90
- class ZcKvDtOld:
91
- """ 旧版本的解析器 """
92
-
93
- def __init__(self, data=None):
94
- """
95
- :param data:
96
- {filepath1: [{'logo': True, 'gt': 'address', 'pr': 'address', 'lb': 'LOT 3'},
97
- {...},
98
- ...]
99
- filepath2: ...,
100
- ...
101
- }
102
- """
103
- self.data = data
104
-
105
- @classmethod
106
- def init_from_zc_txt(cls, file, *, reverse_annos=False):
107
- """ 从文件解析出字典结构数据
108
-
109
- :return: {filepath1: [{'logo': True, 'gt': 'address', 'pr': 'address', 'lb': 'LOT 3'},
110
- {...},
111
- ...]
112
- filepath2: ...,
113
- ...
114
- }
115
- """
116
- content = File(file).read()
117
- content = re.sub(r'(.+?)(\n\n)([^\n]+\n)', r'\3\1\2', content, flags=re.DOTALL)
118
- parts = ContentPartSpliter.multi_blank_lines(content)
119
-
120
- data = dict()
121
- for pt in parts:
122
- lines = pt.splitlines()
123
- filepath = lines[0]
124
- annos = []
125
- for line in lines[1:]: # 第1行是文件名,删掉
126
- m = re.match(r'(.+?), GT: (.+?), PR: (.+?), LB: (.+)', line)
127
- logo, gt, pr, lb = m.groups()
128
- annos.append({'logo': logo == 'True', 'gt': gt, 'pr': pr, 'lb': lb})
129
- if reverse_annos:
130
- annos = list(reversed(annos))
131
- data[filepath] = annos
132
-
133
- return cls(data)
134
-
135
- def to_coco_dt(self, gt_dict, *, printf=True):
136
- """ 配合gt标注文件,做更精细的zc结果解析
137
- """
138
- cat2id = {c['name']: c['id'] for c in gt_dict['categories']}
139
- gt_annos = {x[0]['file_name']: x for x in CocoData(gt_dict).group_gt()}
140
- dt_list = []
141
- for file, dt_annos in self.data.items():
142
- file_name = pathlib.Path(file).name
143
- image, annos = gt_annos[file_name]
144
-
145
- # 如果gt的annos比dt_annos少,需要扩充下,默认按最后一个gt的an填充
146
- n, m = len(annos), len(dt_annos)
147
- assert n == m # 智财如果不带box出来,是要强制框数量相同的!否则box怎么一一对应?
148
- if n < m:
149
- annos += [annos[-1]] * (m - n)
150
-
151
- # TODO 有些不是按顺序匹配的,增加按文本匹配的功能
152
-
153
- for line, an in zip(dt_annos, annos):
154
- gt, pr, lb = an['gt'], an['pr'], an['lb']
155
- if printf and lb != an['label']:
156
- print(file_name, lb, '<=>', an['label']) # gt和dt的框没对应上,最好检查下问题
157
- # 附加值是识别错误的类别,没有则代表识别正确
158
- dt_list.append(
159
- {'image_id': an['image_id'], 'category_id': cat2id[pr],
160
- 'bbox': an['bbox'], 'score': 1, 'label': lb})
161
- return dt_list
162
-
163
-
164
- class ZcKvDt(ZcKvDtOld):
165
- """ 智财预测结果文件的通用解析类
166
-
167
- 这里是210510周一16:24新版的结果,文件顺序头写对了,并且增加了cs每个结果的置信度
168
- """
169
-
170
- @classmethod
171
- def init_from_zc_txt(cls, file, *, reverse_annos=False):
172
- """ 从文件解析出字典结构数据
173
-
174
- 有时候可能没有对应的 coco gt,则可以用这个直接把文件解析为内存数据处理
175
-
176
- :param reverse_annos: 是否对每个图片的标注结果,进行顺序反转
177
- :return: {filepath1: [{'logo': True, 'gt': 'address', 'pr': 'address', 'lb': 'LOT 3', 'cs': 1.0},
178
- {...},
179
- ...]
180
- filepath2: ...,
181
- ...
182
- }
183
- """
184
- content = File(file).read()
185
- parts = ContentPartSpliter.multi_blank_lines(content)
186
-
187
- data = dict()
188
- for pt in parts:
189
- lines = pt.splitlines()
190
- filepath = lines[0]
191
- annos = []
192
- for line in lines[1:]: # 第1行是文件名,删掉
193
- m = re.match(r'(.+?), GT: (.+?), PR: (.+?), LB: (.+?), CS: (.+)', line)
194
- logo, gt, pr, lb, cs = m.groups()
195
- annos.append({'logo': logo == 'True', 'gt': gt, 'pr': pr, 'lb': lb, 'cs': float(cs)})
196
- if reverse_annos:
197
- annos = list(reversed(annos))
198
- data[filepath] = annos
199
-
200
- return cls(data)
201
-
202
- def to_coco_dt(self, gt_dict, *, prt=True):
203
- """ 转coco的dt格式
204
-
205
- :param gt_dict: 需要有参照的gt文件,才能知道图片id,以及补充box位置信息
206
- """
207
- cat2id = {c['name']: c['id'] for c in gt_dict['categories']}
208
- gt_annos = {x[0]['file_name']: x for x in CocoData(gt_dict).group_gt()}
209
- dt_list = []
210
- for file, zc_annos in self.data.items():
211
- file_name = pathlib.Path(file).name
212
- image, im_annos = gt_annos[file_name]
213
-
214
- # 如果gt的annos比dt_annos少,需要扩充下,默认按最后一个gt的an填充
215
- n, m = len(im_annos), len(zc_annos)
216
- if n == m:
217
- # assert n == m # 智财如果不带box出来,是要强制框数量相同的!否则box怎么一一对应?
218
- # if n < m:
219
- # annos += [annos[-1]] * (m - n)
220
- # 要以顺序整体有依据,文本内容匹配为辅的策略配对
221
- im_annos.sort(key=lambda x: x['label'])
222
- zc_annos.sort(key=lambda x: x['lb'])
223
- dt_annos = []
224
-
225
- for a, b in zip(zc_annos, im_annos):
226
- gt, pr, lb, cs = a['gt'], a['pr'], a['lb'], a['cs']
227
- if prt and lb != b['label']:
228
- # gt和dt的框没对应上,最好检查下问题
229
- warn = ' '.join([file_name, lb, '<=>', b['label']])
230
- dprint(warn)
231
- # 附加值是识别错误的类别,没有则代表识别正确
232
- dt_annos.append(
233
- {'image_id': b['image_id'], 'category_id': cat2id[pr],
234
- 'bbox': b['bbox'], 'score': cs, 'label': lb})
235
-
236
- # 已经协调确定了空间顺序,但以防万一,可以再按空间排序下给到下游任务
237
- # 为了效率,也可以确保gt的annos有序,操作时annos顺序不动
238
- # 这里是前面为了匹配,已经把排序搞乱了,这里是必须要重排
239
- dt_annos.sort(key=lambda x: TextlineShape(xywh2ltrb(x['bbox']))) # 几何重排
240
- dt_list += dt_annos
241
- else: # 否则不匹配用不匹配的玩法
242
- raise NotImplementedError
243
-
244
- return dt_list
245
-
246
-
247
- __2_other = """
248
- """
249
-
250
-
251
- class SroieTextData:
252
- """ sroie task1、task2 的标注数据
253
-
254
- 72,25,326,25,326,64,72,64,TAN WOON YANN
255
- 50,82,440,82,440,121,50,121,BOOK TA .K(TAMAN DAYA) SDN BND
256
- 205,121,285,121,285,139,205,139,789417-W
257
- """
258
-
259
- def __init__(self, root, part=None):
260
- """
261
- :param root: 'data/task1+2'
262
- images,目录下有987份jpg图片、txt标注
263
- test.txt,361张jpg图片清单
264
- train.txt,626张jpg图片清单
265
- :param part:
266
- 'test', 只记录test的361张图书
267
- 'train' 等同理
268
- """
269
- pass
270
-
271
- def to_coco(self):
272
- pass
273
-
274
-
275
- class SroieClsData:
276
- """ sroie task3 关键信息提取的标注 """
277
- pass
278
-
279
-
280
- class BaiduOcrRes:
281
- def __init__(self, data=None):
282
- """
283
- :param data: dict (key 文件名 -> value 识别结果dict)
284
- {'words_result':
285
- [ {'words': 'xxxx', 'location': {'top': 130, 'left': ~, 'width': ~, 'height': ~}}, ... ]
286
- 'log_id': 136768...
287
- 'words_result_num': 39
288
- }
289
- """
290
- self.data = data
291
-
292
- @classmethod
293
- def init_from_hxl_txt(cls, file):
294
- """ 按训龙给的txt格式来初始化
295
-
296
- 数据格式:
297
- 一共22499行,大概是所有agreement,但可能有些空白图所以没结果
298
- 每行第一段是png完整路径,第二段是百度api返回的json读取为dict后直接print的结果
299
- """
300
- lines = File(file).read().splitlines()
301
- data = dict()
302
- for line in tqdm(lines, '解析txt中每张图对应的字典识别数据'):
303
- if line == '': continue
304
- # 切分路径和json数据
305
- # imfile, dictdata = line.split(maxsplit=1)
306
- imfile, dictdata = re.split(r'\s+', line, maxsplit=1)
307
- if 'debug' in str(imfile): continue # 有65张debug_的图片不要(后记:服务器上多余的65个debug图已删)
308
- data[imfile] = eval(dictdata)
309
-
310
- return cls(data)
311
-
312
- def check(self, imdir):
313
- """ 检查json数据的一些问题 """
314
- with TicToc('缺失的文件'):
315
- files = Path(str(imdir)).glob('*.jpg')
316
- files1 = {f.stem for f in files}
317
- files2 = {Path(f).stem for f in self.data.keys()}
318
- files3 = {Path(f).stem for f in self.data.keys() if ('debug' in str(f))}
319
- print(f'缺失{len(files1 - files2)}个文件的识别结果')
320
- print(f'多出{len(files2 - files1)}个文件的识别结果')
321
- print(f'{len(files3)}个debug文件')
322
- sys.stderr.flush()
323
-
324
- with TicToc('check errors'):
325
- ct = Counter()
326
- for k, v in self.data.items():
327
- if 'error_code' in v:
328
- ct[v['error_code']] += 1
329
- print(k, v)
330
- print(ct.most_common())
331
-
332
- def to_coco_gt(self, images, *, start_dt_id=0):
333
- """ 转成coco格式
334
-
335
- :param images: coco gt格式的参考images (可以train、val各生成一个返回)
336
- TODO 可以扩展支持输入图片所在目录的形式初始化的方法
337
- :param start_dt_id: annotation起始编号
338
- :return: coco gt格式的字典
339
- """
340
- # 辅助数组
341
- image_files = {Path(x['file_name']).stem: x['id'] for x in images}
342
- # 遍历判断要处理的文件
343
- annotations = []
344
- for k, v in tqdm(self.data.items(), '解析出每张图片识别结果对应的annotations'):
345
- stem = Path(k).stem
346
- if stem not in image_files or 'error_code' in v:
347
- continue
348
- image_id = image_files[stem]
349
- for item in v['words_result']:
350
- loc = item['location']
351
- bbox = [loc['left'], loc['top'], loc['width'], loc['height']]
352
- start_dt_id += 1
353
- an = CocoGtData.gen_annotation(id=start_dt_id, bbox=bbox, image_id=image_id, label=item['words'])
354
- annotations.append(an)
355
- return {'images': images,
356
- 'annotations': annotations,
357
- 'categories': CocoGtData.gen_categories(['text'])}
358
-
359
-
360
- if __name__ == '__main__':
361
- os.chdir('D:/home/datasets/textGroup/SROIE2019+/data/task3_testcrop')
362
- with TicToc(__name__):
363
- ld = LabelmeDataset.init_from_coco(r'test', 'data_crop.json')
364
- ld.writes()
365
- # print(ld)
pyxlpr/data/datasets.py DELETED
@@ -1,200 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Date : 2021/06/25 09:34
6
-
7
- import pathlib
8
-
9
- from pyxllib.xl import *
10
- from fvcore.common.registry import Registry
11
-
12
- ____basic = """
13
- 基础组件
14
- """
15
-
16
-
17
- class CommonPathBase:
18
- def __init__(self, prefix=None):
19
- if prefix is None:
20
- # 只要在这里设置服务器数据目录;本地目录则会将/根目录映射到D:/盘对应目录
21
- if os.getenv('PYXLPR_COMMONDIR'): # 可以使用PYXLPR_COMMONDIR='D:/'、'/'来自定义数据根目录
22
- prefix = os.getenv('PYXLPR_COMMONDIR')
23
- else:
24
- prefix = 'D:/' if sys.platform == 'win32' else '/'
25
- prefix = XlPath(prefix) # 默认是当前操作系统的文件类型;也可以显示输入PosixPath格式的prefix
26
-
27
- self.datasets = prefix / 'home/datasets'
28
- self.huangzhicai = prefix / 'home/huangzhicai'
29
- self.chenkunze = prefix / 'home/chenkunze'
30
- self.slns = prefix / 'home/chenkunze/slns'
31
-
32
- # slns 相关
33
- self.d2configs = self.slns / 'detectron2/configs'
34
- self.xlproject = self.slns / 'pyxlpr/xlproject'
35
-
36
- # datasets 相关
37
- self.realestate2020 = self.datasets / 'RealEstate2020'
38
- self.realestate_coco = self.datasets / 'RealEstate2020/coco_fmt'
39
-
40
- # textGroup 相关
41
- self.textGroup = self.datasets / 'textGroup'
42
- self.icdar2013 = self.textGroup / 'ICDAR2013'
43
- self.ic13loc = self.textGroup / 'ICDAR2013/Task2.1 Text Localization'
44
- self.publaynet = self.textGroup / 'PubLayNet/publaynet'
45
- self.AHCDB = self.textGroup / 'AHCDB'
46
- self.sroie = self.textGroup / 'SROIE2019' # 智财,占秋原来整理的数据
47
- self.sroie2 = self.textGroup / 'SROIE2019+' # 我重新整理过的数据,并且子目录data里有新的数据
48
- self.cipdf = self.textGroup / 'CIPDF' # 从cninfo下载的pdf文件数据
49
- self.cord = self.textGroup / 'CORD' # 从cninfo下载的pdf文件数据
50
- self.xeocr1 = self.textGroup / 'Xeon1OCR' # 从cninfo下载的pdf文件数据
51
-
52
- # chenkunze
53
- # 项目中一些太大的目录迁移到refdir存储;之前没有想过按chenkunze的目录同步;现在不太需要refdir了
54
- self.refdir = self.chenkunze / 'refdir'
55
-
56
- # huangzhicai
57
- self.zclogs = self.huangzhicai / 'workshop/ocrwork/uniocr/logs'
58
- self.voc2007 = self.huangzhicai / 'data/detec/voc2007/VOCdevkit/VOC2007'
59
-
60
-
61
- if sys.platform == 'win32':
62
- common_path = CommonPathBase()
63
- tp10_common_path = CommonPathBase(pathlib.PurePosixPath('/')) # 十卡服务器的常用目录
64
- else:
65
- common_path = CommonPathBase(XlPath('/'))
66
- tp10_common_path = common_path
67
-
68
- ____coco = """
69
- 普通的coco格式数据
70
-
71
- 目前需求这样的设计模式够了
72
- 1、但其实局限还不少,还有很多不便自定义的(不使用register_coco_instances,更灵活地修改底层)
73
- 2、以及在非d2场景的数据引用
74
-
75
- 不过现在也难想清楚,等后面切实需要的时候再扩展修改
76
- """
77
-
78
- COCO_INSTANCES_REGISTRY = Registry('COCO_INSTANCES')
79
- COCO_INSTANCES_REGISTRY.__doc__ = """
80
- 从数据集字符串名,映射到对应的初始化函数
81
- """
82
-
83
-
84
- class RegisterData:
85
- """ 旧版的数据集注册器,暂时不去修改优化了 """
86
-
87
- @classmethod
88
- def register_all(cls):
89
- with TicToc('RegisterData'):
90
- cls.register_by_annotations_dir(common_path.realestate_coco / 'agreement',
91
- common_path.realestate_coco / 'annotations')
92
-
93
- cls.register_by_annotations_dir(common_path.realestate_coco / 'agreement',
94
- common_path.realestate_coco / 'annotations_det')
95
-
96
- cls.register_by_annotations_dir(common_path.realestate_coco / 'agreement6_shade',
97
- common_path.realestate_coco / 'annotations',
98
- 'agreement_train6_shade.json')
99
-
100
- cls.register_by_annotations_dir(common_path.sroie2 / 'images',
101
- common_path.sroie2 / 'annotations')
102
-
103
- cls.register_by_annotations_dir(common_path.voc2007 / 'JPEGImages',
104
- common_path.voc2007 / 'coco_annotations')
105
-
106
- # 裁剪后的sroie数据
107
- cls.register_by_annotations_dir(common_path.sroie2 / 'data/task3_crop/images',
108
- common_path.sroie2 / 'data/task3_crop')
109
-
110
- cls.register_by_annotations_dir(common_path.cipdf / 'images',
111
- common_path.cipdf / 'annotations')
112
-
113
- @classmethod
114
- def register_by_annotations_dir(cls, imdir, andir,
115
- patter=re.compile(r'.+_(train|val|test|minival)\d{0,}\.json'),
116
- classes=None):
117
- r""" 注册coco类型的数据格式
118
-
119
- :param imdir: 图片所在目录
120
- :param andir: 标注所在目录
121
- 会注册目录下所有以 _[train|val|test]\d{0,}.json 为后缀的文件
122
- :param patter: 在andir下,要匹配分析的json文件
123
- :type patter: str | re.compile
124
- :param classes: 类别清单,例如 ['text', 'title', 'list', 'table', 'figure']
125
- 如果输入该参数,则这批patter匹配的所有文件都以这个classes为准
126
- 否则每个json读取自己的 categories 作为类清单
127
-
128
- 本函数用来简化detectron2中coco类型数据格式的注册过程,基本通用
129
- 但是由于要读取josn获取类别信息,在一些特别大的json读取
130
- """
131
- from detectron2.data.datasets import register_coco_instances
132
-
133
- # 1 标注文件
134
- files = Dir(andir).select(patter).subfiles()
135
-
136
- # 2 注册每一个json文件
137
- for f in files:
138
- if classes:
139
- cats = classes
140
- else:
141
- cats = f.read(encoding='utf8')['categories']
142
- cats = [x['name'] for x in cats]
143
- register_coco_instances(f.stem, {'thing_classes': cats}, str(f), str(imdir))
144
-
145
-
146
- class _DatasetRegister:
147
- ROOT = pathlib.Path('.')
148
- CLASSES = ('text',)
149
- META_DATA = {}
150
-
151
- @classmethod
152
- def coco_instances(cls, name, json, imdir):
153
- def func():
154
- return cls.META_DATA, cls.ROOT / json, cls.ROOT / imdir
155
-
156
- COCO_INSTANCES_REGISTRY._do_register(name, func) # noqa 不用装饰器就只能使用_do_register来注册了
157
-
158
-
159
- class Publaynet(_DatasetRegister):
160
- """ 论文版本分析的数据集 """
161
- ROOT = common_path.publaynet
162
- CLASSES = ['text', 'title', 'list', 'table', 'figure']
163
- META_DATA = {'thing_classes': CLASSES}
164
-
165
-
166
- Publaynet.coco_instances('publaynet_train', 'train_brief.json', 'train')
167
- Publaynet.coco_instances('publaynet_val', 'val.json', 'val')
168
- Publaynet.coco_instances('publaynet_val_mini', 'val_mini.json', 'val_mini')
169
- Publaynet.coco_instances('publaynet_test', 'test_ids.json', 'test')
170
-
171
- # # 也可以这样自定义函数注册数据,函数名就是数据名,然后返回 META_DATA, json, imdir 即可
172
- # @COCO_INSTANCES_Registry.register()
173
- # def publaynet_train():
174
- # return Publaynet.META_DATA, Publaynet.ROOT / 'train_brief.json', Publaynet.ROOT / 'train'
175
-
176
-
177
- ____register = """
178
- 数据集注册器
179
- """
180
-
181
-
182
- def register_d2dataset(name, *, error=None):
183
- """ 注册到 detectron2 的MetadataCatalog、DatasetCatalog中
184
-
185
- :param name: 数据集名称
186
- :return:
187
- """
188
- from detectron2.data import MetadataCatalog
189
- from detectron2.data.datasets import register_coco_instances
190
-
191
- if name in MetadataCatalog.keys():
192
- # 已有的数据,就不用注册了
193
- pass
194
- elif name in COCO_INSTANCES_REGISTRY:
195
- register_coco_instances(name, *(COCO_INSTANCES_REGISTRY.get(name)()))
196
- else:
197
- if error == 'ignore':
198
- pass
199
- else:
200
- raise ValueError(f'未预设的数据集名称 {name}')