pyxllib 0.3.96__py3-none-any.whl → 0.3.200__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. pyxllib/__init__.py +21 -21
  2. pyxllib/algo/__init__.py +8 -8
  3. pyxllib/algo/disjoint.py +54 -54
  4. pyxllib/algo/geo.py +541 -529
  5. pyxllib/algo/intervals.py +964 -964
  6. pyxllib/algo/matcher.py +389 -311
  7. pyxllib/algo/newbie.py +166 -166
  8. pyxllib/algo/pupil.py +629 -461
  9. pyxllib/algo/shapelylib.py +67 -67
  10. pyxllib/algo/specialist.py +241 -240
  11. pyxllib/algo/stat.py +494 -458
  12. pyxllib/algo/treelib.py +149 -149
  13. pyxllib/algo/unitlib.py +66 -66
  14. {pyxlpr → pyxllib/autogui}/__init__.py +5 -5
  15. pyxllib/autogui/activewin.py +246 -0
  16. pyxllib/autogui/all.py +9 -0
  17. pyxllib/{ext/autogui → autogui}/autogui.py +852 -823
  18. pyxllib/autogui/uiautolib.py +362 -0
  19. pyxllib/{ext/autogui → autogui}/virtualkey.py +102 -102
  20. pyxllib/autogui/wechat.py +827 -0
  21. pyxllib/autogui/wechat_msg.py +421 -0
  22. pyxllib/autogui/wxautolib.py +84 -0
  23. pyxllib/cv/__init__.py +5 -5
  24. pyxllib/cv/expert.py +267 -267
  25. pyxllib/cv/imfile.py +159 -159
  26. pyxllib/cv/imhash.py +39 -39
  27. pyxllib/cv/pupil.py +9 -9
  28. pyxllib/cv/rgbfmt.py +1525 -1525
  29. pyxllib/cv/slidercaptcha.py +137 -0
  30. pyxllib/cv/trackbartools.py +251 -251
  31. pyxllib/cv/xlcvlib.py +1040 -1040
  32. pyxllib/cv/xlpillib.py +423 -423
  33. pyxllib/data/echarts.py +240 -129
  34. pyxllib/data/jsonlib.py +89 -0
  35. pyxllib/data/oss.py +72 -72
  36. pyxllib/data/pglib.py +1127 -643
  37. pyxllib/data/sqlite.py +568 -341
  38. pyxllib/data/sqllib.py +297 -297
  39. pyxllib/ext/JLineViewer.py +505 -492
  40. pyxllib/ext/__init__.py +6 -6
  41. pyxllib/ext/demolib.py +246 -246
  42. pyxllib/ext/drissionlib.py +277 -0
  43. pyxllib/ext/kq5034lib.py +12 -1606
  44. pyxllib/ext/old.py +663 -663
  45. pyxllib/ext/qt.py +449 -449
  46. pyxllib/ext/robustprocfile.py +497 -0
  47. pyxllib/ext/seleniumlib.py +76 -76
  48. pyxllib/ext/tk.py +173 -173
  49. pyxllib/ext/unixlib.py +827 -826
  50. pyxllib/ext/utools.py +351 -338
  51. pyxllib/ext/webhook.py +124 -101
  52. pyxllib/ext/win32lib.py +40 -40
  53. pyxllib/ext/wjxlib.py +88 -0
  54. pyxllib/ext/wpsapi.py +124 -0
  55. pyxllib/ext/xlwork.py +9 -0
  56. pyxllib/ext/yuquelib.py +1105 -173
  57. pyxllib/file/__init__.py +17 -17
  58. pyxllib/file/docxlib.py +761 -761
  59. pyxllib/file/gitlib.py +309 -309
  60. pyxllib/file/libreoffice.py +165 -0
  61. pyxllib/file/movielib.py +148 -139
  62. pyxllib/file/newbie.py +10 -10
  63. pyxllib/file/onenotelib.py +1469 -1469
  64. pyxllib/file/packlib/__init__.py +330 -293
  65. pyxllib/file/packlib/zipfile.py +2441 -2441
  66. pyxllib/file/pdflib.py +426 -426
  67. pyxllib/file/pupil.py +185 -185
  68. pyxllib/file/specialist/__init__.py +685 -685
  69. pyxllib/file/specialist/dirlib.py +799 -799
  70. pyxllib/file/specialist/download.py +193 -186
  71. pyxllib/file/specialist/filelib.py +2829 -2618
  72. pyxllib/file/xlsxlib.py +3131 -2976
  73. pyxllib/file/xlsyncfile.py +341 -0
  74. pyxllib/prog/__init__.py +5 -5
  75. pyxllib/prog/cachetools.py +64 -0
  76. pyxllib/prog/deprecatedlib.py +233 -233
  77. pyxllib/prog/filelock.py +42 -0
  78. pyxllib/prog/ipyexec.py +253 -253
  79. pyxllib/prog/multiprogs.py +940 -0
  80. pyxllib/prog/newbie.py +451 -444
  81. pyxllib/prog/pupil.py +1197 -1128
  82. pyxllib/prog/sitepackages.py +33 -33
  83. pyxllib/prog/specialist/__init__.py +391 -217
  84. pyxllib/prog/specialist/bc.py +203 -200
  85. pyxllib/prog/specialist/browser.py +497 -488
  86. pyxllib/prog/specialist/common.py +347 -347
  87. pyxllib/prog/specialist/datetime.py +199 -131
  88. pyxllib/prog/specialist/tictoc.py +240 -241
  89. pyxllib/prog/specialist/xllog.py +180 -180
  90. pyxllib/prog/xlosenv.py +108 -101
  91. pyxllib/stdlib/__init__.py +17 -17
  92. pyxllib/stdlib/tablepyxl/__init__.py +10 -10
  93. pyxllib/stdlib/tablepyxl/style.py +303 -303
  94. pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
  95. pyxllib/text/__init__.py +8 -8
  96. pyxllib/text/ahocorasick.py +39 -39
  97. pyxllib/text/airscript.js +744 -0
  98. pyxllib/text/charclasslib.py +121 -109
  99. pyxllib/text/jiebalib.py +267 -264
  100. pyxllib/text/jinjalib.py +32 -0
  101. pyxllib/text/jsa_ai_prompt.md +271 -0
  102. pyxllib/text/jscode.py +922 -767
  103. pyxllib/text/latex/__init__.py +158 -158
  104. pyxllib/text/levenshtein.py +303 -303
  105. pyxllib/text/nestenv.py +1215 -1215
  106. pyxllib/text/newbie.py +300 -288
  107. pyxllib/text/pupil/__init__.py +8 -8
  108. pyxllib/text/pupil/common.py +1121 -1095
  109. pyxllib/text/pupil/xlalign.py +326 -326
  110. pyxllib/text/pycode.py +47 -47
  111. pyxllib/text/specialist/__init__.py +8 -8
  112. pyxllib/text/specialist/common.py +112 -112
  113. pyxllib/text/specialist/ptag.py +186 -186
  114. pyxllib/text/spellchecker.py +172 -172
  115. pyxllib/text/templates/echart_base.html +11 -0
  116. pyxllib/text/templates/highlight_code.html +17 -0
  117. pyxllib/text/templates/latex_editor.html +103 -0
  118. pyxllib/text/vbacode.py +17 -17
  119. pyxllib/text/xmllib.py +747 -685
  120. pyxllib/xl.py +42 -38
  121. pyxllib/xlcv.py +17 -17
  122. pyxllib-0.3.200.dist-info/METADATA +48 -0
  123. pyxllib-0.3.200.dist-info/RECORD +126 -0
  124. {pyxllib-0.3.96.dist-info → pyxllib-0.3.200.dist-info}/WHEEL +1 -2
  125. {pyxllib-0.3.96.dist-info → pyxllib-0.3.200.dist-info/licenses}/LICENSE +190 -190
  126. pyxllib/ext/autogui/__init__.py +0 -8
  127. pyxllib-0.3.96.dist-info/METADATA +0 -51
  128. pyxllib-0.3.96.dist-info/RECORD +0 -333
  129. pyxllib-0.3.96.dist-info/top_level.txt +0 -2
  130. pyxlpr/ai/__init__.py +0 -5
  131. pyxlpr/ai/clientlib.py +0 -1281
  132. pyxlpr/ai/specialist.py +0 -286
  133. pyxlpr/ai/torch_app.py +0 -172
  134. pyxlpr/ai/xlpaddle.py +0 -655
  135. pyxlpr/ai/xltorch.py +0 -705
  136. pyxlpr/data/__init__.py +0 -11
  137. pyxlpr/data/coco.py +0 -1325
  138. pyxlpr/data/datacls.py +0 -365
  139. pyxlpr/data/datasets.py +0 -200
  140. pyxlpr/data/gptlib.py +0 -1291
  141. pyxlpr/data/icdar/__init__.py +0 -96
  142. pyxlpr/data/icdar/deteval.py +0 -377
  143. pyxlpr/data/icdar/icdar2013.py +0 -341
  144. pyxlpr/data/icdar/iou.py +0 -340
  145. pyxlpr/data/icdar/rrc_evaluation_funcs_1_1.py +0 -463
  146. pyxlpr/data/imtextline.py +0 -473
  147. pyxlpr/data/labelme.py +0 -866
  148. pyxlpr/data/removeline.py +0 -179
  149. pyxlpr/data/specialist.py +0 -57
  150. pyxlpr/eval/__init__.py +0 -85
  151. pyxlpr/paddleocr.py +0 -776
  152. pyxlpr/ppocr/__init__.py +0 -15
  153. pyxlpr/ppocr/configs/rec/multi_language/generate_multi_language_configs.py +0 -226
  154. pyxlpr/ppocr/data/__init__.py +0 -135
  155. pyxlpr/ppocr/data/imaug/ColorJitter.py +0 -26
  156. pyxlpr/ppocr/data/imaug/__init__.py +0 -67
  157. pyxlpr/ppocr/data/imaug/copy_paste.py +0 -170
  158. pyxlpr/ppocr/data/imaug/east_process.py +0 -437
  159. pyxlpr/ppocr/data/imaug/gen_table_mask.py +0 -244
  160. pyxlpr/ppocr/data/imaug/iaa_augment.py +0 -114
  161. pyxlpr/ppocr/data/imaug/label_ops.py +0 -789
  162. pyxlpr/ppocr/data/imaug/make_border_map.py +0 -184
  163. pyxlpr/ppocr/data/imaug/make_pse_gt.py +0 -106
  164. pyxlpr/ppocr/data/imaug/make_shrink_map.py +0 -126
  165. pyxlpr/ppocr/data/imaug/operators.py +0 -433
  166. pyxlpr/ppocr/data/imaug/pg_process.py +0 -906
  167. pyxlpr/ppocr/data/imaug/randaugment.py +0 -143
  168. pyxlpr/ppocr/data/imaug/random_crop_data.py +0 -239
  169. pyxlpr/ppocr/data/imaug/rec_img_aug.py +0 -533
  170. pyxlpr/ppocr/data/imaug/sast_process.py +0 -777
  171. pyxlpr/ppocr/data/imaug/text_image_aug/__init__.py +0 -17
  172. pyxlpr/ppocr/data/imaug/text_image_aug/augment.py +0 -120
  173. pyxlpr/ppocr/data/imaug/text_image_aug/warp_mls.py +0 -168
  174. pyxlpr/ppocr/data/lmdb_dataset.py +0 -115
  175. pyxlpr/ppocr/data/pgnet_dataset.py +0 -104
  176. pyxlpr/ppocr/data/pubtab_dataset.py +0 -107
  177. pyxlpr/ppocr/data/simple_dataset.py +0 -372
  178. pyxlpr/ppocr/losses/__init__.py +0 -61
  179. pyxlpr/ppocr/losses/ace_loss.py +0 -52
  180. pyxlpr/ppocr/losses/basic_loss.py +0 -135
  181. pyxlpr/ppocr/losses/center_loss.py +0 -88
  182. pyxlpr/ppocr/losses/cls_loss.py +0 -30
  183. pyxlpr/ppocr/losses/combined_loss.py +0 -67
  184. pyxlpr/ppocr/losses/det_basic_loss.py +0 -208
  185. pyxlpr/ppocr/losses/det_db_loss.py +0 -80
  186. pyxlpr/ppocr/losses/det_east_loss.py +0 -63
  187. pyxlpr/ppocr/losses/det_pse_loss.py +0 -149
  188. pyxlpr/ppocr/losses/det_sast_loss.py +0 -121
  189. pyxlpr/ppocr/losses/distillation_loss.py +0 -272
  190. pyxlpr/ppocr/losses/e2e_pg_loss.py +0 -140
  191. pyxlpr/ppocr/losses/kie_sdmgr_loss.py +0 -113
  192. pyxlpr/ppocr/losses/rec_aster_loss.py +0 -99
  193. pyxlpr/ppocr/losses/rec_att_loss.py +0 -39
  194. pyxlpr/ppocr/losses/rec_ctc_loss.py +0 -44
  195. pyxlpr/ppocr/losses/rec_enhanced_ctc_loss.py +0 -70
  196. pyxlpr/ppocr/losses/rec_nrtr_loss.py +0 -30
  197. pyxlpr/ppocr/losses/rec_sar_loss.py +0 -28
  198. pyxlpr/ppocr/losses/rec_srn_loss.py +0 -47
  199. pyxlpr/ppocr/losses/table_att_loss.py +0 -109
  200. pyxlpr/ppocr/metrics/__init__.py +0 -44
  201. pyxlpr/ppocr/metrics/cls_metric.py +0 -45
  202. pyxlpr/ppocr/metrics/det_metric.py +0 -82
  203. pyxlpr/ppocr/metrics/distillation_metric.py +0 -73
  204. pyxlpr/ppocr/metrics/e2e_metric.py +0 -86
  205. pyxlpr/ppocr/metrics/eval_det_iou.py +0 -274
  206. pyxlpr/ppocr/metrics/kie_metric.py +0 -70
  207. pyxlpr/ppocr/metrics/rec_metric.py +0 -75
  208. pyxlpr/ppocr/metrics/table_metric.py +0 -50
  209. pyxlpr/ppocr/modeling/architectures/__init__.py +0 -32
  210. pyxlpr/ppocr/modeling/architectures/base_model.py +0 -88
  211. pyxlpr/ppocr/modeling/architectures/distillation_model.py +0 -60
  212. pyxlpr/ppocr/modeling/backbones/__init__.py +0 -54
  213. pyxlpr/ppocr/modeling/backbones/det_mobilenet_v3.py +0 -268
  214. pyxlpr/ppocr/modeling/backbones/det_resnet_vd.py +0 -246
  215. pyxlpr/ppocr/modeling/backbones/det_resnet_vd_sast.py +0 -285
  216. pyxlpr/ppocr/modeling/backbones/e2e_resnet_vd_pg.py +0 -265
  217. pyxlpr/ppocr/modeling/backbones/kie_unet_sdmgr.py +0 -186
  218. pyxlpr/ppocr/modeling/backbones/rec_mobilenet_v3.py +0 -138
  219. pyxlpr/ppocr/modeling/backbones/rec_mv1_enhance.py +0 -258
  220. pyxlpr/ppocr/modeling/backbones/rec_nrtr_mtb.py +0 -48
  221. pyxlpr/ppocr/modeling/backbones/rec_resnet_31.py +0 -210
  222. pyxlpr/ppocr/modeling/backbones/rec_resnet_aster.py +0 -143
  223. pyxlpr/ppocr/modeling/backbones/rec_resnet_fpn.py +0 -307
  224. pyxlpr/ppocr/modeling/backbones/rec_resnet_vd.py +0 -286
  225. pyxlpr/ppocr/modeling/heads/__init__.py +0 -54
  226. pyxlpr/ppocr/modeling/heads/cls_head.py +0 -52
  227. pyxlpr/ppocr/modeling/heads/det_db_head.py +0 -118
  228. pyxlpr/ppocr/modeling/heads/det_east_head.py +0 -121
  229. pyxlpr/ppocr/modeling/heads/det_pse_head.py +0 -37
  230. pyxlpr/ppocr/modeling/heads/det_sast_head.py +0 -128
  231. pyxlpr/ppocr/modeling/heads/e2e_pg_head.py +0 -253
  232. pyxlpr/ppocr/modeling/heads/kie_sdmgr_head.py +0 -206
  233. pyxlpr/ppocr/modeling/heads/multiheadAttention.py +0 -163
  234. pyxlpr/ppocr/modeling/heads/rec_aster_head.py +0 -393
  235. pyxlpr/ppocr/modeling/heads/rec_att_head.py +0 -202
  236. pyxlpr/ppocr/modeling/heads/rec_ctc_head.py +0 -88
  237. pyxlpr/ppocr/modeling/heads/rec_nrtr_head.py +0 -826
  238. pyxlpr/ppocr/modeling/heads/rec_sar_head.py +0 -402
  239. pyxlpr/ppocr/modeling/heads/rec_srn_head.py +0 -280
  240. pyxlpr/ppocr/modeling/heads/self_attention.py +0 -406
  241. pyxlpr/ppocr/modeling/heads/table_att_head.py +0 -246
  242. pyxlpr/ppocr/modeling/necks/__init__.py +0 -32
  243. pyxlpr/ppocr/modeling/necks/db_fpn.py +0 -111
  244. pyxlpr/ppocr/modeling/necks/east_fpn.py +0 -188
  245. pyxlpr/ppocr/modeling/necks/fpn.py +0 -138
  246. pyxlpr/ppocr/modeling/necks/pg_fpn.py +0 -314
  247. pyxlpr/ppocr/modeling/necks/rnn.py +0 -92
  248. pyxlpr/ppocr/modeling/necks/sast_fpn.py +0 -284
  249. pyxlpr/ppocr/modeling/necks/table_fpn.py +0 -110
  250. pyxlpr/ppocr/modeling/transforms/__init__.py +0 -28
  251. pyxlpr/ppocr/modeling/transforms/stn.py +0 -135
  252. pyxlpr/ppocr/modeling/transforms/tps.py +0 -308
  253. pyxlpr/ppocr/modeling/transforms/tps_spatial_transformer.py +0 -156
  254. pyxlpr/ppocr/optimizer/__init__.py +0 -61
  255. pyxlpr/ppocr/optimizer/learning_rate.py +0 -228
  256. pyxlpr/ppocr/optimizer/lr_scheduler.py +0 -49
  257. pyxlpr/ppocr/optimizer/optimizer.py +0 -160
  258. pyxlpr/ppocr/optimizer/regularizer.py +0 -52
  259. pyxlpr/ppocr/postprocess/__init__.py +0 -55
  260. pyxlpr/ppocr/postprocess/cls_postprocess.py +0 -33
  261. pyxlpr/ppocr/postprocess/db_postprocess.py +0 -234
  262. pyxlpr/ppocr/postprocess/east_postprocess.py +0 -143
  263. pyxlpr/ppocr/postprocess/locality_aware_nms.py +0 -200
  264. pyxlpr/ppocr/postprocess/pg_postprocess.py +0 -52
  265. pyxlpr/ppocr/postprocess/pse_postprocess/__init__.py +0 -15
  266. pyxlpr/ppocr/postprocess/pse_postprocess/pse/__init__.py +0 -29
  267. pyxlpr/ppocr/postprocess/pse_postprocess/pse/setup.py +0 -14
  268. pyxlpr/ppocr/postprocess/pse_postprocess/pse_postprocess.py +0 -118
  269. pyxlpr/ppocr/postprocess/rec_postprocess.py +0 -654
  270. pyxlpr/ppocr/postprocess/sast_postprocess.py +0 -355
  271. pyxlpr/ppocr/tools/__init__.py +0 -14
  272. pyxlpr/ppocr/tools/eval.py +0 -83
  273. pyxlpr/ppocr/tools/export_center.py +0 -77
  274. pyxlpr/ppocr/tools/export_model.py +0 -129
  275. pyxlpr/ppocr/tools/infer/predict_cls.py +0 -151
  276. pyxlpr/ppocr/tools/infer/predict_det.py +0 -300
  277. pyxlpr/ppocr/tools/infer/predict_e2e.py +0 -169
  278. pyxlpr/ppocr/tools/infer/predict_rec.py +0 -414
  279. pyxlpr/ppocr/tools/infer/predict_system.py +0 -204
  280. pyxlpr/ppocr/tools/infer/utility.py +0 -629
  281. pyxlpr/ppocr/tools/infer_cls.py +0 -83
  282. pyxlpr/ppocr/tools/infer_det.py +0 -134
  283. pyxlpr/ppocr/tools/infer_e2e.py +0 -122
  284. pyxlpr/ppocr/tools/infer_kie.py +0 -153
  285. pyxlpr/ppocr/tools/infer_rec.py +0 -146
  286. pyxlpr/ppocr/tools/infer_table.py +0 -107
  287. pyxlpr/ppocr/tools/program.py +0 -596
  288. pyxlpr/ppocr/tools/test_hubserving.py +0 -117
  289. pyxlpr/ppocr/tools/train.py +0 -163
  290. pyxlpr/ppocr/tools/xlprog.py +0 -748
  291. pyxlpr/ppocr/utils/EN_symbol_dict.txt +0 -94
  292. pyxlpr/ppocr/utils/__init__.py +0 -24
  293. pyxlpr/ppocr/utils/dict/ar_dict.txt +0 -117
  294. pyxlpr/ppocr/utils/dict/arabic_dict.txt +0 -162
  295. pyxlpr/ppocr/utils/dict/be_dict.txt +0 -145
  296. pyxlpr/ppocr/utils/dict/bg_dict.txt +0 -140
  297. pyxlpr/ppocr/utils/dict/chinese_cht_dict.txt +0 -8421
  298. pyxlpr/ppocr/utils/dict/cyrillic_dict.txt +0 -163
  299. pyxlpr/ppocr/utils/dict/devanagari_dict.txt +0 -167
  300. pyxlpr/ppocr/utils/dict/en_dict.txt +0 -63
  301. pyxlpr/ppocr/utils/dict/fa_dict.txt +0 -136
  302. pyxlpr/ppocr/utils/dict/french_dict.txt +0 -136
  303. pyxlpr/ppocr/utils/dict/german_dict.txt +0 -143
  304. pyxlpr/ppocr/utils/dict/hi_dict.txt +0 -162
  305. pyxlpr/ppocr/utils/dict/it_dict.txt +0 -118
  306. pyxlpr/ppocr/utils/dict/japan_dict.txt +0 -4399
  307. pyxlpr/ppocr/utils/dict/ka_dict.txt +0 -153
  308. pyxlpr/ppocr/utils/dict/korean_dict.txt +0 -3688
  309. pyxlpr/ppocr/utils/dict/latin_dict.txt +0 -185
  310. pyxlpr/ppocr/utils/dict/mr_dict.txt +0 -153
  311. pyxlpr/ppocr/utils/dict/ne_dict.txt +0 -153
  312. pyxlpr/ppocr/utils/dict/oc_dict.txt +0 -96
  313. pyxlpr/ppocr/utils/dict/pu_dict.txt +0 -130
  314. pyxlpr/ppocr/utils/dict/rs_dict.txt +0 -91
  315. pyxlpr/ppocr/utils/dict/rsc_dict.txt +0 -134
  316. pyxlpr/ppocr/utils/dict/ru_dict.txt +0 -125
  317. pyxlpr/ppocr/utils/dict/ta_dict.txt +0 -128
  318. pyxlpr/ppocr/utils/dict/table_dict.txt +0 -277
  319. pyxlpr/ppocr/utils/dict/table_structure_dict.txt +0 -2759
  320. pyxlpr/ppocr/utils/dict/te_dict.txt +0 -151
  321. pyxlpr/ppocr/utils/dict/ug_dict.txt +0 -114
  322. pyxlpr/ppocr/utils/dict/uk_dict.txt +0 -142
  323. pyxlpr/ppocr/utils/dict/ur_dict.txt +0 -137
  324. pyxlpr/ppocr/utils/dict/xi_dict.txt +0 -110
  325. pyxlpr/ppocr/utils/dict90.txt +0 -90
  326. pyxlpr/ppocr/utils/e2e_metric/Deteval.py +0 -574
  327. pyxlpr/ppocr/utils/e2e_metric/polygon_fast.py +0 -83
  328. pyxlpr/ppocr/utils/e2e_utils/extract_batchsize.py +0 -87
  329. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_fast.py +0 -457
  330. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_slow.py +0 -592
  331. pyxlpr/ppocr/utils/e2e_utils/pgnet_pp_utils.py +0 -162
  332. pyxlpr/ppocr/utils/e2e_utils/visual.py +0 -162
  333. pyxlpr/ppocr/utils/en_dict.txt +0 -95
  334. pyxlpr/ppocr/utils/gen_label.py +0 -81
  335. pyxlpr/ppocr/utils/ic15_dict.txt +0 -36
  336. pyxlpr/ppocr/utils/iou.py +0 -54
  337. pyxlpr/ppocr/utils/logging.py +0 -69
  338. pyxlpr/ppocr/utils/network.py +0 -84
  339. pyxlpr/ppocr/utils/ppocr_keys_v1.txt +0 -6623
  340. pyxlpr/ppocr/utils/profiler.py +0 -110
  341. pyxlpr/ppocr/utils/save_load.py +0 -150
  342. pyxlpr/ppocr/utils/stats.py +0 -72
  343. pyxlpr/ppocr/utils/utility.py +0 -80
  344. pyxlpr/ppstructure/__init__.py +0 -13
  345. pyxlpr/ppstructure/predict_system.py +0 -187
  346. pyxlpr/ppstructure/table/__init__.py +0 -13
  347. pyxlpr/ppstructure/table/eval_table.py +0 -72
  348. pyxlpr/ppstructure/table/matcher.py +0 -192
  349. pyxlpr/ppstructure/table/predict_structure.py +0 -136
  350. pyxlpr/ppstructure/table/predict_table.py +0 -221
  351. pyxlpr/ppstructure/table/table_metric/__init__.py +0 -16
  352. pyxlpr/ppstructure/table/table_metric/parallel.py +0 -51
  353. pyxlpr/ppstructure/table/table_metric/table_metric.py +0 -247
  354. pyxlpr/ppstructure/table/tablepyxl/__init__.py +0 -13
  355. pyxlpr/ppstructure/table/tablepyxl/style.py +0 -283
  356. pyxlpr/ppstructure/table/tablepyxl/tablepyxl.py +0 -118
  357. pyxlpr/ppstructure/utility.py +0 -71
  358. pyxlpr/xlai.py +0 -10
@@ -1,303 +1,303 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Date : 2021/06/06 17:01
6
-
7
- from pyxllib.prog.pupil import check_install_package
8
-
9
- # 这个需要C++14编译器 https://download.microsoft.com/download/5/f/7/5f7acaeb-8363-451f-9425-68a90f98b238/visualcppbuildtools_full.exe
10
- # 在需要的时候安装,防止只是想用pyxllib很简单的功能,但是在pip install阶段处理过于麻烦
11
- # MatchSimString计算编辑距离需要
12
- check_install_package('Levenshtein', 'python-Levenshtein')
13
-
14
- from collections import defaultdict
15
- from more_itertools import chunked
16
- import warnings
17
-
18
- import Levenshtein
19
- import numpy as np
20
- import pandas as pd
21
-
22
- from pyxllib.prog.pupil import run_once
23
- from pyxllib.prog.specialist import dataframe_str
24
- from pyxllib.text.pupil import briefstr
25
-
26
- # 忽略特定的警告
27
- warnings.filterwarnings("ignore", category=FutureWarning,
28
- module="sklearn.cluster._agglomerative",
29
- lineno=1005)
30
-
31
-
32
- @run_once('str')
33
- def get_levenshtein_similar(x, y):
34
- """ 缓存各字符串之间的编辑距离 """
35
- return Levenshtein.ratio(x, y)
36
-
37
-
38
- class MatchSimString:
39
- """ 匹配近似字符串
40
-
41
- mss = MatchSimString()
42
-
43
- # 1 添加候选对象
44
- mss.append_candidate('福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用')
45
- mss.append_candidate('2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)')
46
- mss.append_candidate('删除所有标签中间多余的空白')
47
-
48
- # 2 需要匹配的对象1
49
- s = '奕本初一福周厦门培油'
50
-
51
- idx, sim = mss.match(s)
52
- print('匹配目标:', mss[idx]) # 匹配目标: 福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用
53
- print('相似度:', sim) # 相似度: 0.22
54
-
55
- # 3 需要匹配的对象2
56
- s = '圆柱与【圆锥】_教案空白版'
57
-
58
- idx, sim = mss.match(s)
59
- print('匹配目标:', mss[idx]) # 2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)
60
- print('相似度:', sim) # 相似度: 0.375
61
-
62
- 如果append_candidate有传递2个扩展信息参数,可以索引获取:
63
- mss.ext_value[idx]
64
- """
65
-
66
- def __init__(self, method=briefstr):
67
- self.preproc = method
68
- self.origin_str = [] # 原始字符串内容
69
- self.key_str = [] # 对原始字符串进行处理后的字符
70
- self.ext_value = [] # 扩展存储一些信息
71
-
72
- def __getitem__(self, item):
73
- return self.origin_str[item]
74
-
75
- def __delitem__(self, item):
76
- del self.origin_str[item]
77
- del self.key_str[item]
78
- del self.ext_value[item]
79
-
80
- def __len__(self):
81
- return len(self.key_str)
82
-
83
- def get_similarity(self, x, y):
84
- """ 计算两对数据之间的相似度 """
85
- pass
86
-
87
- def append_candidate(self, k, v=None):
88
- self.origin_str.append(k)
89
- if callable(self.preproc):
90
- k = self.preproc(k)
91
- self.key_str.append(k)
92
- self.ext_value.append(v)
93
-
94
- def match(self, s):
95
- """ 跟候选字符串进行匹配,返回最佳匹配结果
96
- """
97
- idx, sim = -1, 0
98
- for i in range(len(self)):
99
- k, v = self.key_str[i], self.ext_value[i]
100
- sim_ = Levenshtein.ratio(k, s)
101
- if sim_ > sim:
102
- sim = sim_
103
- idx = i
104
- i += 1
105
- return idx, sim
106
-
107
- def match_many(self, s, count=1):
108
- """跟候选字符串进行匹配,返回多个最佳匹配结果
109
- :param str s: 待匹配的字符串
110
- :param int count: 需要返回的匹配数量
111
- :return: 匹配结果列表,列表中的元素为(idx, sim)对
112
- """
113
- scores = [(i, Levenshtein.ratio(self.key_str[i], s)) for i in range(len(self))]
114
- # 根据相似度排序并返回前count个结果
115
- return sorted(scores, key=lambda x: x[1], reverse=True)[:count]
116
-
117
- def match_test(self, s, count=-1, showstr=lambda x: x[:50]):
118
- """输入一个字符串s,和候选项做近似匹配
119
-
120
- :param s: 需要进行匹配的字符串s
121
- :param count: 只输出部分匹配结果
122
- -1:输出所有匹配结果
123
- 0 < count < 1:例如0.4,则只输出匹配度最高的40%结果
124
- 整数:输出匹配度最高的count个结果
125
- :param showstr: 字符串显示效果
126
- """
127
- # 1 计算编辑距离,存储结果到res
128
- res = []
129
- n = len(self)
130
- for i in range(n):
131
- k, v = self.key_str[i], self.ext_value[i]
132
- sim = Levenshtein.ratio(k, s)
133
- res.append([i, v, sim, showstr(k)]) # 输出的时候从0开始编号
134
- i += 1
135
-
136
- # 2 排序、节选结果
137
- res = sorted(res, key=lambda x: -x[2])
138
- if 0 < count < 1:
139
- n = max(1, int(n * count))
140
- elif isinstance(count, int) and count > 0:
141
- n = min(count, n)
142
- res = res[:n]
143
-
144
- # 3 输出
145
- df = pd.DataFrame.from_records(res, columns=('序号', '标签', '编辑距离', '内容'))
146
- s = dataframe_str(df)
147
- s = s.replace('\u2022', '') # texstudio无法显示会报错的字符
148
- print(s)
149
-
150
- def agglomerative_clustering(self, threshold=0.5):
151
- """ 对内部字符串进行层次聚类
152
-
153
- :param threshold: 可以理解成距离的阈值,距离小于这个阈值的字符串会被聚为一类
154
- 值越小,分出的类别越多越细
155
- """
156
- check_install_package('sklearn', 'scikit-learn')
157
- from sklearn.cluster import AgglomerativeClustering
158
-
159
- # 1 给每个样本标类别
160
- distance_matrix = np.zeros((len(self), len(self)))
161
- for i in range(len(self)):
162
- for j in range(i + 1, len(self)):
163
- # 我们需要距离,所以用1减去相似度
164
- distance = 1 - Levenshtein.ratio(self.key_str[i], self.key_str[j])
165
- distance_matrix[i, j] = distance_matrix[j, i] = distance
166
-
167
- # 进行层次聚类
168
- clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
169
- distance_threshold=threshold,
170
- linkage='complete')
171
- labels = clustering.fit_predict(distance_matrix)
172
-
173
- return labels
174
-
175
- def display_clusters(self, threshold=0.5):
176
- """ 根据agglomerative_clustering的结果,显示各个聚类的内容 """
177
-
178
- labels = self.agglomerative_clustering(threshold=threshold)
179
- cluster_dict = defaultdict(list)
180
-
181
- # 组织数据到字典中
182
- for idx, label in enumerate(labels):
183
- cluster_dict[label].append(self.origin_str[idx])
184
-
185
- # 按标签排序并显示
186
- result = {}
187
- for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
188
- result[label] = items
189
-
190
- return result
191
-
192
-
193
- class HierarchicalMatchSimString(MatchSimString):
194
- """ 在面对数据量很大的候选数据情况下,建议使用这个层次聚类后的匹配方法 """
195
-
196
- def __init__(self, method=briefstr):
197
- super().__init__(method)
198
- self.groups = dict()
199
-
200
- def get_center_sample(self, indices=None):
201
- """ 输入一组下标,计算中心样本,未输入参数值的时候,则在全量样本里找 """
202
- if indices is None:
203
- indices = range(len(self))
204
-
205
- # 用于存储之前计算的结果
206
- cached_results = {}
207
-
208
- def get_similarity(i, j):
209
- """ 获取两个索引的相似度,利用缓存来避免重复计算 """
210
- if (i, j) in cached_results:
211
- return cached_results[(i, j)]
212
- sim_val = Levenshtein.ratio(self.key_str[i], self.key_str[j])
213
- cached_results[(i, j)] = cached_results[(j, i)] = sim_val
214
- return sim_val
215
-
216
- center_idx = max(indices, key=lambda x: sum(get_similarity(x, y) for y in indices))
217
- return center_idx
218
-
219
- def merge_group(self, indices, threshold=0.5, strategy='center'):
220
- """ 对输入的indexs清单,按照threshold的阈值进行合并
221
- 返回的是一个字典,key是代表性样本,value是同组内的数据编号
222
-
223
- :param strategy: 代表样本的挑选策略
224
- center,中心样本
225
- first,第一个样本
226
- """
227
- check_install_package('sklearn', 'scikit-learn')
228
- from sklearn.cluster import AgglomerativeClustering
229
-
230
- # 1 给每个样本标类别
231
- n = len(indices)
232
- distance_matrix = np.zeros((n, n))
233
- for i in range(n):
234
- for j in range(i + 1, n):
235
- # 我们需要距离,所以用1减去相似度
236
- distance = 1 - Levenshtein.ratio(self.key_str[indices[i]], self.key_str[indices[j]])
237
- distance_matrix[i, j] = distance_matrix[j, i] = distance
238
-
239
- # 进行层次聚类
240
- clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
241
- distance_threshold=threshold,
242
- linkage='complete')
243
- labels = clustering.fit_predict(distance_matrix)
244
-
245
- # 2 分组字典
246
- cluster_dict = defaultdict(list)
247
- # 组织数据到字典中
248
- for i, label in enumerate(labels):
249
- cluster_dict[label].append(indices[i])
250
-
251
- # 3 改成代表样本映射到一组里,并且按照样本数从多到少排序
252
- result = {}
253
- for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
254
- if strategy == 'first':
255
- representative = items[0]
256
- elif strategy == 'center':
257
- # 使用局部索引计算平均距离
258
- local_indices = [i for i, idx in enumerate(indices) if idx in items]
259
- sub_matrix = distance_matrix[np.ix_(local_indices, local_indices)]
260
- avg_distances = sub_matrix.mean(axis=1)
261
- representative_idx = np.argmin(avg_distances)
262
- representative = items[representative_idx]
263
- else:
264
- raise ValueError(f'Invalid strategy: {strategy}')
265
- result[representative] = items
266
-
267
- return result
268
-
269
- def init_groups(self, threshold=0.5, batch_size=1000):
270
- """
271
- :param threshold: 按照阈值进行分组,在这个距离内的都会归到一组
272
- :param batch_size: 因为数据可能太大,不可能一次性全量两两比较,这里可以分batch处理
273
- 这样虽然结果不太精确,但能大大减小运算量
274
- """
275
- # 1 最开始每个样本都是一个组
276
- groups = {i: [i] for i in range(len(self))}
277
- new_groups = {}
278
-
279
- # 2 不断合并,直到没有组数变化
280
- while len(groups) > 1:
281
- for indices in chunked(groups.keys(), batch_size):
282
- # 对于这里返回的字典,原groups里的values也要对应拼接的
283
- indices2 = self.merge_group(indices, threshold=threshold)
284
- for idx, idxs in indices2.items():
285
- # 获取原始分组中的索引
286
- original_idxs = [groups[original_idx] for original_idx in idxs]
287
- # 展平列表并分配到新分组中
288
- new_groups[idx] = [item for sublist in original_idxs for item in sublist]
289
-
290
- # 如果分组没有发生变化,退出循环
291
- if len(new_groups) == len(groups):
292
- break
293
-
294
- groups = new_groups
295
- new_groups = {}
296
-
297
- # 3 按数量从多到少排序
298
- new_groups = {}
299
- for label, items in sorted(groups.items(), key=lambda x: -len(x[1])):
300
- new_groups[label] = items # 暂用第一个出现的作为代表
301
-
302
- self.groups = new_groups
303
- return self.groups
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : 陈坤泽
4
+ # @Email : 877362867@qq.com
5
+ # @Date : 2021/06/06 17:01
6
+
7
+ from pyxllib.prog.pupil import check_install_package
8
+
9
+ # 这个需要C++14编译器 https://download.microsoft.com/download/5/f/7/5f7acaeb-8363-451f-9425-68a90f98b238/visualcppbuildtools_full.exe
10
+ # 在需要的时候安装,防止只是想用pyxllib很简单的功能,但是在pip install阶段处理过于麻烦
11
+ # MatchSimString计算编辑距离需要
12
+ check_install_package('Levenshtein', 'python-Levenshtein')
13
+
14
+ from collections import defaultdict
15
+ from more_itertools import chunked
16
+ import warnings
17
+
18
+ import Levenshtein
19
+ import numpy as np
20
+ import pandas as pd
21
+
22
+ from pyxllib.prog.pupil import run_once
23
+ from pyxllib.prog.specialist import dataframe_str
24
+ from pyxllib.text.pupil import briefstr
25
+
26
+ # 忽略特定的警告
27
+ warnings.filterwarnings("ignore", category=FutureWarning,
28
+ module="sklearn.cluster._agglomerative",
29
+ lineno=1005)
30
+
31
+
32
+ @run_once('str')
33
+ def get_levenshtein_similar(x, y):
34
+ """ 缓存各字符串之间的编辑距离 """
35
+ return Levenshtein.ratio(x, y)
36
+
37
+
38
+ class MatchSimString:
39
+ """ 匹配近似字符串
40
+
41
+ mss = MatchSimString()
42
+
43
+ # 1 添加候选对象
44
+ mss.append_candidate('福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用')
45
+ mss.append_candidate('2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)')
46
+ mss.append_candidate('删除所有标签中间多余的空白')
47
+
48
+ # 2 需要匹配的对象1
49
+ s = '奕本初一福周厦门培油'
50
+
51
+ idx, sim = mss.match(s)
52
+ print('匹配目标:', mss[idx]) # 匹配目标: 福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用
53
+ print('相似度:', sim) # 相似度: 0.22
54
+
55
+ # 3 需要匹配的对象2
56
+ s = '圆柱与【圆锥】_教案空白版'
57
+
58
+ idx, sim = mss.match(s)
59
+ print('匹配目标:', mss[idx]) # 2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案(教师版)
60
+ print('相似度:', sim) # 相似度: 0.375
61
+
62
+ 如果append_candidate有传递2个扩展信息参数,可以索引获取:
63
+ mss.ext_value[idx]
64
+ """
65
+
66
+ def __init__(self, method=briefstr):
67
+ self.preproc = method
68
+ self.origin_str = [] # 原始字符串内容
69
+ self.key_str = [] # 对原始字符串进行处理后的字符
70
+ self.ext_value = [] # 扩展存储一些信息
71
+
72
+ def __getitem__(self, item):
73
+ return self.origin_str[item]
74
+
75
+ def __delitem__(self, item):
76
+ del self.origin_str[item]
77
+ del self.key_str[item]
78
+ del self.ext_value[item]
79
+
80
+ def __len__(self):
81
+ return len(self.key_str)
82
+
83
+ def get_similarity(self, x, y):
84
+ """ 计算两对数据之间的相似度 """
85
+ pass
86
+
87
+ def append_candidate(self, k, v=None):
88
+ self.origin_str.append(k)
89
+ if callable(self.preproc):
90
+ k = self.preproc(k)
91
+ self.key_str.append(k)
92
+ self.ext_value.append(v)
93
+
94
+ def match(self, s):
95
+ """ 跟候选字符串进行匹配,返回最佳匹配结果
96
+ """
97
+ idx, sim = -1, 0
98
+ for i in range(len(self)):
99
+ k, v = self.key_str[i], self.ext_value[i]
100
+ sim_ = Levenshtein.ratio(k, s)
101
+ if sim_ > sim:
102
+ sim = sim_
103
+ idx = i
104
+ i += 1
105
+ return idx, sim
106
+
107
+ def match_many(self, s, count=1):
108
+ """跟候选字符串进行匹配,返回多个最佳匹配结果
109
+ :param str s: 待匹配的字符串
110
+ :param int count: 需要返回的匹配数量
111
+ :return: 匹配结果列表,列表中的元素为(idx, sim)对
112
+ """
113
+ scores = [(i, Levenshtein.ratio(self.key_str[i], s)) for i in range(len(self))]
114
+ # 根据相似度排序并返回前count个结果
115
+ return sorted(scores, key=lambda x: x[1], reverse=True)[:count]
116
+
117
+ def match_test(self, s, count=-1, showstr=lambda x: x[:50]):
118
+ """输入一个字符串s,和候选项做近似匹配
119
+
120
+ :param s: 需要进行匹配的字符串s
121
+ :param count: 只输出部分匹配结果
122
+ -1:输出所有匹配结果
123
+ 0 < count < 1:例如0.4,则只输出匹配度最高的40%结果
124
+ 整数:输出匹配度最高的count个结果
125
+ :param showstr: 字符串显示效果
126
+ """
127
+ # 1 计算编辑距离,存储结果到res
128
+ res = []
129
+ n = len(self)
130
+ for i in range(n):
131
+ k, v = self.key_str[i], self.ext_value[i]
132
+ sim = Levenshtein.ratio(k, s)
133
+ res.append([i, v, sim, showstr(k)]) # 输出的时候从0开始编号
134
+ i += 1
135
+
136
+ # 2 排序、节选结果
137
+ res = sorted(res, key=lambda x: -x[2])
138
+ if 0 < count < 1:
139
+ n = max(1, int(n * count))
140
+ elif isinstance(count, int) and count > 0:
141
+ n = min(count, n)
142
+ res = res[:n]
143
+
144
+ # 3 输出
145
+ df = pd.DataFrame.from_records(res, columns=('序号', '标签', '编辑距离', '内容'))
146
+ s = dataframe_str(df)
147
+ s = s.replace('\u2022', '') # texstudio无法显示会报错的字符
148
+ print(s)
149
+
150
+ def agglomerative_clustering(self, threshold=0.5):
151
+ """ 对内部字符串进行层次聚类
152
+
153
+ :param threshold: 可以理解成距离的阈值,距离小于这个阈值的字符串会被聚为一类
154
+ 值越小,分出的类别越多越细
155
+ """
156
+ check_install_package('sklearn', 'scikit-learn')
157
+ from sklearn.cluster import AgglomerativeClustering
158
+
159
+ # 1 给每个样本标类别
160
+ distance_matrix = np.zeros((len(self), len(self)))
161
+ for i in range(len(self)):
162
+ for j in range(i + 1, len(self)):
163
+ # 我们需要距离,所以用1减去相似度
164
+ distance = 1 - Levenshtein.ratio(self.key_str[i], self.key_str[j])
165
+ distance_matrix[i, j] = distance_matrix[j, i] = distance
166
+
167
+ # 进行层次聚类
168
+ clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
169
+ distance_threshold=threshold,
170
+ linkage='complete')
171
+ labels = clustering.fit_predict(distance_matrix)
172
+
173
+ return labels
174
+
175
+ def display_clusters(self, threshold=0.5):
176
+ """ 根据agglomerative_clustering的结果,显示各个聚类的内容 """
177
+
178
+ labels = self.agglomerative_clustering(threshold=threshold)
179
+ cluster_dict = defaultdict(list)
180
+
181
+ # 组织数据到字典中
182
+ for idx, label in enumerate(labels):
183
+ cluster_dict[label].append(self.origin_str[idx])
184
+
185
+ # 按标签排序并显示
186
+ result = {}
187
+ for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
188
+ result[label] = items
189
+
190
+ return result
191
+
192
+
193
+ class HierarchicalMatchSimString(MatchSimString):
194
+ """ 在面对数据量很大的候选数据情况下,建议使用这个层次聚类后的匹配方法 """
195
+
196
+ def __init__(self, method=briefstr):
197
+ super().__init__(method)
198
+ self.groups = dict()
199
+
200
+ def get_center_sample(self, indices=None):
201
+ """ 输入一组下标,计算中心样本,未输入参数值的时候,则在全量样本里找 """
202
+ if indices is None:
203
+ indices = range(len(self))
204
+
205
+ # 用于存储之前计算的结果
206
+ cached_results = {}
207
+
208
+ def get_similarity(i, j):
209
+ """ 获取两个索引的相似度,利用缓存来避免重复计算 """
210
+ if (i, j) in cached_results:
211
+ return cached_results[(i, j)]
212
+ sim_val = Levenshtein.ratio(self.key_str[i], self.key_str[j])
213
+ cached_results[(i, j)] = cached_results[(j, i)] = sim_val
214
+ return sim_val
215
+
216
+ center_idx = max(indices, key=lambda x: sum(get_similarity(x, y) for y in indices))
217
+ return center_idx
218
+
219
+ def merge_group(self, indices, threshold=0.5, strategy='center'):
220
+ """ 对输入的indexs清单,按照threshold的阈值进行合并
221
+ 返回的是一个字典,key是代表性样本,value是同组内的数据编号
222
+
223
+ :param strategy: 代表样本的挑选策略
224
+ center,中心样本
225
+ first,第一个样本
226
+ """
227
+ check_install_package('sklearn', 'scikit-learn')
228
+ from sklearn.cluster import AgglomerativeClustering
229
+
230
+ # 1 给每个样本标类别
231
+ n = len(indices)
232
+ distance_matrix = np.zeros((n, n))
233
+ for i in range(n):
234
+ for j in range(i + 1, n):
235
+ # 我们需要距离,所以用1减去相似度
236
+ distance = 1 - Levenshtein.ratio(self.key_str[indices[i]], self.key_str[indices[j]])
237
+ distance_matrix[i, j] = distance_matrix[j, i] = distance
238
+
239
+ # 进行层次聚类
240
+ clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
241
+ distance_threshold=threshold,
242
+ linkage='complete')
243
+ labels = clustering.fit_predict(distance_matrix)
244
+
245
+ # 2 分组字典
246
+ cluster_dict = defaultdict(list)
247
+ # 组织数据到字典中
248
+ for i, label in enumerate(labels):
249
+ cluster_dict[label].append(indices[i])
250
+
251
+ # 3 改成代表样本映射到一组里,并且按照样本数从多到少排序
252
+ result = {}
253
+ for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
254
+ if strategy == 'first':
255
+ representative = items[0]
256
+ elif strategy == 'center':
257
+ # 使用局部索引计算平均距离
258
+ local_indices = [i for i, idx in enumerate(indices) if idx in items]
259
+ sub_matrix = distance_matrix[np.ix_(local_indices, local_indices)]
260
+ avg_distances = sub_matrix.mean(axis=1)
261
+ representative_idx = np.argmin(avg_distances)
262
+ representative = items[representative_idx]
263
+ else:
264
+ raise ValueError(f'Invalid strategy: {strategy}')
265
+ result[representative] = items
266
+
267
+ return result
268
+
269
+ def init_groups(self, threshold=0.5, batch_size=1000):
270
+ """
271
+ :param threshold: 按照阈值进行分组,在这个距离内的都会归到一组
272
+ :param batch_size: 因为数据可能太大,不可能一次性全量两两比较,这里可以分batch处理
273
+ 这样虽然结果不太精确,但能大大减小运算量
274
+ """
275
+ # 1 最开始每个样本都是一个组
276
+ groups = {i: [i] for i in range(len(self))}
277
+ new_groups = {}
278
+
279
+ # 2 不断合并,直到没有组数变化
280
+ while len(groups) > 1:
281
+ for indices in chunked(groups.keys(), batch_size):
282
+ # 对于这里返回的字典,原groups里的values也要对应拼接的
283
+ indices2 = self.merge_group(indices, threshold=threshold)
284
+ for idx, idxs in indices2.items():
285
+ # 获取原始分组中的索引
286
+ original_idxs = [groups[original_idx] for original_idx in idxs]
287
+ # 展平列表并分配到新分组中
288
+ new_groups[idx] = [item for sublist in original_idxs for item in sublist]
289
+
290
+ # 如果分组没有发生变化,退出循环
291
+ if len(new_groups) == len(groups):
292
+ break
293
+
294
+ groups = new_groups
295
+ new_groups = {}
296
+
297
+ # 3 按数量从多到少排序
298
+ new_groups = {}
299
+ for label, items in sorted(groups.items(), key=lambda x: -len(x[1])):
300
+ new_groups[label] = items # 暂用第一个出现的作为代表
301
+
302
+ self.groups = new_groups
303
+ return self.groups