PyPI - pyxllib - Versions diffs - 0.3.96__py3-none-any.whl → 0.3.200__py3-none-any.whl - Mend

pyxllib 0.3.96py3-none-any.whl → 0.3.200py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (358) hide show

pyxllib/__init__.py +21 -21
pyxllib/algo/__init__.py +8 -8
pyxllib/algo/disjoint.py +54 -54
pyxllib/algo/geo.py +541 -529
pyxllib/algo/intervals.py +964 -964
pyxllib/algo/matcher.py +389 -311
pyxllib/algo/newbie.py +166 -166
pyxllib/algo/pupil.py +629 -461
pyxllib/algo/shapelylib.py +67 -67
pyxllib/algo/specialist.py +241 -240
pyxllib/algo/stat.py +494 -458
pyxllib/algo/treelib.py +149 -149
pyxllib/algo/unitlib.py +66 -66
{pyxlpr → pyxllib/autogui}/__init__.py +5 -5
pyxllib/autogui/activewin.py +246 -0
pyxllib/autogui/all.py +9 -0
pyxllib/{ext/autogui → autogui}/autogui.py +852 -823
pyxllib/autogui/uiautolib.py +362 -0
pyxllib/{ext/autogui → autogui}/virtualkey.py +102 -102
pyxllib/autogui/wechat.py +827 -0
pyxllib/autogui/wechat_msg.py +421 -0
pyxllib/autogui/wxautolib.py +84 -0
pyxllib/cv/__init__.py +5 -5
pyxllib/cv/expert.py +267 -267
pyxllib/cv/imfile.py +159 -159
pyxllib/cv/imhash.py +39 -39
pyxllib/cv/pupil.py +9 -9
pyxllib/cv/rgbfmt.py +1525 -1525
pyxllib/cv/slidercaptcha.py +137 -0
pyxllib/cv/trackbartools.py +251 -251
pyxllib/cv/xlcvlib.py +1040 -1040
pyxllib/cv/xlpillib.py +423 -423
pyxllib/data/echarts.py +240 -129
pyxllib/data/jsonlib.py +89 -0
pyxllib/data/oss.py +72 -72
pyxllib/data/pglib.py +1127 -643
pyxllib/data/sqlite.py +568 -341
pyxllib/data/sqllib.py +297 -297
pyxllib/ext/JLineViewer.py +505 -492
pyxllib/ext/__init__.py +6 -6
pyxllib/ext/demolib.py +246 -246
pyxllib/ext/drissionlib.py +277 -0
pyxllib/ext/kq5034lib.py +12 -1606
pyxllib/ext/old.py +663 -663
pyxllib/ext/qt.py +449 -449
pyxllib/ext/robustprocfile.py +497 -0
pyxllib/ext/seleniumlib.py +76 -76
pyxllib/ext/tk.py +173 -173
pyxllib/ext/unixlib.py +827 -826
pyxllib/ext/utools.py +351 -338
pyxllib/ext/webhook.py +124 -101
pyxllib/ext/win32lib.py +40 -40
pyxllib/ext/wjxlib.py +88 -0
pyxllib/ext/wpsapi.py +124 -0
pyxllib/ext/xlwork.py +9 -0
pyxllib/ext/yuquelib.py +1105 -173
pyxllib/file/__init__.py +17 -17
pyxllib/file/docxlib.py +761 -761
pyxllib/file/gitlib.py +309 -309
pyxllib/file/libreoffice.py +165 -0
pyxllib/file/movielib.py +148 -139
pyxllib/file/newbie.py +10 -10
pyxllib/file/onenotelib.py +1469 -1469
pyxllib/file/packlib/__init__.py +330 -293
pyxllib/file/packlib/zipfile.py +2441 -2441
pyxllib/file/pdflib.py +426 -426
pyxllib/file/pupil.py +185 -185
pyxllib/file/specialist/__init__.py +685 -685
pyxllib/file/specialist/dirlib.py +799 -799
pyxllib/file/specialist/download.py +193 -186
pyxllib/file/specialist/filelib.py +2829 -2618
pyxllib/file/xlsxlib.py +3131 -2976
pyxllib/file/xlsyncfile.py +341 -0
pyxllib/prog/__init__.py +5 -5
pyxllib/prog/cachetools.py +64 -0
pyxllib/prog/deprecatedlib.py +233 -233
pyxllib/prog/filelock.py +42 -0
pyxllib/prog/ipyexec.py +253 -253
pyxllib/prog/multiprogs.py +940 -0
pyxllib/prog/newbie.py +451 -444
pyxllib/prog/pupil.py +1197 -1128
pyxllib/prog/sitepackages.py +33 -33
pyxllib/prog/specialist/__init__.py +391 -217
pyxllib/prog/specialist/bc.py +203 -200
pyxllib/prog/specialist/browser.py +497 -488
pyxllib/prog/specialist/common.py +347 -347
pyxllib/prog/specialist/datetime.py +199 -131
pyxllib/prog/specialist/tictoc.py +240 -241
pyxllib/prog/specialist/xllog.py +180 -180
pyxllib/prog/xlosenv.py +108 -101
pyxllib/stdlib/__init__.py +17 -17
pyxllib/stdlib/tablepyxl/__init__.py +10 -10
pyxllib/stdlib/tablepyxl/style.py +303 -303
pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
pyxllib/text/__init__.py +8 -8
pyxllib/text/ahocorasick.py +39 -39
pyxllib/text/airscript.js +744 -0
pyxllib/text/charclasslib.py +121 -109
pyxllib/text/jiebalib.py +267 -264
pyxllib/text/jinjalib.py +32 -0
pyxllib/text/jsa_ai_prompt.md +271 -0
pyxllib/text/jscode.py +922 -767
pyxllib/text/latex/__init__.py +158 -158
pyxllib/text/levenshtein.py +303 -303
pyxllib/text/nestenv.py +1215 -1215
pyxllib/text/newbie.py +300 -288
pyxllib/text/pupil/__init__.py +8 -8
pyxllib/text/pupil/common.py +1121 -1095
pyxllib/text/pupil/xlalign.py +326 -326
pyxllib/text/pycode.py +47 -47
pyxllib/text/specialist/__init__.py +8 -8
pyxllib/text/specialist/common.py +112 -112
pyxllib/text/specialist/ptag.py +186 -186
pyxllib/text/spellchecker.py +172 -172
pyxllib/text/templates/echart_base.html +11 -0
pyxllib/text/templates/highlight_code.html +17 -0
pyxllib/text/templates/latex_editor.html +103 -0
pyxllib/text/vbacode.py +17 -17
pyxllib/text/xmllib.py +747 -685
pyxllib/xl.py +42 -38
pyxllib/xlcv.py +17 -17
pyxllib-0.3.200.dist-info/METADATA +48 -0
pyxllib-0.3.200.dist-info/RECORD +126 -0
{pyxllib-0.3.96.dist-info → pyxllib-0.3.200.dist-info}/WHEEL +1 -2
{pyxllib-0.3.96.dist-info → pyxllib-0.3.200.dist-info/licenses}/LICENSE +190 -190
pyxllib/ext/autogui/__init__.py +0 -8
pyxllib-0.3.96.dist-info/METADATA +0 -51
pyxllib-0.3.96.dist-info/RECORD +0 -333
pyxllib-0.3.96.dist-info/top_level.txt +0 -2
pyxlpr/ai/__init__.py +0 -5
pyxlpr/ai/clientlib.py +0 -1281
pyxlpr/ai/specialist.py +0 -286
pyxlpr/ai/torch_app.py +0 -172
pyxlpr/ai/xlpaddle.py +0 -655
pyxlpr/ai/xltorch.py +0 -705
pyxlpr/data/__init__.py +0 -11
pyxlpr/data/coco.py +0 -1325
pyxlpr/data/datacls.py +0 -365
pyxlpr/data/datasets.py +0 -200
pyxlpr/data/gptlib.py +0 -1291
pyxlpr/data/icdar/__init__.py +0 -96
pyxlpr/data/icdar/deteval.py +0 -377
pyxlpr/data/icdar/icdar2013.py +0 -341
pyxlpr/data/icdar/iou.py +0 -340
pyxlpr/data/icdar/rrc_evaluation_funcs_1_1.py +0 -463
pyxlpr/data/imtextline.py +0 -473
pyxlpr/data/labelme.py +0 -866
pyxlpr/data/removeline.py +0 -179
pyxlpr/data/specialist.py +0 -57
pyxlpr/eval/__init__.py +0 -85
pyxlpr/paddleocr.py +0 -776
pyxlpr/ppocr/__init__.py +0 -15
pyxlpr/ppocr/configs/rec/multi_language/generate_multi_language_configs.py +0 -226
pyxlpr/ppocr/data/__init__.py +0 -135
pyxlpr/ppocr/data/imaug/ColorJitter.py +0 -26
pyxlpr/ppocr/data/imaug/__init__.py +0 -67
pyxlpr/ppocr/data/imaug/copy_paste.py +0 -170
pyxlpr/ppocr/data/imaug/east_process.py +0 -437
pyxlpr/ppocr/data/imaug/gen_table_mask.py +0 -244
pyxlpr/ppocr/data/imaug/iaa_augment.py +0 -114
pyxlpr/ppocr/data/imaug/label_ops.py +0 -789
pyxlpr/ppocr/data/imaug/make_border_map.py +0 -184
pyxlpr/ppocr/data/imaug/make_pse_gt.py +0 -106
pyxlpr/ppocr/data/imaug/make_shrink_map.py +0 -126
pyxlpr/ppocr/data/imaug/operators.py +0 -433
pyxlpr/ppocr/data/imaug/pg_process.py +0 -906
pyxlpr/ppocr/data/imaug/randaugment.py +0 -143
pyxlpr/ppocr/data/imaug/random_crop_data.py +0 -239
pyxlpr/ppocr/data/imaug/rec_img_aug.py +0 -533
pyxlpr/ppocr/data/imaug/sast_process.py +0 -777
pyxlpr/ppocr/data/imaug/text_image_aug/__init__.py +0 -17
pyxlpr/ppocr/data/imaug/text_image_aug/augment.py +0 -120
pyxlpr/ppocr/data/imaug/text_image_aug/warp_mls.py +0 -168
pyxlpr/ppocr/data/lmdb_dataset.py +0 -115
pyxlpr/ppocr/data/pgnet_dataset.py +0 -104
pyxlpr/ppocr/data/pubtab_dataset.py +0 -107
pyxlpr/ppocr/data/simple_dataset.py +0 -372
pyxlpr/ppocr/losses/__init__.py +0 -61
pyxlpr/ppocr/losses/ace_loss.py +0 -52
pyxlpr/ppocr/losses/basic_loss.py +0 -135
pyxlpr/ppocr/losses/center_loss.py +0 -88
pyxlpr/ppocr/losses/cls_loss.py +0 -30
pyxlpr/ppocr/losses/combined_loss.py +0 -67
pyxlpr/ppocr/losses/det_basic_loss.py +0 -208
pyxlpr/ppocr/losses/det_db_loss.py +0 -80
pyxlpr/ppocr/losses/det_east_loss.py +0 -63
pyxlpr/ppocr/losses/det_pse_loss.py +0 -149
pyxlpr/ppocr/losses/det_sast_loss.py +0 -121
pyxlpr/ppocr/losses/distillation_loss.py +0 -272
pyxlpr/ppocr/losses/e2e_pg_loss.py +0 -140
pyxlpr/ppocr/losses/kie_sdmgr_loss.py +0 -113
pyxlpr/ppocr/losses/rec_aster_loss.py +0 -99
pyxlpr/ppocr/losses/rec_att_loss.py +0 -39
pyxlpr/ppocr/losses/rec_ctc_loss.py +0 -44
pyxlpr/ppocr/losses/rec_enhanced_ctc_loss.py +0 -70
pyxlpr/ppocr/losses/rec_nrtr_loss.py +0 -30
pyxlpr/ppocr/losses/rec_sar_loss.py +0 -28
pyxlpr/ppocr/losses/rec_srn_loss.py +0 -47
pyxlpr/ppocr/losses/table_att_loss.py +0 -109
pyxlpr/ppocr/metrics/__init__.py +0 -44
pyxlpr/ppocr/metrics/cls_metric.py +0 -45
pyxlpr/ppocr/metrics/det_metric.py +0 -82
pyxlpr/ppocr/metrics/distillation_metric.py +0 -73
pyxlpr/ppocr/metrics/e2e_metric.py +0 -86
pyxlpr/ppocr/metrics/eval_det_iou.py +0 -274
pyxlpr/ppocr/metrics/kie_metric.py +0 -70
pyxlpr/ppocr/metrics/rec_metric.py +0 -75
pyxlpr/ppocr/metrics/table_metric.py +0 -50
pyxlpr/ppocr/modeling/architectures/__init__.py +0 -32
pyxlpr/ppocr/modeling/architectures/base_model.py +0 -88
pyxlpr/ppocr/modeling/architectures/distillation_model.py +0 -60
pyxlpr/ppocr/modeling/backbones/__init__.py +0 -54
pyxlpr/ppocr/modeling/backbones/det_mobilenet_v3.py +0 -268
pyxlpr/ppocr/modeling/backbones/det_resnet_vd.py +0 -246
pyxlpr/ppocr/modeling/backbones/det_resnet_vd_sast.py +0 -285
pyxlpr/ppocr/modeling/backbones/e2e_resnet_vd_pg.py +0 -265
pyxlpr/ppocr/modeling/backbones/kie_unet_sdmgr.py +0 -186
pyxlpr/ppocr/modeling/backbones/rec_mobilenet_v3.py +0 -138
pyxlpr/ppocr/modeling/backbones/rec_mv1_enhance.py +0 -258
pyxlpr/ppocr/modeling/backbones/rec_nrtr_mtb.py +0 -48
pyxlpr/ppocr/modeling/backbones/rec_resnet_31.py +0 -210
pyxlpr/ppocr/modeling/backbones/rec_resnet_aster.py +0 -143
pyxlpr/ppocr/modeling/backbones/rec_resnet_fpn.py +0 -307
pyxlpr/ppocr/modeling/backbones/rec_resnet_vd.py +0 -286
pyxlpr/ppocr/modeling/heads/__init__.py +0 -54
pyxlpr/ppocr/modeling/heads/cls_head.py +0 -52
pyxlpr/ppocr/modeling/heads/det_db_head.py +0 -118
pyxlpr/ppocr/modeling/heads/det_east_head.py +0 -121
pyxlpr/ppocr/modeling/heads/det_pse_head.py +0 -37
pyxlpr/ppocr/modeling/heads/det_sast_head.py +0 -128
pyxlpr/ppocr/modeling/heads/e2e_pg_head.py +0 -253
pyxlpr/ppocr/modeling/heads/kie_sdmgr_head.py +0 -206
pyxlpr/ppocr/modeling/heads/multiheadAttention.py +0 -163
pyxlpr/ppocr/modeling/heads/rec_aster_head.py +0 -393
pyxlpr/ppocr/modeling/heads/rec_att_head.py +0 -202
pyxlpr/ppocr/modeling/heads/rec_ctc_head.py +0 -88
pyxlpr/ppocr/modeling/heads/rec_nrtr_head.py +0 -826
pyxlpr/ppocr/modeling/heads/rec_sar_head.py +0 -402
pyxlpr/ppocr/modeling/heads/rec_srn_head.py +0 -280
pyxlpr/ppocr/modeling/heads/self_attention.py +0 -406
pyxlpr/ppocr/modeling/heads/table_att_head.py +0 -246
pyxlpr/ppocr/modeling/necks/__init__.py +0 -32
pyxlpr/ppocr/modeling/necks/db_fpn.py +0 -111
pyxlpr/ppocr/modeling/necks/east_fpn.py +0 -188
pyxlpr/ppocr/modeling/necks/fpn.py +0 -138
pyxlpr/ppocr/modeling/necks/pg_fpn.py +0 -314
pyxlpr/ppocr/modeling/necks/rnn.py +0 -92
pyxlpr/ppocr/modeling/necks/sast_fpn.py +0 -284
pyxlpr/ppocr/modeling/necks/table_fpn.py +0 -110
pyxlpr/ppocr/modeling/transforms/__init__.py +0 -28
pyxlpr/ppocr/modeling/transforms/stn.py +0 -135
pyxlpr/ppocr/modeling/transforms/tps.py +0 -308
pyxlpr/ppocr/modeling/transforms/tps_spatial_transformer.py +0 -156
pyxlpr/ppocr/optimizer/__init__.py +0 -61
pyxlpr/ppocr/optimizer/learning_rate.py +0 -228
pyxlpr/ppocr/optimizer/lr_scheduler.py +0 -49
pyxlpr/ppocr/optimizer/optimizer.py +0 -160
pyxlpr/ppocr/optimizer/regularizer.py +0 -52
pyxlpr/ppocr/postprocess/__init__.py +0 -55
pyxlpr/ppocr/postprocess/cls_postprocess.py +0 -33
pyxlpr/ppocr/postprocess/db_postprocess.py +0 -234
pyxlpr/ppocr/postprocess/east_postprocess.py +0 -143
pyxlpr/ppocr/postprocess/locality_aware_nms.py +0 -200
pyxlpr/ppocr/postprocess/pg_postprocess.py +0 -52
pyxlpr/ppocr/postprocess/pse_postprocess/__init__.py +0 -15
pyxlpr/ppocr/postprocess/pse_postprocess/pse/__init__.py +0 -29
pyxlpr/ppocr/postprocess/pse_postprocess/pse/setup.py +0 -14
pyxlpr/ppocr/postprocess/pse_postprocess/pse_postprocess.py +0 -118
pyxlpr/ppocr/postprocess/rec_postprocess.py +0 -654
pyxlpr/ppocr/postprocess/sast_postprocess.py +0 -355
pyxlpr/ppocr/tools/__init__.py +0 -14
pyxlpr/ppocr/tools/eval.py +0 -83
pyxlpr/ppocr/tools/export_center.py +0 -77
pyxlpr/ppocr/tools/export_model.py +0 -129
pyxlpr/ppocr/tools/infer/predict_cls.py +0 -151
pyxlpr/ppocr/tools/infer/predict_det.py +0 -300
pyxlpr/ppocr/tools/infer/predict_e2e.py +0 -169
pyxlpr/ppocr/tools/infer/predict_rec.py +0 -414
pyxlpr/ppocr/tools/infer/predict_system.py +0 -204
pyxlpr/ppocr/tools/infer/utility.py +0 -629
pyxlpr/ppocr/tools/infer_cls.py +0 -83
pyxlpr/ppocr/tools/infer_det.py +0 -134
pyxlpr/ppocr/tools/infer_e2e.py +0 -122
pyxlpr/ppocr/tools/infer_kie.py +0 -153
pyxlpr/ppocr/tools/infer_rec.py +0 -146
pyxlpr/ppocr/tools/infer_table.py +0 -107
pyxlpr/ppocr/tools/program.py +0 -596
pyxlpr/ppocr/tools/test_hubserving.py +0 -117
pyxlpr/ppocr/tools/train.py +0 -163
pyxlpr/ppocr/tools/xlprog.py +0 -748
pyxlpr/ppocr/utils/EN_symbol_dict.txt +0 -94
pyxlpr/ppocr/utils/__init__.py +0 -24
pyxlpr/ppocr/utils/dict/ar_dict.txt +0 -117
pyxlpr/ppocr/utils/dict/arabic_dict.txt +0 -162
pyxlpr/ppocr/utils/dict/be_dict.txt +0 -145
pyxlpr/ppocr/utils/dict/bg_dict.txt +0 -140
pyxlpr/ppocr/utils/dict/chinese_cht_dict.txt +0 -8421
pyxlpr/ppocr/utils/dict/cyrillic_dict.txt +0 -163
pyxlpr/ppocr/utils/dict/devanagari_dict.txt +0 -167
pyxlpr/ppocr/utils/dict/en_dict.txt +0 -63
pyxlpr/ppocr/utils/dict/fa_dict.txt +0 -136
pyxlpr/ppocr/utils/dict/french_dict.txt +0 -136
pyxlpr/ppocr/utils/dict/german_dict.txt +0 -143
pyxlpr/ppocr/utils/dict/hi_dict.txt +0 -162
pyxlpr/ppocr/utils/dict/it_dict.txt +0 -118
pyxlpr/ppocr/utils/dict/japan_dict.txt +0 -4399
pyxlpr/ppocr/utils/dict/ka_dict.txt +0 -153
pyxlpr/ppocr/utils/dict/korean_dict.txt +0 -3688
pyxlpr/ppocr/utils/dict/latin_dict.txt +0 -185
pyxlpr/ppocr/utils/dict/mr_dict.txt +0 -153
pyxlpr/ppocr/utils/dict/ne_dict.txt +0 -153
pyxlpr/ppocr/utils/dict/oc_dict.txt +0 -96
pyxlpr/ppocr/utils/dict/pu_dict.txt +0 -130
pyxlpr/ppocr/utils/dict/rs_dict.txt +0 -91
pyxlpr/ppocr/utils/dict/rsc_dict.txt +0 -134
pyxlpr/ppocr/utils/dict/ru_dict.txt +0 -125
pyxlpr/ppocr/utils/dict/ta_dict.txt +0 -128
pyxlpr/ppocr/utils/dict/table_dict.txt +0 -277
pyxlpr/ppocr/utils/dict/table_structure_dict.txt +0 -2759
pyxlpr/ppocr/utils/dict/te_dict.txt +0 -151
pyxlpr/ppocr/utils/dict/ug_dict.txt +0 -114
pyxlpr/ppocr/utils/dict/uk_dict.txt +0 -142
pyxlpr/ppocr/utils/dict/ur_dict.txt +0 -137
pyxlpr/ppocr/utils/dict/xi_dict.txt +0 -110
pyxlpr/ppocr/utils/dict90.txt +0 -90
pyxlpr/ppocr/utils/e2e_metric/Deteval.py +0 -574
pyxlpr/ppocr/utils/e2e_metric/polygon_fast.py +0 -83
pyxlpr/ppocr/utils/e2e_utils/extract_batchsize.py +0 -87
pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_fast.py +0 -457
pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_slow.py +0 -592
pyxlpr/ppocr/utils/e2e_utils/pgnet_pp_utils.py +0 -162
pyxlpr/ppocr/utils/e2e_utils/visual.py +0 -162
pyxlpr/ppocr/utils/en_dict.txt +0 -95
pyxlpr/ppocr/utils/gen_label.py +0 -81
pyxlpr/ppocr/utils/ic15_dict.txt +0 -36
pyxlpr/ppocr/utils/iou.py +0 -54
pyxlpr/ppocr/utils/logging.py +0 -69
pyxlpr/ppocr/utils/network.py +0 -84
pyxlpr/ppocr/utils/ppocr_keys_v1.txt +0 -6623
pyxlpr/ppocr/utils/profiler.py +0 -110
pyxlpr/ppocr/utils/save_load.py +0 -150
pyxlpr/ppocr/utils/stats.py +0 -72
pyxlpr/ppocr/utils/utility.py +0 -80
pyxlpr/ppstructure/__init__.py +0 -13
pyxlpr/ppstructure/predict_system.py +0 -187
pyxlpr/ppstructure/table/__init__.py +0 -13
pyxlpr/ppstructure/table/eval_table.py +0 -72
pyxlpr/ppstructure/table/matcher.py +0 -192
pyxlpr/ppstructure/table/predict_structure.py +0 -136
pyxlpr/ppstructure/table/predict_table.py +0 -221
pyxlpr/ppstructure/table/table_metric/__init__.py +0 -16
pyxlpr/ppstructure/table/table_metric/parallel.py +0 -51
pyxlpr/ppstructure/table/table_metric/table_metric.py +0 -247
pyxlpr/ppstructure/table/tablepyxl/__init__.py +0 -13
pyxlpr/ppstructure/table/tablepyxl/style.py +0 -283
pyxlpr/ppstructure/table/tablepyxl/tablepyxl.py +0 -118
pyxlpr/ppstructure/utility.py +0 -71
pyxlpr/xlai.py +0 -10

pyxllib/text/levenshtein.py CHANGED Viewed

@@ -1,303 +1,303 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# @Author : 陈坤泽
-# @Email  : 877362867@qq.com
-# @Date   : 2021/06/06 17:01
-from pyxllib.prog.pupil import check_install_package
-# 这个需要C++14编译器 https://download.microsoft.com/download/5/f/7/5f7acaeb-8363-451f-9425-68a90f98b238/visualcppbuildtools_full.exe
-# 在需要的时候安装，防止只是想用pyxllib很简单的功能，但是在pip install阶段处理过于麻烦
-# MatchSimString计算编辑距离需要
-check_install_package('Levenshtein', 'python-Levenshtein')
-from collections import defaultdict
-from more_itertools import chunked
-import warnings
-import Levenshtein
-import numpy as np
-import pandas as pd
-from pyxllib.prog.pupil import run_once
-from pyxllib.prog.specialist import dataframe_str
-from pyxllib.text.pupil import briefstr
-# 忽略特定的警告
-warnings.filterwarnings("ignore", category=FutureWarning,
-                        module="sklearn.cluster._agglomerative",
-                        lineno=1005)
-@run_once('str')
-def get_levenshtein_similar(x, y):
-    """ 缓存各字符串之间的编辑距离 """
-    return Levenshtein.ratio(x, y)
-class MatchSimString:
-    """ 匹配近似字符串
-    mss = MatchSimString()
-    # 1 添加候选对象
-    mss.append_candidate('福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用')
-    mss.append_candidate('2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案（教师版）')
-    mss.append_candidate('删除所有标签中间多余的空白')
-    # 2 需要匹配的对象1
-    s = '奕本初一福周厦门培油'
-    idx, sim = mss.match(s)
-    print('匹配目标：', mss[idx])  # 匹配目标： 福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用
-    print('相似度：', sim)         # 相似度： 0.22
-    # 3 需要匹配的对象2
-    s = '圆柱与【圆锥】_教案空白版'
-    idx, sim = mss.match(s)
-    print('匹配目标：', mss[idx])  # 2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案（教师版）
-    print('相似度：', sim)         # 相似度： 0.375
-    如果append_candidate有传递2个扩展信息参数，可以索引获取：
-    mss.ext_value[idx]
-    """
-    def __init__(self, method=briefstr):
-        self.preproc = method
-        self.origin_str = []  # 原始字符串内容
-        self.key_str = []  # 对原始字符串进行处理后的字符
-        self.ext_value = []  # 扩展存储一些信息
-    def __getitem__(self, item):
-        return self.origin_str[item]
-    def __delitem__(self, item):
-        del self.origin_str[item]
-        del self.key_str[item]
-        del self.ext_value[item]
-    def __len__(self):
-        return len(self.key_str)
-    def get_similarity(self, x, y):
-        """ 计算两对数据之间的相似度 """
-        pass
-    def append_candidate(self, k, v=None):
-        self.origin_str.append(k)
-        if callable(self.preproc):
-            k = self.preproc(k)
-        self.key_str.append(k)
-        self.ext_value.append(v)
-    def match(self, s):
-        """ 跟候选字符串进行匹配，返回最佳匹配结果
-        """
-        idx, sim = -1, 0
-        for i in range(len(self)):
-            k, v = self.key_str[i], self.ext_value[i]
-            sim_ = Levenshtein.ratio(k, s)
-            if sim_ > sim:
-                sim = sim_
-                idx = i
-            i += 1
-        return idx, sim
-    def match_many(self, s, count=1):
-        """跟候选字符串进行匹配，返回多个最佳匹配结果
-        :param str s: 待匹配的字符串
-        :param int count: 需要返回的匹配数量
-        :return: 匹配结果列表，列表中的元素为(idx, sim)对
-        """
-        scores = [(i, Levenshtein.ratio(self.key_str[i], s)) for i in range(len(self))]
-        # 根据相似度排序并返回前count个结果
-        return sorted(scores, key=lambda x: x[1], reverse=True)[:count]
-    def match_test(self, s, count=-1, showstr=lambda x: x[:50]):
-        """输入一个字符串s，和候选项做近似匹配
-        :param s: 需要进行匹配的字符串s
-        :param count: 只输出部分匹配结果
-            -1：输出所有匹配结果
-            0 < count < 1：例如0.4，则只输出匹配度最高的40%结果
-            整数：输出匹配度最高的count个结果
-        :param showstr: 字符串显示效果
-        """
-        # 1 计算编辑距离，存储结果到res
-        res = []
-        n = len(self)
-        for i in range(n):
-            k, v = self.key_str[i], self.ext_value[i]
-            sim = Levenshtein.ratio(k, s)
-            res.append([i, v, sim, showstr(k)])  # 输出的时候从0开始编号
-            i += 1
-        # 2 排序、节选结果
-        res = sorted(res, key=lambda x: -x[2])
-        if 0 < count < 1:
-            n = max(1, int(n * count))
-        elif isinstance(count, int) and count > 0:
-            n = min(count, n)
-        res = res[:n]
-        # 3 输出
-        df = pd.DataFrame.from_records(res, columns=('序号', '标签', '编辑距离', '内容'))
-        s = dataframe_str(df)
-        s = s.replace('\u2022', '')  # texstudio无法显示会报错的字符
-        print(s)
-    def agglomerative_clustering(self, threshold=0.5):
-        """ 对内部字符串进行层次聚类
-        :param threshold: 可以理解成距离的阈值，距离小于这个阈值的字符串会被聚为一类
-            值越小，分出的类别越多越细
-        """
-        check_install_package('sklearn', 'scikit-learn')
-        from sklearn.cluster import AgglomerativeClustering
-        # 1 给每个样本标类别
-        distance_matrix = np.zeros((len(self), len(self)))
-        for i in range(len(self)):
-            for j in range(i + 1, len(self)):
-                # 我们需要距离，所以用1减去相似度
-                distance = 1 - Levenshtein.ratio(self.key_str[i], self.key_str[j])
-                distance_matrix[i, j] = distance_matrix[j, i] = distance
-        # 进行层次聚类
-        clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
-                                             distance_threshold=threshold,
-                                             linkage='complete')
-        labels = clustering.fit_predict(distance_matrix)
-        return labels
-    def display_clusters(self, threshold=0.5):
-        """ 根据agglomerative_clustering的结果，显示各个聚类的内容 """
-        labels = self.agglomerative_clustering(threshold=threshold)
-        cluster_dict = defaultdict(list)
-        # 组织数据到字典中
-        for idx, label in enumerate(labels):
-            cluster_dict[label].append(self.origin_str[idx])
-        # 按标签排序并显示
-        result = {}
-        for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
-            result[label] = items
-        return result
-class HierarchicalMatchSimString(MatchSimString):
-    """ 在面对数据量很大的候选数据情况下，建议使用这个层次聚类后的匹配方法 """
-    def __init__(self, method=briefstr):
-        super().__init__(method)
-        self.groups = dict()
-    def get_center_sample(self, indices=None):
-        """ 输入一组下标，计算中心样本，未输入参数值的时候，则在全量样本里找 """
-        if indices is None:
-            indices = range(len(self))
-        # 用于存储之前计算的结果
-        cached_results = {}
-        def get_similarity(i, j):
-            """ 获取两个索引的相似度，利用缓存来避免重复计算 """
-            if (i, j) in cached_results:
-                return cached_results[(i, j)]
-            sim_val = Levenshtein.ratio(self.key_str[i], self.key_str[j])
-            cached_results[(i, j)] = cached_results[(j, i)] = sim_val
-            return sim_val
-        center_idx = max(indices, key=lambda x: sum(get_similarity(x, y) for y in indices))
-        return center_idx
-    def merge_group(self, indices, threshold=0.5, strategy='center'):
-        """ 对输入的indexs清单，按照threshold的阈值进行合并
-        返回的是一个字典，key是代表性样本，value是同组内的数据编号
-        :param strategy: 代表样本的挑选策略
-            center，中心样本
-            first，第一个样本
-        """
-        check_install_package('sklearn', 'scikit-learn')
-        from sklearn.cluster import AgglomerativeClustering
-        # 1 给每个样本标类别
-        n = len(indices)
-        distance_matrix = np.zeros((n, n))
-        for i in range(n):
-            for j in range(i + 1, n):
-                # 我们需要距离，所以用1减去相似度
-                distance = 1 - Levenshtein.ratio(self.key_str[indices[i]], self.key_str[indices[j]])
-                distance_matrix[i, j] = distance_matrix[j, i] = distance
-        # 进行层次聚类
-        clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
-                                             distance_threshold=threshold,
-                                             linkage='complete')
-        labels = clustering.fit_predict(distance_matrix)
-        # 2 分组字典
-        cluster_dict = defaultdict(list)
-        # 组织数据到字典中
-        for i, label in enumerate(labels):
-            cluster_dict[label].append(indices[i])
-        # 3 改成代表样本映射到一组里，并且按照样本数从多到少排序
-        result = {}
-        for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
-            if strategy == 'first':
-                representative = items[0]
-            elif strategy == 'center':
-                # 使用局部索引计算平均距离
-                local_indices = [i for i, idx in enumerate(indices) if idx in items]
-                sub_matrix = distance_matrix[np.ix_(local_indices, local_indices)]
-                avg_distances = sub_matrix.mean(axis=1)
-                representative_idx = np.argmin(avg_distances)
-                representative = items[representative_idx]
-            else:
-                raise ValueError(f'Invalid strategy: {strategy}')
-            result[representative] = items
-        return result
-    def init_groups(self, threshold=0.5, batch_size=1000):
-        """
-        :param threshold: 按照阈值进行分组，在这个距离内的都会归到一组
-        :param batch_size: 因为数据可能太大，不可能一次性全量两两比较，这里可以分batch处理
-            这样虽然结果不太精确，但能大大减小运算量
-        """
-        # 1 最开始每个样本都是一个组
-        groups = {i: [i] for i in range(len(self))}
-        new_groups = {}
-        # 2 不断合并，直到没有组数变化
-        while len(groups) > 1:
-            for indices in chunked(groups.keys(), batch_size):
-                # 对于这里返回的字典，原groups里的values也要对应拼接的
-                indices2 = self.merge_group(indices, threshold=threshold)
-                for idx, idxs in indices2.items():
-                    # 获取原始分组中的索引
-                    original_idxs = [groups[original_idx] for original_idx in idxs]
-                    # 展平列表并分配到新分组中
-                    new_groups[idx] = [item for sublist in original_idxs for item in sublist]
-            # 如果分组没有发生变化，退出循环
-            if len(new_groups) == len(groups):
-                break
-            groups = new_groups
-            new_groups = {}
-        # 3 按数量从多到少排序
-        new_groups = {}
-        for label, items in sorted(groups.items(), key=lambda x: -len(x[1])):
-            new_groups[label] = items  # 暂用第一个出现的作为代表
-        self.groups = new_groups
-        return self.groups
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# @Author : 陈坤泽
+# @Email  : 877362867@qq.com
+# @Date   : 2021/06/06 17:01
+from pyxllib.prog.pupil import check_install_package
+# 这个需要C++14编译器 https://download.microsoft.com/download/5/f/7/5f7acaeb-8363-451f-9425-68a90f98b238/visualcppbuildtools_full.exe
+# 在需要的时候安装，防止只是想用pyxllib很简单的功能，但是在pip install阶段处理过于麻烦
+# MatchSimString计算编辑距离需要
+check_install_package('Levenshtein', 'python-Levenshtein')
+from collections import defaultdict
+from more_itertools import chunked
+import warnings
+import Levenshtein
+import numpy as np
+import pandas as pd
+from pyxllib.prog.pupil import run_once
+from pyxllib.prog.specialist import dataframe_str
+from pyxllib.text.pupil import briefstr
+# 忽略特定的警告
+warnings.filterwarnings("ignore", category=FutureWarning,
+                        module="sklearn.cluster._agglomerative",
+                        lineno=1005)
+@run_once('str')
+def get_levenshtein_similar(x, y):
+    """ 缓存各字符串之间的编辑距离 """
+    return Levenshtein.ratio(x, y)
+class MatchSimString:
+    """ 匹配近似字符串
+    mss = MatchSimString()
+    # 1 添加候选对象
+    mss.append_candidate('福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用')
+    mss.append_candidate('2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案（教师版）')
+    mss.append_candidate('删除所有标签中间多余的空白')
+    # 2 需要匹配的对象1
+    s = '奕本初一福周厦门培油'
+    idx, sim = mss.match(s)
+    print('匹配目标：', mss[idx])  # 匹配目标： 福州+厦门2018初数暑假讲义-请录入-快乐学习\初一福厦培优-测试用
+    print('相似度：', sim)         # 相似度： 0.22
+    # 3 需要匹配的对象2
+    s = '圆柱与【圆锥】_教案空白版'
+    idx, sim = mss.match(s)
+    print('匹配目标：', mss[idx])  # 2018_快乐数学_六年级_秋季_第01讲_圆柱与圆锥_教案（教师版）
+    print('相似度：', sim)         # 相似度： 0.375
+    如果append_candidate有传递2个扩展信息参数，可以索引获取：
+    mss.ext_value[idx]
+    """
+    def __init__(self, method=briefstr):
+        self.preproc = method
+        self.origin_str = []  # 原始字符串内容
+        self.key_str = []  # 对原始字符串进行处理后的字符
+        self.ext_value = []  # 扩展存储一些信息
+    def __getitem__(self, item):
+        return self.origin_str[item]
+    def __delitem__(self, item):
+        del self.origin_str[item]
+        del self.key_str[item]
+        del self.ext_value[item]
+    def __len__(self):
+        return len(self.key_str)
+    def get_similarity(self, x, y):
+        """ 计算两对数据之间的相似度 """
+        pass
+    def append_candidate(self, k, v=None):
+        self.origin_str.append(k)
+        if callable(self.preproc):
+            k = self.preproc(k)
+        self.key_str.append(k)
+        self.ext_value.append(v)
+    def match(self, s):
+        """ 跟候选字符串进行匹配，返回最佳匹配结果
+        """
+        idx, sim = -1, 0
+        for i in range(len(self)):
+            k, v = self.key_str[i], self.ext_value[i]
+            sim_ = Levenshtein.ratio(k, s)
+            if sim_ > sim:
+                sim = sim_
+                idx = i
+            i += 1
+        return idx, sim
+    def match_many(self, s, count=1):
+        """跟候选字符串进行匹配，返回多个最佳匹配结果
+        :param str s: 待匹配的字符串
+        :param int count: 需要返回的匹配数量
+        :return: 匹配结果列表，列表中的元素为(idx, sim)对
+        """
+        scores = [(i, Levenshtein.ratio(self.key_str[i], s)) for i in range(len(self))]
+        # 根据相似度排序并返回前count个结果
+        return sorted(scores, key=lambda x: x[1], reverse=True)[:count]
+    def match_test(self, s, count=-1, showstr=lambda x: x[:50]):
+        """输入一个字符串s，和候选项做近似匹配
+        :param s: 需要进行匹配的字符串s
+        :param count: 只输出部分匹配结果
+            -1：输出所有匹配结果
+            0 < count < 1：例如0.4，则只输出匹配度最高的40%结果
+            整数：输出匹配度最高的count个结果
+        :param showstr: 字符串显示效果
+        """
+        # 1 计算编辑距离，存储结果到res
+        res = []
+        n = len(self)
+        for i in range(n):
+            k, v = self.key_str[i], self.ext_value[i]
+            sim = Levenshtein.ratio(k, s)
+            res.append([i, v, sim, showstr(k)])  # 输出的时候从0开始编号
+            i += 1
+        # 2 排序、节选结果
+        res = sorted(res, key=lambda x: -x[2])
+        if 0 < count < 1:
+            n = max(1, int(n * count))
+        elif isinstance(count, int) and count > 0:
+            n = min(count, n)
+        res = res[:n]
+        # 3 输出
+        df = pd.DataFrame.from_records(res, columns=('序号', '标签', '编辑距离', '内容'))
+        s = dataframe_str(df)
+        s = s.replace('\u2022', '')  # texstudio无法显示会报错的字符
+        print(s)
+    def agglomerative_clustering(self, threshold=0.5):
+        """ 对内部字符串进行层次聚类
+        :param threshold: 可以理解成距离的阈值，距离小于这个阈值的字符串会被聚为一类
+            值越小，分出的类别越多越细
+        """
+        check_install_package('sklearn', 'scikit-learn')
+        from sklearn.cluster import AgglomerativeClustering
+        # 1 给每个样本标类别
+        distance_matrix = np.zeros((len(self), len(self)))
+        for i in range(len(self)):
+            for j in range(i + 1, len(self)):
+                # 我们需要距离，所以用1减去相似度
+                distance = 1 - Levenshtein.ratio(self.key_str[i], self.key_str[j])
+                distance_matrix[i, j] = distance_matrix[j, i] = distance
+        # 进行层次聚类
+        clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
+                                             distance_threshold=threshold,
+                                             linkage='complete')
+        labels = clustering.fit_predict(distance_matrix)
+        return labels
+    def display_clusters(self, threshold=0.5):
+        """ 根据agglomerative_clustering的结果，显示各个聚类的内容 """
+        labels = self.agglomerative_clustering(threshold=threshold)
+        cluster_dict = defaultdict(list)
+        # 组织数据到字典中
+        for idx, label in enumerate(labels):
+            cluster_dict[label].append(self.origin_str[idx])
+        # 按标签排序并显示
+        result = {}
+        for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
+            result[label] = items
+        return result
+class HierarchicalMatchSimString(MatchSimString):
+    """ 在面对数据量很大的候选数据情况下，建议使用这个层次聚类后的匹配方法 """
+    def __init__(self, method=briefstr):
+        super().__init__(method)
+        self.groups = dict()
+    def get_center_sample(self, indices=None):
+        """ 输入一组下标，计算中心样本，未输入参数值的时候，则在全量样本里找 """
+        if indices is None:
+            indices = range(len(self))
+        # 用于存储之前计算的结果
+        cached_results = {}
+        def get_similarity(i, j):
+            """ 获取两个索引的相似度，利用缓存来避免重复计算 """
+            if (i, j) in cached_results:
+                return cached_results[(i, j)]
+            sim_val = Levenshtein.ratio(self.key_str[i], self.key_str[j])
+            cached_results[(i, j)] = cached_results[(j, i)] = sim_val
+            return sim_val
+        center_idx = max(indices, key=lambda x: sum(get_similarity(x, y) for y in indices))
+        return center_idx
+    def merge_group(self, indices, threshold=0.5, strategy='center'):
+        """ 对输入的indexs清单，按照threshold的阈值进行合并
+        返回的是一个字典，key是代表性样本，value是同组内的数据编号
+        :param strategy: 代表样本的挑选策略
+            center，中心样本
+            first，第一个样本
+        """
+        check_install_package('sklearn', 'scikit-learn')
+        from sklearn.cluster import AgglomerativeClustering
+        # 1 给每个样本标类别
+        n = len(indices)
+        distance_matrix = np.zeros((n, n))
+        for i in range(n):
+            for j in range(i + 1, n):
+                # 我们需要距离，所以用1减去相似度
+                distance = 1 - Levenshtein.ratio(self.key_str[indices[i]], self.key_str[indices[j]])
+                distance_matrix[i, j] = distance_matrix[j, i] = distance
+        # 进行层次聚类
+        clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
+                                             distance_threshold=threshold,
+                                             linkage='complete')
+        labels = clustering.fit_predict(distance_matrix)
+        # 2 分组字典
+        cluster_dict = defaultdict(list)
+        # 组织数据到字典中
+        for i, label in enumerate(labels):
+            cluster_dict[label].append(indices[i])
+        # 3 改成代表样本映射到一组里，并且按照样本数从多到少排序
+        result = {}
+        for label, items in sorted(cluster_dict.items(), key=lambda x: -len(x[1])):
+            if strategy == 'first':
+                representative = items[0]
+            elif strategy == 'center':
+                # 使用局部索引计算平均距离
+                local_indices = [i for i, idx in enumerate(indices) if idx in items]
+                sub_matrix = distance_matrix[np.ix_(local_indices, local_indices)]
+                avg_distances = sub_matrix.mean(axis=1)
+                representative_idx = np.argmin(avg_distances)
+                representative = items[representative_idx]
+            else:
+                raise ValueError(f'Invalid strategy: {strategy}')
+            result[representative] = items
+        return result
+    def init_groups(self, threshold=0.5, batch_size=1000):
+        """
+        :param threshold: 按照阈值进行分组，在这个距离内的都会归到一组
+        :param batch_size: 因为数据可能太大，不可能一次性全量两两比较，这里可以分batch处理
+            这样虽然结果不太精确，但能大大减小运算量
+        """
+        # 1 最开始每个样本都是一个组
+        groups = {i: [i] for i in range(len(self))}
+        new_groups = {}
+        # 2 不断合并，直到没有组数变化
+        while len(groups) > 1:
+            for indices in chunked(groups.keys(), batch_size):
+                # 对于这里返回的字典，原groups里的values也要对应拼接的
+                indices2 = self.merge_group(indices, threshold=threshold)
+                for idx, idxs in indices2.items():
+                    # 获取原始分组中的索引
+                    original_idxs = [groups[original_idx] for original_idx in idxs]
+                    # 展平列表并分配到新分组中
+                    new_groups[idx] = [item for sublist in original_idxs for item in sublist]
+            # 如果分组没有发生变化，退出循环
+            if len(new_groups) == len(groups):
+                break
+            groups = new_groups
+            new_groups = {}
+        # 3 按数量从多到少排序
+        new_groups = {}
+        for label, items in sorted(groups.items(), key=lambda x: -len(x[1])):
+            new_groups[label] = items  # 暂用第一个出现的作为代表
+        self.groups = new_groups
+        return self.groups

pyxllib 0.3.96__py3-none-any.whl → 0.3.200__py3-none-any.whl

pyxllib 0.3.96py3-none-any.whl → 0.3.200py3-none-any.whl