pyxllib 0.3.96__py3-none-any.whl → 0.3.197__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/algo/geo.py +12 -0
- pyxllib/algo/intervals.py +1 -1
- pyxllib/algo/matcher.py +78 -0
- pyxllib/algo/pupil.py +187 -19
- pyxllib/algo/specialist.py +2 -1
- pyxllib/algo/stat.py +38 -2
- {pyxlpr → pyxllib/autogui}/__init__.py +1 -1
- pyxllib/autogui/activewin.py +246 -0
- pyxllib/autogui/all.py +9 -0
- pyxllib/{ext/autogui → autogui}/autogui.py +40 -11
- pyxllib/autogui/uiautolib.py +362 -0
- pyxllib/autogui/wechat.py +827 -0
- pyxllib/autogui/wechat_msg.py +421 -0
- pyxllib/autogui/wxautolib.py +84 -0
- pyxllib/cv/slidercaptcha.py +137 -0
- pyxllib/data/echarts.py +123 -12
- pyxllib/data/jsonlib.py +89 -0
- pyxllib/data/pglib.py +514 -30
- pyxllib/data/sqlite.py +231 -4
- pyxllib/ext/JLineViewer.py +14 -1
- pyxllib/ext/drissionlib.py +277 -0
- pyxllib/ext/kq5034lib.py +0 -1594
- pyxllib/ext/robustprocfile.py +497 -0
- pyxllib/ext/unixlib.py +6 -5
- pyxllib/ext/utools.py +108 -95
- pyxllib/ext/webhook.py +32 -14
- pyxllib/ext/wjxlib.py +88 -0
- pyxllib/ext/wpsapi.py +124 -0
- pyxllib/ext/xlwork.py +9 -0
- pyxllib/ext/yuquelib.py +1003 -71
- pyxllib/file/docxlib.py +1 -1
- pyxllib/file/libreoffice.py +165 -0
- pyxllib/file/movielib.py +9 -0
- pyxllib/file/packlib/__init__.py +112 -75
- pyxllib/file/pdflib.py +1 -1
- pyxllib/file/pupil.py +1 -1
- pyxllib/file/specialist/dirlib.py +1 -1
- pyxllib/file/specialist/download.py +10 -3
- pyxllib/file/specialist/filelib.py +266 -55
- pyxllib/file/xlsxlib.py +205 -50
- pyxllib/file/xlsyncfile.py +341 -0
- pyxllib/prog/cachetools.py +64 -0
- pyxllib/prog/filelock.py +42 -0
- pyxllib/prog/multiprogs.py +940 -0
- pyxllib/prog/newbie.py +9 -2
- pyxllib/prog/pupil.py +129 -60
- pyxllib/prog/specialist/__init__.py +176 -2
- pyxllib/prog/specialist/bc.py +5 -2
- pyxllib/prog/specialist/browser.py +11 -2
- pyxllib/prog/specialist/datetime.py +68 -0
- pyxllib/prog/specialist/tictoc.py +12 -13
- pyxllib/prog/specialist/xllog.py +5 -5
- pyxllib/prog/xlosenv.py +7 -0
- pyxllib/text/airscript.js +744 -0
- pyxllib/text/charclasslib.py +17 -5
- pyxllib/text/jiebalib.py +6 -3
- pyxllib/text/jinjalib.py +32 -0
- pyxllib/text/jsa_ai_prompt.md +271 -0
- pyxllib/text/jscode.py +159 -4
- pyxllib/text/nestenv.py +1 -1
- pyxllib/text/newbie.py +12 -0
- pyxllib/text/pupil/common.py +26 -0
- pyxllib/text/specialist/ptag.py +2 -2
- pyxllib/text/templates/echart_base.html +11 -0
- pyxllib/text/templates/highlight_code.html +17 -0
- pyxllib/text/templates/latex_editor.html +103 -0
- pyxllib/text/xmllib.py +76 -14
- pyxllib/xl.py +2 -1
- pyxllib-0.3.197.dist-info/METADATA +48 -0
- pyxllib-0.3.197.dist-info/RECORD +126 -0
- {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +1 -2
- pyxllib/ext/autogui/__init__.py +0 -8
- pyxllib-0.3.96.dist-info/METADATA +0 -51
- pyxllib-0.3.96.dist-info/RECORD +0 -333
- pyxllib-0.3.96.dist-info/top_level.txt +0 -2
- pyxlpr/ai/__init__.py +0 -5
- pyxlpr/ai/clientlib.py +0 -1281
- pyxlpr/ai/specialist.py +0 -286
- pyxlpr/ai/torch_app.py +0 -172
- pyxlpr/ai/xlpaddle.py +0 -655
- pyxlpr/ai/xltorch.py +0 -705
- pyxlpr/data/__init__.py +0 -11
- pyxlpr/data/coco.py +0 -1325
- pyxlpr/data/datacls.py +0 -365
- pyxlpr/data/datasets.py +0 -200
- pyxlpr/data/gptlib.py +0 -1291
- pyxlpr/data/icdar/__init__.py +0 -96
- pyxlpr/data/icdar/deteval.py +0 -377
- pyxlpr/data/icdar/icdar2013.py +0 -341
- pyxlpr/data/icdar/iou.py +0 -340
- pyxlpr/data/icdar/rrc_evaluation_funcs_1_1.py +0 -463
- pyxlpr/data/imtextline.py +0 -473
- pyxlpr/data/labelme.py +0 -866
- pyxlpr/data/removeline.py +0 -179
- pyxlpr/data/specialist.py +0 -57
- pyxlpr/eval/__init__.py +0 -85
- pyxlpr/paddleocr.py +0 -776
- pyxlpr/ppocr/__init__.py +0 -15
- pyxlpr/ppocr/configs/rec/multi_language/generate_multi_language_configs.py +0 -226
- pyxlpr/ppocr/data/__init__.py +0 -135
- pyxlpr/ppocr/data/imaug/ColorJitter.py +0 -26
- pyxlpr/ppocr/data/imaug/__init__.py +0 -67
- pyxlpr/ppocr/data/imaug/copy_paste.py +0 -170
- pyxlpr/ppocr/data/imaug/east_process.py +0 -437
- pyxlpr/ppocr/data/imaug/gen_table_mask.py +0 -244
- pyxlpr/ppocr/data/imaug/iaa_augment.py +0 -114
- pyxlpr/ppocr/data/imaug/label_ops.py +0 -789
- pyxlpr/ppocr/data/imaug/make_border_map.py +0 -184
- pyxlpr/ppocr/data/imaug/make_pse_gt.py +0 -106
- pyxlpr/ppocr/data/imaug/make_shrink_map.py +0 -126
- pyxlpr/ppocr/data/imaug/operators.py +0 -433
- pyxlpr/ppocr/data/imaug/pg_process.py +0 -906
- pyxlpr/ppocr/data/imaug/randaugment.py +0 -143
- pyxlpr/ppocr/data/imaug/random_crop_data.py +0 -239
- pyxlpr/ppocr/data/imaug/rec_img_aug.py +0 -533
- pyxlpr/ppocr/data/imaug/sast_process.py +0 -777
- pyxlpr/ppocr/data/imaug/text_image_aug/__init__.py +0 -17
- pyxlpr/ppocr/data/imaug/text_image_aug/augment.py +0 -120
- pyxlpr/ppocr/data/imaug/text_image_aug/warp_mls.py +0 -168
- pyxlpr/ppocr/data/lmdb_dataset.py +0 -115
- pyxlpr/ppocr/data/pgnet_dataset.py +0 -104
- pyxlpr/ppocr/data/pubtab_dataset.py +0 -107
- pyxlpr/ppocr/data/simple_dataset.py +0 -372
- pyxlpr/ppocr/losses/__init__.py +0 -61
- pyxlpr/ppocr/losses/ace_loss.py +0 -52
- pyxlpr/ppocr/losses/basic_loss.py +0 -135
- pyxlpr/ppocr/losses/center_loss.py +0 -88
- pyxlpr/ppocr/losses/cls_loss.py +0 -30
- pyxlpr/ppocr/losses/combined_loss.py +0 -67
- pyxlpr/ppocr/losses/det_basic_loss.py +0 -208
- pyxlpr/ppocr/losses/det_db_loss.py +0 -80
- pyxlpr/ppocr/losses/det_east_loss.py +0 -63
- pyxlpr/ppocr/losses/det_pse_loss.py +0 -149
- pyxlpr/ppocr/losses/det_sast_loss.py +0 -121
- pyxlpr/ppocr/losses/distillation_loss.py +0 -272
- pyxlpr/ppocr/losses/e2e_pg_loss.py +0 -140
- pyxlpr/ppocr/losses/kie_sdmgr_loss.py +0 -113
- pyxlpr/ppocr/losses/rec_aster_loss.py +0 -99
- pyxlpr/ppocr/losses/rec_att_loss.py +0 -39
- pyxlpr/ppocr/losses/rec_ctc_loss.py +0 -44
- pyxlpr/ppocr/losses/rec_enhanced_ctc_loss.py +0 -70
- pyxlpr/ppocr/losses/rec_nrtr_loss.py +0 -30
- pyxlpr/ppocr/losses/rec_sar_loss.py +0 -28
- pyxlpr/ppocr/losses/rec_srn_loss.py +0 -47
- pyxlpr/ppocr/losses/table_att_loss.py +0 -109
- pyxlpr/ppocr/metrics/__init__.py +0 -44
- pyxlpr/ppocr/metrics/cls_metric.py +0 -45
- pyxlpr/ppocr/metrics/det_metric.py +0 -82
- pyxlpr/ppocr/metrics/distillation_metric.py +0 -73
- pyxlpr/ppocr/metrics/e2e_metric.py +0 -86
- pyxlpr/ppocr/metrics/eval_det_iou.py +0 -274
- pyxlpr/ppocr/metrics/kie_metric.py +0 -70
- pyxlpr/ppocr/metrics/rec_metric.py +0 -75
- pyxlpr/ppocr/metrics/table_metric.py +0 -50
- pyxlpr/ppocr/modeling/architectures/__init__.py +0 -32
- pyxlpr/ppocr/modeling/architectures/base_model.py +0 -88
- pyxlpr/ppocr/modeling/architectures/distillation_model.py +0 -60
- pyxlpr/ppocr/modeling/backbones/__init__.py +0 -54
- pyxlpr/ppocr/modeling/backbones/det_mobilenet_v3.py +0 -268
- pyxlpr/ppocr/modeling/backbones/det_resnet_vd.py +0 -246
- pyxlpr/ppocr/modeling/backbones/det_resnet_vd_sast.py +0 -285
- pyxlpr/ppocr/modeling/backbones/e2e_resnet_vd_pg.py +0 -265
- pyxlpr/ppocr/modeling/backbones/kie_unet_sdmgr.py +0 -186
- pyxlpr/ppocr/modeling/backbones/rec_mobilenet_v3.py +0 -138
- pyxlpr/ppocr/modeling/backbones/rec_mv1_enhance.py +0 -258
- pyxlpr/ppocr/modeling/backbones/rec_nrtr_mtb.py +0 -48
- pyxlpr/ppocr/modeling/backbones/rec_resnet_31.py +0 -210
- pyxlpr/ppocr/modeling/backbones/rec_resnet_aster.py +0 -143
- pyxlpr/ppocr/modeling/backbones/rec_resnet_fpn.py +0 -307
- pyxlpr/ppocr/modeling/backbones/rec_resnet_vd.py +0 -286
- pyxlpr/ppocr/modeling/heads/__init__.py +0 -54
- pyxlpr/ppocr/modeling/heads/cls_head.py +0 -52
- pyxlpr/ppocr/modeling/heads/det_db_head.py +0 -118
- pyxlpr/ppocr/modeling/heads/det_east_head.py +0 -121
- pyxlpr/ppocr/modeling/heads/det_pse_head.py +0 -37
- pyxlpr/ppocr/modeling/heads/det_sast_head.py +0 -128
- pyxlpr/ppocr/modeling/heads/e2e_pg_head.py +0 -253
- pyxlpr/ppocr/modeling/heads/kie_sdmgr_head.py +0 -206
- pyxlpr/ppocr/modeling/heads/multiheadAttention.py +0 -163
- pyxlpr/ppocr/modeling/heads/rec_aster_head.py +0 -393
- pyxlpr/ppocr/modeling/heads/rec_att_head.py +0 -202
- pyxlpr/ppocr/modeling/heads/rec_ctc_head.py +0 -88
- pyxlpr/ppocr/modeling/heads/rec_nrtr_head.py +0 -826
- pyxlpr/ppocr/modeling/heads/rec_sar_head.py +0 -402
- pyxlpr/ppocr/modeling/heads/rec_srn_head.py +0 -280
- pyxlpr/ppocr/modeling/heads/self_attention.py +0 -406
- pyxlpr/ppocr/modeling/heads/table_att_head.py +0 -246
- pyxlpr/ppocr/modeling/necks/__init__.py +0 -32
- pyxlpr/ppocr/modeling/necks/db_fpn.py +0 -111
- pyxlpr/ppocr/modeling/necks/east_fpn.py +0 -188
- pyxlpr/ppocr/modeling/necks/fpn.py +0 -138
- pyxlpr/ppocr/modeling/necks/pg_fpn.py +0 -314
- pyxlpr/ppocr/modeling/necks/rnn.py +0 -92
- pyxlpr/ppocr/modeling/necks/sast_fpn.py +0 -284
- pyxlpr/ppocr/modeling/necks/table_fpn.py +0 -110
- pyxlpr/ppocr/modeling/transforms/__init__.py +0 -28
- pyxlpr/ppocr/modeling/transforms/stn.py +0 -135
- pyxlpr/ppocr/modeling/transforms/tps.py +0 -308
- pyxlpr/ppocr/modeling/transforms/tps_spatial_transformer.py +0 -156
- pyxlpr/ppocr/optimizer/__init__.py +0 -61
- pyxlpr/ppocr/optimizer/learning_rate.py +0 -228
- pyxlpr/ppocr/optimizer/lr_scheduler.py +0 -49
- pyxlpr/ppocr/optimizer/optimizer.py +0 -160
- pyxlpr/ppocr/optimizer/regularizer.py +0 -52
- pyxlpr/ppocr/postprocess/__init__.py +0 -55
- pyxlpr/ppocr/postprocess/cls_postprocess.py +0 -33
- pyxlpr/ppocr/postprocess/db_postprocess.py +0 -234
- pyxlpr/ppocr/postprocess/east_postprocess.py +0 -143
- pyxlpr/ppocr/postprocess/locality_aware_nms.py +0 -200
- pyxlpr/ppocr/postprocess/pg_postprocess.py +0 -52
- pyxlpr/ppocr/postprocess/pse_postprocess/__init__.py +0 -15
- pyxlpr/ppocr/postprocess/pse_postprocess/pse/__init__.py +0 -29
- pyxlpr/ppocr/postprocess/pse_postprocess/pse/setup.py +0 -14
- pyxlpr/ppocr/postprocess/pse_postprocess/pse_postprocess.py +0 -118
- pyxlpr/ppocr/postprocess/rec_postprocess.py +0 -654
- pyxlpr/ppocr/postprocess/sast_postprocess.py +0 -355
- pyxlpr/ppocr/tools/__init__.py +0 -14
- pyxlpr/ppocr/tools/eval.py +0 -83
- pyxlpr/ppocr/tools/export_center.py +0 -77
- pyxlpr/ppocr/tools/export_model.py +0 -129
- pyxlpr/ppocr/tools/infer/predict_cls.py +0 -151
- pyxlpr/ppocr/tools/infer/predict_det.py +0 -300
- pyxlpr/ppocr/tools/infer/predict_e2e.py +0 -169
- pyxlpr/ppocr/tools/infer/predict_rec.py +0 -414
- pyxlpr/ppocr/tools/infer/predict_system.py +0 -204
- pyxlpr/ppocr/tools/infer/utility.py +0 -629
- pyxlpr/ppocr/tools/infer_cls.py +0 -83
- pyxlpr/ppocr/tools/infer_det.py +0 -134
- pyxlpr/ppocr/tools/infer_e2e.py +0 -122
- pyxlpr/ppocr/tools/infer_kie.py +0 -153
- pyxlpr/ppocr/tools/infer_rec.py +0 -146
- pyxlpr/ppocr/tools/infer_table.py +0 -107
- pyxlpr/ppocr/tools/program.py +0 -596
- pyxlpr/ppocr/tools/test_hubserving.py +0 -117
- pyxlpr/ppocr/tools/train.py +0 -163
- pyxlpr/ppocr/tools/xlprog.py +0 -748
- pyxlpr/ppocr/utils/EN_symbol_dict.txt +0 -94
- pyxlpr/ppocr/utils/__init__.py +0 -24
- pyxlpr/ppocr/utils/dict/ar_dict.txt +0 -117
- pyxlpr/ppocr/utils/dict/arabic_dict.txt +0 -162
- pyxlpr/ppocr/utils/dict/be_dict.txt +0 -145
- pyxlpr/ppocr/utils/dict/bg_dict.txt +0 -140
- pyxlpr/ppocr/utils/dict/chinese_cht_dict.txt +0 -8421
- pyxlpr/ppocr/utils/dict/cyrillic_dict.txt +0 -163
- pyxlpr/ppocr/utils/dict/devanagari_dict.txt +0 -167
- pyxlpr/ppocr/utils/dict/en_dict.txt +0 -63
- pyxlpr/ppocr/utils/dict/fa_dict.txt +0 -136
- pyxlpr/ppocr/utils/dict/french_dict.txt +0 -136
- pyxlpr/ppocr/utils/dict/german_dict.txt +0 -143
- pyxlpr/ppocr/utils/dict/hi_dict.txt +0 -162
- pyxlpr/ppocr/utils/dict/it_dict.txt +0 -118
- pyxlpr/ppocr/utils/dict/japan_dict.txt +0 -4399
- pyxlpr/ppocr/utils/dict/ka_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/korean_dict.txt +0 -3688
- pyxlpr/ppocr/utils/dict/latin_dict.txt +0 -185
- pyxlpr/ppocr/utils/dict/mr_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/ne_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/oc_dict.txt +0 -96
- pyxlpr/ppocr/utils/dict/pu_dict.txt +0 -130
- pyxlpr/ppocr/utils/dict/rs_dict.txt +0 -91
- pyxlpr/ppocr/utils/dict/rsc_dict.txt +0 -134
- pyxlpr/ppocr/utils/dict/ru_dict.txt +0 -125
- pyxlpr/ppocr/utils/dict/ta_dict.txt +0 -128
- pyxlpr/ppocr/utils/dict/table_dict.txt +0 -277
- pyxlpr/ppocr/utils/dict/table_structure_dict.txt +0 -2759
- pyxlpr/ppocr/utils/dict/te_dict.txt +0 -151
- pyxlpr/ppocr/utils/dict/ug_dict.txt +0 -114
- pyxlpr/ppocr/utils/dict/uk_dict.txt +0 -142
- pyxlpr/ppocr/utils/dict/ur_dict.txt +0 -137
- pyxlpr/ppocr/utils/dict/xi_dict.txt +0 -110
- pyxlpr/ppocr/utils/dict90.txt +0 -90
- pyxlpr/ppocr/utils/e2e_metric/Deteval.py +0 -574
- pyxlpr/ppocr/utils/e2e_metric/polygon_fast.py +0 -83
- pyxlpr/ppocr/utils/e2e_utils/extract_batchsize.py +0 -87
- pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_fast.py +0 -457
- pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_slow.py +0 -592
- pyxlpr/ppocr/utils/e2e_utils/pgnet_pp_utils.py +0 -162
- pyxlpr/ppocr/utils/e2e_utils/visual.py +0 -162
- pyxlpr/ppocr/utils/en_dict.txt +0 -95
- pyxlpr/ppocr/utils/gen_label.py +0 -81
- pyxlpr/ppocr/utils/ic15_dict.txt +0 -36
- pyxlpr/ppocr/utils/iou.py +0 -54
- pyxlpr/ppocr/utils/logging.py +0 -69
- pyxlpr/ppocr/utils/network.py +0 -84
- pyxlpr/ppocr/utils/ppocr_keys_v1.txt +0 -6623
- pyxlpr/ppocr/utils/profiler.py +0 -110
- pyxlpr/ppocr/utils/save_load.py +0 -150
- pyxlpr/ppocr/utils/stats.py +0 -72
- pyxlpr/ppocr/utils/utility.py +0 -80
- pyxlpr/ppstructure/__init__.py +0 -13
- pyxlpr/ppstructure/predict_system.py +0 -187
- pyxlpr/ppstructure/table/__init__.py +0 -13
- pyxlpr/ppstructure/table/eval_table.py +0 -72
- pyxlpr/ppstructure/table/matcher.py +0 -192
- pyxlpr/ppstructure/table/predict_structure.py +0 -136
- pyxlpr/ppstructure/table/predict_table.py +0 -221
- pyxlpr/ppstructure/table/table_metric/__init__.py +0 -16
- pyxlpr/ppstructure/table/table_metric/parallel.py +0 -51
- pyxlpr/ppstructure/table/table_metric/table_metric.py +0 -247
- pyxlpr/ppstructure/table/tablepyxl/__init__.py +0 -13
- pyxlpr/ppstructure/table/tablepyxl/style.py +0 -283
- pyxlpr/ppstructure/table/tablepyxl/tablepyxl.py +0 -118
- pyxlpr/ppstructure/utility.py +0 -71
- pyxlpr/xlai.py +0 -10
- /pyxllib/{ext/autogui → autogui}/virtualkey.py +0 -0
- {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
pyxllib/algo/geo.py
CHANGED
@@ -33,6 +33,18 @@ def ltrb2xywh(p):
|
|
33
33
|
return [p[0], p[1], p[2] - p[0], p[3] - p[1]]
|
34
34
|
|
35
35
|
|
36
|
+
def ltrb2polygon(p):
|
37
|
+
""" ltrb坐标转多边形
|
38
|
+
|
39
|
+
:param list|tuple p: [left, top, right, bottom]
|
40
|
+
:rtype: list
|
41
|
+
|
42
|
+
>>> ltrb2polygon([100, 50, 200, 150])
|
43
|
+
[[100, 50], [200, 50], [200, 150], [100, 150]]
|
44
|
+
"""
|
45
|
+
return [p[:2], [p[2], p[1]], p[2:], [p[0], p[3]]]
|
46
|
+
|
47
|
+
|
36
48
|
def rect2polygon(src_pts):
|
37
49
|
""" 矩形对角线两个点,转成四边形四个点的模式来表达
|
38
50
|
(输入左上、右下两个顶点坐标)
|
pyxllib/algo/intervals.py
CHANGED
pyxllib/algo/matcher.py
CHANGED
@@ -13,6 +13,8 @@ from pyxllib.prog.pupil import check_install_package
|
|
13
13
|
# check_install_package('Levenshtein', 'python-Levenshtein')
|
14
14
|
|
15
15
|
from collections import defaultdict
|
16
|
+
import heapq
|
17
|
+
import math
|
16
18
|
import warnings
|
17
19
|
|
18
20
|
warnings.filterwarnings("ignore", message="loaded more than 1 DLL from .libs:")
|
@@ -40,6 +42,53 @@ except ModuleNotFoundError:
|
|
40
42
|
pass
|
41
43
|
|
42
44
|
|
45
|
+
def calculate_coeff_favoring_length(length1, length2, baseline=100, scale=10000):
|
46
|
+
"""
|
47
|
+
根据两文本的长度计算相似度调整系数,以解决短文本过高相似度评分的问题。
|
48
|
+
|
49
|
+
短文本之间相似或完全相同的片段可能导致相似度评分过高,从而误判文本间的相关性比实际更高。
|
50
|
+
通过引入相似度调整系数来平衡评分,降低短文本之间的相似度得分,使评分更加合理和公平。
|
51
|
+
|
52
|
+
:param length1: 第一文本的长度
|
53
|
+
:param length2: 第二文本的长度
|
54
|
+
:param baseline: 基线长度,影响系数调整的起始点。
|
55
|
+
:param scale: 尺度长度,定义了系数增长到2的长度标准。
|
56
|
+
:return: 相似度调整系数。
|
57
|
+
"""
|
58
|
+
total_length = length1 + length2
|
59
|
+
length_ratio = min(length1, length2) / max(length1, length2)
|
60
|
+
|
61
|
+
if total_length < baseline:
|
62
|
+
coefficient = 0.5 + 0.5 * (total_length / baseline)
|
63
|
+
else:
|
64
|
+
coefficient = 1 + (math.log1p(total_length - baseline + 1) / math.log1p(scale - baseline + 1))
|
65
|
+
|
66
|
+
# 考虑长度差异的影响
|
67
|
+
coefficient *= length_ratio
|
68
|
+
|
69
|
+
return coefficient
|
70
|
+
|
71
|
+
|
72
|
+
def compute_text_similarity_favoring_length(text1, text2, baseline=100, scale=10000):
|
73
|
+
"""
|
74
|
+
计算两段文本之间的相似度,引入长度调整系数以解决短文本过高相似度评分的问题。
|
75
|
+
|
76
|
+
:param text1: 第一段文本
|
77
|
+
:param text2: 第二段文本
|
78
|
+
:param baseline: 基线长度,影响系数调整的起始点。
|
79
|
+
:param scale: 尺度长度,定义了系数增长到2的长度标准。
|
80
|
+
:return: 加权后的相似度得分,范围在0到1之间。
|
81
|
+
"""
|
82
|
+
base_similarity = Levenshtein.ratio(text1, text2)
|
83
|
+
coefficient = calculate_coeff_favoring_length(len(text1), len(text2), baseline, scale)
|
84
|
+
|
85
|
+
# 计算加权相似度
|
86
|
+
weighted_similarity = base_similarity * coefficient
|
87
|
+
|
88
|
+
# 确保相似度不会超过1
|
89
|
+
return min(weighted_similarity, 1.0)
|
90
|
+
|
91
|
+
|
43
92
|
class DataMatcher:
|
44
93
|
""" 泛化的匹配类,对任何类型的数据进行匹配 """
|
45
94
|
|
@@ -161,6 +210,35 @@ class DataMatcher:
|
|
161
210
|
center_idx = max(indices, key=lambda x: sum(get_similarity(x, y) for y in indices))
|
162
211
|
return center_idx
|
163
212
|
|
213
|
+
def find_top_similar_pairs(self, top_n=1):
|
214
|
+
"""找到最相近的top_n对数据。
|
215
|
+
|
216
|
+
:param top_n: 需要返回的最相似的数据对的数量。
|
217
|
+
:return: 一个列表,包含(top_n个)最相似数据对的索引和它们之间的相似度。
|
218
|
+
"""
|
219
|
+
if len(self.data) < 2:
|
220
|
+
return []
|
221
|
+
|
222
|
+
# 初始化一个列表来保存最相似的数据对,使用最小堆来维护这个列表
|
223
|
+
# 最小堆能够保证每次都能快速弹出相似度最小的数据对
|
224
|
+
top_pairs = []
|
225
|
+
|
226
|
+
for i in tqdm(range(len(self.data))):
|
227
|
+
for j in range(i + 1, len(self.data)):
|
228
|
+
similarity = self.compute_similarity(self.data[i], self.data[j])
|
229
|
+
|
230
|
+
# 如果当前相似度对数量还未达到top_n,直接添加
|
231
|
+
if len(top_pairs) < top_n:
|
232
|
+
heapq.heappush(top_pairs, (similarity, (i, j)))
|
233
|
+
else:
|
234
|
+
# 如果当前对的相似度大于堆中最小的相似度,替换之
|
235
|
+
if similarity > top_pairs[0][0]:
|
236
|
+
heapq.heapreplace(top_pairs, (similarity, (i, j)))
|
237
|
+
|
238
|
+
# 将堆转换为排序后的列表返回
|
239
|
+
top_pairs.sort(reverse=True, key=lambda x: x[0])
|
240
|
+
return [(pair[1], pair[0]) for pair in top_pairs]
|
241
|
+
|
164
242
|
|
165
243
|
class GroupedDataMatcher(DataMatcher):
|
166
244
|
""" 对数据量特别大的情况,我们可以先对数据进行分组,然后再对每个分组进行匹配 """
|
pyxllib/algo/pupil.py
CHANGED
@@ -4,11 +4,13 @@
|
|
4
4
|
# @Email : 877362867@qq.com
|
5
5
|
# @Date : 2021/06/03 14:22
|
6
6
|
|
7
|
-
import
|
7
|
+
from bisect import bisect_right
|
8
8
|
from collections import defaultdict, Counter
|
9
|
-
import
|
9
|
+
import datetime
|
10
10
|
import re
|
11
|
+
from statistics import quantiles
|
11
12
|
import sys
|
13
|
+
import textwrap
|
12
14
|
|
13
15
|
from pyxllib.prog.newbie import typename, human_readable_number
|
14
16
|
from pyxllib.text.pupil import listalign, int2myalphaenum
|
@@ -152,6 +154,189 @@ class ValuesStat:
|
|
152
154
|
raise ValueError("无效的数据数量")
|
153
155
|
|
154
156
|
|
157
|
+
class ValuesStat2:
|
158
|
+
""" 240509周四17:33,第2代统计器
|
159
|
+
|
160
|
+
240628周五14:05 todo 关于各种特殊格式数据,怎么计算是个问题
|
161
|
+
这问题可能有些复杂,近期估计没空折腾,留以后有空折腾的一个大坑了
|
162
|
+
"""
|
163
|
+
|
164
|
+
def __init__(self, values=None, raw_values=None, data_type=None):
|
165
|
+
from statistics import pstdev, mean
|
166
|
+
|
167
|
+
# 支持输入可能带有非数值类型的raw_values
|
168
|
+
data_type = data_type or ''
|
169
|
+
if raw_values:
|
170
|
+
if 'timestamp' in data_type:
|
171
|
+
values = [x.timestamp() for x in raw_values if hasattr(x, 'timestamp')]
|
172
|
+
else:
|
173
|
+
values = [x for x in raw_values if isinstance(x, (int, float))] # todo 可能需要更泛用的判断数值的方法
|
174
|
+
|
175
|
+
self.date_type = data_type
|
176
|
+
self.raw_values = raw_values
|
177
|
+
values = values or []
|
178
|
+
self.values = sorted(values)
|
179
|
+
if self.raw_values:
|
180
|
+
self.raw_n = len(self.raw_values)
|
181
|
+
else:
|
182
|
+
self.raw_n = 0
|
183
|
+
self.n = len(values)
|
184
|
+
|
185
|
+
if 'timestamp' in data_type:
|
186
|
+
self.sum = None
|
187
|
+
else:
|
188
|
+
self.sum = sum(values)
|
189
|
+
|
190
|
+
if self.n:
|
191
|
+
self.mean = mean(self.values)
|
192
|
+
self.std = pstdev(self.values)
|
193
|
+
self.min, self.max = self.values[0], self.values[-1]
|
194
|
+
else:
|
195
|
+
self.mean = self.std = self.min = self.max = None
|
196
|
+
|
197
|
+
self.dist = None
|
198
|
+
|
199
|
+
def __len__(self):
|
200
|
+
return self.n
|
201
|
+
|
202
|
+
def _summary(self, unit=None, precision=4, percentile_count=5):
|
203
|
+
""" 返回字典结构的总结 """
|
204
|
+
""" 文本汇总性的报告
|
205
|
+
|
206
|
+
:param percentile_count: 包括两个极值端点的切分点数,
|
207
|
+
设置2,就是不设置分位数,就是只展示最小、最大值
|
208
|
+
如果设置了3,就表示"中位数、二分位数",在展示的时候,会显示50%位置的分位数值
|
209
|
+
如果设置了5,就相当于"四分位数",会显示25%、50%、75%位置的分位数值
|
210
|
+
:param unit: 展示数值时使用的单位
|
211
|
+
:param precision: 展示数值时的精度
|
212
|
+
"""
|
213
|
+
|
214
|
+
# 1 各种细分的格式化方法
|
215
|
+
def fmt0(v):
|
216
|
+
# 数量类整数的格式
|
217
|
+
return human_readable_number(v, '万')
|
218
|
+
|
219
|
+
def fmt1(v):
|
220
|
+
if isinstance(v, str):
|
221
|
+
return v
|
222
|
+
return human_readable_number(v, unit or 'K', precision)
|
223
|
+
|
224
|
+
def fmt2(v):
|
225
|
+
# 日期类数据的格式化
|
226
|
+
# todo 这个应该数据的具体格式来设置的,但是这个现在有点难写,先写死
|
227
|
+
if isinstance(v, str):
|
228
|
+
return v
|
229
|
+
elif isinstance(v, (int, float)):
|
230
|
+
v = datetime.datetime.fromtimestamp(v)
|
231
|
+
|
232
|
+
return v.strftime(unit or '%Y-%m-%d %H:%M:%S')
|
233
|
+
|
234
|
+
def fmt2b(v):
|
235
|
+
# 时间长度类数据的格式化
|
236
|
+
return human_readable_number(v, '秒')
|
237
|
+
|
238
|
+
if 'timestamp' in self.date_type:
|
239
|
+
fmt = fmt2
|
240
|
+
fmtb = fmt2b
|
241
|
+
else:
|
242
|
+
fmt = fmtb = fmt1
|
243
|
+
|
244
|
+
# 2 生成统计报告
|
245
|
+
desc = {}
|
246
|
+
if self.raw_n and self.raw_n > self.n:
|
247
|
+
desc["总数"] = f"{fmt0(self.n)}/{fmt0(self.raw_n)}≈{self.n / self.raw_n:.2%}"
|
248
|
+
else:
|
249
|
+
desc["总数"] = f"{fmt0(self.n)}"
|
250
|
+
|
251
|
+
if self.sum is not None:
|
252
|
+
desc["总和"] = f"{fmt(self.sum)}"
|
253
|
+
if self.mean is not None and self.std is not None:
|
254
|
+
desc["均值±标准差"] = f"{fmt(self.mean)}±{fmtb(self.std)}"
|
255
|
+
elif self.mean is not None:
|
256
|
+
desc["均值"] = f"{fmt(self.mean)}"
|
257
|
+
elif self.std is not None:
|
258
|
+
desc["标准差"] = f"{fmtb(self.std)}"
|
259
|
+
|
260
|
+
if self.values:
|
261
|
+
dist = [self.values[0]]
|
262
|
+
if percentile_count > 2:
|
263
|
+
quartiles = quantiles(self.values, n=percentile_count - 1)
|
264
|
+
dist += quartiles
|
265
|
+
dist.append(self.values[-1])
|
266
|
+
|
267
|
+
desc["分布"] = '/'.join([fmt(v) for v in dist])
|
268
|
+
elif self.dist:
|
269
|
+
desc["分布"] = '/'.join([fmt(v) for v in self.dist])
|
270
|
+
|
271
|
+
return desc
|
272
|
+
|
273
|
+
def summary(self, unit=None, precision=4, percentile_count=5):
|
274
|
+
""" 文本汇总性的报告
|
275
|
+
|
276
|
+
:param unit: 展示数值时使用的单位
|
277
|
+
:param precision: 展示数值时的精度
|
278
|
+
:param percentile_count: 包括两个极值端点的切分点数,
|
279
|
+
设置2,就是不设置分位数,就是只展示最小、最大值
|
280
|
+
如果设置了3,就表示"中位数、二分位数",在展示的时候,会显示50%位置的分位数值
|
281
|
+
如果设置了5,就相当于"四分位数",会显示25%、50%、75%位置的分位数值
|
282
|
+
"""
|
283
|
+
desc = self._summary(unit, precision, percentile_count)
|
284
|
+
return '\t'.join([f"{key}: {value}" for key, value in desc.items()])
|
285
|
+
|
286
|
+
def calculate_ratios(self, x_values, fmt=False, unit=False):
|
287
|
+
""" 计算并返回一个字典,其中包含每个 x_values 中的值与其小于等于该值的元素的比例
|
288
|
+
|
289
|
+
:param x_values: 一个数值列表,用来计算每个数值小于等于它的元素的比例
|
290
|
+
:param fmt: 直接将值格式化好
|
291
|
+
:return: 一个字典,键为输入的数值,值为对应的比例(百分比)
|
292
|
+
"""
|
293
|
+
ratio_dict = {}
|
294
|
+
for x in x_values:
|
295
|
+
position = bisect_right(self.values, x)
|
296
|
+
if self.n > 0:
|
297
|
+
ratio = (position / self.n)
|
298
|
+
else:
|
299
|
+
ratio = 0
|
300
|
+
ratio_dict[x] = ratio
|
301
|
+
|
302
|
+
def unit_func(x):
|
303
|
+
if unit:
|
304
|
+
return human_readable_number(x, unit, 4)
|
305
|
+
return x
|
306
|
+
|
307
|
+
if fmt:
|
308
|
+
ratio_dict = {unit_func(x): f'{ratio:.2%}' for x, ratio in ratio_dict.items()}
|
309
|
+
|
310
|
+
return ratio_dict
|
311
|
+
|
312
|
+
def group_count(self, max_entries=None, min_count=None):
|
313
|
+
""" 统计每种取值出现的次数,并根据条件过滤结果
|
314
|
+
|
315
|
+
:param max_entries: 最多显示的条目数
|
316
|
+
:param min_count: 显示的条目至少出现的次数
|
317
|
+
"""
|
318
|
+
from collections import Counter
|
319
|
+
|
320
|
+
# 使用Counter来计数每个值出现的次数
|
321
|
+
counts = Counter(self.values or self.raw_values)
|
322
|
+
|
323
|
+
# 根据min_count过滤计数结果
|
324
|
+
if min_count is not None:
|
325
|
+
counts = {k: v for k, v in counts.items() if v >= min_count}
|
326
|
+
|
327
|
+
# 根据max_entries限制结果数量
|
328
|
+
if max_entries is not None:
|
329
|
+
# 按出现次数降序排列,然后选取前max_entries项
|
330
|
+
most_common = counts.most_common(max_entries)
|
331
|
+
# 转换回字典形式
|
332
|
+
counts = dict(most_common)
|
333
|
+
else:
|
334
|
+
# 如果没有指定max_entries,则保持所有满足min_count的结果
|
335
|
+
counts = dict(sorted(counts.items(), key=lambda item: item[1], reverse=True))
|
336
|
+
|
337
|
+
return counts
|
338
|
+
|
339
|
+
|
155
340
|
class Groups:
|
156
341
|
def __init__(self, data):
|
157
342
|
""" 分组
|
@@ -296,23 +481,6 @@ def matchpairs(xs, ys, cmp_func, least_score=sys.float_info.epsilon, *,
|
|
296
481
|
return pairs
|
297
482
|
|
298
483
|
|
299
|
-
def get_number_width(n):
|
300
|
-
""" 判断数值n的长度
|
301
|
-
|
302
|
-
>>> get_number_width(0)
|
303
|
-
Traceback (most recent call last):
|
304
|
-
AssertionError
|
305
|
-
>>> get_number_width(9)
|
306
|
-
1
|
307
|
-
>>> get_number_width(10)
|
308
|
-
2
|
309
|
-
>>> get_number_width(97)
|
310
|
-
2
|
311
|
-
"""
|
312
|
-
assert n > 0
|
313
|
-
return math.ceil(math.log10(n + 1))
|
314
|
-
|
315
|
-
|
316
484
|
class SearchBase:
|
317
485
|
""" 一个dfs、bfs模板类 """
|
318
486
|
|
pyxllib/algo/specialist.py
CHANGED
@@ -11,7 +11,8 @@ import numpy as np
|
|
11
11
|
import pandas as pd
|
12
12
|
|
13
13
|
from pyxllib.prog.pupil import DictTool
|
14
|
-
from pyxllib.prog.deprecatedlib import deprecated
|
14
|
+
# from pyxllib.prog.deprecatedlib import deprecated
|
15
|
+
from deprecated import deprecated
|
15
16
|
|
16
17
|
|
17
18
|
@deprecated(reason='这个实现方式不佳,请参考 make_index_function')
|
pyxllib/algo/stat.py
CHANGED
@@ -18,6 +18,10 @@ from pyxllib.prog.pupil import dprint, typename
|
|
18
18
|
from pyxllib.file.specialist import XlPath
|
19
19
|
|
20
20
|
pd.options.display.unicode.east_asian_width = True # 优化中文输出对齐问题
|
21
|
+
try:
|
22
|
+
pd.set_option('future.no_silent_downcasting', True)
|
23
|
+
except Exception as e:
|
24
|
+
pass
|
21
25
|
|
22
26
|
|
23
27
|
def treetable(childreds, parents, arg3=None, nodename_colname=None):
|
@@ -250,13 +254,18 @@ def xlpivot(df, index=None, columns=None, values=None):
|
|
250
254
|
:param columns: 列划分方式
|
251
255
|
:param values: 显示的值
|
252
256
|
Callable[items, value]:输出一个函数
|
257
|
+
list[str]: 支持输入属性列表,表示显示原始值的意思。如果原始值不唯一,则逗号分开拼接后显示。但这种用法就不太算是传统意义的数据透视表了
|
253
258
|
:return: 数据透视表的表格
|
254
259
|
|
255
260
|
使用示例:
|
256
261
|
def func(items): # 输入匹配的多行数据
|
257
262
|
x = items.iloc[0]
|
258
263
|
return f'{x["precision"]:.0f},{x["recall"]:.0f},{x["hmean"]:.2f},{x["fps"]}' # 返回显示的值
|
259
|
-
|
264
|
+
|
265
|
+
>> df2 = xlpivot(df, ['model_type'], ['dataset', 'total_frame'],
|
266
|
+
{'precision,recall,hmean,fps': func})
|
267
|
+
|
268
|
+
注意技巧:如果要在分组后约束特定顺序,可以使用特殊前缀进行编号对齐
|
260
269
|
"""
|
261
270
|
|
262
271
|
# 1 将分组的格式标准化
|
@@ -271,6 +280,17 @@ def xlpivot(df, index=None, columns=None, values=None):
|
|
271
280
|
index_, columns_ = reset_groups(index), reset_groups(columns)
|
272
281
|
|
273
282
|
# 2 目标值的格式标准化
|
283
|
+
def make_col_func(col):
|
284
|
+
def func(rows):
|
285
|
+
if len(rows):
|
286
|
+
return ', '.join(map(str, rows[col].values))
|
287
|
+
return ''
|
288
|
+
|
289
|
+
return func
|
290
|
+
|
291
|
+
if isinstance(values, (list, tuple)):
|
292
|
+
values = {v: make_col_func(v) for v in values}
|
293
|
+
|
274
294
|
if callable(values):
|
275
295
|
values_ = {'values': values}
|
276
296
|
elif isinstance(values, dict):
|
@@ -279,6 +299,8 @@ def xlpivot(df, index=None, columns=None, values=None):
|
|
279
299
|
raise TypeError
|
280
300
|
|
281
301
|
# 3 分组
|
302
|
+
assert len(df), 'df是空的'
|
303
|
+
|
282
304
|
keys = index_ + columns_
|
283
305
|
dfgp = df.groupby(keys)
|
284
306
|
data = defaultdict(list)
|
@@ -429,7 +451,7 @@ def print_full_dataframe(df):
|
|
429
451
|
'display.max_colwidth', None):
|
430
452
|
print(df)
|
431
453
|
|
432
|
-
pd.options
|
454
|
+
pd.options.display.max_rows = 60
|
433
455
|
|
434
456
|
|
435
457
|
def custom_fillna(df, default_fill_value='', numeric_fill_value=None, specific_fill=None):
|
@@ -456,3 +478,17 @@ def custom_fillna(df, default_fill_value='', numeric_fill_value=None, specific_f
|
|
456
478
|
df[column] = df[column].fillna(default_fill_value)
|
457
479
|
# 可以在这里添加更多条件,以处理其他数据类型,如datetime。
|
458
480
|
return df
|
481
|
+
|
482
|
+
|
483
|
+
def dataframe_to_list(df):
|
484
|
+
"""将DataFrame转换为列表结构,第一行是表头,其余是数据"""
|
485
|
+
# 获取表头(列名)作为第一个列表元素
|
486
|
+
headers = df.columns.tolist()
|
487
|
+
|
488
|
+
# 获取数据行,每一行作为一个列表,然后将所有这些列表收集到一个大列表中
|
489
|
+
data_rows = df.values.tolist()
|
490
|
+
|
491
|
+
# 将表头和数据行合并成最终的列表
|
492
|
+
result_list = [headers] + data_rows
|
493
|
+
|
494
|
+
return result_list
|