pyxllib 0.3.96__py3-none-any.whl → 0.3.197__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/algo/geo.py +12 -0
- pyxllib/algo/intervals.py +1 -1
- pyxllib/algo/matcher.py +78 -0
- pyxllib/algo/pupil.py +187 -19
- pyxllib/algo/specialist.py +2 -1
- pyxllib/algo/stat.py +38 -2
- {pyxlpr → pyxllib/autogui}/__init__.py +1 -1
- pyxllib/autogui/activewin.py +246 -0
- pyxllib/autogui/all.py +9 -0
- pyxllib/{ext/autogui → autogui}/autogui.py +40 -11
- pyxllib/autogui/uiautolib.py +362 -0
- pyxllib/autogui/wechat.py +827 -0
- pyxllib/autogui/wechat_msg.py +421 -0
- pyxllib/autogui/wxautolib.py +84 -0
- pyxllib/cv/slidercaptcha.py +137 -0
- pyxllib/data/echarts.py +123 -12
- pyxllib/data/jsonlib.py +89 -0
- pyxllib/data/pglib.py +514 -30
- pyxllib/data/sqlite.py +231 -4
- pyxllib/ext/JLineViewer.py +14 -1
- pyxllib/ext/drissionlib.py +277 -0
- pyxllib/ext/kq5034lib.py +0 -1594
- pyxllib/ext/robustprocfile.py +497 -0
- pyxllib/ext/unixlib.py +6 -5
- pyxllib/ext/utools.py +108 -95
- pyxllib/ext/webhook.py +32 -14
- pyxllib/ext/wjxlib.py +88 -0
- pyxllib/ext/wpsapi.py +124 -0
- pyxllib/ext/xlwork.py +9 -0
- pyxllib/ext/yuquelib.py +1003 -71
- pyxllib/file/docxlib.py +1 -1
- pyxllib/file/libreoffice.py +165 -0
- pyxllib/file/movielib.py +9 -0
- pyxllib/file/packlib/__init__.py +112 -75
- pyxllib/file/pdflib.py +1 -1
- pyxllib/file/pupil.py +1 -1
- pyxllib/file/specialist/dirlib.py +1 -1
- pyxllib/file/specialist/download.py +10 -3
- pyxllib/file/specialist/filelib.py +266 -55
- pyxllib/file/xlsxlib.py +205 -50
- pyxllib/file/xlsyncfile.py +341 -0
- pyxllib/prog/cachetools.py +64 -0
- pyxllib/prog/filelock.py +42 -0
- pyxllib/prog/multiprogs.py +940 -0
- pyxllib/prog/newbie.py +9 -2
- pyxllib/prog/pupil.py +129 -60
- pyxllib/prog/specialist/__init__.py +176 -2
- pyxllib/prog/specialist/bc.py +5 -2
- pyxllib/prog/specialist/browser.py +11 -2
- pyxllib/prog/specialist/datetime.py +68 -0
- pyxllib/prog/specialist/tictoc.py +12 -13
- pyxllib/prog/specialist/xllog.py +5 -5
- pyxllib/prog/xlosenv.py +7 -0
- pyxllib/text/airscript.js +744 -0
- pyxllib/text/charclasslib.py +17 -5
- pyxllib/text/jiebalib.py +6 -3
- pyxllib/text/jinjalib.py +32 -0
- pyxllib/text/jsa_ai_prompt.md +271 -0
- pyxllib/text/jscode.py +159 -4
- pyxllib/text/nestenv.py +1 -1
- pyxllib/text/newbie.py +12 -0
- pyxllib/text/pupil/common.py +26 -0
- pyxllib/text/specialist/ptag.py +2 -2
- pyxllib/text/templates/echart_base.html +11 -0
- pyxllib/text/templates/highlight_code.html +17 -0
- pyxllib/text/templates/latex_editor.html +103 -0
- pyxllib/text/xmllib.py +76 -14
- pyxllib/xl.py +2 -1
- pyxllib-0.3.197.dist-info/METADATA +48 -0
- pyxllib-0.3.197.dist-info/RECORD +126 -0
- {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +1 -2
- pyxllib/ext/autogui/__init__.py +0 -8
- pyxllib-0.3.96.dist-info/METADATA +0 -51
- pyxllib-0.3.96.dist-info/RECORD +0 -333
- pyxllib-0.3.96.dist-info/top_level.txt +0 -2
- pyxlpr/ai/__init__.py +0 -5
- pyxlpr/ai/clientlib.py +0 -1281
- pyxlpr/ai/specialist.py +0 -286
- pyxlpr/ai/torch_app.py +0 -172
- pyxlpr/ai/xlpaddle.py +0 -655
- pyxlpr/ai/xltorch.py +0 -705
- pyxlpr/data/__init__.py +0 -11
- pyxlpr/data/coco.py +0 -1325
- pyxlpr/data/datacls.py +0 -365
- pyxlpr/data/datasets.py +0 -200
- pyxlpr/data/gptlib.py +0 -1291
- pyxlpr/data/icdar/__init__.py +0 -96
- pyxlpr/data/icdar/deteval.py +0 -377
- pyxlpr/data/icdar/icdar2013.py +0 -341
- pyxlpr/data/icdar/iou.py +0 -340
- pyxlpr/data/icdar/rrc_evaluation_funcs_1_1.py +0 -463
- pyxlpr/data/imtextline.py +0 -473
- pyxlpr/data/labelme.py +0 -866
- pyxlpr/data/removeline.py +0 -179
- pyxlpr/data/specialist.py +0 -57
- pyxlpr/eval/__init__.py +0 -85
- pyxlpr/paddleocr.py +0 -776
- pyxlpr/ppocr/__init__.py +0 -15
- pyxlpr/ppocr/configs/rec/multi_language/generate_multi_language_configs.py +0 -226
- pyxlpr/ppocr/data/__init__.py +0 -135
- pyxlpr/ppocr/data/imaug/ColorJitter.py +0 -26
- pyxlpr/ppocr/data/imaug/__init__.py +0 -67
- pyxlpr/ppocr/data/imaug/copy_paste.py +0 -170
- pyxlpr/ppocr/data/imaug/east_process.py +0 -437
- pyxlpr/ppocr/data/imaug/gen_table_mask.py +0 -244
- pyxlpr/ppocr/data/imaug/iaa_augment.py +0 -114
- pyxlpr/ppocr/data/imaug/label_ops.py +0 -789
- pyxlpr/ppocr/data/imaug/make_border_map.py +0 -184
- pyxlpr/ppocr/data/imaug/make_pse_gt.py +0 -106
- pyxlpr/ppocr/data/imaug/make_shrink_map.py +0 -126
- pyxlpr/ppocr/data/imaug/operators.py +0 -433
- pyxlpr/ppocr/data/imaug/pg_process.py +0 -906
- pyxlpr/ppocr/data/imaug/randaugment.py +0 -143
- pyxlpr/ppocr/data/imaug/random_crop_data.py +0 -239
- pyxlpr/ppocr/data/imaug/rec_img_aug.py +0 -533
- pyxlpr/ppocr/data/imaug/sast_process.py +0 -777
- pyxlpr/ppocr/data/imaug/text_image_aug/__init__.py +0 -17
- pyxlpr/ppocr/data/imaug/text_image_aug/augment.py +0 -120
- pyxlpr/ppocr/data/imaug/text_image_aug/warp_mls.py +0 -168
- pyxlpr/ppocr/data/lmdb_dataset.py +0 -115
- pyxlpr/ppocr/data/pgnet_dataset.py +0 -104
- pyxlpr/ppocr/data/pubtab_dataset.py +0 -107
- pyxlpr/ppocr/data/simple_dataset.py +0 -372
- pyxlpr/ppocr/losses/__init__.py +0 -61
- pyxlpr/ppocr/losses/ace_loss.py +0 -52
- pyxlpr/ppocr/losses/basic_loss.py +0 -135
- pyxlpr/ppocr/losses/center_loss.py +0 -88
- pyxlpr/ppocr/losses/cls_loss.py +0 -30
- pyxlpr/ppocr/losses/combined_loss.py +0 -67
- pyxlpr/ppocr/losses/det_basic_loss.py +0 -208
- pyxlpr/ppocr/losses/det_db_loss.py +0 -80
- pyxlpr/ppocr/losses/det_east_loss.py +0 -63
- pyxlpr/ppocr/losses/det_pse_loss.py +0 -149
- pyxlpr/ppocr/losses/det_sast_loss.py +0 -121
- pyxlpr/ppocr/losses/distillation_loss.py +0 -272
- pyxlpr/ppocr/losses/e2e_pg_loss.py +0 -140
- pyxlpr/ppocr/losses/kie_sdmgr_loss.py +0 -113
- pyxlpr/ppocr/losses/rec_aster_loss.py +0 -99
- pyxlpr/ppocr/losses/rec_att_loss.py +0 -39
- pyxlpr/ppocr/losses/rec_ctc_loss.py +0 -44
- pyxlpr/ppocr/losses/rec_enhanced_ctc_loss.py +0 -70
- pyxlpr/ppocr/losses/rec_nrtr_loss.py +0 -30
- pyxlpr/ppocr/losses/rec_sar_loss.py +0 -28
- pyxlpr/ppocr/losses/rec_srn_loss.py +0 -47
- pyxlpr/ppocr/losses/table_att_loss.py +0 -109
- pyxlpr/ppocr/metrics/__init__.py +0 -44
- pyxlpr/ppocr/metrics/cls_metric.py +0 -45
- pyxlpr/ppocr/metrics/det_metric.py +0 -82
- pyxlpr/ppocr/metrics/distillation_metric.py +0 -73
- pyxlpr/ppocr/metrics/e2e_metric.py +0 -86
- pyxlpr/ppocr/metrics/eval_det_iou.py +0 -274
- pyxlpr/ppocr/metrics/kie_metric.py +0 -70
- pyxlpr/ppocr/metrics/rec_metric.py +0 -75
- pyxlpr/ppocr/metrics/table_metric.py +0 -50
- pyxlpr/ppocr/modeling/architectures/__init__.py +0 -32
- pyxlpr/ppocr/modeling/architectures/base_model.py +0 -88
- pyxlpr/ppocr/modeling/architectures/distillation_model.py +0 -60
- pyxlpr/ppocr/modeling/backbones/__init__.py +0 -54
- pyxlpr/ppocr/modeling/backbones/det_mobilenet_v3.py +0 -268
- pyxlpr/ppocr/modeling/backbones/det_resnet_vd.py +0 -246
- pyxlpr/ppocr/modeling/backbones/det_resnet_vd_sast.py +0 -285
- pyxlpr/ppocr/modeling/backbones/e2e_resnet_vd_pg.py +0 -265
- pyxlpr/ppocr/modeling/backbones/kie_unet_sdmgr.py +0 -186
- pyxlpr/ppocr/modeling/backbones/rec_mobilenet_v3.py +0 -138
- pyxlpr/ppocr/modeling/backbones/rec_mv1_enhance.py +0 -258
- pyxlpr/ppocr/modeling/backbones/rec_nrtr_mtb.py +0 -48
- pyxlpr/ppocr/modeling/backbones/rec_resnet_31.py +0 -210
- pyxlpr/ppocr/modeling/backbones/rec_resnet_aster.py +0 -143
- pyxlpr/ppocr/modeling/backbones/rec_resnet_fpn.py +0 -307
- pyxlpr/ppocr/modeling/backbones/rec_resnet_vd.py +0 -286
- pyxlpr/ppocr/modeling/heads/__init__.py +0 -54
- pyxlpr/ppocr/modeling/heads/cls_head.py +0 -52
- pyxlpr/ppocr/modeling/heads/det_db_head.py +0 -118
- pyxlpr/ppocr/modeling/heads/det_east_head.py +0 -121
- pyxlpr/ppocr/modeling/heads/det_pse_head.py +0 -37
- pyxlpr/ppocr/modeling/heads/det_sast_head.py +0 -128
- pyxlpr/ppocr/modeling/heads/e2e_pg_head.py +0 -253
- pyxlpr/ppocr/modeling/heads/kie_sdmgr_head.py +0 -206
- pyxlpr/ppocr/modeling/heads/multiheadAttention.py +0 -163
- pyxlpr/ppocr/modeling/heads/rec_aster_head.py +0 -393
- pyxlpr/ppocr/modeling/heads/rec_att_head.py +0 -202
- pyxlpr/ppocr/modeling/heads/rec_ctc_head.py +0 -88
- pyxlpr/ppocr/modeling/heads/rec_nrtr_head.py +0 -826
- pyxlpr/ppocr/modeling/heads/rec_sar_head.py +0 -402
- pyxlpr/ppocr/modeling/heads/rec_srn_head.py +0 -280
- pyxlpr/ppocr/modeling/heads/self_attention.py +0 -406
- pyxlpr/ppocr/modeling/heads/table_att_head.py +0 -246
- pyxlpr/ppocr/modeling/necks/__init__.py +0 -32
- pyxlpr/ppocr/modeling/necks/db_fpn.py +0 -111
- pyxlpr/ppocr/modeling/necks/east_fpn.py +0 -188
- pyxlpr/ppocr/modeling/necks/fpn.py +0 -138
- pyxlpr/ppocr/modeling/necks/pg_fpn.py +0 -314
- pyxlpr/ppocr/modeling/necks/rnn.py +0 -92
- pyxlpr/ppocr/modeling/necks/sast_fpn.py +0 -284
- pyxlpr/ppocr/modeling/necks/table_fpn.py +0 -110
- pyxlpr/ppocr/modeling/transforms/__init__.py +0 -28
- pyxlpr/ppocr/modeling/transforms/stn.py +0 -135
- pyxlpr/ppocr/modeling/transforms/tps.py +0 -308
- pyxlpr/ppocr/modeling/transforms/tps_spatial_transformer.py +0 -156
- pyxlpr/ppocr/optimizer/__init__.py +0 -61
- pyxlpr/ppocr/optimizer/learning_rate.py +0 -228
- pyxlpr/ppocr/optimizer/lr_scheduler.py +0 -49
- pyxlpr/ppocr/optimizer/optimizer.py +0 -160
- pyxlpr/ppocr/optimizer/regularizer.py +0 -52
- pyxlpr/ppocr/postprocess/__init__.py +0 -55
- pyxlpr/ppocr/postprocess/cls_postprocess.py +0 -33
- pyxlpr/ppocr/postprocess/db_postprocess.py +0 -234
- pyxlpr/ppocr/postprocess/east_postprocess.py +0 -143
- pyxlpr/ppocr/postprocess/locality_aware_nms.py +0 -200
- pyxlpr/ppocr/postprocess/pg_postprocess.py +0 -52
- pyxlpr/ppocr/postprocess/pse_postprocess/__init__.py +0 -15
- pyxlpr/ppocr/postprocess/pse_postprocess/pse/__init__.py +0 -29
- pyxlpr/ppocr/postprocess/pse_postprocess/pse/setup.py +0 -14
- pyxlpr/ppocr/postprocess/pse_postprocess/pse_postprocess.py +0 -118
- pyxlpr/ppocr/postprocess/rec_postprocess.py +0 -654
- pyxlpr/ppocr/postprocess/sast_postprocess.py +0 -355
- pyxlpr/ppocr/tools/__init__.py +0 -14
- pyxlpr/ppocr/tools/eval.py +0 -83
- pyxlpr/ppocr/tools/export_center.py +0 -77
- pyxlpr/ppocr/tools/export_model.py +0 -129
- pyxlpr/ppocr/tools/infer/predict_cls.py +0 -151
- pyxlpr/ppocr/tools/infer/predict_det.py +0 -300
- pyxlpr/ppocr/tools/infer/predict_e2e.py +0 -169
- pyxlpr/ppocr/tools/infer/predict_rec.py +0 -414
- pyxlpr/ppocr/tools/infer/predict_system.py +0 -204
- pyxlpr/ppocr/tools/infer/utility.py +0 -629
- pyxlpr/ppocr/tools/infer_cls.py +0 -83
- pyxlpr/ppocr/tools/infer_det.py +0 -134
- pyxlpr/ppocr/tools/infer_e2e.py +0 -122
- pyxlpr/ppocr/tools/infer_kie.py +0 -153
- pyxlpr/ppocr/tools/infer_rec.py +0 -146
- pyxlpr/ppocr/tools/infer_table.py +0 -107
- pyxlpr/ppocr/tools/program.py +0 -596
- pyxlpr/ppocr/tools/test_hubserving.py +0 -117
- pyxlpr/ppocr/tools/train.py +0 -163
- pyxlpr/ppocr/tools/xlprog.py +0 -748
- pyxlpr/ppocr/utils/EN_symbol_dict.txt +0 -94
- pyxlpr/ppocr/utils/__init__.py +0 -24
- pyxlpr/ppocr/utils/dict/ar_dict.txt +0 -117
- pyxlpr/ppocr/utils/dict/arabic_dict.txt +0 -162
- pyxlpr/ppocr/utils/dict/be_dict.txt +0 -145
- pyxlpr/ppocr/utils/dict/bg_dict.txt +0 -140
- pyxlpr/ppocr/utils/dict/chinese_cht_dict.txt +0 -8421
- pyxlpr/ppocr/utils/dict/cyrillic_dict.txt +0 -163
- pyxlpr/ppocr/utils/dict/devanagari_dict.txt +0 -167
- pyxlpr/ppocr/utils/dict/en_dict.txt +0 -63
- pyxlpr/ppocr/utils/dict/fa_dict.txt +0 -136
- pyxlpr/ppocr/utils/dict/french_dict.txt +0 -136
- pyxlpr/ppocr/utils/dict/german_dict.txt +0 -143
- pyxlpr/ppocr/utils/dict/hi_dict.txt +0 -162
- pyxlpr/ppocr/utils/dict/it_dict.txt +0 -118
- pyxlpr/ppocr/utils/dict/japan_dict.txt +0 -4399
- pyxlpr/ppocr/utils/dict/ka_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/korean_dict.txt +0 -3688
- pyxlpr/ppocr/utils/dict/latin_dict.txt +0 -185
- pyxlpr/ppocr/utils/dict/mr_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/ne_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/oc_dict.txt +0 -96
- pyxlpr/ppocr/utils/dict/pu_dict.txt +0 -130
- pyxlpr/ppocr/utils/dict/rs_dict.txt +0 -91
- pyxlpr/ppocr/utils/dict/rsc_dict.txt +0 -134
- pyxlpr/ppocr/utils/dict/ru_dict.txt +0 -125
- pyxlpr/ppocr/utils/dict/ta_dict.txt +0 -128
- pyxlpr/ppocr/utils/dict/table_dict.txt +0 -277
- pyxlpr/ppocr/utils/dict/table_structure_dict.txt +0 -2759
- pyxlpr/ppocr/utils/dict/te_dict.txt +0 -151
- pyxlpr/ppocr/utils/dict/ug_dict.txt +0 -114
- pyxlpr/ppocr/utils/dict/uk_dict.txt +0 -142
- pyxlpr/ppocr/utils/dict/ur_dict.txt +0 -137
- pyxlpr/ppocr/utils/dict/xi_dict.txt +0 -110
- pyxlpr/ppocr/utils/dict90.txt +0 -90
- pyxlpr/ppocr/utils/e2e_metric/Deteval.py +0 -574
- pyxlpr/ppocr/utils/e2e_metric/polygon_fast.py +0 -83
- pyxlpr/ppocr/utils/e2e_utils/extract_batchsize.py +0 -87
- pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_fast.py +0 -457
- pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_slow.py +0 -592
- pyxlpr/ppocr/utils/e2e_utils/pgnet_pp_utils.py +0 -162
- pyxlpr/ppocr/utils/e2e_utils/visual.py +0 -162
- pyxlpr/ppocr/utils/en_dict.txt +0 -95
- pyxlpr/ppocr/utils/gen_label.py +0 -81
- pyxlpr/ppocr/utils/ic15_dict.txt +0 -36
- pyxlpr/ppocr/utils/iou.py +0 -54
- pyxlpr/ppocr/utils/logging.py +0 -69
- pyxlpr/ppocr/utils/network.py +0 -84
- pyxlpr/ppocr/utils/ppocr_keys_v1.txt +0 -6623
- pyxlpr/ppocr/utils/profiler.py +0 -110
- pyxlpr/ppocr/utils/save_load.py +0 -150
- pyxlpr/ppocr/utils/stats.py +0 -72
- pyxlpr/ppocr/utils/utility.py +0 -80
- pyxlpr/ppstructure/__init__.py +0 -13
- pyxlpr/ppstructure/predict_system.py +0 -187
- pyxlpr/ppstructure/table/__init__.py +0 -13
- pyxlpr/ppstructure/table/eval_table.py +0 -72
- pyxlpr/ppstructure/table/matcher.py +0 -192
- pyxlpr/ppstructure/table/predict_structure.py +0 -136
- pyxlpr/ppstructure/table/predict_table.py +0 -221
- pyxlpr/ppstructure/table/table_metric/__init__.py +0 -16
- pyxlpr/ppstructure/table/table_metric/parallel.py +0 -51
- pyxlpr/ppstructure/table/table_metric/table_metric.py +0 -247
- pyxlpr/ppstructure/table/tablepyxl/__init__.py +0 -13
- pyxlpr/ppstructure/table/tablepyxl/style.py +0 -283
- pyxlpr/ppstructure/table/tablepyxl/tablepyxl.py +0 -118
- pyxlpr/ppstructure/utility.py +0 -71
- pyxlpr/xlai.py +0 -10
- /pyxllib/{ext/autogui → autogui}/virtualkey.py +0 -0
- {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
pyxlpr/ppocr/tools/xlprog.py
DELETED
@@ -1,748 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : 陈坤泽
|
4
|
-
# @Email : 877362867@qq.com
|
5
|
-
# @Date : 2022/02/21 11:07
|
6
|
-
|
7
|
-
"""
|
8
|
-
对PaddleOcr进行了更高程度的工程化封装
|
9
|
-
"""
|
10
|
-
import collections
|
11
|
-
import os
|
12
|
-
import sys
|
13
|
-
import re
|
14
|
-
|
15
|
-
import pandas as pd
|
16
|
-
import yaml
|
17
|
-
import shutil
|
18
|
-
import copy
|
19
|
-
import inspect
|
20
|
-
import math
|
21
|
-
import json
|
22
|
-
|
23
|
-
import numpy as np
|
24
|
-
from tqdm import tqdm
|
25
|
-
|
26
|
-
from pyxlpr.ppocr.tools.program import preprocess
|
27
|
-
from pyxlpr.ppocr.data import build_dataloader
|
28
|
-
|
29
|
-
from pyxllib.algo.geo import rect_bounds, ltrb2xywh
|
30
|
-
from pyxllib.file.specialist import XlPath, ensure_localfile, ensure_localdir
|
31
|
-
from pyxllib.cv.xlcvlib import xlcv
|
32
|
-
from pyxllib.prog.newbie import round_int
|
33
|
-
|
34
|
-
|
35
|
-
class PaddleOcrBaseConfig:
|
36
|
-
""" paddle(ocr)标准配置文件的封装,为了简化配置方便自己使用,
|
37
|
-
做了一个中间层组件,方便做一些统一的参数设置、修改
|
38
|
-
"""
|
39
|
-
|
40
|
-
def __init__(self):
|
41
|
-
self.cfg = {}
|
42
|
-
|
43
|
-
def __1_config(self):
|
44
|
-
""" 配置文件相关的功能 """
|
45
|
-
pass
|
46
|
-
|
47
|
-
def autoset(self):
|
48
|
-
""" 这个接口方便写一些通用的配置 """
|
49
|
-
|
50
|
-
x = self.cfg['Global']
|
51
|
-
x['use_visualdl'] = True
|
52
|
-
x['print_batch_step'] = 1000 # 这个单位是iter。原本很小2,我改成了100。但epoch很小的时候,每轮epoch也会输出。
|
53
|
-
x['pretrained_model'] = None
|
54
|
-
# 每隔多少次epoch,保存模型,原本默认是1200,这里故意设置的特别大,相当于不保存模型,需要的话手动设置补充。
|
55
|
-
# 虽然没有固定间隔保存模型,但默认还是会根据eval,保存最优模型的
|
56
|
-
x['save_epoch_step'] = 100000
|
57
|
-
|
58
|
-
self.set_save_dir('models/' + inspect.stack()[3].function)
|
59
|
-
|
60
|
-
def resume(self, train=False):
|
61
|
-
""" 如果没有设置checkpoints,尝试加载best_accuracy或latest模型
|
62
|
-
|
63
|
-
跟是否是Train模式有关,默认加载的模型会不一样
|
64
|
-
train要加载latest,其他默认优先加载accuracy
|
65
|
-
"""
|
66
|
-
if train: # 用于模型训练时,应该是优先回复上一次的模型
|
67
|
-
candidates = ['latest', 'best_accuracy']
|
68
|
-
else: # 用于其他场合,则应该是默认找最佳模型来使用
|
69
|
-
candidates = ['best_accuracy', 'latest']
|
70
|
-
|
71
|
-
for name in candidates:
|
72
|
-
f = XlPath(self.cfg['Global']['save_model_dir']) / name
|
73
|
-
if f.with_suffix('.pdparams').exists():
|
74
|
-
self.cfg['Global']['checkpoints'] = f
|
75
|
-
return
|
76
|
-
|
77
|
-
def config_from_content(self, content):
|
78
|
-
self.cfg = yaml.safe_load(content)
|
79
|
-
self.autoset()
|
80
|
-
return self.cfg
|
81
|
-
|
82
|
-
def config_from_template(self, subpath):
|
83
|
-
"""
|
84
|
-
:param subpath: 例如 'det/det_mv3_db'
|
85
|
-
"""
|
86
|
-
f = os.path.join(sys.modules['pyxlpr.ppocr'].__path__[0], 'configs', subpath + '.yml')
|
87
|
-
return self.config_from_content(XlPath(f).read_text())
|
88
|
-
|
89
|
-
def set_save_dir(self, save_dir):
|
90
|
-
""" 有很多个运行中文件的输出路径,可以统一到一个地方,并且只设置一次就够 """
|
91
|
-
# self.d['Global']
|
92
|
-
save_dir = XlPath(save_dir)
|
93
|
-
x = self.cfg['Global']
|
94
|
-
x['save_model_dir'] = save_dir # train时模型存储目录
|
95
|
-
x['save_inference_dir'] = save_dir / 'infer' # export_model时存储目录
|
96
|
-
# 这个选项暂时还不清楚具体作用,不知道是不是db专有的
|
97
|
-
x['save_res_path'] = save_dir / 'predicts.txt'
|
98
|
-
|
99
|
-
def set_simpledataset(self, mode, data_dir, label_file_list, ratio_list=None):
|
100
|
-
""" paddle官方标准的SimpleDataset数据格式
|
101
|
-
|
102
|
-
:param str mode: Train or Eval,设置训练集或者验证集
|
103
|
-
:param PathLike data_dir: 数据所在根目录
|
104
|
-
:param list label_file_list: 标注文件清单 [txtfile1, textfile2, ...]
|
105
|
-
每个txtfile文件里的内容,每行是一张图的标注
|
106
|
-
每行第1列是图片相对data_dir的路径,\t隔开,第2列是json.dumps的json标注数据
|
107
|
-
json里有transcription字段存储文本内容,points存储四边形框位置
|
108
|
-
:param list ratio_list: 只有一个label_file_list的时候,可以只输入一个数字,但最好统一按列表输入
|
109
|
-
填写一个0~1.0的小数值,表示所取样本比例数
|
110
|
-
这个paddle官方实现是随机取的,没有顺序规律
|
111
|
-
"""
|
112
|
-
# 注意如果在SimpleDataSet、XlSimpleDataSet之间切换的话,有些字段格式是有区别的
|
113
|
-
# 保险起见,就把self.cfg[mode]['dataset']整个重置了
|
114
|
-
node = self.cfg[mode]['dataset']
|
115
|
-
x = {'name': 'SimpleDataSet',
|
116
|
-
'data_dir': XlPath(data_dir),
|
117
|
-
'label_file_list': label_file_list}
|
118
|
-
if ratio_list:
|
119
|
-
x['ratio_list'] = ratio_list
|
120
|
-
x['transforms'] = node['transforms']
|
121
|
-
self.cfg[mode]['dataset'] = x
|
122
|
-
|
123
|
-
def set_xlsimpledataset(self, mode, data_dir, data_list):
|
124
|
-
""" 设置自己的XlSampleDataSet数据格式
|
125
|
-
|
126
|
-
用于对各种源生的格式,在程序运行中将格式直接转为paddle的内存支持格式接口,从而不用重复生成冗余的中间数据文件
|
127
|
-
目前最主要的是扩展了对xllabelme标注格式的支持,如labelme_det
|
128
|
-
|
129
|
-
:param str mode: Train or Eval,设置训练集或者验证集
|
130
|
-
:param PathLike data_dir: 数据所在根目录
|
131
|
-
:param list data_list: 数据具体清单,每个条目都是一个字典
|
132
|
-
[必填]type: 具体的数据格式,目前支持 labelme_det, icdar2015, refineAgree
|
133
|
-
具体支持的方法,可以见XlSimpleDataSet类下前缀为from_的成员方法
|
134
|
-
其他为选填字段,具体见from_定义支持的扩展功能,一般有以下常见参数功能
|
135
|
-
[ratio] 一个小数比例,可以负数代表从后往前取
|
136
|
-
一般用于懒得物理区分Train、Eval数据集的时候,在代码里用算法自动拆分训练、验证集
|
137
|
-
|
138
|
-
"""
|
139
|
-
node = self.cfg[mode]['dataset']
|
140
|
-
x = {'name': 'XlSimpleDataSet',
|
141
|
-
'data_dir': XlPath(data_dir),
|
142
|
-
'data_list': data_list}
|
143
|
-
x['transforms'] = node['transforms']
|
144
|
-
self.cfg[mode]['dataset'] = x
|
145
|
-
|
146
|
-
@classmethod
|
147
|
-
def _rset_posix_path(cls, d):
|
148
|
-
from pathlib import Path
|
149
|
-
|
150
|
-
if isinstance(d, list):
|
151
|
-
for i, x in enumerate(d):
|
152
|
-
if isinstance(x, (Path, XlPath)):
|
153
|
-
d[i] = x.as_posix()
|
154
|
-
else:
|
155
|
-
cls._rset_posix_path(x)
|
156
|
-
elif isinstance(d, dict):
|
157
|
-
for k, v in d.items():
|
158
|
-
if isinstance(v, (Path, XlPath)):
|
159
|
-
d[k] = v.as_posix()
|
160
|
-
else:
|
161
|
-
cls._rset_posix_path(v)
|
162
|
-
|
163
|
-
def rset_posix_path(self):
|
164
|
-
""" 配置字典中,可能存在XlPath、Path类,需要递归统一修改为str类型来存储
|
165
|
-
|
166
|
-
rset是递归设置的意思
|
167
|
-
"""
|
168
|
-
d = copy.deepcopy(self.cfg)
|
169
|
-
self._rset_posix_path(d)
|
170
|
-
return d
|
171
|
-
|
172
|
-
def write_cfg_tempfile(self):
|
173
|
-
""" 存储一个文件到临时目录,并返回文件路径 """
|
174
|
-
p = XlPath.tempfile('.yml')
|
175
|
-
# TODO 写入文件前,会把配置里 XlPath全部转为 as_poisx 的str
|
176
|
-
self._rset_posix_path(self.cfg)
|
177
|
-
p.write_yaml(self.cfg)
|
178
|
-
return str(p)
|
179
|
-
|
180
|
-
def add_config_to_cmd_argv(self):
|
181
|
-
""" 把配置参数加入命令行的 -c 命令中 """
|
182
|
-
sys.argv = sys.argv + ['-c', self.write_cfg_tempfile()]
|
183
|
-
|
184
|
-
def set_iter_num(self, num):
|
185
|
-
""" 按迭代数设置训练长度
|
186
|
-
|
187
|
-
paddle的配置源生并不支持按iter来统计训练长度,
|
188
|
-
要通过batch_size_per_card和数据量,来反推epoch_num需要设置多少
|
189
|
-
|
190
|
-
注意要先设置好数据集,再继续迭代数哦!
|
191
|
-
"""
|
192
|
-
config, device, logger, _ = preprocess(from_dict=self.rset_posix_path(), use_visualdl=False)
|
193
|
-
train_dataloader = build_dataloader(config, 'Train', device, logger)
|
194
|
-
per_epoch_iter_num = len(train_dataloader) # 每个epoch的迭代数
|
195
|
-
self.cfg['Global']['epoch_num'] = math.ceil(num / per_epoch_iter_num)
|
196
|
-
|
197
|
-
def __2_main(self):
|
198
|
-
""" 一些脚本功能工具 """
|
199
|
-
pass
|
200
|
-
|
201
|
-
def train(self, resume=False):
|
202
|
-
from pyxlpr.ppocr.tools.train import main
|
203
|
-
|
204
|
-
if resume:
|
205
|
-
self.resume(True)
|
206
|
-
config, device, logger, vdl_writer = preprocess(is_train=True, from_dict=self.rset_posix_path())
|
207
|
-
main(config, device, logger, vdl_writer)
|
208
|
-
|
209
|
-
def eval(self, resume=True, *, dataset_mode='Eval'):
|
210
|
-
"""
|
211
|
-
:param dataset_mode: 使用的数据集,默认是Eval,也可以用Train
|
212
|
-
"""
|
213
|
-
from pyxlpr.ppocr.tools.eval import main
|
214
|
-
|
215
|
-
if resume:
|
216
|
-
self.resume()
|
217
|
-
|
218
|
-
config, device, logger, vdl_writer = preprocess(from_dict=self.rset_posix_path())
|
219
|
-
for k in ['name', 'data_dir', 'data_list']:
|
220
|
-
config['Eval']['dataset'][k] = config[dataset_mode]['dataset'][k]
|
221
|
-
metric = main(config, device, logger)
|
222
|
-
return metric
|
223
|
-
|
224
|
-
def infer_det(self, resume=True):
|
225
|
-
from pyxlpr.ppocr.tools.infer_det import main
|
226
|
-
|
227
|
-
if resume:
|
228
|
-
self.resume()
|
229
|
-
config, device, logger, vdl_writer = preprocess(from_dict=self.rset_posix_path())
|
230
|
-
main(config, logger)
|
231
|
-
|
232
|
-
def export_model(self, resume=True):
|
233
|
-
from pyxlpr.ppocr.tools.export_model import main
|
234
|
-
|
235
|
-
if resume:
|
236
|
-
self.resume()
|
237
|
-
config, device, logger, vdl_writer = preprocess(from_dict=self.rset_posix_path())
|
238
|
-
main(config, logger)
|
239
|
-
|
240
|
-
def __3_pretrained(self):
|
241
|
-
""" 使用预训练模型相关配置的封装 """
|
242
|
-
|
243
|
-
@classmethod
|
244
|
-
def get_pretrained_model_backbone(cls, name):
|
245
|
-
""" 只拿骨干网络的权重 """
|
246
|
-
local_file = XlPath.userdir() / f'.paddleocr/pretrained/{name}.pdparams'
|
247
|
-
url = f'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/{name}.pdparams'
|
248
|
-
ensure_localfile(local_file, url)
|
249
|
-
return local_file.parent / local_file.stem # 省略.pdparams后缀
|
250
|
-
|
251
|
-
@classmethod
|
252
|
-
def get_pretrained_model_ppocr(cls, name):
|
253
|
-
local_dir = XlPath.userdir() / f'.paddleocr/pretrained/{name}'
|
254
|
-
url = f'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/{name}.tar'
|
255
|
-
ensure_localdir(local_dir, url, wrap=-1)
|
256
|
-
return local_dir / 'best_accuracy' # ppocr训练好的ocr模型
|
257
|
-
|
258
|
-
def set_pretrained_model_backbone(self, name):
|
259
|
-
path = self.get_pretrained_model_backbone(name)
|
260
|
-
self.cfg['Global']['pretrained_model'] = path
|
261
|
-
|
262
|
-
def set_pretrained_model_ppocr(self, name):
|
263
|
-
path = self.get_pretrained_model_ppocr(name)
|
264
|
-
self.cfg['Global']['pretrained_model'] = path
|
265
|
-
|
266
|
-
def set_pretrained_infer_model(self, local_dir, url):
|
267
|
-
""" 自己扩展的一个配置参数,metric的时候用 """
|
268
|
-
local_dir = XlPath.userdir() / f'.paddleocr/pretrained_infer/{local_dir}'
|
269
|
-
path = ensure_localdir(local_dir, url, wrap=-1)
|
270
|
-
self.cfg['Global']['pretrained_infer_model'] = path
|
271
|
-
|
272
|
-
def set_pretrained_model(self, pretrained, models):
|
273
|
-
""" 对上述功能进一步封装,简化高层接口配置时的代码复杂度
|
274
|
-
|
275
|
-
:param bool|int pretrained:
|
276
|
-
0 不使用预训练权重
|
277
|
-
1 使用骨干网络权重
|
278
|
-
2 使用完整的ppocr权重
|
279
|
-
3 之前定制训练过的最好的模型
|
280
|
-
:param models: pretrained为1、2时加载的模型
|
281
|
-
"""
|
282
|
-
if pretrained == 1:
|
283
|
-
self.set_pretrained_model_backbone(models[0])
|
284
|
-
elif pretrained == 2:
|
285
|
-
self.set_pretrained_model_ppocr(models[1])
|
286
|
-
elif pretrained == 3:
|
287
|
-
self.cfg['Global']['pretrained_model'] = self.cfg['Global']['save_model_dir'] / 'best_accuracy'
|
288
|
-
|
289
|
-
def __call__(self, *args, **kwargs):
|
290
|
-
# 让fire库配合return self不会报错
|
291
|
-
pass
|
292
|
-
|
293
|
-
|
294
|
-
class XlDetText(PaddleOcrBaseConfig):
|
295
|
-
""" 检测模型专用配置
|
296
|
-
"""
|
297
|
-
|
298
|
-
def autolabel(self, datadir, *, model_type=0, **kwargs):
|
299
|
-
""" 预标注检测、识别
|
300
|
-
|
301
|
-
TODO model_type在det1_mobile的时候,默认设为2?
|
302
|
-
|
303
|
-
"""
|
304
|
-
pocr = self.build_ppocr(model_type, **kwargs)
|
305
|
-
pocr.ocr2labelme(datadir, det=True, rec=True)
|
306
|
-
|
307
|
-
def set_deploy_args_det(self):
|
308
|
-
""" 检测模型在部署时候的参数,不一定跟eval一样
|
309
|
-
换句话说,eval本来就应该尽量贴近真实部署的配置参数
|
310
|
-
|
311
|
-
由于很多文本检测的配置文件,在eval时有些配置跟部署不同,这里按照deploy的情况自动进行调整
|
312
|
-
|
313
|
-
当然,有些配置,如果eval效果确实比deploy来的好,可以考虑deploy采用eval的配置方式
|
314
|
-
"""
|
315
|
-
for x in self.cfg['Eval']['dataset']['transforms']:
|
316
|
-
if 'DetResizeForTest' in x:
|
317
|
-
x['DetResizeForTest'] = {'limit_side_len': 960, 'limit_type': 'max'}
|
318
|
-
|
319
|
-
def det1_mobile_init(self, *, pretrained=2):
|
320
|
-
"""
|
321
|
-
官方实验:ic15, train1000+val500张, batch_size_per_card=8, epoch=1200
|
322
|
-
也就是总训练量120w,除batchsize,是15万iter
|
323
|
-
按照核酸的实验,每iter耗时大概是0.4秒,实验总用时15iter/3600*0.4约等于17小时
|
324
|
-
|
325
|
-
batchsize=8,hesuan训练过程占用显存 6.7G
|
326
|
-
以后有其他实验数据,会尝试都覆盖上,但记忆中差不多都是消耗这么多
|
327
|
-
|
328
|
-
这个部署文件共 3M
|
329
|
-
|
330
|
-
TODO datalist不只一个的相关功能,还没有进行测试,但问题应该不大
|
331
|
-
"""
|
332
|
-
# 1 加载基础的配置
|
333
|
-
cfg = self.config_from_template('det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0')
|
334
|
-
self.set_pretrained_model(pretrained, ['MobileNetV3_large_x0_5_pretrained', 'ch_ppocr_mobile_v2.0_det_train'])
|
335
|
-
self.set_deploy_args_det()
|
336
|
-
|
337
|
-
# 2 预训练权重也提供一个部署模型,供后续metric分析
|
338
|
-
infer_model_url = 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar'
|
339
|
-
self.set_pretrained_infer_model('ch_ppocr_mobile_v2.0_det_infer', infer_model_url)
|
340
|
-
|
341
|
-
def det1_server_init(self, *, pretrained=2):
|
342
|
-
"""
|
343
|
-
训练显存 10.2 G
|
344
|
-
|
345
|
-
这个部署文件共 47M
|
346
|
-
"""
|
347
|
-
# 1 加载基础的配置
|
348
|
-
cfg = self.config_from_template('det/ch_ppocr_v2.0/ch_det_res18_db_v2.0')
|
349
|
-
self.set_pretrained_model(pretrained, ['ResNet18_vd_pretrained', 'ch_ppocr_server_v2.0_det_train'])
|
350
|
-
self.set_deploy_args_det()
|
351
|
-
|
352
|
-
# 2 预训练权重也提供一个部署模型,供后续metric分析
|
353
|
-
infer_model_url = 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar'
|
354
|
-
self.set_pretrained_infer_model('ch_ppocr_server_v2.0_det_infer', infer_model_url)
|
355
|
-
|
356
|
-
def det2_init(self, *, pretrained=1):
|
357
|
-
""" 2021.9.7发布的PP-OCRv2
|
358
|
-
但是我还没有试跑过,不确定我这样配置是对的
|
359
|
-
|
360
|
-
220223周三18:11,跑通了,但还没有完全对,metric结果有点奇怪,摸不着头脑
|
361
|
-
"""
|
362
|
-
cfg = self.config_from_template('det/ch_PP-OCRv2/ch_PP-OCRv2_det_distill')
|
363
|
-
if pretrained:
|
364
|
-
x = cfg['Architecture']['Models']
|
365
|
-
|
366
|
-
# self.set_pretrained_model_ppocr('ch_PP-OCRv2_det_distill_train')
|
367
|
-
x['Student']['pretrained'] = self.get_pretrained_model_backbone('MobileNetV3_large_x0_5_pretrained')
|
368
|
-
# x['Student']['pretrained'] = self.get_pretrained_model_ppocr('ch_PP-OCRv2_det_distill_train')
|
369
|
-
x['Teacher']['pretrained'] = self.get_pretrained_model_ppocr('ch_ppocr_server_v2.0_det_train')
|
370
|
-
|
371
|
-
self.set_deploy_args_det()
|
372
|
-
|
373
|
-
infer_model_url = 'https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar'
|
374
|
-
self.set_pretrained_infer_model('ch_PP-OCRv2_det_infer', infer_model_url)
|
375
|
-
|
376
|
-
return self
|
377
|
-
|
378
|
-
def build_ppocr(self, model_type=2, **kwargs):
|
379
|
-
""" 获得部署用的接口类
|
380
|
-
导出部署模型,并加载
|
381
|
-
|
382
|
-
:param model_type:
|
383
|
-
0,原始的PaddleOcr
|
384
|
-
1,配置文件自带的部署文件(需要配置Global.pretrained_infer_model参数)
|
385
|
-
2,finetune后的模型
|
386
|
-
:param kwargs: 可以增加一些检测模型的配置参数,比如常用的 det_db_unclip_ratio=1.5
|
387
|
-
"""
|
388
|
-
from pyxlpr.paddleocr import PaddleOCR
|
389
|
-
|
390
|
-
if model_type == 0:
|
391
|
-
ppocr = PaddleOCR.build_ppocr(**kwargs)
|
392
|
-
elif model_type == 1:
|
393
|
-
d = self.cfg['Global']['pretrained_infer_model']
|
394
|
-
if not d:
|
395
|
-
return {}
|
396
|
-
ppocr = PaddleOCR.build_ppocr(det_model_dir=d, **kwargs)
|
397
|
-
else:
|
398
|
-
self.export_model(True)
|
399
|
-
ppocr = PaddleOCR.build_ppocr(det_model_dir=self.cfg['Global']['save_inference_dir'], **kwargs)
|
400
|
-
|
401
|
-
return ppocr
|
402
|
-
|
403
|
-
def _build_dataset(self, config, logger, dataset_mode='Eval'):
|
404
|
-
from pyxlpr.ppocr.data import build_dataset
|
405
|
-
# 注意这里数据集切换方法跟PaddleOCRConfig.eval有点不太一样,因为部署操作要连transforms一起改掉
|
406
|
-
src = config[dataset_mode]['dataset']
|
407
|
-
config['Eval']['dataset'] = {'name': src['name'],
|
408
|
-
'data_dir': src['data_dir'],
|
409
|
-
'data_list': src['data_list'],
|
410
|
-
'transforms': [{'DetLabelEncode': None}]}
|
411
|
-
dataset = build_dataset(config, 'Eval', logger)
|
412
|
-
return config, dataset
|
413
|
-
|
414
|
-
def eval_deploy(self, model_type=2, dataset_mode='Eval', **kwargs):
|
415
|
-
ppocr = self.build_ppocr(model_type, **kwargs)
|
416
|
-
config, device, logger, vdl_writer = preprocess(from_dict=self.rset_posix_path())
|
417
|
-
config, dataset = self._build_dataset(config, logger, dataset_mode)
|
418
|
-
metric = ppocr.det_metric(dataset)
|
419
|
-
logger.info(str(metric))
|
420
|
-
return metric
|
421
|
-
|
422
|
-
def metric(self, *, print_mode=False):
|
423
|
-
""" 得到一个综合的测评结果,一般如下:
|
424
|
-
type train_dataset eval_dataset
|
425
|
-
①PaddleOCR* 32.35*56 100.0*190
|
426
|
-
②pretrained 17.57*43 50.0*22
|
427
|
-
③pretrained* 17.57*184 50.0*192
|
428
|
-
④finetune 93.05*49 100.0*20
|
429
|
-
⑤finetune* 93.05*173 100.0*164
|
430
|
-
|
431
|
-
有几条规律
|
432
|
-
1、精度②=③,速度③>②。如果精度不同,很可能官方给的预训练模型和部署文件有问题
|
433
|
-
2、精度④=⑤,速度⑤>④。如果精度不同,可能eval和部署阶段的图片处理方式不同
|
434
|
-
det就存在这个问题,处理后的图片尺寸不同,通过set_deploy_args_det修复了
|
435
|
-
3、去掉上述两个eval阶段的,其实就是①③⑤三个模型间的比较
|
436
|
-
①PaddleOCR直接给的一条龙准备好的模型,一般要高于③开源模型训练效果,低于⑤定制化的效果
|
437
|
-
即精读:③<①<⑤
|
438
|
-
"""
|
439
|
-
import pandas as pd
|
440
|
-
from pyxllib.algo.stat import xlpivot
|
441
|
-
|
442
|
-
# 1 收集各模型结果
|
443
|
-
eval_list = []
|
444
|
-
|
445
|
-
def core(title, eval_func):
|
446
|
-
for dataset in ['a、Train', 'b、Eval']:
|
447
|
-
m = eval_func(dataset[2:]) # m, metric
|
448
|
-
m = {k: (round_int(v) if k in ('fps', 'total_frame') else round(v * 100, 2)) for k, v in m.items()}
|
449
|
-
m2 = {'model_type': title, 'dataset': dataset}
|
450
|
-
m2.update(m)
|
451
|
-
eval_list.append(m2)
|
452
|
-
|
453
|
-
core('①PaddleOCR*', lambda m: self.eval_deploy(model_type=0, dataset_mode=m))
|
454
|
-
core('②pretrained', lambda m: self.eval(resume=False, dataset_mode=m))
|
455
|
-
core('③pretrained*', lambda m: self.eval_deploy(model_type=1, dataset_mode=m))
|
456
|
-
core('④finetune', lambda m: self.eval(resume=True, dataset_mode=m))
|
457
|
-
core('⑤finetune*', lambda m: self.eval_deploy(model_type=2, dataset_mode=m))
|
458
|
-
|
459
|
-
# 2 最后的统计表
|
460
|
-
df = pd.DataFrame.from_records(eval_list)
|
461
|
-
outfile = self.cfg['Global']['save_model_dir'] / f'results/metric.html'
|
462
|
-
os.makedirs(outfile.parent, exist_ok=True)
|
463
|
-
|
464
|
-
def func(items):
|
465
|
-
x = items.iloc[0]
|
466
|
-
return f'{x["precision"]:.0f},{x["recall"]:.0f},{x["hmean"]:.2f},{x["fps"]}'
|
467
|
-
|
468
|
-
df2 = xlpivot(df, ['model_type'], ['dataset', 'total_frame'], {'precision,recall,hmean,fps': func})
|
469
|
-
stat_html = df2.to_html()
|
470
|
-
stat_html = stat_html.replace('<th></th>', f'<th>{sys.argv[2]}</th>', 1)
|
471
|
-
outfile.write_text(stat_html)
|
472
|
-
|
473
|
-
if 'metric' in sys.argv:
|
474
|
-
print(df2)
|
475
|
-
return
|
476
|
-
|
477
|
-
if print_mode:
|
478
|
-
print(df2)
|
479
|
-
|
480
|
-
return df
|
481
|
-
|
482
|
-
def create_visual_results(self, *, model_type=2, max_samples=None, **kwargs):
|
483
|
-
""" 将可视化结果生成到目录下
|
484
|
-
|
485
|
-
:param max_samples: 限制生成的可视化图片上限,有时候只需要看少量样本
|
486
|
-
|
487
|
-
【算法流程】基本思路是将数据转成coco格式后,使用coco的功能接口来实现,考验我以前接口好不好用的时候到了
|
488
|
-
1、初始化指定的ppocr
|
489
|
-
2、用ppocr生成一套检测结果
|
490
|
-
3、和gt对比,生成一套coco数据
|
491
|
-
4、生成coco可视化结果
|
492
|
-
5、生成coco的数据分析表格
|
493
|
-
"""
|
494
|
-
import PIL.Image
|
495
|
-
from pyxlpr.data.coco import CocoGtData, CocoMatch
|
496
|
-
|
497
|
-
ppocr = self.build_ppocr(model_type, **kwargs)
|
498
|
-
for dataset_mode in ['Train', 'Eval']: # 训练集和验证集结果都生成,放在两个不同目录
|
499
|
-
gt = {'images': [],
|
500
|
-
'annotations': [],
|
501
|
-
'categories': CocoGtData.gen_categories(['text'])}
|
502
|
-
dt = []
|
503
|
-
k = 1
|
504
|
-
|
505
|
-
config, device, logger, vdl_writer = preprocess(from_dict=self.rset_posix_path())
|
506
|
-
config, dataset = self._build_dataset(config, logger, dataset_mode)
|
507
|
-
out_dir = self.cfg['Global']['save_model_dir'] / f'results/{dataset_mode}'
|
508
|
-
data_dir = self.cfg['Eval']['dataset']['data_dir']
|
509
|
-
for img_id, x in enumerate(dataset, start=1):
|
510
|
-
if max_samples and img_id > max_samples:
|
511
|
-
break
|
512
|
-
|
513
|
-
# 1 拷贝图片数据到相对目录
|
514
|
-
src_img_path = x['img_path']
|
515
|
-
rel_img_path = XlPath(src_img_path).relpath(data_dir)
|
516
|
-
dst_img_path = out_dir / rel_img_path
|
517
|
-
os.makedirs(dst_img_path.parent, exist_ok=True)
|
518
|
-
if not dst_img_path.is_file():
|
519
|
-
shutil.copy2(src_img_path, dst_img_path)
|
520
|
-
|
521
|
-
# 2 生成对应图片的gt标注数据
|
522
|
-
w, h = PIL.Image.open(str(dst_img_path)).size
|
523
|
-
gt['images'].append(CocoGtData.gen_image(img_id, rel_img_path, h, w))
|
524
|
-
for p in x['polys']:
|
525
|
-
gt['annotations'].append(
|
526
|
-
CocoGtData.gen_annotation(id=k, image_id=img_id, points=p, text=x['texts']))
|
527
|
-
k += 1
|
528
|
-
|
529
|
-
# 3 生成dt标注数据
|
530
|
-
img = xlcv.read_from_buffer(x['image'])
|
531
|
-
for p in ppocr.ocr(img, rec=False):
|
532
|
-
dt.append({'image_id': img_id, 'category_id': 1, 'segmentation': np.array(p).reshape([1, -1]),
|
533
|
-
'bbox': ltrb2xywh(rect_bounds(p)), 'score': 1.0})
|
534
|
-
|
535
|
-
cm = CocoMatch(gt, dt)
|
536
|
-
cm.to_labelme_match(out_dir, segmentation=True)
|
537
|
-
cm.to_excel(out_dir / 'cocomatch.xlsx')
|
538
|
-
|
539
|
-
def __config_demo(self):
|
540
|
-
""" 常用的配置示例 """
|
541
|
-
|
542
|
-
def set_xllabelme_dataset(self, data_dir, ratio_list):
|
543
|
-
""" 设置xllabelme格式的文字检测标注数据
|
544
|
-
|
545
|
-
我自设的一种简单的数据集范式
|
546
|
-
|
547
|
-
:param data_dir: 数据根目录
|
548
|
-
:param list[float, float] ratio_list: 训练集、验证集所需的比例
|
549
|
-
可以取负数,表示从后往前取;底层设置了随机数种子,每次取得具体文件是固定的。
|
550
|
-
数据集较少的话,一般是推荐 [0.9, -0.1],较多的话可以 [0.8, -0.2]
|
551
|
-
"""
|
552
|
-
self.set_xlsimpledataset('Train', data_dir, [{'type': 'labelme_det', 'ratio': ratio_list[0]}])
|
553
|
-
self.set_xlsimpledataset('Eval', data_dir, [{'type': 'labelme_det', 'ratio': ratio_list[1]}])
|
554
|
-
|
555
|
-
def det1_mobile_raw(self):
|
556
|
-
""" paddle源生格式的配置示例 """
|
557
|
-
self.det1_mobile_init(pretrained=2) # 基础配置
|
558
|
-
self.set_save_dir('train/det1_mobile_raw') # 模型保存位置
|
559
|
-
self.set_simpledataset('Train', 'data', ['data/ppdet_train.txt'])
|
560
|
-
self.set_simpledataset('Eval', 'data', ['data/ppdet_val.txt'])
|
561
|
-
self.set_iter_num(150000)
|
562
|
-
return self
|
563
|
-
|
564
|
-
def det1_mobile(self):
|
565
|
-
""" labelme标注格式的检测训练 """
|
566
|
-
self.det1_mobile_init(pretrained=2) # 基础配置
|
567
|
-
self.set_save_dir('train/det1_mobile') # 模型保存位置
|
568
|
-
self.set_xllabelme_dataset('data', [0.9, -0.1]) # 设置数据集
|
569
|
-
self.set_iter_num(150000) # 设置迭代轮次
|
570
|
-
return self
|
571
|
-
|
572
|
-
def det1_server(self):
|
573
|
-
self.det1_server_init(pretrained=2) # 基础配置
|
574
|
-
self.set_save_dir('train/det1_server') # 模型保存位置
|
575
|
-
self.set_xllabelme_dataset('data', [0.9, -0.1]) # 设置数据集
|
576
|
-
self.set_iter_num(150000) # 设置迭代轮次
|
577
|
-
return self
|
578
|
-
|
579
|
-
|
580
|
-
class XlRec(PaddleOcrBaseConfig):
|
581
|
-
""" 识别模型专用配置
|
582
|
-
"""
|
583
|
-
|
584
|
-
def stat_texts(self, xllabelme_data_dir, *, ref_dict='ppocr_keys_v1.txt'):
|
585
|
-
""" 检查标注的句子、字符出现情况 statistics texts
|
586
|
-
|
587
|
-
:param xllabelme_data_dir: xllabelme格式的标注数据所在目录
|
588
|
-
:param ref_dict: 参考字典文件
|
589
|
-
"""
|
590
|
-
from collections import Counter
|
591
|
-
from pyxllib.algo.pupil import ValuesStat
|
592
|
-
from pyxllib.algo.stat import dataframes_to_excel
|
593
|
-
from pyxlpr.ppocr.utils import get_dict_content
|
594
|
-
|
595
|
-
root = XlPath(xllabelme_data_dir)
|
596
|
-
outfile = root.parent / 'stat_texts.xlsx'
|
597
|
-
|
598
|
-
# 1 读取数据
|
599
|
-
sentances_counter = Counter() # 每句话的内容,和相同话出现的次数
|
600
|
-
for f in root.rglob('*.json'):
|
601
|
-
for sp in f.read_json()['shapes']:
|
602
|
-
attr = json.loads(sp['label'])
|
603
|
-
if 'text' in attr:
|
604
|
-
text = attr['text']
|
605
|
-
sentances_counter[text] += 1
|
606
|
-
|
607
|
-
# 2 统计 sentances 每句话出现频率, words 每个单词出现频率, chars 每个字符出现频率
|
608
|
-
chars_counter = Counter()
|
609
|
-
words_counter = Counter()
|
610
|
-
for sentance, cnt in sentances_counter.items():
|
611
|
-
for word in sentance.split(): # 目前先按空格分开,虽然严格来说,对于中文情况,要用结巴分词处理更准确
|
612
|
-
words_counter[word] += cnt
|
613
|
-
for ch in sentance: # 统计每个字符出现次数,包括空格
|
614
|
-
chars_counter[ch] += cnt
|
615
|
-
|
616
|
-
# 3 转df
|
617
|
-
char_dict = set(get_dict_content(ref_dict).splitlines())
|
618
|
-
ls = []
|
619
|
-
new_chars = []
|
620
|
-
for char, cnt in chars_counter.most_common():
|
621
|
-
ls.append([char, cnt, '' if char in char_dict else 'True'])
|
622
|
-
if char not in char_dict and char != ' ':
|
623
|
-
new_chars.append(char)
|
624
|
-
chars_df = pd.DataFrame.from_records(ls, columns=['char', 'count', 'new_char'])
|
625
|
-
|
626
|
-
words_df = pd.DataFrame.from_records(words_counter.most_common(), columns=['word', 'count'])
|
627
|
-
sentances_df = pd.DataFrame.from_records([[sentance, cnt, len(sentance)]
|
628
|
-
for sentance, cnt in sentances_counter.most_common()],
|
629
|
-
columns=['sentance', 'count', 'length'])
|
630
|
-
|
631
|
-
# 计算不同长度句子的分布规律
|
632
|
-
ct = Counter()
|
633
|
-
lengths = []
|
634
|
-
for _, row in sentances_df.iterrows():
|
635
|
-
ct[row['length']] += row['count']
|
636
|
-
lengths += [row['length']] * row['count'] # 这个实现不是那么得优雅,但如果要兼容ValuesStat只能先这样处理
|
637
|
-
# ct = sentances_df.groupby('length').sum().to_dict()['count']
|
638
|
-
max_len = max(sentances_df['length'])
|
639
|
-
sentances_length_df = pd.DataFrame.from_records([[i, ct.get(i, 0)] for i in range(max_len + 1)],
|
640
|
-
columns=['length', 'count'])
|
641
|
-
|
642
|
-
# 4 频数规律计算
|
643
|
-
def summary(title, vals):
|
644
|
-
msg = ValuesStat(vals).summary(['g', '.2f', '.2f', 'g', 'g'])
|
645
|
-
# print(msg)
|
646
|
-
return [title] + re.findall(r':\s+(\S+)', msg)
|
647
|
-
|
648
|
-
print('【stat_texts】')
|
649
|
-
print(f'输出文件:{outfile.as_posix()}')
|
650
|
-
|
651
|
-
print(f'不在字典中的{len(new_chars)}个字符:' + ''.join(new_chars))
|
652
|
-
|
653
|
-
ls = [
|
654
|
-
summary('字符频数', chars_df['count']),
|
655
|
-
summary('词组频数', words_df['count']),
|
656
|
-
summary('句子频数', sentances_df['count']),
|
657
|
-
summary('句子长度', lengths),
|
658
|
-
]
|
659
|
-
df = pd.DataFrame.from_records(ls, columns=['title', '总和', '均值标准差', '总数', '最小值', '最大值'])
|
660
|
-
print(df)
|
661
|
-
|
662
|
-
# 5 存储分析表
|
663
|
-
sheets = {'字符': chars_df, '词组': words_df,
|
664
|
-
'句子': sentances_df, '句子长度': sentances_length_df}
|
665
|
-
dataframes_to_excel(outfile, sheets)
|
666
|
-
|
667
|
-
def create_recdata(self, src, dst, *, print_mode=True, recreate=False):
|
668
|
-
""" 从xllabelme标注的格式,生成到paddle支持的识别数据格式;提取出供文本识别模型训练的文本行数据
|
669
|
-
|
670
|
-
:param src: xllabelme_data_dir
|
671
|
-
:param dst: 目标存储位置的根目录
|
672
|
-
:param recreate: 如果目标目录存在,将其删除,重新生成
|
673
|
-
|
674
|
-
注意:本套生成方法仅供参考,这套处理目前不是那么泛用
|
675
|
-
"""
|
676
|
-
# 0
|
677
|
-
src, dst = XlPath(src), XlPath(dst)
|
678
|
-
if recreate and dst.is_dir():
|
679
|
-
dst.delete() # 如果已有,将其删除
|
680
|
-
|
681
|
-
# 1 生成图片
|
682
|
-
chars = set()
|
683
|
-
labels1, labels2 = [], []
|
684
|
-
for f in tqdm(list(src.rglob('*.json')), desc='提取文本行数据', disable=not print_mode):
|
685
|
-
data = f.read_json()
|
686
|
-
impath = f.parent / data['imagePath']
|
687
|
-
im = xlcv.read(impath)
|
688
|
-
for i, sp in enumerate(data['shapes'], start=1):
|
689
|
-
# a组,提取文本行的时候,按外接矩形框截取
|
690
|
-
name = f'imgs/{f.stem}_r{i:03}.jpg'
|
691
|
-
text = json.loads(sp['label'])['text']
|
692
|
-
chars |= set(text)
|
693
|
-
xlcv.write(xlcv.get_sub(im, sp['points']), dst / name)
|
694
|
-
labels1.append(f'{name}\t{text}')
|
695
|
-
|
696
|
-
# b组,提取文本行的时候,进行仿射变换矫正
|
697
|
-
name = f'imgs/{f.stem}_w{i:03}.jpg'
|
698
|
-
xlcv.write(xlcv.get_sub(im, sp['points'], warp_quad=True), dst / name)
|
699
|
-
labels2.append(f'{name}\t{text}')
|
700
|
-
|
701
|
-
# 2 字典文件
|
702
|
-
chars -= set(' \n\t') # 要去掉空格等字符
|
703
|
-
(dst / 'char_dict.txt').write_text('\n'.join(sorted(chars)))
|
704
|
-
|
705
|
-
# 3 标注数据
|
706
|
-
(dst / 'labels_rect.txt').write_text('\n'.join(labels1))
|
707
|
-
(dst / 'labels_warp.txt').write_text('\n'.join(labels2))
|
708
|
-
(dst / 'labels_total.txt').write_text('\n'.join(labels1 + labels2))
|
709
|
-
|
710
|
-
return self
|
711
|
-
|
712
|
-
def set_rec_dataset(self, data_dir, label_file_list):
|
713
|
-
""" 设置识别数据集
|
714
|
-
|
715
|
-
:param data_dir: 数据所在根目录
|
716
|
-
:param list[str|list] label_file_list: 标注文件清单
|
717
|
-
str,标注文件的相对路径
|
718
|
-
list[str, float],除了str描述标注文件路径,还有个ratio值配置选取样本的比例
|
719
|
-
|
720
|
-
TODO 想做设置的集成,但目前还没想到好的设计方式,可以自己手动拆分数据,并在autoset中配置,也不会很麻烦
|
721
|
-
"""
|
722
|
-
|
723
|
-
# self.cfg['Train']['dataset']['data_dir'] = Paths.eleclabel / 'recdata'
|
724
|
-
# self.cfg['Train']['dataset']['label_file_list'] = [Paths.eleclabel / 'recdata/labels_ab.txt']
|
725
|
-
# self.cfg['Eval']['dataset']['data_dir'] = Paths.eleclabel / 'recdata'
|
726
|
-
# self.cfg['Eval']['dataset']['label_file_list'] = [Paths.eleclabel / 'recdata/labels_ab.txt']
|
727
|
-
|
728
|
-
raise NotImplementedError
|
729
|
-
|
730
|
-
|
731
|
-
class XlCls:
|
732
|
-
""" 分类模型,这个是基本使用源生的paddlepaddle,没有使用有个更强的paddleclas """
|
733
|
-
|
734
|
-
|
735
|
-
class XlOcr:
|
736
|
-
""" 封装了文字技术体系,检测识别的一些标准化处理流程 """
|
737
|
-
|
738
|
-
def __init__(self, root):
|
739
|
-
self.root = XlPath(root) # 项目根目录
|
740
|
-
|
741
|
-
def step1_autolabel(self):
|
742
|
-
""" 预标注检测、识别 """
|
743
|
-
|
744
|
-
def step2_refinelabel(self):
|
745
|
-
""" 人工手动优化label标注 """
|
746
|
-
|
747
|
-
def step3_det(self):
|
748
|
-
""" 训练检测模型 """
|