pyxllib 0.3.96__py3-none-any.whl → 0.3.197__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/algo/geo.py +12 -0
- pyxllib/algo/intervals.py +1 -1
- pyxllib/algo/matcher.py +78 -0
- pyxllib/algo/pupil.py +187 -19
- pyxllib/algo/specialist.py +2 -1
- pyxllib/algo/stat.py +38 -2
- {pyxlpr → pyxllib/autogui}/__init__.py +1 -1
- pyxllib/autogui/activewin.py +246 -0
- pyxllib/autogui/all.py +9 -0
- pyxllib/{ext/autogui → autogui}/autogui.py +40 -11
- pyxllib/autogui/uiautolib.py +362 -0
- pyxllib/autogui/wechat.py +827 -0
- pyxllib/autogui/wechat_msg.py +421 -0
- pyxllib/autogui/wxautolib.py +84 -0
- pyxllib/cv/slidercaptcha.py +137 -0
- pyxllib/data/echarts.py +123 -12
- pyxllib/data/jsonlib.py +89 -0
- pyxllib/data/pglib.py +514 -30
- pyxllib/data/sqlite.py +231 -4
- pyxllib/ext/JLineViewer.py +14 -1
- pyxllib/ext/drissionlib.py +277 -0
- pyxllib/ext/kq5034lib.py +0 -1594
- pyxllib/ext/robustprocfile.py +497 -0
- pyxllib/ext/unixlib.py +6 -5
- pyxllib/ext/utools.py +108 -95
- pyxllib/ext/webhook.py +32 -14
- pyxllib/ext/wjxlib.py +88 -0
- pyxllib/ext/wpsapi.py +124 -0
- pyxllib/ext/xlwork.py +9 -0
- pyxllib/ext/yuquelib.py +1003 -71
- pyxllib/file/docxlib.py +1 -1
- pyxllib/file/libreoffice.py +165 -0
- pyxllib/file/movielib.py +9 -0
- pyxllib/file/packlib/__init__.py +112 -75
- pyxllib/file/pdflib.py +1 -1
- pyxllib/file/pupil.py +1 -1
- pyxllib/file/specialist/dirlib.py +1 -1
- pyxllib/file/specialist/download.py +10 -3
- pyxllib/file/specialist/filelib.py +266 -55
- pyxllib/file/xlsxlib.py +205 -50
- pyxllib/file/xlsyncfile.py +341 -0
- pyxllib/prog/cachetools.py +64 -0
- pyxllib/prog/filelock.py +42 -0
- pyxllib/prog/multiprogs.py +940 -0
- pyxllib/prog/newbie.py +9 -2
- pyxllib/prog/pupil.py +129 -60
- pyxllib/prog/specialist/__init__.py +176 -2
- pyxllib/prog/specialist/bc.py +5 -2
- pyxllib/prog/specialist/browser.py +11 -2
- pyxllib/prog/specialist/datetime.py +68 -0
- pyxllib/prog/specialist/tictoc.py +12 -13
- pyxllib/prog/specialist/xllog.py +5 -5
- pyxllib/prog/xlosenv.py +7 -0
- pyxllib/text/airscript.js +744 -0
- pyxllib/text/charclasslib.py +17 -5
- pyxllib/text/jiebalib.py +6 -3
- pyxllib/text/jinjalib.py +32 -0
- pyxllib/text/jsa_ai_prompt.md +271 -0
- pyxllib/text/jscode.py +159 -4
- pyxllib/text/nestenv.py +1 -1
- pyxllib/text/newbie.py +12 -0
- pyxllib/text/pupil/common.py +26 -0
- pyxllib/text/specialist/ptag.py +2 -2
- pyxllib/text/templates/echart_base.html +11 -0
- pyxllib/text/templates/highlight_code.html +17 -0
- pyxllib/text/templates/latex_editor.html +103 -0
- pyxllib/text/xmllib.py +76 -14
- pyxllib/xl.py +2 -1
- pyxllib-0.3.197.dist-info/METADATA +48 -0
- pyxllib-0.3.197.dist-info/RECORD +126 -0
- {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +1 -2
- pyxllib/ext/autogui/__init__.py +0 -8
- pyxllib-0.3.96.dist-info/METADATA +0 -51
- pyxllib-0.3.96.dist-info/RECORD +0 -333
- pyxllib-0.3.96.dist-info/top_level.txt +0 -2
- pyxlpr/ai/__init__.py +0 -5
- pyxlpr/ai/clientlib.py +0 -1281
- pyxlpr/ai/specialist.py +0 -286
- pyxlpr/ai/torch_app.py +0 -172
- pyxlpr/ai/xlpaddle.py +0 -655
- pyxlpr/ai/xltorch.py +0 -705
- pyxlpr/data/__init__.py +0 -11
- pyxlpr/data/coco.py +0 -1325
- pyxlpr/data/datacls.py +0 -365
- pyxlpr/data/datasets.py +0 -200
- pyxlpr/data/gptlib.py +0 -1291
- pyxlpr/data/icdar/__init__.py +0 -96
- pyxlpr/data/icdar/deteval.py +0 -377
- pyxlpr/data/icdar/icdar2013.py +0 -341
- pyxlpr/data/icdar/iou.py +0 -340
- pyxlpr/data/icdar/rrc_evaluation_funcs_1_1.py +0 -463
- pyxlpr/data/imtextline.py +0 -473
- pyxlpr/data/labelme.py +0 -866
- pyxlpr/data/removeline.py +0 -179
- pyxlpr/data/specialist.py +0 -57
- pyxlpr/eval/__init__.py +0 -85
- pyxlpr/paddleocr.py +0 -776
- pyxlpr/ppocr/__init__.py +0 -15
- pyxlpr/ppocr/configs/rec/multi_language/generate_multi_language_configs.py +0 -226
- pyxlpr/ppocr/data/__init__.py +0 -135
- pyxlpr/ppocr/data/imaug/ColorJitter.py +0 -26
- pyxlpr/ppocr/data/imaug/__init__.py +0 -67
- pyxlpr/ppocr/data/imaug/copy_paste.py +0 -170
- pyxlpr/ppocr/data/imaug/east_process.py +0 -437
- pyxlpr/ppocr/data/imaug/gen_table_mask.py +0 -244
- pyxlpr/ppocr/data/imaug/iaa_augment.py +0 -114
- pyxlpr/ppocr/data/imaug/label_ops.py +0 -789
- pyxlpr/ppocr/data/imaug/make_border_map.py +0 -184
- pyxlpr/ppocr/data/imaug/make_pse_gt.py +0 -106
- pyxlpr/ppocr/data/imaug/make_shrink_map.py +0 -126
- pyxlpr/ppocr/data/imaug/operators.py +0 -433
- pyxlpr/ppocr/data/imaug/pg_process.py +0 -906
- pyxlpr/ppocr/data/imaug/randaugment.py +0 -143
- pyxlpr/ppocr/data/imaug/random_crop_data.py +0 -239
- pyxlpr/ppocr/data/imaug/rec_img_aug.py +0 -533
- pyxlpr/ppocr/data/imaug/sast_process.py +0 -777
- pyxlpr/ppocr/data/imaug/text_image_aug/__init__.py +0 -17
- pyxlpr/ppocr/data/imaug/text_image_aug/augment.py +0 -120
- pyxlpr/ppocr/data/imaug/text_image_aug/warp_mls.py +0 -168
- pyxlpr/ppocr/data/lmdb_dataset.py +0 -115
- pyxlpr/ppocr/data/pgnet_dataset.py +0 -104
- pyxlpr/ppocr/data/pubtab_dataset.py +0 -107
- pyxlpr/ppocr/data/simple_dataset.py +0 -372
- pyxlpr/ppocr/losses/__init__.py +0 -61
- pyxlpr/ppocr/losses/ace_loss.py +0 -52
- pyxlpr/ppocr/losses/basic_loss.py +0 -135
- pyxlpr/ppocr/losses/center_loss.py +0 -88
- pyxlpr/ppocr/losses/cls_loss.py +0 -30
- pyxlpr/ppocr/losses/combined_loss.py +0 -67
- pyxlpr/ppocr/losses/det_basic_loss.py +0 -208
- pyxlpr/ppocr/losses/det_db_loss.py +0 -80
- pyxlpr/ppocr/losses/det_east_loss.py +0 -63
- pyxlpr/ppocr/losses/det_pse_loss.py +0 -149
- pyxlpr/ppocr/losses/det_sast_loss.py +0 -121
- pyxlpr/ppocr/losses/distillation_loss.py +0 -272
- pyxlpr/ppocr/losses/e2e_pg_loss.py +0 -140
- pyxlpr/ppocr/losses/kie_sdmgr_loss.py +0 -113
- pyxlpr/ppocr/losses/rec_aster_loss.py +0 -99
- pyxlpr/ppocr/losses/rec_att_loss.py +0 -39
- pyxlpr/ppocr/losses/rec_ctc_loss.py +0 -44
- pyxlpr/ppocr/losses/rec_enhanced_ctc_loss.py +0 -70
- pyxlpr/ppocr/losses/rec_nrtr_loss.py +0 -30
- pyxlpr/ppocr/losses/rec_sar_loss.py +0 -28
- pyxlpr/ppocr/losses/rec_srn_loss.py +0 -47
- pyxlpr/ppocr/losses/table_att_loss.py +0 -109
- pyxlpr/ppocr/metrics/__init__.py +0 -44
- pyxlpr/ppocr/metrics/cls_metric.py +0 -45
- pyxlpr/ppocr/metrics/det_metric.py +0 -82
- pyxlpr/ppocr/metrics/distillation_metric.py +0 -73
- pyxlpr/ppocr/metrics/e2e_metric.py +0 -86
- pyxlpr/ppocr/metrics/eval_det_iou.py +0 -274
- pyxlpr/ppocr/metrics/kie_metric.py +0 -70
- pyxlpr/ppocr/metrics/rec_metric.py +0 -75
- pyxlpr/ppocr/metrics/table_metric.py +0 -50
- pyxlpr/ppocr/modeling/architectures/__init__.py +0 -32
- pyxlpr/ppocr/modeling/architectures/base_model.py +0 -88
- pyxlpr/ppocr/modeling/architectures/distillation_model.py +0 -60
- pyxlpr/ppocr/modeling/backbones/__init__.py +0 -54
- pyxlpr/ppocr/modeling/backbones/det_mobilenet_v3.py +0 -268
- pyxlpr/ppocr/modeling/backbones/det_resnet_vd.py +0 -246
- pyxlpr/ppocr/modeling/backbones/det_resnet_vd_sast.py +0 -285
- pyxlpr/ppocr/modeling/backbones/e2e_resnet_vd_pg.py +0 -265
- pyxlpr/ppocr/modeling/backbones/kie_unet_sdmgr.py +0 -186
- pyxlpr/ppocr/modeling/backbones/rec_mobilenet_v3.py +0 -138
- pyxlpr/ppocr/modeling/backbones/rec_mv1_enhance.py +0 -258
- pyxlpr/ppocr/modeling/backbones/rec_nrtr_mtb.py +0 -48
- pyxlpr/ppocr/modeling/backbones/rec_resnet_31.py +0 -210
- pyxlpr/ppocr/modeling/backbones/rec_resnet_aster.py +0 -143
- pyxlpr/ppocr/modeling/backbones/rec_resnet_fpn.py +0 -307
- pyxlpr/ppocr/modeling/backbones/rec_resnet_vd.py +0 -286
- pyxlpr/ppocr/modeling/heads/__init__.py +0 -54
- pyxlpr/ppocr/modeling/heads/cls_head.py +0 -52
- pyxlpr/ppocr/modeling/heads/det_db_head.py +0 -118
- pyxlpr/ppocr/modeling/heads/det_east_head.py +0 -121
- pyxlpr/ppocr/modeling/heads/det_pse_head.py +0 -37
- pyxlpr/ppocr/modeling/heads/det_sast_head.py +0 -128
- pyxlpr/ppocr/modeling/heads/e2e_pg_head.py +0 -253
- pyxlpr/ppocr/modeling/heads/kie_sdmgr_head.py +0 -206
- pyxlpr/ppocr/modeling/heads/multiheadAttention.py +0 -163
- pyxlpr/ppocr/modeling/heads/rec_aster_head.py +0 -393
- pyxlpr/ppocr/modeling/heads/rec_att_head.py +0 -202
- pyxlpr/ppocr/modeling/heads/rec_ctc_head.py +0 -88
- pyxlpr/ppocr/modeling/heads/rec_nrtr_head.py +0 -826
- pyxlpr/ppocr/modeling/heads/rec_sar_head.py +0 -402
- pyxlpr/ppocr/modeling/heads/rec_srn_head.py +0 -280
- pyxlpr/ppocr/modeling/heads/self_attention.py +0 -406
- pyxlpr/ppocr/modeling/heads/table_att_head.py +0 -246
- pyxlpr/ppocr/modeling/necks/__init__.py +0 -32
- pyxlpr/ppocr/modeling/necks/db_fpn.py +0 -111
- pyxlpr/ppocr/modeling/necks/east_fpn.py +0 -188
- pyxlpr/ppocr/modeling/necks/fpn.py +0 -138
- pyxlpr/ppocr/modeling/necks/pg_fpn.py +0 -314
- pyxlpr/ppocr/modeling/necks/rnn.py +0 -92
- pyxlpr/ppocr/modeling/necks/sast_fpn.py +0 -284
- pyxlpr/ppocr/modeling/necks/table_fpn.py +0 -110
- pyxlpr/ppocr/modeling/transforms/__init__.py +0 -28
- pyxlpr/ppocr/modeling/transforms/stn.py +0 -135
- pyxlpr/ppocr/modeling/transforms/tps.py +0 -308
- pyxlpr/ppocr/modeling/transforms/tps_spatial_transformer.py +0 -156
- pyxlpr/ppocr/optimizer/__init__.py +0 -61
- pyxlpr/ppocr/optimizer/learning_rate.py +0 -228
- pyxlpr/ppocr/optimizer/lr_scheduler.py +0 -49
- pyxlpr/ppocr/optimizer/optimizer.py +0 -160
- pyxlpr/ppocr/optimizer/regularizer.py +0 -52
- pyxlpr/ppocr/postprocess/__init__.py +0 -55
- pyxlpr/ppocr/postprocess/cls_postprocess.py +0 -33
- pyxlpr/ppocr/postprocess/db_postprocess.py +0 -234
- pyxlpr/ppocr/postprocess/east_postprocess.py +0 -143
- pyxlpr/ppocr/postprocess/locality_aware_nms.py +0 -200
- pyxlpr/ppocr/postprocess/pg_postprocess.py +0 -52
- pyxlpr/ppocr/postprocess/pse_postprocess/__init__.py +0 -15
- pyxlpr/ppocr/postprocess/pse_postprocess/pse/__init__.py +0 -29
- pyxlpr/ppocr/postprocess/pse_postprocess/pse/setup.py +0 -14
- pyxlpr/ppocr/postprocess/pse_postprocess/pse_postprocess.py +0 -118
- pyxlpr/ppocr/postprocess/rec_postprocess.py +0 -654
- pyxlpr/ppocr/postprocess/sast_postprocess.py +0 -355
- pyxlpr/ppocr/tools/__init__.py +0 -14
- pyxlpr/ppocr/tools/eval.py +0 -83
- pyxlpr/ppocr/tools/export_center.py +0 -77
- pyxlpr/ppocr/tools/export_model.py +0 -129
- pyxlpr/ppocr/tools/infer/predict_cls.py +0 -151
- pyxlpr/ppocr/tools/infer/predict_det.py +0 -300
- pyxlpr/ppocr/tools/infer/predict_e2e.py +0 -169
- pyxlpr/ppocr/tools/infer/predict_rec.py +0 -414
- pyxlpr/ppocr/tools/infer/predict_system.py +0 -204
- pyxlpr/ppocr/tools/infer/utility.py +0 -629
- pyxlpr/ppocr/tools/infer_cls.py +0 -83
- pyxlpr/ppocr/tools/infer_det.py +0 -134
- pyxlpr/ppocr/tools/infer_e2e.py +0 -122
- pyxlpr/ppocr/tools/infer_kie.py +0 -153
- pyxlpr/ppocr/tools/infer_rec.py +0 -146
- pyxlpr/ppocr/tools/infer_table.py +0 -107
- pyxlpr/ppocr/tools/program.py +0 -596
- pyxlpr/ppocr/tools/test_hubserving.py +0 -117
- pyxlpr/ppocr/tools/train.py +0 -163
- pyxlpr/ppocr/tools/xlprog.py +0 -748
- pyxlpr/ppocr/utils/EN_symbol_dict.txt +0 -94
- pyxlpr/ppocr/utils/__init__.py +0 -24
- pyxlpr/ppocr/utils/dict/ar_dict.txt +0 -117
- pyxlpr/ppocr/utils/dict/arabic_dict.txt +0 -162
- pyxlpr/ppocr/utils/dict/be_dict.txt +0 -145
- pyxlpr/ppocr/utils/dict/bg_dict.txt +0 -140
- pyxlpr/ppocr/utils/dict/chinese_cht_dict.txt +0 -8421
- pyxlpr/ppocr/utils/dict/cyrillic_dict.txt +0 -163
- pyxlpr/ppocr/utils/dict/devanagari_dict.txt +0 -167
- pyxlpr/ppocr/utils/dict/en_dict.txt +0 -63
- pyxlpr/ppocr/utils/dict/fa_dict.txt +0 -136
- pyxlpr/ppocr/utils/dict/french_dict.txt +0 -136
- pyxlpr/ppocr/utils/dict/german_dict.txt +0 -143
- pyxlpr/ppocr/utils/dict/hi_dict.txt +0 -162
- pyxlpr/ppocr/utils/dict/it_dict.txt +0 -118
- pyxlpr/ppocr/utils/dict/japan_dict.txt +0 -4399
- pyxlpr/ppocr/utils/dict/ka_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/korean_dict.txt +0 -3688
- pyxlpr/ppocr/utils/dict/latin_dict.txt +0 -185
- pyxlpr/ppocr/utils/dict/mr_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/ne_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/oc_dict.txt +0 -96
- pyxlpr/ppocr/utils/dict/pu_dict.txt +0 -130
- pyxlpr/ppocr/utils/dict/rs_dict.txt +0 -91
- pyxlpr/ppocr/utils/dict/rsc_dict.txt +0 -134
- pyxlpr/ppocr/utils/dict/ru_dict.txt +0 -125
- pyxlpr/ppocr/utils/dict/ta_dict.txt +0 -128
- pyxlpr/ppocr/utils/dict/table_dict.txt +0 -277
- pyxlpr/ppocr/utils/dict/table_structure_dict.txt +0 -2759
- pyxlpr/ppocr/utils/dict/te_dict.txt +0 -151
- pyxlpr/ppocr/utils/dict/ug_dict.txt +0 -114
- pyxlpr/ppocr/utils/dict/uk_dict.txt +0 -142
- pyxlpr/ppocr/utils/dict/ur_dict.txt +0 -137
- pyxlpr/ppocr/utils/dict/xi_dict.txt +0 -110
- pyxlpr/ppocr/utils/dict90.txt +0 -90
- pyxlpr/ppocr/utils/e2e_metric/Deteval.py +0 -574
- pyxlpr/ppocr/utils/e2e_metric/polygon_fast.py +0 -83
- pyxlpr/ppocr/utils/e2e_utils/extract_batchsize.py +0 -87
- pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_fast.py +0 -457
- pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_slow.py +0 -592
- pyxlpr/ppocr/utils/e2e_utils/pgnet_pp_utils.py +0 -162
- pyxlpr/ppocr/utils/e2e_utils/visual.py +0 -162
- pyxlpr/ppocr/utils/en_dict.txt +0 -95
- pyxlpr/ppocr/utils/gen_label.py +0 -81
- pyxlpr/ppocr/utils/ic15_dict.txt +0 -36
- pyxlpr/ppocr/utils/iou.py +0 -54
- pyxlpr/ppocr/utils/logging.py +0 -69
- pyxlpr/ppocr/utils/network.py +0 -84
- pyxlpr/ppocr/utils/ppocr_keys_v1.txt +0 -6623
- pyxlpr/ppocr/utils/profiler.py +0 -110
- pyxlpr/ppocr/utils/save_load.py +0 -150
- pyxlpr/ppocr/utils/stats.py +0 -72
- pyxlpr/ppocr/utils/utility.py +0 -80
- pyxlpr/ppstructure/__init__.py +0 -13
- pyxlpr/ppstructure/predict_system.py +0 -187
- pyxlpr/ppstructure/table/__init__.py +0 -13
- pyxlpr/ppstructure/table/eval_table.py +0 -72
- pyxlpr/ppstructure/table/matcher.py +0 -192
- pyxlpr/ppstructure/table/predict_structure.py +0 -136
- pyxlpr/ppstructure/table/predict_table.py +0 -221
- pyxlpr/ppstructure/table/table_metric/__init__.py +0 -16
- pyxlpr/ppstructure/table/table_metric/parallel.py +0 -51
- pyxlpr/ppstructure/table/table_metric/table_metric.py +0 -247
- pyxlpr/ppstructure/table/tablepyxl/__init__.py +0 -13
- pyxlpr/ppstructure/table/tablepyxl/style.py +0 -283
- pyxlpr/ppstructure/table/tablepyxl/tablepyxl.py +0 -118
- pyxlpr/ppstructure/utility.py +0 -71
- pyxlpr/xlai.py +0 -10
- /pyxllib/{ext/autogui → autogui}/virtualkey.py +0 -0
- {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
pyxlpr/data/datacls.py
DELETED
@@ -1,365 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : 陈坤泽
|
4
|
-
# @Email : 877362867@qq.com
|
5
|
-
# @Date : 2021/05/19 16:28
|
6
|
-
|
7
|
-
""" 相关数据格式类 """
|
8
|
-
|
9
|
-
from pathlib import Path
|
10
|
-
from pyxlpr.ai import *
|
11
|
-
from pyxlpr.data import *
|
12
|
-
|
13
|
-
# from pyxlpr.data.imtextline import TextlineShape
|
14
|
-
|
15
|
-
__1_zcdata = """
|
16
|
-
"""
|
17
|
-
|
18
|
-
|
19
|
-
class ZcTextGt:
|
20
|
-
def __init__(self, root, data=None, *, imdir='images', parts=None):
|
21
|
-
"""
|
22
|
-
:param root: 数据根目录
|
23
|
-
:param imdir: 所在图片子目录
|
24
|
-
:param parts: ['test.txt', 'train.txt'] 等分块标记
|
25
|
-
|
26
|
-
备注:具体文本数据好像不太适合直接存到内存里,就先不存了。
|
27
|
-
但是这个类至少可以把相关功能整合在一起,不零散。
|
28
|
-
"""
|
29
|
-
self.root = Dir(root)
|
30
|
-
self.imdir = Dir(imdir, self.root)
|
31
|
-
self.parts = parts or []
|
32
|
-
|
33
|
-
if data is None:
|
34
|
-
pass
|
35
|
-
self.data = data
|
36
|
-
|
37
|
-
def writes_from_coco(self, gt_dict, *, prt=False):
|
38
|
-
""" coco标注 --> 智财偏好的文本标注格式
|
39
|
-
|
40
|
-
:param gt_dict: coco 的 gt 字典
|
41
|
-
|
42
|
-
TODO gt_dict可能是过量版,增设筛选规则?
|
43
|
-
"""
|
44
|
-
items = list(CocoData(gt_dict).group_gt())
|
45
|
-
for img, anns in tqdm(items, desc='生成ZcTextGt的txt标注文件', disable=not prt):
|
46
|
-
content = []
|
47
|
-
for ann in anns:
|
48
|
-
ltrb = xywh2ltrb(ann['bbox'])
|
49
|
-
ltrb = ','.join([str(int(v)) for v in ltrb])
|
50
|
-
content.append('\t'.join([ltrb, ann['label']]))
|
51
|
-
File(img['file_name'], self.imdir, suffix='.txt').write('\n'.join(content))
|
52
|
-
|
53
|
-
def writes(self, *, max_workers=8, prt=False):
|
54
|
-
""" 重新写入txt文件 """
|
55
|
-
|
56
|
-
def write(x):
|
57
|
-
file, data = x
|
58
|
-
if file: # 如果文件存在,要遵循原有的编码规则
|
59
|
-
with open(str(file), 'rb') as f:
|
60
|
-
bstr = f.read()
|
61
|
-
encoding = get_encoding(bstr)
|
62
|
-
file.write(data, encoding=encoding, if_exists='replace')
|
63
|
-
else: # 否则直接写入
|
64
|
-
file.write(data)
|
65
|
-
|
66
|
-
mtqdm(write, self.data, desc='写入labelme json数据', max_workers=max_workers, disable=not prt)
|
67
|
-
|
68
|
-
|
69
|
-
class ZcKvGt(ZcTextGt):
|
70
|
-
|
71
|
-
def writes_from_coco(self, gt_dict, *, prt=False):
|
72
|
-
""" coco标注 -> 智财偏好的,带类别的文本标注格式 """
|
73
|
-
items = list(CocoData(gt_dict).group_gt())
|
74
|
-
for img, anns in tqdm(items, '生成ZcKvGt的txt标注文件', disable=not prt):
|
75
|
-
content = []
|
76
|
-
for ann in anns:
|
77
|
-
ltrb = xywh2ltrb(ann['bbox'])
|
78
|
-
ltrb = ','.join([str(int(v)) for v in ltrb])
|
79
|
-
cat_id = ann['category_id']
|
80
|
-
cat = (cat_id + 1) // 2
|
81
|
-
if cat == 5:
|
82
|
-
cat = 0
|
83
|
-
kv = -1
|
84
|
-
else:
|
85
|
-
kv = (cat - 1) % 2
|
86
|
-
content.append('\t'.join([ltrb, str(cat), str(kv), ann['label']]))
|
87
|
-
File(img['file_name'], self.imdir, suffix='.txt').write('\n'.join(content))
|
88
|
-
|
89
|
-
|
90
|
-
class ZcKvDtOld:
|
91
|
-
""" 旧版本的解析器 """
|
92
|
-
|
93
|
-
def __init__(self, data=None):
|
94
|
-
"""
|
95
|
-
:param data:
|
96
|
-
{filepath1: [{'logo': True, 'gt': 'address', 'pr': 'address', 'lb': 'LOT 3'},
|
97
|
-
{...},
|
98
|
-
...]
|
99
|
-
filepath2: ...,
|
100
|
-
...
|
101
|
-
}
|
102
|
-
"""
|
103
|
-
self.data = data
|
104
|
-
|
105
|
-
@classmethod
|
106
|
-
def init_from_zc_txt(cls, file, *, reverse_annos=False):
|
107
|
-
""" 从文件解析出字典结构数据
|
108
|
-
|
109
|
-
:return: {filepath1: [{'logo': True, 'gt': 'address', 'pr': 'address', 'lb': 'LOT 3'},
|
110
|
-
{...},
|
111
|
-
...]
|
112
|
-
filepath2: ...,
|
113
|
-
...
|
114
|
-
}
|
115
|
-
"""
|
116
|
-
content = File(file).read()
|
117
|
-
content = re.sub(r'(.+?)(\n\n)([^\n]+\n)', r'\3\1\2', content, flags=re.DOTALL)
|
118
|
-
parts = ContentPartSpliter.multi_blank_lines(content)
|
119
|
-
|
120
|
-
data = dict()
|
121
|
-
for pt in parts:
|
122
|
-
lines = pt.splitlines()
|
123
|
-
filepath = lines[0]
|
124
|
-
annos = []
|
125
|
-
for line in lines[1:]: # 第1行是文件名,删掉
|
126
|
-
m = re.match(r'(.+?), GT: (.+?), PR: (.+?), LB: (.+)', line)
|
127
|
-
logo, gt, pr, lb = m.groups()
|
128
|
-
annos.append({'logo': logo == 'True', 'gt': gt, 'pr': pr, 'lb': lb})
|
129
|
-
if reverse_annos:
|
130
|
-
annos = list(reversed(annos))
|
131
|
-
data[filepath] = annos
|
132
|
-
|
133
|
-
return cls(data)
|
134
|
-
|
135
|
-
def to_coco_dt(self, gt_dict, *, printf=True):
|
136
|
-
""" 配合gt标注文件,做更精细的zc结果解析
|
137
|
-
"""
|
138
|
-
cat2id = {c['name']: c['id'] for c in gt_dict['categories']}
|
139
|
-
gt_annos = {x[0]['file_name']: x for x in CocoData(gt_dict).group_gt()}
|
140
|
-
dt_list = []
|
141
|
-
for file, dt_annos in self.data.items():
|
142
|
-
file_name = pathlib.Path(file).name
|
143
|
-
image, annos = gt_annos[file_name]
|
144
|
-
|
145
|
-
# 如果gt的annos比dt_annos少,需要扩充下,默认按最后一个gt的an填充
|
146
|
-
n, m = len(annos), len(dt_annos)
|
147
|
-
assert n == m # 智财如果不带box出来,是要强制框数量相同的!否则box怎么一一对应?
|
148
|
-
if n < m:
|
149
|
-
annos += [annos[-1]] * (m - n)
|
150
|
-
|
151
|
-
# TODO 有些不是按顺序匹配的,增加按文本匹配的功能
|
152
|
-
|
153
|
-
for line, an in zip(dt_annos, annos):
|
154
|
-
gt, pr, lb = an['gt'], an['pr'], an['lb']
|
155
|
-
if printf and lb != an['label']:
|
156
|
-
print(file_name, lb, '<=>', an['label']) # gt和dt的框没对应上,最好检查下问题
|
157
|
-
# 附加值是识别错误的类别,没有则代表识别正确
|
158
|
-
dt_list.append(
|
159
|
-
{'image_id': an['image_id'], 'category_id': cat2id[pr],
|
160
|
-
'bbox': an['bbox'], 'score': 1, 'label': lb})
|
161
|
-
return dt_list
|
162
|
-
|
163
|
-
|
164
|
-
class ZcKvDt(ZcKvDtOld):
|
165
|
-
""" 智财预测结果文件的通用解析类
|
166
|
-
|
167
|
-
这里是210510周一16:24新版的结果,文件顺序头写对了,并且增加了cs每个结果的置信度
|
168
|
-
"""
|
169
|
-
|
170
|
-
@classmethod
|
171
|
-
def init_from_zc_txt(cls, file, *, reverse_annos=False):
|
172
|
-
""" 从文件解析出字典结构数据
|
173
|
-
|
174
|
-
有时候可能没有对应的 coco gt,则可以用这个直接把文件解析为内存数据处理
|
175
|
-
|
176
|
-
:param reverse_annos: 是否对每个图片的标注结果,进行顺序反转
|
177
|
-
:return: {filepath1: [{'logo': True, 'gt': 'address', 'pr': 'address', 'lb': 'LOT 3', 'cs': 1.0},
|
178
|
-
{...},
|
179
|
-
...]
|
180
|
-
filepath2: ...,
|
181
|
-
...
|
182
|
-
}
|
183
|
-
"""
|
184
|
-
content = File(file).read()
|
185
|
-
parts = ContentPartSpliter.multi_blank_lines(content)
|
186
|
-
|
187
|
-
data = dict()
|
188
|
-
for pt in parts:
|
189
|
-
lines = pt.splitlines()
|
190
|
-
filepath = lines[0]
|
191
|
-
annos = []
|
192
|
-
for line in lines[1:]: # 第1行是文件名,删掉
|
193
|
-
m = re.match(r'(.+?), GT: (.+?), PR: (.+?), LB: (.+?), CS: (.+)', line)
|
194
|
-
logo, gt, pr, lb, cs = m.groups()
|
195
|
-
annos.append({'logo': logo == 'True', 'gt': gt, 'pr': pr, 'lb': lb, 'cs': float(cs)})
|
196
|
-
if reverse_annos:
|
197
|
-
annos = list(reversed(annos))
|
198
|
-
data[filepath] = annos
|
199
|
-
|
200
|
-
return cls(data)
|
201
|
-
|
202
|
-
def to_coco_dt(self, gt_dict, *, prt=True):
|
203
|
-
""" 转coco的dt格式
|
204
|
-
|
205
|
-
:param gt_dict: 需要有参照的gt文件,才能知道图片id,以及补充box位置信息
|
206
|
-
"""
|
207
|
-
cat2id = {c['name']: c['id'] for c in gt_dict['categories']}
|
208
|
-
gt_annos = {x[0]['file_name']: x for x in CocoData(gt_dict).group_gt()}
|
209
|
-
dt_list = []
|
210
|
-
for file, zc_annos in self.data.items():
|
211
|
-
file_name = pathlib.Path(file).name
|
212
|
-
image, im_annos = gt_annos[file_name]
|
213
|
-
|
214
|
-
# 如果gt的annos比dt_annos少,需要扩充下,默认按最后一个gt的an填充
|
215
|
-
n, m = len(im_annos), len(zc_annos)
|
216
|
-
if n == m:
|
217
|
-
# assert n == m # 智财如果不带box出来,是要强制框数量相同的!否则box怎么一一对应?
|
218
|
-
# if n < m:
|
219
|
-
# annos += [annos[-1]] * (m - n)
|
220
|
-
# 要以顺序整体有依据,文本内容匹配为辅的策略配对
|
221
|
-
im_annos.sort(key=lambda x: x['label'])
|
222
|
-
zc_annos.sort(key=lambda x: x['lb'])
|
223
|
-
dt_annos = []
|
224
|
-
|
225
|
-
for a, b in zip(zc_annos, im_annos):
|
226
|
-
gt, pr, lb, cs = a['gt'], a['pr'], a['lb'], a['cs']
|
227
|
-
if prt and lb != b['label']:
|
228
|
-
# gt和dt的框没对应上,最好检查下问题
|
229
|
-
warn = ' '.join([file_name, lb, '<=>', b['label']])
|
230
|
-
dprint(warn)
|
231
|
-
# 附加值是识别错误的类别,没有则代表识别正确
|
232
|
-
dt_annos.append(
|
233
|
-
{'image_id': b['image_id'], 'category_id': cat2id[pr],
|
234
|
-
'bbox': b['bbox'], 'score': cs, 'label': lb})
|
235
|
-
|
236
|
-
# 已经协调确定了空间顺序,但以防万一,可以再按空间排序下给到下游任务
|
237
|
-
# 为了效率,也可以确保gt的annos有序,操作时annos顺序不动
|
238
|
-
# 这里是前面为了匹配,已经把排序搞乱了,这里是必须要重排
|
239
|
-
dt_annos.sort(key=lambda x: TextlineShape(xywh2ltrb(x['bbox']))) # 几何重排
|
240
|
-
dt_list += dt_annos
|
241
|
-
else: # 否则不匹配用不匹配的玩法
|
242
|
-
raise NotImplementedError
|
243
|
-
|
244
|
-
return dt_list
|
245
|
-
|
246
|
-
|
247
|
-
__2_other = """
|
248
|
-
"""
|
249
|
-
|
250
|
-
|
251
|
-
class SroieTextData:
|
252
|
-
""" sroie task1、task2 的标注数据
|
253
|
-
|
254
|
-
72,25,326,25,326,64,72,64,TAN WOON YANN
|
255
|
-
50,82,440,82,440,121,50,121,BOOK TA .K(TAMAN DAYA) SDN BND
|
256
|
-
205,121,285,121,285,139,205,139,789417-W
|
257
|
-
"""
|
258
|
-
|
259
|
-
def __init__(self, root, part=None):
|
260
|
-
"""
|
261
|
-
:param root: 'data/task1+2'
|
262
|
-
images,目录下有987份jpg图片、txt标注
|
263
|
-
test.txt,361张jpg图片清单
|
264
|
-
train.txt,626张jpg图片清单
|
265
|
-
:param part:
|
266
|
-
'test', 只记录test的361张图书
|
267
|
-
'train' 等同理
|
268
|
-
"""
|
269
|
-
pass
|
270
|
-
|
271
|
-
def to_coco(self):
|
272
|
-
pass
|
273
|
-
|
274
|
-
|
275
|
-
class SroieClsData:
|
276
|
-
""" sroie task3 关键信息提取的标注 """
|
277
|
-
pass
|
278
|
-
|
279
|
-
|
280
|
-
class BaiduOcrRes:
|
281
|
-
def __init__(self, data=None):
|
282
|
-
"""
|
283
|
-
:param data: dict (key 文件名 -> value 识别结果dict)
|
284
|
-
{'words_result':
|
285
|
-
[ {'words': 'xxxx', 'location': {'top': 130, 'left': ~, 'width': ~, 'height': ~}}, ... ]
|
286
|
-
'log_id': 136768...
|
287
|
-
'words_result_num': 39
|
288
|
-
}
|
289
|
-
"""
|
290
|
-
self.data = data
|
291
|
-
|
292
|
-
@classmethod
|
293
|
-
def init_from_hxl_txt(cls, file):
|
294
|
-
""" 按训龙给的txt格式来初始化
|
295
|
-
|
296
|
-
数据格式:
|
297
|
-
一共22499行,大概是所有agreement,但可能有些空白图所以没结果
|
298
|
-
每行第一段是png完整路径,第二段是百度api返回的json读取为dict后直接print的结果
|
299
|
-
"""
|
300
|
-
lines = File(file).read().splitlines()
|
301
|
-
data = dict()
|
302
|
-
for line in tqdm(lines, '解析txt中每张图对应的字典识别数据'):
|
303
|
-
if line == '': continue
|
304
|
-
# 切分路径和json数据
|
305
|
-
# imfile, dictdata = line.split(maxsplit=1)
|
306
|
-
imfile, dictdata = re.split(r'\s+', line, maxsplit=1)
|
307
|
-
if 'debug' in str(imfile): continue # 有65张debug_的图片不要(后记:服务器上多余的65个debug图已删)
|
308
|
-
data[imfile] = eval(dictdata)
|
309
|
-
|
310
|
-
return cls(data)
|
311
|
-
|
312
|
-
def check(self, imdir):
|
313
|
-
""" 检查json数据的一些问题 """
|
314
|
-
with TicToc('缺失的文件'):
|
315
|
-
files = Path(str(imdir)).glob('*.jpg')
|
316
|
-
files1 = {f.stem for f in files}
|
317
|
-
files2 = {Path(f).stem for f in self.data.keys()}
|
318
|
-
files3 = {Path(f).stem for f in self.data.keys() if ('debug' in str(f))}
|
319
|
-
print(f'缺失{len(files1 - files2)}个文件的识别结果')
|
320
|
-
print(f'多出{len(files2 - files1)}个文件的识别结果')
|
321
|
-
print(f'{len(files3)}个debug文件')
|
322
|
-
sys.stderr.flush()
|
323
|
-
|
324
|
-
with TicToc('check errors'):
|
325
|
-
ct = Counter()
|
326
|
-
for k, v in self.data.items():
|
327
|
-
if 'error_code' in v:
|
328
|
-
ct[v['error_code']] += 1
|
329
|
-
print(k, v)
|
330
|
-
print(ct.most_common())
|
331
|
-
|
332
|
-
def to_coco_gt(self, images, *, start_dt_id=0):
|
333
|
-
""" 转成coco格式
|
334
|
-
|
335
|
-
:param images: coco gt格式的参考images (可以train、val各生成一个返回)
|
336
|
-
TODO 可以扩展支持输入图片所在目录的形式初始化的方法
|
337
|
-
:param start_dt_id: annotation起始编号
|
338
|
-
:return: coco gt格式的字典
|
339
|
-
"""
|
340
|
-
# 辅助数组
|
341
|
-
image_files = {Path(x['file_name']).stem: x['id'] for x in images}
|
342
|
-
# 遍历判断要处理的文件
|
343
|
-
annotations = []
|
344
|
-
for k, v in tqdm(self.data.items(), '解析出每张图片识别结果对应的annotations'):
|
345
|
-
stem = Path(k).stem
|
346
|
-
if stem not in image_files or 'error_code' in v:
|
347
|
-
continue
|
348
|
-
image_id = image_files[stem]
|
349
|
-
for item in v['words_result']:
|
350
|
-
loc = item['location']
|
351
|
-
bbox = [loc['left'], loc['top'], loc['width'], loc['height']]
|
352
|
-
start_dt_id += 1
|
353
|
-
an = CocoGtData.gen_annotation(id=start_dt_id, bbox=bbox, image_id=image_id, label=item['words'])
|
354
|
-
annotations.append(an)
|
355
|
-
return {'images': images,
|
356
|
-
'annotations': annotations,
|
357
|
-
'categories': CocoGtData.gen_categories(['text'])}
|
358
|
-
|
359
|
-
|
360
|
-
if __name__ == '__main__':
|
361
|
-
os.chdir('D:/home/datasets/textGroup/SROIE2019+/data/task3_testcrop')
|
362
|
-
with TicToc(__name__):
|
363
|
-
ld = LabelmeDataset.init_from_coco(r'test', 'data_crop.json')
|
364
|
-
ld.writes()
|
365
|
-
# print(ld)
|
pyxlpr/data/datasets.py
DELETED
@@ -1,200 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : 陈坤泽
|
4
|
-
# @Email : 877362867@qq.com
|
5
|
-
# @Date : 2021/06/25 09:34
|
6
|
-
|
7
|
-
import pathlib
|
8
|
-
|
9
|
-
from pyxllib.xl import *
|
10
|
-
from fvcore.common.registry import Registry
|
11
|
-
|
12
|
-
____basic = """
|
13
|
-
基础组件
|
14
|
-
"""
|
15
|
-
|
16
|
-
|
17
|
-
class CommonPathBase:
|
18
|
-
def __init__(self, prefix=None):
|
19
|
-
if prefix is None:
|
20
|
-
# 只要在这里设置服务器数据目录;本地目录则会将/根目录映射到D:/盘对应目录
|
21
|
-
if os.getenv('PYXLPR_COMMONDIR'): # 可以使用PYXLPR_COMMONDIR='D:/'、'/'来自定义数据根目录
|
22
|
-
prefix = os.getenv('PYXLPR_COMMONDIR')
|
23
|
-
else:
|
24
|
-
prefix = 'D:/' if sys.platform == 'win32' else '/'
|
25
|
-
prefix = XlPath(prefix) # 默认是当前操作系统的文件类型;也可以显示输入PosixPath格式的prefix
|
26
|
-
|
27
|
-
self.datasets = prefix / 'home/datasets'
|
28
|
-
self.huangzhicai = prefix / 'home/huangzhicai'
|
29
|
-
self.chenkunze = prefix / 'home/chenkunze'
|
30
|
-
self.slns = prefix / 'home/chenkunze/slns'
|
31
|
-
|
32
|
-
# slns 相关
|
33
|
-
self.d2configs = self.slns / 'detectron2/configs'
|
34
|
-
self.xlproject = self.slns / 'pyxlpr/xlproject'
|
35
|
-
|
36
|
-
# datasets 相关
|
37
|
-
self.realestate2020 = self.datasets / 'RealEstate2020'
|
38
|
-
self.realestate_coco = self.datasets / 'RealEstate2020/coco_fmt'
|
39
|
-
|
40
|
-
# textGroup 相关
|
41
|
-
self.textGroup = self.datasets / 'textGroup'
|
42
|
-
self.icdar2013 = self.textGroup / 'ICDAR2013'
|
43
|
-
self.ic13loc = self.textGroup / 'ICDAR2013/Task2.1 Text Localization'
|
44
|
-
self.publaynet = self.textGroup / 'PubLayNet/publaynet'
|
45
|
-
self.AHCDB = self.textGroup / 'AHCDB'
|
46
|
-
self.sroie = self.textGroup / 'SROIE2019' # 智财,占秋原来整理的数据
|
47
|
-
self.sroie2 = self.textGroup / 'SROIE2019+' # 我重新整理过的数据,并且子目录data里有新的数据
|
48
|
-
self.cipdf = self.textGroup / 'CIPDF' # 从cninfo下载的pdf文件数据
|
49
|
-
self.cord = self.textGroup / 'CORD' # 从cninfo下载的pdf文件数据
|
50
|
-
self.xeocr1 = self.textGroup / 'Xeon1OCR' # 从cninfo下载的pdf文件数据
|
51
|
-
|
52
|
-
# chenkunze
|
53
|
-
# 项目中一些太大的目录迁移到refdir存储;之前没有想过按chenkunze的目录同步;现在不太需要refdir了
|
54
|
-
self.refdir = self.chenkunze / 'refdir'
|
55
|
-
|
56
|
-
# huangzhicai
|
57
|
-
self.zclogs = self.huangzhicai / 'workshop/ocrwork/uniocr/logs'
|
58
|
-
self.voc2007 = self.huangzhicai / 'data/detec/voc2007/VOCdevkit/VOC2007'
|
59
|
-
|
60
|
-
|
61
|
-
if sys.platform == 'win32':
|
62
|
-
common_path = CommonPathBase()
|
63
|
-
tp10_common_path = CommonPathBase(pathlib.PurePosixPath('/')) # 十卡服务器的常用目录
|
64
|
-
else:
|
65
|
-
common_path = CommonPathBase(XlPath('/'))
|
66
|
-
tp10_common_path = common_path
|
67
|
-
|
68
|
-
____coco = """
|
69
|
-
普通的coco格式数据
|
70
|
-
|
71
|
-
目前需求这样的设计模式够了
|
72
|
-
1、但其实局限还不少,还有很多不便自定义的(不使用register_coco_instances,更灵活地修改底层)
|
73
|
-
2、以及在非d2场景的数据引用
|
74
|
-
|
75
|
-
不过现在也难想清楚,等后面切实需要的时候再扩展修改
|
76
|
-
"""
|
77
|
-
|
78
|
-
COCO_INSTANCES_REGISTRY = Registry('COCO_INSTANCES')
|
79
|
-
COCO_INSTANCES_REGISTRY.__doc__ = """
|
80
|
-
从数据集字符串名,映射到对应的初始化函数
|
81
|
-
"""
|
82
|
-
|
83
|
-
|
84
|
-
class RegisterData:
|
85
|
-
""" 旧版的数据集注册器,暂时不去修改优化了 """
|
86
|
-
|
87
|
-
@classmethod
|
88
|
-
def register_all(cls):
|
89
|
-
with TicToc('RegisterData'):
|
90
|
-
cls.register_by_annotations_dir(common_path.realestate_coco / 'agreement',
|
91
|
-
common_path.realestate_coco / 'annotations')
|
92
|
-
|
93
|
-
cls.register_by_annotations_dir(common_path.realestate_coco / 'agreement',
|
94
|
-
common_path.realestate_coco / 'annotations_det')
|
95
|
-
|
96
|
-
cls.register_by_annotations_dir(common_path.realestate_coco / 'agreement6_shade',
|
97
|
-
common_path.realestate_coco / 'annotations',
|
98
|
-
'agreement_train6_shade.json')
|
99
|
-
|
100
|
-
cls.register_by_annotations_dir(common_path.sroie2 / 'images',
|
101
|
-
common_path.sroie2 / 'annotations')
|
102
|
-
|
103
|
-
cls.register_by_annotations_dir(common_path.voc2007 / 'JPEGImages',
|
104
|
-
common_path.voc2007 / 'coco_annotations')
|
105
|
-
|
106
|
-
# 裁剪后的sroie数据
|
107
|
-
cls.register_by_annotations_dir(common_path.sroie2 / 'data/task3_crop/images',
|
108
|
-
common_path.sroie2 / 'data/task3_crop')
|
109
|
-
|
110
|
-
cls.register_by_annotations_dir(common_path.cipdf / 'images',
|
111
|
-
common_path.cipdf / 'annotations')
|
112
|
-
|
113
|
-
@classmethod
|
114
|
-
def register_by_annotations_dir(cls, imdir, andir,
|
115
|
-
patter=re.compile(r'.+_(train|val|test|minival)\d{0,}\.json'),
|
116
|
-
classes=None):
|
117
|
-
r""" 注册coco类型的数据格式
|
118
|
-
|
119
|
-
:param imdir: 图片所在目录
|
120
|
-
:param andir: 标注所在目录
|
121
|
-
会注册目录下所有以 _[train|val|test]\d{0,}.json 为后缀的文件
|
122
|
-
:param patter: 在andir下,要匹配分析的json文件
|
123
|
-
:type patter: str | re.compile
|
124
|
-
:param classes: 类别清单,例如 ['text', 'title', 'list', 'table', 'figure']
|
125
|
-
如果输入该参数,则这批patter匹配的所有文件都以这个classes为准
|
126
|
-
否则每个json读取自己的 categories 作为类清单
|
127
|
-
|
128
|
-
本函数用来简化detectron2中coco类型数据格式的注册过程,基本通用
|
129
|
-
但是由于要读取josn获取类别信息,在一些特别大的json读取
|
130
|
-
"""
|
131
|
-
from detectron2.data.datasets import register_coco_instances
|
132
|
-
|
133
|
-
# 1 标注文件
|
134
|
-
files = Dir(andir).select(patter).subfiles()
|
135
|
-
|
136
|
-
# 2 注册每一个json文件
|
137
|
-
for f in files:
|
138
|
-
if classes:
|
139
|
-
cats = classes
|
140
|
-
else:
|
141
|
-
cats = f.read(encoding='utf8')['categories']
|
142
|
-
cats = [x['name'] for x in cats]
|
143
|
-
register_coco_instances(f.stem, {'thing_classes': cats}, str(f), str(imdir))
|
144
|
-
|
145
|
-
|
146
|
-
class _DatasetRegister:
|
147
|
-
ROOT = pathlib.Path('.')
|
148
|
-
CLASSES = ('text',)
|
149
|
-
META_DATA = {}
|
150
|
-
|
151
|
-
@classmethod
|
152
|
-
def coco_instances(cls, name, json, imdir):
|
153
|
-
def func():
|
154
|
-
return cls.META_DATA, cls.ROOT / json, cls.ROOT / imdir
|
155
|
-
|
156
|
-
COCO_INSTANCES_REGISTRY._do_register(name, func) # noqa 不用装饰器就只能使用_do_register来注册了
|
157
|
-
|
158
|
-
|
159
|
-
class Publaynet(_DatasetRegister):
|
160
|
-
""" 论文版本分析的数据集 """
|
161
|
-
ROOT = common_path.publaynet
|
162
|
-
CLASSES = ['text', 'title', 'list', 'table', 'figure']
|
163
|
-
META_DATA = {'thing_classes': CLASSES}
|
164
|
-
|
165
|
-
|
166
|
-
Publaynet.coco_instances('publaynet_train', 'train_brief.json', 'train')
|
167
|
-
Publaynet.coco_instances('publaynet_val', 'val.json', 'val')
|
168
|
-
Publaynet.coco_instances('publaynet_val_mini', 'val_mini.json', 'val_mini')
|
169
|
-
Publaynet.coco_instances('publaynet_test', 'test_ids.json', 'test')
|
170
|
-
|
171
|
-
# # 也可以这样自定义函数注册数据,函数名就是数据名,然后返回 META_DATA, json, imdir 即可
|
172
|
-
# @COCO_INSTANCES_Registry.register()
|
173
|
-
# def publaynet_train():
|
174
|
-
# return Publaynet.META_DATA, Publaynet.ROOT / 'train_brief.json', Publaynet.ROOT / 'train'
|
175
|
-
|
176
|
-
|
177
|
-
____register = """
|
178
|
-
数据集注册器
|
179
|
-
"""
|
180
|
-
|
181
|
-
|
182
|
-
def register_d2dataset(name, *, error=None):
|
183
|
-
""" 注册到 detectron2 的MetadataCatalog、DatasetCatalog中
|
184
|
-
|
185
|
-
:param name: 数据集名称
|
186
|
-
:return:
|
187
|
-
"""
|
188
|
-
from detectron2.data import MetadataCatalog
|
189
|
-
from detectron2.data.datasets import register_coco_instances
|
190
|
-
|
191
|
-
if name in MetadataCatalog.keys():
|
192
|
-
# 已有的数据,就不用注册了
|
193
|
-
pass
|
194
|
-
elif name in COCO_INSTANCES_REGISTRY:
|
195
|
-
register_coco_instances(name, *(COCO_INSTANCES_REGISTRY.get(name)()))
|
196
|
-
else:
|
197
|
-
if error == 'ignore':
|
198
|
-
pass
|
199
|
-
else:
|
200
|
-
raise ValueError(f'未预设的数据集名称 {name}')
|