pyxllib 0.3.96__py3-none-any.whl → 0.3.197__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/algo/geo.py +12 -0
- pyxllib/algo/intervals.py +1 -1
- pyxllib/algo/matcher.py +78 -0
- pyxllib/algo/pupil.py +187 -19
- pyxllib/algo/specialist.py +2 -1
- pyxllib/algo/stat.py +38 -2
- {pyxlpr → pyxllib/autogui}/__init__.py +1 -1
- pyxllib/autogui/activewin.py +246 -0
- pyxllib/autogui/all.py +9 -0
- pyxllib/{ext/autogui → autogui}/autogui.py +40 -11
- pyxllib/autogui/uiautolib.py +362 -0
- pyxllib/autogui/wechat.py +827 -0
- pyxllib/autogui/wechat_msg.py +421 -0
- pyxllib/autogui/wxautolib.py +84 -0
- pyxllib/cv/slidercaptcha.py +137 -0
- pyxllib/data/echarts.py +123 -12
- pyxllib/data/jsonlib.py +89 -0
- pyxllib/data/pglib.py +514 -30
- pyxllib/data/sqlite.py +231 -4
- pyxllib/ext/JLineViewer.py +14 -1
- pyxllib/ext/drissionlib.py +277 -0
- pyxllib/ext/kq5034lib.py +0 -1594
- pyxllib/ext/robustprocfile.py +497 -0
- pyxllib/ext/unixlib.py +6 -5
- pyxllib/ext/utools.py +108 -95
- pyxllib/ext/webhook.py +32 -14
- pyxllib/ext/wjxlib.py +88 -0
- pyxllib/ext/wpsapi.py +124 -0
- pyxllib/ext/xlwork.py +9 -0
- pyxllib/ext/yuquelib.py +1003 -71
- pyxllib/file/docxlib.py +1 -1
- pyxllib/file/libreoffice.py +165 -0
- pyxllib/file/movielib.py +9 -0
- pyxllib/file/packlib/__init__.py +112 -75
- pyxllib/file/pdflib.py +1 -1
- pyxllib/file/pupil.py +1 -1
- pyxllib/file/specialist/dirlib.py +1 -1
- pyxllib/file/specialist/download.py +10 -3
- pyxllib/file/specialist/filelib.py +266 -55
- pyxllib/file/xlsxlib.py +205 -50
- pyxllib/file/xlsyncfile.py +341 -0
- pyxllib/prog/cachetools.py +64 -0
- pyxllib/prog/filelock.py +42 -0
- pyxllib/prog/multiprogs.py +940 -0
- pyxllib/prog/newbie.py +9 -2
- pyxllib/prog/pupil.py +129 -60
- pyxllib/prog/specialist/__init__.py +176 -2
- pyxllib/prog/specialist/bc.py +5 -2
- pyxllib/prog/specialist/browser.py +11 -2
- pyxllib/prog/specialist/datetime.py +68 -0
- pyxllib/prog/specialist/tictoc.py +12 -13
- pyxllib/prog/specialist/xllog.py +5 -5
- pyxllib/prog/xlosenv.py +7 -0
- pyxllib/text/airscript.js +744 -0
- pyxllib/text/charclasslib.py +17 -5
- pyxllib/text/jiebalib.py +6 -3
- pyxllib/text/jinjalib.py +32 -0
- pyxllib/text/jsa_ai_prompt.md +271 -0
- pyxllib/text/jscode.py +159 -4
- pyxllib/text/nestenv.py +1 -1
- pyxllib/text/newbie.py +12 -0
- pyxllib/text/pupil/common.py +26 -0
- pyxllib/text/specialist/ptag.py +2 -2
- pyxllib/text/templates/echart_base.html +11 -0
- pyxllib/text/templates/highlight_code.html +17 -0
- pyxllib/text/templates/latex_editor.html +103 -0
- pyxllib/text/xmllib.py +76 -14
- pyxllib/xl.py +2 -1
- pyxllib-0.3.197.dist-info/METADATA +48 -0
- pyxllib-0.3.197.dist-info/RECORD +126 -0
- {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +1 -2
- pyxllib/ext/autogui/__init__.py +0 -8
- pyxllib-0.3.96.dist-info/METADATA +0 -51
- pyxllib-0.3.96.dist-info/RECORD +0 -333
- pyxllib-0.3.96.dist-info/top_level.txt +0 -2
- pyxlpr/ai/__init__.py +0 -5
- pyxlpr/ai/clientlib.py +0 -1281
- pyxlpr/ai/specialist.py +0 -286
- pyxlpr/ai/torch_app.py +0 -172
- pyxlpr/ai/xlpaddle.py +0 -655
- pyxlpr/ai/xltorch.py +0 -705
- pyxlpr/data/__init__.py +0 -11
- pyxlpr/data/coco.py +0 -1325
- pyxlpr/data/datacls.py +0 -365
- pyxlpr/data/datasets.py +0 -200
- pyxlpr/data/gptlib.py +0 -1291
- pyxlpr/data/icdar/__init__.py +0 -96
- pyxlpr/data/icdar/deteval.py +0 -377
- pyxlpr/data/icdar/icdar2013.py +0 -341
- pyxlpr/data/icdar/iou.py +0 -340
- pyxlpr/data/icdar/rrc_evaluation_funcs_1_1.py +0 -463
- pyxlpr/data/imtextline.py +0 -473
- pyxlpr/data/labelme.py +0 -866
- pyxlpr/data/removeline.py +0 -179
- pyxlpr/data/specialist.py +0 -57
- pyxlpr/eval/__init__.py +0 -85
- pyxlpr/paddleocr.py +0 -776
- pyxlpr/ppocr/__init__.py +0 -15
- pyxlpr/ppocr/configs/rec/multi_language/generate_multi_language_configs.py +0 -226
- pyxlpr/ppocr/data/__init__.py +0 -135
- pyxlpr/ppocr/data/imaug/ColorJitter.py +0 -26
- pyxlpr/ppocr/data/imaug/__init__.py +0 -67
- pyxlpr/ppocr/data/imaug/copy_paste.py +0 -170
- pyxlpr/ppocr/data/imaug/east_process.py +0 -437
- pyxlpr/ppocr/data/imaug/gen_table_mask.py +0 -244
- pyxlpr/ppocr/data/imaug/iaa_augment.py +0 -114
- pyxlpr/ppocr/data/imaug/label_ops.py +0 -789
- pyxlpr/ppocr/data/imaug/make_border_map.py +0 -184
- pyxlpr/ppocr/data/imaug/make_pse_gt.py +0 -106
- pyxlpr/ppocr/data/imaug/make_shrink_map.py +0 -126
- pyxlpr/ppocr/data/imaug/operators.py +0 -433
- pyxlpr/ppocr/data/imaug/pg_process.py +0 -906
- pyxlpr/ppocr/data/imaug/randaugment.py +0 -143
- pyxlpr/ppocr/data/imaug/random_crop_data.py +0 -239
- pyxlpr/ppocr/data/imaug/rec_img_aug.py +0 -533
- pyxlpr/ppocr/data/imaug/sast_process.py +0 -777
- pyxlpr/ppocr/data/imaug/text_image_aug/__init__.py +0 -17
- pyxlpr/ppocr/data/imaug/text_image_aug/augment.py +0 -120
- pyxlpr/ppocr/data/imaug/text_image_aug/warp_mls.py +0 -168
- pyxlpr/ppocr/data/lmdb_dataset.py +0 -115
- pyxlpr/ppocr/data/pgnet_dataset.py +0 -104
- pyxlpr/ppocr/data/pubtab_dataset.py +0 -107
- pyxlpr/ppocr/data/simple_dataset.py +0 -372
- pyxlpr/ppocr/losses/__init__.py +0 -61
- pyxlpr/ppocr/losses/ace_loss.py +0 -52
- pyxlpr/ppocr/losses/basic_loss.py +0 -135
- pyxlpr/ppocr/losses/center_loss.py +0 -88
- pyxlpr/ppocr/losses/cls_loss.py +0 -30
- pyxlpr/ppocr/losses/combined_loss.py +0 -67
- pyxlpr/ppocr/losses/det_basic_loss.py +0 -208
- pyxlpr/ppocr/losses/det_db_loss.py +0 -80
- pyxlpr/ppocr/losses/det_east_loss.py +0 -63
- pyxlpr/ppocr/losses/det_pse_loss.py +0 -149
- pyxlpr/ppocr/losses/det_sast_loss.py +0 -121
- pyxlpr/ppocr/losses/distillation_loss.py +0 -272
- pyxlpr/ppocr/losses/e2e_pg_loss.py +0 -140
- pyxlpr/ppocr/losses/kie_sdmgr_loss.py +0 -113
- pyxlpr/ppocr/losses/rec_aster_loss.py +0 -99
- pyxlpr/ppocr/losses/rec_att_loss.py +0 -39
- pyxlpr/ppocr/losses/rec_ctc_loss.py +0 -44
- pyxlpr/ppocr/losses/rec_enhanced_ctc_loss.py +0 -70
- pyxlpr/ppocr/losses/rec_nrtr_loss.py +0 -30
- pyxlpr/ppocr/losses/rec_sar_loss.py +0 -28
- pyxlpr/ppocr/losses/rec_srn_loss.py +0 -47
- pyxlpr/ppocr/losses/table_att_loss.py +0 -109
- pyxlpr/ppocr/metrics/__init__.py +0 -44
- pyxlpr/ppocr/metrics/cls_metric.py +0 -45
- pyxlpr/ppocr/metrics/det_metric.py +0 -82
- pyxlpr/ppocr/metrics/distillation_metric.py +0 -73
- pyxlpr/ppocr/metrics/e2e_metric.py +0 -86
- pyxlpr/ppocr/metrics/eval_det_iou.py +0 -274
- pyxlpr/ppocr/metrics/kie_metric.py +0 -70
- pyxlpr/ppocr/metrics/rec_metric.py +0 -75
- pyxlpr/ppocr/metrics/table_metric.py +0 -50
- pyxlpr/ppocr/modeling/architectures/__init__.py +0 -32
- pyxlpr/ppocr/modeling/architectures/base_model.py +0 -88
- pyxlpr/ppocr/modeling/architectures/distillation_model.py +0 -60
- pyxlpr/ppocr/modeling/backbones/__init__.py +0 -54
- pyxlpr/ppocr/modeling/backbones/det_mobilenet_v3.py +0 -268
- pyxlpr/ppocr/modeling/backbones/det_resnet_vd.py +0 -246
- pyxlpr/ppocr/modeling/backbones/det_resnet_vd_sast.py +0 -285
- pyxlpr/ppocr/modeling/backbones/e2e_resnet_vd_pg.py +0 -265
- pyxlpr/ppocr/modeling/backbones/kie_unet_sdmgr.py +0 -186
- pyxlpr/ppocr/modeling/backbones/rec_mobilenet_v3.py +0 -138
- pyxlpr/ppocr/modeling/backbones/rec_mv1_enhance.py +0 -258
- pyxlpr/ppocr/modeling/backbones/rec_nrtr_mtb.py +0 -48
- pyxlpr/ppocr/modeling/backbones/rec_resnet_31.py +0 -210
- pyxlpr/ppocr/modeling/backbones/rec_resnet_aster.py +0 -143
- pyxlpr/ppocr/modeling/backbones/rec_resnet_fpn.py +0 -307
- pyxlpr/ppocr/modeling/backbones/rec_resnet_vd.py +0 -286
- pyxlpr/ppocr/modeling/heads/__init__.py +0 -54
- pyxlpr/ppocr/modeling/heads/cls_head.py +0 -52
- pyxlpr/ppocr/modeling/heads/det_db_head.py +0 -118
- pyxlpr/ppocr/modeling/heads/det_east_head.py +0 -121
- pyxlpr/ppocr/modeling/heads/det_pse_head.py +0 -37
- pyxlpr/ppocr/modeling/heads/det_sast_head.py +0 -128
- pyxlpr/ppocr/modeling/heads/e2e_pg_head.py +0 -253
- pyxlpr/ppocr/modeling/heads/kie_sdmgr_head.py +0 -206
- pyxlpr/ppocr/modeling/heads/multiheadAttention.py +0 -163
- pyxlpr/ppocr/modeling/heads/rec_aster_head.py +0 -393
- pyxlpr/ppocr/modeling/heads/rec_att_head.py +0 -202
- pyxlpr/ppocr/modeling/heads/rec_ctc_head.py +0 -88
- pyxlpr/ppocr/modeling/heads/rec_nrtr_head.py +0 -826
- pyxlpr/ppocr/modeling/heads/rec_sar_head.py +0 -402
- pyxlpr/ppocr/modeling/heads/rec_srn_head.py +0 -280
- pyxlpr/ppocr/modeling/heads/self_attention.py +0 -406
- pyxlpr/ppocr/modeling/heads/table_att_head.py +0 -246
- pyxlpr/ppocr/modeling/necks/__init__.py +0 -32
- pyxlpr/ppocr/modeling/necks/db_fpn.py +0 -111
- pyxlpr/ppocr/modeling/necks/east_fpn.py +0 -188
- pyxlpr/ppocr/modeling/necks/fpn.py +0 -138
- pyxlpr/ppocr/modeling/necks/pg_fpn.py +0 -314
- pyxlpr/ppocr/modeling/necks/rnn.py +0 -92
- pyxlpr/ppocr/modeling/necks/sast_fpn.py +0 -284
- pyxlpr/ppocr/modeling/necks/table_fpn.py +0 -110
- pyxlpr/ppocr/modeling/transforms/__init__.py +0 -28
- pyxlpr/ppocr/modeling/transforms/stn.py +0 -135
- pyxlpr/ppocr/modeling/transforms/tps.py +0 -308
- pyxlpr/ppocr/modeling/transforms/tps_spatial_transformer.py +0 -156
- pyxlpr/ppocr/optimizer/__init__.py +0 -61
- pyxlpr/ppocr/optimizer/learning_rate.py +0 -228
- pyxlpr/ppocr/optimizer/lr_scheduler.py +0 -49
- pyxlpr/ppocr/optimizer/optimizer.py +0 -160
- pyxlpr/ppocr/optimizer/regularizer.py +0 -52
- pyxlpr/ppocr/postprocess/__init__.py +0 -55
- pyxlpr/ppocr/postprocess/cls_postprocess.py +0 -33
- pyxlpr/ppocr/postprocess/db_postprocess.py +0 -234
- pyxlpr/ppocr/postprocess/east_postprocess.py +0 -143
- pyxlpr/ppocr/postprocess/locality_aware_nms.py +0 -200
- pyxlpr/ppocr/postprocess/pg_postprocess.py +0 -52
- pyxlpr/ppocr/postprocess/pse_postprocess/__init__.py +0 -15
- pyxlpr/ppocr/postprocess/pse_postprocess/pse/__init__.py +0 -29
- pyxlpr/ppocr/postprocess/pse_postprocess/pse/setup.py +0 -14
- pyxlpr/ppocr/postprocess/pse_postprocess/pse_postprocess.py +0 -118
- pyxlpr/ppocr/postprocess/rec_postprocess.py +0 -654
- pyxlpr/ppocr/postprocess/sast_postprocess.py +0 -355
- pyxlpr/ppocr/tools/__init__.py +0 -14
- pyxlpr/ppocr/tools/eval.py +0 -83
- pyxlpr/ppocr/tools/export_center.py +0 -77
- pyxlpr/ppocr/tools/export_model.py +0 -129
- pyxlpr/ppocr/tools/infer/predict_cls.py +0 -151
- pyxlpr/ppocr/tools/infer/predict_det.py +0 -300
- pyxlpr/ppocr/tools/infer/predict_e2e.py +0 -169
- pyxlpr/ppocr/tools/infer/predict_rec.py +0 -414
- pyxlpr/ppocr/tools/infer/predict_system.py +0 -204
- pyxlpr/ppocr/tools/infer/utility.py +0 -629
- pyxlpr/ppocr/tools/infer_cls.py +0 -83
- pyxlpr/ppocr/tools/infer_det.py +0 -134
- pyxlpr/ppocr/tools/infer_e2e.py +0 -122
- pyxlpr/ppocr/tools/infer_kie.py +0 -153
- pyxlpr/ppocr/tools/infer_rec.py +0 -146
- pyxlpr/ppocr/tools/infer_table.py +0 -107
- pyxlpr/ppocr/tools/program.py +0 -596
- pyxlpr/ppocr/tools/test_hubserving.py +0 -117
- pyxlpr/ppocr/tools/train.py +0 -163
- pyxlpr/ppocr/tools/xlprog.py +0 -748
- pyxlpr/ppocr/utils/EN_symbol_dict.txt +0 -94
- pyxlpr/ppocr/utils/__init__.py +0 -24
- pyxlpr/ppocr/utils/dict/ar_dict.txt +0 -117
- pyxlpr/ppocr/utils/dict/arabic_dict.txt +0 -162
- pyxlpr/ppocr/utils/dict/be_dict.txt +0 -145
- pyxlpr/ppocr/utils/dict/bg_dict.txt +0 -140
- pyxlpr/ppocr/utils/dict/chinese_cht_dict.txt +0 -8421
- pyxlpr/ppocr/utils/dict/cyrillic_dict.txt +0 -163
- pyxlpr/ppocr/utils/dict/devanagari_dict.txt +0 -167
- pyxlpr/ppocr/utils/dict/en_dict.txt +0 -63
- pyxlpr/ppocr/utils/dict/fa_dict.txt +0 -136
- pyxlpr/ppocr/utils/dict/french_dict.txt +0 -136
- pyxlpr/ppocr/utils/dict/german_dict.txt +0 -143
- pyxlpr/ppocr/utils/dict/hi_dict.txt +0 -162
- pyxlpr/ppocr/utils/dict/it_dict.txt +0 -118
- pyxlpr/ppocr/utils/dict/japan_dict.txt +0 -4399
- pyxlpr/ppocr/utils/dict/ka_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/korean_dict.txt +0 -3688
- pyxlpr/ppocr/utils/dict/latin_dict.txt +0 -185
- pyxlpr/ppocr/utils/dict/mr_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/ne_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/oc_dict.txt +0 -96
- pyxlpr/ppocr/utils/dict/pu_dict.txt +0 -130
- pyxlpr/ppocr/utils/dict/rs_dict.txt +0 -91
- pyxlpr/ppocr/utils/dict/rsc_dict.txt +0 -134
- pyxlpr/ppocr/utils/dict/ru_dict.txt +0 -125
- pyxlpr/ppocr/utils/dict/ta_dict.txt +0 -128
- pyxlpr/ppocr/utils/dict/table_dict.txt +0 -277
- pyxlpr/ppocr/utils/dict/table_structure_dict.txt +0 -2759
- pyxlpr/ppocr/utils/dict/te_dict.txt +0 -151
- pyxlpr/ppocr/utils/dict/ug_dict.txt +0 -114
- pyxlpr/ppocr/utils/dict/uk_dict.txt +0 -142
- pyxlpr/ppocr/utils/dict/ur_dict.txt +0 -137
- pyxlpr/ppocr/utils/dict/xi_dict.txt +0 -110
- pyxlpr/ppocr/utils/dict90.txt +0 -90
- pyxlpr/ppocr/utils/e2e_metric/Deteval.py +0 -574
- pyxlpr/ppocr/utils/e2e_metric/polygon_fast.py +0 -83
- pyxlpr/ppocr/utils/e2e_utils/extract_batchsize.py +0 -87
- pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_fast.py +0 -457
- pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_slow.py +0 -592
- pyxlpr/ppocr/utils/e2e_utils/pgnet_pp_utils.py +0 -162
- pyxlpr/ppocr/utils/e2e_utils/visual.py +0 -162
- pyxlpr/ppocr/utils/en_dict.txt +0 -95
- pyxlpr/ppocr/utils/gen_label.py +0 -81
- pyxlpr/ppocr/utils/ic15_dict.txt +0 -36
- pyxlpr/ppocr/utils/iou.py +0 -54
- pyxlpr/ppocr/utils/logging.py +0 -69
- pyxlpr/ppocr/utils/network.py +0 -84
- pyxlpr/ppocr/utils/ppocr_keys_v1.txt +0 -6623
- pyxlpr/ppocr/utils/profiler.py +0 -110
- pyxlpr/ppocr/utils/save_load.py +0 -150
- pyxlpr/ppocr/utils/stats.py +0 -72
- pyxlpr/ppocr/utils/utility.py +0 -80
- pyxlpr/ppstructure/__init__.py +0 -13
- pyxlpr/ppstructure/predict_system.py +0 -187
- pyxlpr/ppstructure/table/__init__.py +0 -13
- pyxlpr/ppstructure/table/eval_table.py +0 -72
- pyxlpr/ppstructure/table/matcher.py +0 -192
- pyxlpr/ppstructure/table/predict_structure.py +0 -136
- pyxlpr/ppstructure/table/predict_table.py +0 -221
- pyxlpr/ppstructure/table/table_metric/__init__.py +0 -16
- pyxlpr/ppstructure/table/table_metric/parallel.py +0 -51
- pyxlpr/ppstructure/table/table_metric/table_metric.py +0 -247
- pyxlpr/ppstructure/table/tablepyxl/__init__.py +0 -13
- pyxlpr/ppstructure/table/tablepyxl/style.py +0 -283
- pyxlpr/ppstructure/table/tablepyxl/tablepyxl.py +0 -118
- pyxlpr/ppstructure/utility.py +0 -71
- pyxlpr/xlai.py +0 -10
- /pyxllib/{ext/autogui → autogui}/virtualkey.py +0 -0
- {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
@@ -1,17 +0,0 @@
|
|
1
|
-
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
2
|
-
#
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
# you may not use this file except in compliance with the License.
|
5
|
-
# You may obtain a copy of the License at
|
6
|
-
#
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
#
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
# See the License for the specific language governing permissions and
|
13
|
-
# limitations under the License.
|
14
|
-
|
15
|
-
from .augment import tia_perspective, tia_distort, tia_stretch
|
16
|
-
|
17
|
-
__all__ = ['tia_distort', 'tia_stretch', 'tia_perspective']
|
@@ -1,120 +0,0 @@
|
|
1
|
-
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
2
|
-
#
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
# you may not use this file except in compliance with the License.
|
5
|
-
# You may obtain a copy of the License at
|
6
|
-
#
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
#
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
# See the License for the specific language governing permissions and
|
13
|
-
# limitations under the License.
|
14
|
-
"""
|
15
|
-
This code is refer from:
|
16
|
-
https://github.com/RubanSeven/Text-Image-Augmentation-python/blob/master/augment.py
|
17
|
-
"""
|
18
|
-
|
19
|
-
import numpy as np
|
20
|
-
from .warp_mls import WarpMLS
|
21
|
-
|
22
|
-
|
23
|
-
def tia_distort(src, segment=4):
|
24
|
-
img_h, img_w = src.shape[:2]
|
25
|
-
|
26
|
-
cut = img_w // segment
|
27
|
-
thresh = cut // 3
|
28
|
-
|
29
|
-
src_pts = list()
|
30
|
-
dst_pts = list()
|
31
|
-
|
32
|
-
src_pts.append([0, 0])
|
33
|
-
src_pts.append([img_w, 0])
|
34
|
-
src_pts.append([img_w, img_h])
|
35
|
-
src_pts.append([0, img_h])
|
36
|
-
|
37
|
-
dst_pts.append([np.random.randint(thresh), np.random.randint(thresh)])
|
38
|
-
dst_pts.append(
|
39
|
-
[img_w - np.random.randint(thresh), np.random.randint(thresh)])
|
40
|
-
dst_pts.append(
|
41
|
-
[img_w - np.random.randint(thresh), img_h - np.random.randint(thresh)])
|
42
|
-
dst_pts.append(
|
43
|
-
[np.random.randint(thresh), img_h - np.random.randint(thresh)])
|
44
|
-
|
45
|
-
half_thresh = thresh * 0.5
|
46
|
-
|
47
|
-
for cut_idx in np.arange(1, segment, 1):
|
48
|
-
src_pts.append([cut * cut_idx, 0])
|
49
|
-
src_pts.append([cut * cut_idx, img_h])
|
50
|
-
dst_pts.append([
|
51
|
-
cut * cut_idx + np.random.randint(thresh) - half_thresh,
|
52
|
-
np.random.randint(thresh) - half_thresh
|
53
|
-
])
|
54
|
-
dst_pts.append([
|
55
|
-
cut * cut_idx + np.random.randint(thresh) - half_thresh,
|
56
|
-
img_h + np.random.randint(thresh) - half_thresh
|
57
|
-
])
|
58
|
-
|
59
|
-
trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h)
|
60
|
-
dst = trans.generate()
|
61
|
-
|
62
|
-
return dst
|
63
|
-
|
64
|
-
|
65
|
-
def tia_stretch(src, segment=4):
|
66
|
-
img_h, img_w = src.shape[:2]
|
67
|
-
|
68
|
-
cut = img_w // segment
|
69
|
-
thresh = cut * 4 // 5
|
70
|
-
|
71
|
-
src_pts = list()
|
72
|
-
dst_pts = list()
|
73
|
-
|
74
|
-
src_pts.append([0, 0])
|
75
|
-
src_pts.append([img_w, 0])
|
76
|
-
src_pts.append([img_w, img_h])
|
77
|
-
src_pts.append([0, img_h])
|
78
|
-
|
79
|
-
dst_pts.append([0, 0])
|
80
|
-
dst_pts.append([img_w, 0])
|
81
|
-
dst_pts.append([img_w, img_h])
|
82
|
-
dst_pts.append([0, img_h])
|
83
|
-
|
84
|
-
half_thresh = thresh * 0.5
|
85
|
-
|
86
|
-
for cut_idx in np.arange(1, segment, 1):
|
87
|
-
move = np.random.randint(thresh) - half_thresh
|
88
|
-
src_pts.append([cut * cut_idx, 0])
|
89
|
-
src_pts.append([cut * cut_idx, img_h])
|
90
|
-
dst_pts.append([cut * cut_idx + move, 0])
|
91
|
-
dst_pts.append([cut * cut_idx + move, img_h])
|
92
|
-
|
93
|
-
trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h)
|
94
|
-
dst = trans.generate()
|
95
|
-
|
96
|
-
return dst
|
97
|
-
|
98
|
-
|
99
|
-
def tia_perspective(src):
|
100
|
-
img_h, img_w = src.shape[:2]
|
101
|
-
|
102
|
-
thresh = img_h // 2
|
103
|
-
|
104
|
-
src_pts = list()
|
105
|
-
dst_pts = list()
|
106
|
-
|
107
|
-
src_pts.append([0, 0])
|
108
|
-
src_pts.append([img_w, 0])
|
109
|
-
src_pts.append([img_w, img_h])
|
110
|
-
src_pts.append([0, img_h])
|
111
|
-
|
112
|
-
dst_pts.append([0, np.random.randint(thresh)])
|
113
|
-
dst_pts.append([img_w, np.random.randint(thresh)])
|
114
|
-
dst_pts.append([img_w, img_h - np.random.randint(thresh)])
|
115
|
-
dst_pts.append([0, img_h - np.random.randint(thresh)])
|
116
|
-
|
117
|
-
trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h)
|
118
|
-
dst = trans.generate()
|
119
|
-
|
120
|
-
return dst
|
@@ -1,168 +0,0 @@
|
|
1
|
-
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
2
|
-
#
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
# you may not use this file except in compliance with the License.
|
5
|
-
# You may obtain a copy of the License at
|
6
|
-
#
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
#
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
# See the License for the specific language governing permissions and
|
13
|
-
# limitations under the License.
|
14
|
-
"""
|
15
|
-
This code is refer from:
|
16
|
-
https://github.com/RubanSeven/Text-Image-Augmentation-python/blob/master/warp_mls.py
|
17
|
-
"""
|
18
|
-
|
19
|
-
import numpy as np
|
20
|
-
|
21
|
-
|
22
|
-
class WarpMLS:
|
23
|
-
def __init__(self, src, src_pts, dst_pts, dst_w, dst_h, trans_ratio=1.):
|
24
|
-
self.src = src
|
25
|
-
self.src_pts = src_pts
|
26
|
-
self.dst_pts = dst_pts
|
27
|
-
self.pt_count = len(self.dst_pts)
|
28
|
-
self.dst_w = dst_w
|
29
|
-
self.dst_h = dst_h
|
30
|
-
self.trans_ratio = trans_ratio
|
31
|
-
self.grid_size = 100
|
32
|
-
self.rdx = np.zeros((self.dst_h, self.dst_w))
|
33
|
-
self.rdy = np.zeros((self.dst_h, self.dst_w))
|
34
|
-
|
35
|
-
@staticmethod
|
36
|
-
def __bilinear_interp(x, y, v11, v12, v21, v22):
|
37
|
-
return (v11 * (1 - y) + v12 * y) * (1 - x) + (v21 *
|
38
|
-
(1 - y) + v22 * y) * x
|
39
|
-
|
40
|
-
def generate(self):
|
41
|
-
self.calc_delta()
|
42
|
-
return self.gen_img()
|
43
|
-
|
44
|
-
def calc_delta(self):
|
45
|
-
w = np.zeros(self.pt_count, dtype=np.float32)
|
46
|
-
|
47
|
-
if self.pt_count < 2:
|
48
|
-
return
|
49
|
-
|
50
|
-
i = 0
|
51
|
-
while 1:
|
52
|
-
if self.dst_w <= i < self.dst_w + self.grid_size - 1:
|
53
|
-
i = self.dst_w - 1
|
54
|
-
elif i >= self.dst_w:
|
55
|
-
break
|
56
|
-
|
57
|
-
j = 0
|
58
|
-
while 1:
|
59
|
-
if self.dst_h <= j < self.dst_h + self.grid_size - 1:
|
60
|
-
j = self.dst_h - 1
|
61
|
-
elif j >= self.dst_h:
|
62
|
-
break
|
63
|
-
|
64
|
-
sw = 0
|
65
|
-
swp = np.zeros(2, dtype=np.float32)
|
66
|
-
swq = np.zeros(2, dtype=np.float32)
|
67
|
-
new_pt = np.zeros(2, dtype=np.float32)
|
68
|
-
cur_pt = np.array([i, j], dtype=np.float32)
|
69
|
-
|
70
|
-
k = 0
|
71
|
-
for k in range(self.pt_count):
|
72
|
-
if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]:
|
73
|
-
break
|
74
|
-
|
75
|
-
w[k] = 1. / (
|
76
|
-
(i - self.dst_pts[k][0]) * (i - self.dst_pts[k][0]) +
|
77
|
-
(j - self.dst_pts[k][1]) * (j - self.dst_pts[k][1]))
|
78
|
-
|
79
|
-
sw += w[k]
|
80
|
-
swp = swp + w[k] * np.array(self.dst_pts[k])
|
81
|
-
swq = swq + w[k] * np.array(self.src_pts[k])
|
82
|
-
|
83
|
-
if k == self.pt_count - 1:
|
84
|
-
pstar = 1 / sw * swp
|
85
|
-
qstar = 1 / sw * swq
|
86
|
-
|
87
|
-
miu_s = 0
|
88
|
-
for k in range(self.pt_count):
|
89
|
-
if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]:
|
90
|
-
continue
|
91
|
-
pt_i = self.dst_pts[k] - pstar
|
92
|
-
miu_s += w[k] * np.sum(pt_i * pt_i)
|
93
|
-
|
94
|
-
cur_pt -= pstar
|
95
|
-
cur_pt_j = np.array([-cur_pt[1], cur_pt[0]])
|
96
|
-
|
97
|
-
for k in range(self.pt_count):
|
98
|
-
if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]:
|
99
|
-
continue
|
100
|
-
|
101
|
-
pt_i = self.dst_pts[k] - pstar
|
102
|
-
pt_j = np.array([-pt_i[1], pt_i[0]])
|
103
|
-
|
104
|
-
tmp_pt = np.zeros(2, dtype=np.float32)
|
105
|
-
tmp_pt[0] = np.sum(pt_i * cur_pt) * self.src_pts[k][0] - \
|
106
|
-
np.sum(pt_j * cur_pt) * self.src_pts[k][1]
|
107
|
-
tmp_pt[1] = -np.sum(pt_i * cur_pt_j) * self.src_pts[k][0] + \
|
108
|
-
np.sum(pt_j * cur_pt_j) * self.src_pts[k][1]
|
109
|
-
tmp_pt *= (w[k] / miu_s)
|
110
|
-
new_pt += tmp_pt
|
111
|
-
|
112
|
-
new_pt += qstar
|
113
|
-
else:
|
114
|
-
new_pt = self.src_pts[k]
|
115
|
-
|
116
|
-
self.rdx[j, i] = new_pt[0] - i
|
117
|
-
self.rdy[j, i] = new_pt[1] - j
|
118
|
-
|
119
|
-
j += self.grid_size
|
120
|
-
i += self.grid_size
|
121
|
-
|
122
|
-
def gen_img(self):
|
123
|
-
src_h, src_w = self.src.shape[:2]
|
124
|
-
dst = np.zeros_like(self.src, dtype=np.float32)
|
125
|
-
|
126
|
-
for i in np.arange(0, self.dst_h, self.grid_size):
|
127
|
-
for j in np.arange(0, self.dst_w, self.grid_size):
|
128
|
-
ni = i + self.grid_size
|
129
|
-
nj = j + self.grid_size
|
130
|
-
w = h = self.grid_size
|
131
|
-
if ni >= self.dst_h:
|
132
|
-
ni = self.dst_h - 1
|
133
|
-
h = ni - i + 1
|
134
|
-
if nj >= self.dst_w:
|
135
|
-
nj = self.dst_w - 1
|
136
|
-
w = nj - j + 1
|
137
|
-
|
138
|
-
di = np.reshape(np.arange(h), (-1, 1))
|
139
|
-
dj = np.reshape(np.arange(w), (1, -1))
|
140
|
-
delta_x = self.__bilinear_interp(
|
141
|
-
di / h, dj / w, self.rdx[i, j], self.rdx[i, nj],
|
142
|
-
self.rdx[ni, j], self.rdx[ni, nj])
|
143
|
-
delta_y = self.__bilinear_interp(
|
144
|
-
di / h, dj / w, self.rdy[i, j], self.rdy[i, nj],
|
145
|
-
self.rdy[ni, j], self.rdy[ni, nj])
|
146
|
-
nx = j + dj + delta_x * self.trans_ratio
|
147
|
-
ny = i + di + delta_y * self.trans_ratio
|
148
|
-
nx = np.clip(nx, 0, src_w - 1)
|
149
|
-
ny = np.clip(ny, 0, src_h - 1)
|
150
|
-
nxi = np.array(np.floor(nx), dtype=np.int32)
|
151
|
-
nyi = np.array(np.floor(ny), dtype=np.int32)
|
152
|
-
nxi1 = np.array(np.ceil(nx), dtype=np.int32)
|
153
|
-
nyi1 = np.array(np.ceil(ny), dtype=np.int32)
|
154
|
-
|
155
|
-
if len(self.src.shape) == 3:
|
156
|
-
x = np.tile(np.expand_dims(ny - nyi, axis=-1), (1, 1, 3))
|
157
|
-
y = np.tile(np.expand_dims(nx - nxi, axis=-1), (1, 1, 3))
|
158
|
-
else:
|
159
|
-
x = ny - nyi
|
160
|
-
y = nx - nxi
|
161
|
-
dst[i:i + h, j:j + w] = self.__bilinear_interp(
|
162
|
-
x, y, self.src[nyi, nxi], self.src[nyi, nxi1],
|
163
|
-
self.src[nyi1, nxi], self.src[nyi1, nxi1])
|
164
|
-
|
165
|
-
dst = np.clip(dst, 0, 255)
|
166
|
-
dst = np.array(dst, dtype=np.uint8)
|
167
|
-
|
168
|
-
return dst
|
@@ -1,115 +0,0 @@
|
|
1
|
-
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
2
|
-
#
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
# you may not use this file except in compliance with the License.
|
5
|
-
# You may obtain a copy of the License at
|
6
|
-
#
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
#
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
# See the License for the specific language governing permissions and
|
13
|
-
# limitations under the License.
|
14
|
-
import numpy as np
|
15
|
-
import os
|
16
|
-
from paddle.io import Dataset
|
17
|
-
import lmdb
|
18
|
-
import cv2
|
19
|
-
|
20
|
-
from .imaug import transform, create_operators
|
21
|
-
|
22
|
-
|
23
|
-
class LMDBDataSet(Dataset):
|
24
|
-
def __init__(self, config, mode, logger, seed=None):
|
25
|
-
super(LMDBDataSet, self).__init__()
|
26
|
-
|
27
|
-
global_config = config['Global']
|
28
|
-
dataset_config = config[mode]['dataset']
|
29
|
-
loader_config = config[mode]['loader']
|
30
|
-
batch_size = loader_config['batch_size_per_card']
|
31
|
-
data_dir = dataset_config['data_dir']
|
32
|
-
self.do_shuffle = loader_config['shuffle']
|
33
|
-
|
34
|
-
self.lmdb_sets = self.load_hierarchical_lmdb_dataset(data_dir)
|
35
|
-
logger.info("Initialize indexs of datasets:%s" % data_dir)
|
36
|
-
self.data_idx_order_list = self.dataset_traversal()
|
37
|
-
if self.do_shuffle:
|
38
|
-
np.random.shuffle(self.data_idx_order_list)
|
39
|
-
self.ops = create_operators(dataset_config['transforms'], global_config)
|
40
|
-
|
41
|
-
def load_hierarchical_lmdb_dataset(self, data_dir):
|
42
|
-
lmdb_sets = {}
|
43
|
-
dataset_idx = 0
|
44
|
-
for dirpath, dirnames, filenames in os.walk(data_dir + '/'):
|
45
|
-
if not dirnames:
|
46
|
-
env = lmdb.open(
|
47
|
-
dirpath,
|
48
|
-
max_readers=32,
|
49
|
-
readonly=True,
|
50
|
-
lock=False,
|
51
|
-
readahead=False,
|
52
|
-
meminit=False)
|
53
|
-
txn = env.begin(write=False)
|
54
|
-
num_samples = int(txn.get('num-samples'.encode()))
|
55
|
-
lmdb_sets[dataset_idx] = {"dirpath":dirpath, "env":env, \
|
56
|
-
"txn":txn, "num_samples":num_samples}
|
57
|
-
dataset_idx += 1
|
58
|
-
return lmdb_sets
|
59
|
-
|
60
|
-
def dataset_traversal(self):
|
61
|
-
lmdb_num = len(self.lmdb_sets)
|
62
|
-
total_sample_num = 0
|
63
|
-
for lno in range(lmdb_num):
|
64
|
-
total_sample_num += self.lmdb_sets[lno]['num_samples']
|
65
|
-
data_idx_order_list = np.zeros((total_sample_num, 2))
|
66
|
-
beg_idx = 0
|
67
|
-
for lno in range(lmdb_num):
|
68
|
-
tmp_sample_num = self.lmdb_sets[lno]['num_samples']
|
69
|
-
end_idx = beg_idx + tmp_sample_num
|
70
|
-
data_idx_order_list[beg_idx:end_idx, 0] = lno
|
71
|
-
data_idx_order_list[beg_idx:end_idx, 1] \
|
72
|
-
= list(range(tmp_sample_num))
|
73
|
-
data_idx_order_list[beg_idx:end_idx, 1] += 1
|
74
|
-
beg_idx = beg_idx + tmp_sample_num
|
75
|
-
return data_idx_order_list
|
76
|
-
|
77
|
-
def get_img_data(self, value):
|
78
|
-
"""get_img_data"""
|
79
|
-
if not value:
|
80
|
-
return None
|
81
|
-
imgdata = np.frombuffer(value, dtype='uint8')
|
82
|
-
if imgdata is None:
|
83
|
-
return None
|
84
|
-
imgori = cv2.imdecode(imgdata, 1)
|
85
|
-
if imgori is None:
|
86
|
-
return None
|
87
|
-
return imgori
|
88
|
-
|
89
|
-
def get_lmdb_sample_info(self, txn, index):
|
90
|
-
label_key = 'label-%09d'.encode() % index
|
91
|
-
label = txn.get(label_key)
|
92
|
-
if label is None:
|
93
|
-
return None
|
94
|
-
label = label.decode('utf-8')
|
95
|
-
img_key = 'image-%09d'.encode() % index
|
96
|
-
imgbuf = txn.get(img_key)
|
97
|
-
return imgbuf, label
|
98
|
-
|
99
|
-
def __getitem__(self, idx):
|
100
|
-
lmdb_idx, file_idx = self.data_idx_order_list[idx]
|
101
|
-
lmdb_idx = int(lmdb_idx)
|
102
|
-
file_idx = int(file_idx)
|
103
|
-
sample_info = self.get_lmdb_sample_info(self.lmdb_sets[lmdb_idx]['txn'],
|
104
|
-
file_idx)
|
105
|
-
if sample_info is None:
|
106
|
-
return self.__getitem__(np.random.randint(self.__len__()))
|
107
|
-
img, label = sample_info
|
108
|
-
data = {'image': img, 'label': label}
|
109
|
-
outs = transform(data, self.ops)
|
110
|
-
if outs is None:
|
111
|
-
return self.__getitem__(np.random.randint(self.__len__()))
|
112
|
-
return outs
|
113
|
-
|
114
|
-
def __len__(self):
|
115
|
-
return self.data_idx_order_list.shape[0]
|
@@ -1,104 +0,0 @@
|
|
1
|
-
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
2
|
-
#
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
# you may not use this file except in compliance with the License.
|
5
|
-
# You may obtain a copy of the License at
|
6
|
-
#
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
#
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
# See the License for the specific language governing permissions and
|
13
|
-
# limitations under the License.
|
14
|
-
import numpy as np
|
15
|
-
import os
|
16
|
-
from paddle.io import Dataset
|
17
|
-
from .imaug import transform, create_operators
|
18
|
-
import random
|
19
|
-
|
20
|
-
|
21
|
-
class PGDataSet(Dataset):
|
22
|
-
def __init__(self, config, mode, logger, seed=None):
|
23
|
-
super(PGDataSet, self).__init__()
|
24
|
-
|
25
|
-
self.logger = logger
|
26
|
-
self.seed = seed
|
27
|
-
self.mode = mode
|
28
|
-
global_config = config['Global']
|
29
|
-
dataset_config = config[mode]['dataset']
|
30
|
-
loader_config = config[mode]['loader']
|
31
|
-
|
32
|
-
self.delimiter = dataset_config.get('delimiter', '\t')
|
33
|
-
label_file_list = dataset_config.pop('label_file_list')
|
34
|
-
data_source_num = len(label_file_list)
|
35
|
-
ratio_list = dataset_config.get("ratio_list", [1.0])
|
36
|
-
if isinstance(ratio_list, (float, int)):
|
37
|
-
ratio_list = [float(ratio_list)] * int(data_source_num)
|
38
|
-
assert len(
|
39
|
-
ratio_list
|
40
|
-
) == data_source_num, "The length of ratio_list should be the same as the file_list."
|
41
|
-
self.data_dir = dataset_config['data_dir']
|
42
|
-
self.do_shuffle = loader_config['shuffle']
|
43
|
-
|
44
|
-
logger.info("Initialize indexs of datasets:%s" % label_file_list)
|
45
|
-
self.data_lines = self.get_image_info_list(label_file_list, ratio_list)
|
46
|
-
self.data_idx_order_list = list(range(len(self.data_lines)))
|
47
|
-
if mode.lower() == "train":
|
48
|
-
self.shuffle_data_random()
|
49
|
-
|
50
|
-
self.ops = create_operators(dataset_config['transforms'], global_config)
|
51
|
-
|
52
|
-
def shuffle_data_random(self):
|
53
|
-
if self.do_shuffle:
|
54
|
-
random.seed(self.seed)
|
55
|
-
random.shuffle(self.data_lines)
|
56
|
-
return
|
57
|
-
|
58
|
-
def get_image_info_list(self, file_list, ratio_list):
|
59
|
-
if isinstance(file_list, str):
|
60
|
-
file_list = [file_list]
|
61
|
-
data_lines = []
|
62
|
-
for idx, file in enumerate(file_list):
|
63
|
-
with open(file, "rb") as f:
|
64
|
-
lines = f.readlines()
|
65
|
-
if self.mode == "train" or ratio_list[idx] < 1.0:
|
66
|
-
random.seed(self.seed)
|
67
|
-
lines = random.sample(lines,
|
68
|
-
round(len(lines) * ratio_list[idx]))
|
69
|
-
data_lines.extend(lines)
|
70
|
-
return data_lines
|
71
|
-
|
72
|
-
def __getitem__(self, idx):
|
73
|
-
file_idx = self.data_idx_order_list[idx]
|
74
|
-
data_line = self.data_lines[file_idx]
|
75
|
-
img_id = 0
|
76
|
-
try:
|
77
|
-
data_line = data_line.decode('utf-8')
|
78
|
-
substr = data_line.strip("\n").split(self.delimiter)
|
79
|
-
file_name = substr[0]
|
80
|
-
label = substr[1]
|
81
|
-
img_path = os.path.join(self.data_dir, file_name)
|
82
|
-
if self.mode.lower() == 'eval':
|
83
|
-
try:
|
84
|
-
img_id = int(data_line.split(".")[0][7:])
|
85
|
-
except:
|
86
|
-
img_id = 0
|
87
|
-
data = {'img_path': img_path, 'label': label, 'img_id': img_id}
|
88
|
-
if not os.path.exists(img_path):
|
89
|
-
raise Exception("{} does not exist!".format(img_path))
|
90
|
-
with open(data['img_path'], 'rb') as f:
|
91
|
-
img = f.read()
|
92
|
-
data['image'] = img
|
93
|
-
outs = transform(data, self.ops)
|
94
|
-
except Exception as e:
|
95
|
-
self.logger.error(
|
96
|
-
"When parsing line {}, error happened with msg: {}".format(
|
97
|
-
self.data_idx_order_list[idx], e))
|
98
|
-
outs = None
|
99
|
-
if outs is None:
|
100
|
-
return self.__getitem__(np.random.randint(self.__len__()))
|
101
|
-
return outs
|
102
|
-
|
103
|
-
def __len__(self):
|
104
|
-
return len(self.data_idx_order_list)
|
@@ -1,107 +0,0 @@
|
|
1
|
-
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
2
|
-
#
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
# you may not use this file except in compliance with the License.
|
5
|
-
# You may obtain a copy of the License at
|
6
|
-
#
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
#
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
# See the License for the specific language governing permissions and
|
13
|
-
# limitations under the License.
|
14
|
-
import numpy as np
|
15
|
-
import os
|
16
|
-
import random
|
17
|
-
from paddle.io import Dataset
|
18
|
-
import json
|
19
|
-
|
20
|
-
from .imaug import transform, create_operators
|
21
|
-
|
22
|
-
|
23
|
-
class PubTabDataSet(Dataset):
|
24
|
-
def __init__(self, config, mode, logger, seed=None):
|
25
|
-
super(PubTabDataSet, self).__init__()
|
26
|
-
self.logger = logger
|
27
|
-
|
28
|
-
global_config = config['Global']
|
29
|
-
dataset_config = config[mode]['dataset']
|
30
|
-
loader_config = config[mode]['loader']
|
31
|
-
|
32
|
-
label_file_path = dataset_config.pop('label_file_path')
|
33
|
-
|
34
|
-
self.data_dir = dataset_config['data_dir']
|
35
|
-
self.do_shuffle = loader_config['shuffle']
|
36
|
-
self.do_hard_select = False
|
37
|
-
if 'hard_select' in loader_config:
|
38
|
-
self.do_hard_select = loader_config['hard_select']
|
39
|
-
self.hard_prob = loader_config['hard_prob']
|
40
|
-
if self.do_hard_select:
|
41
|
-
self.img_select_prob = self.load_hard_select_prob()
|
42
|
-
self.table_select_type = None
|
43
|
-
if 'table_select_type' in loader_config:
|
44
|
-
self.table_select_type = loader_config['table_select_type']
|
45
|
-
self.table_select_prob = loader_config['table_select_prob']
|
46
|
-
|
47
|
-
self.seed = seed
|
48
|
-
logger.info("Initialize indexs of datasets:%s" % label_file_path)
|
49
|
-
with open(label_file_path, "rb") as f:
|
50
|
-
self.data_lines = f.readlines()
|
51
|
-
self.data_idx_order_list = list(range(len(self.data_lines)))
|
52
|
-
if mode.lower() == "train":
|
53
|
-
self.shuffle_data_random()
|
54
|
-
self.ops = create_operators(dataset_config['transforms'], global_config)
|
55
|
-
|
56
|
-
def shuffle_data_random(self):
|
57
|
-
if self.do_shuffle:
|
58
|
-
random.seed(self.seed)
|
59
|
-
random.shuffle(self.data_lines)
|
60
|
-
return
|
61
|
-
|
62
|
-
def __getitem__(self, idx):
|
63
|
-
try:
|
64
|
-
data_line = self.data_lines[idx]
|
65
|
-
data_line = data_line.decode('utf-8').strip("\n")
|
66
|
-
info = json.loads(data_line)
|
67
|
-
file_name = info['filename']
|
68
|
-
select_flag = True
|
69
|
-
if self.do_hard_select:
|
70
|
-
prob = self.img_select_prob[file_name]
|
71
|
-
if prob < random.uniform(0, 1):
|
72
|
-
select_flag = False
|
73
|
-
|
74
|
-
if self.table_select_type:
|
75
|
-
structure = info['html']['structure']['tokens'].copy()
|
76
|
-
structure_str = ''.join(structure)
|
77
|
-
table_type = "simple"
|
78
|
-
if 'colspan' in structure_str or 'rowspan' in structure_str:
|
79
|
-
table_type = "complex"
|
80
|
-
if table_type == "complex":
|
81
|
-
if self.table_select_prob < random.uniform(0, 1):
|
82
|
-
select_flag = False
|
83
|
-
|
84
|
-
if select_flag:
|
85
|
-
cells = info['html']['cells'].copy()
|
86
|
-
structure = info['html']['structure'].copy()
|
87
|
-
img_path = os.path.join(self.data_dir, file_name)
|
88
|
-
data = {'img_path': img_path, 'cells': cells, 'structure':structure}
|
89
|
-
if not os.path.exists(img_path):
|
90
|
-
raise Exception("{} does not exist!".format(img_path))
|
91
|
-
with open(data['img_path'], 'rb') as f:
|
92
|
-
img = f.read()
|
93
|
-
data['image'] = img
|
94
|
-
outs = transform(data, self.ops)
|
95
|
-
else:
|
96
|
-
outs = None
|
97
|
-
except Exception as e:
|
98
|
-
self.logger.error(
|
99
|
-
"When parsing line {}, error happened with msg: {}".format(
|
100
|
-
data_line, e))
|
101
|
-
outs = None
|
102
|
-
if outs is None:
|
103
|
-
return self.__getitem__(np.random.randint(self.__len__()))
|
104
|
-
return outs
|
105
|
-
|
106
|
-
def __len__(self):
|
107
|
-
return len(self.data_idx_order_list)
|