pyxllib 0.3.96__py3-none-any.whl → 0.3.197__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/algo/geo.py +12 -0
- pyxllib/algo/intervals.py +1 -1
- pyxllib/algo/matcher.py +78 -0
- pyxllib/algo/pupil.py +187 -19
- pyxllib/algo/specialist.py +2 -1
- pyxllib/algo/stat.py +38 -2
- {pyxlpr → pyxllib/autogui}/__init__.py +1 -1
- pyxllib/autogui/activewin.py +246 -0
- pyxllib/autogui/all.py +9 -0
- pyxllib/{ext/autogui → autogui}/autogui.py +40 -11
- pyxllib/autogui/uiautolib.py +362 -0
- pyxllib/autogui/wechat.py +827 -0
- pyxllib/autogui/wechat_msg.py +421 -0
- pyxllib/autogui/wxautolib.py +84 -0
- pyxllib/cv/slidercaptcha.py +137 -0
- pyxllib/data/echarts.py +123 -12
- pyxllib/data/jsonlib.py +89 -0
- pyxllib/data/pglib.py +514 -30
- pyxllib/data/sqlite.py +231 -4
- pyxllib/ext/JLineViewer.py +14 -1
- pyxllib/ext/drissionlib.py +277 -0
- pyxllib/ext/kq5034lib.py +0 -1594
- pyxllib/ext/robustprocfile.py +497 -0
- pyxllib/ext/unixlib.py +6 -5
- pyxllib/ext/utools.py +108 -95
- pyxllib/ext/webhook.py +32 -14
- pyxllib/ext/wjxlib.py +88 -0
- pyxllib/ext/wpsapi.py +124 -0
- pyxllib/ext/xlwork.py +9 -0
- pyxllib/ext/yuquelib.py +1003 -71
- pyxllib/file/docxlib.py +1 -1
- pyxllib/file/libreoffice.py +165 -0
- pyxllib/file/movielib.py +9 -0
- pyxllib/file/packlib/__init__.py +112 -75
- pyxllib/file/pdflib.py +1 -1
- pyxllib/file/pupil.py +1 -1
- pyxllib/file/specialist/dirlib.py +1 -1
- pyxllib/file/specialist/download.py +10 -3
- pyxllib/file/specialist/filelib.py +266 -55
- pyxllib/file/xlsxlib.py +205 -50
- pyxllib/file/xlsyncfile.py +341 -0
- pyxllib/prog/cachetools.py +64 -0
- pyxllib/prog/filelock.py +42 -0
- pyxllib/prog/multiprogs.py +940 -0
- pyxllib/prog/newbie.py +9 -2
- pyxllib/prog/pupil.py +129 -60
- pyxllib/prog/specialist/__init__.py +176 -2
- pyxllib/prog/specialist/bc.py +5 -2
- pyxllib/prog/specialist/browser.py +11 -2
- pyxllib/prog/specialist/datetime.py +68 -0
- pyxllib/prog/specialist/tictoc.py +12 -13
- pyxllib/prog/specialist/xllog.py +5 -5
- pyxllib/prog/xlosenv.py +7 -0
- pyxllib/text/airscript.js +744 -0
- pyxllib/text/charclasslib.py +17 -5
- pyxllib/text/jiebalib.py +6 -3
- pyxllib/text/jinjalib.py +32 -0
- pyxllib/text/jsa_ai_prompt.md +271 -0
- pyxllib/text/jscode.py +159 -4
- pyxllib/text/nestenv.py +1 -1
- pyxllib/text/newbie.py +12 -0
- pyxllib/text/pupil/common.py +26 -0
- pyxllib/text/specialist/ptag.py +2 -2
- pyxllib/text/templates/echart_base.html +11 -0
- pyxllib/text/templates/highlight_code.html +17 -0
- pyxllib/text/templates/latex_editor.html +103 -0
- pyxllib/text/xmllib.py +76 -14
- pyxllib/xl.py +2 -1
- pyxllib-0.3.197.dist-info/METADATA +48 -0
- pyxllib-0.3.197.dist-info/RECORD +126 -0
- {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +1 -2
- pyxllib/ext/autogui/__init__.py +0 -8
- pyxllib-0.3.96.dist-info/METADATA +0 -51
- pyxllib-0.3.96.dist-info/RECORD +0 -333
- pyxllib-0.3.96.dist-info/top_level.txt +0 -2
- pyxlpr/ai/__init__.py +0 -5
- pyxlpr/ai/clientlib.py +0 -1281
- pyxlpr/ai/specialist.py +0 -286
- pyxlpr/ai/torch_app.py +0 -172
- pyxlpr/ai/xlpaddle.py +0 -655
- pyxlpr/ai/xltorch.py +0 -705
- pyxlpr/data/__init__.py +0 -11
- pyxlpr/data/coco.py +0 -1325
- pyxlpr/data/datacls.py +0 -365
- pyxlpr/data/datasets.py +0 -200
- pyxlpr/data/gptlib.py +0 -1291
- pyxlpr/data/icdar/__init__.py +0 -96
- pyxlpr/data/icdar/deteval.py +0 -377
- pyxlpr/data/icdar/icdar2013.py +0 -341
- pyxlpr/data/icdar/iou.py +0 -340
- pyxlpr/data/icdar/rrc_evaluation_funcs_1_1.py +0 -463
- pyxlpr/data/imtextline.py +0 -473
- pyxlpr/data/labelme.py +0 -866
- pyxlpr/data/removeline.py +0 -179
- pyxlpr/data/specialist.py +0 -57
- pyxlpr/eval/__init__.py +0 -85
- pyxlpr/paddleocr.py +0 -776
- pyxlpr/ppocr/__init__.py +0 -15
- pyxlpr/ppocr/configs/rec/multi_language/generate_multi_language_configs.py +0 -226
- pyxlpr/ppocr/data/__init__.py +0 -135
- pyxlpr/ppocr/data/imaug/ColorJitter.py +0 -26
- pyxlpr/ppocr/data/imaug/__init__.py +0 -67
- pyxlpr/ppocr/data/imaug/copy_paste.py +0 -170
- pyxlpr/ppocr/data/imaug/east_process.py +0 -437
- pyxlpr/ppocr/data/imaug/gen_table_mask.py +0 -244
- pyxlpr/ppocr/data/imaug/iaa_augment.py +0 -114
- pyxlpr/ppocr/data/imaug/label_ops.py +0 -789
- pyxlpr/ppocr/data/imaug/make_border_map.py +0 -184
- pyxlpr/ppocr/data/imaug/make_pse_gt.py +0 -106
- pyxlpr/ppocr/data/imaug/make_shrink_map.py +0 -126
- pyxlpr/ppocr/data/imaug/operators.py +0 -433
- pyxlpr/ppocr/data/imaug/pg_process.py +0 -906
- pyxlpr/ppocr/data/imaug/randaugment.py +0 -143
- pyxlpr/ppocr/data/imaug/random_crop_data.py +0 -239
- pyxlpr/ppocr/data/imaug/rec_img_aug.py +0 -533
- pyxlpr/ppocr/data/imaug/sast_process.py +0 -777
- pyxlpr/ppocr/data/imaug/text_image_aug/__init__.py +0 -17
- pyxlpr/ppocr/data/imaug/text_image_aug/augment.py +0 -120
- pyxlpr/ppocr/data/imaug/text_image_aug/warp_mls.py +0 -168
- pyxlpr/ppocr/data/lmdb_dataset.py +0 -115
- pyxlpr/ppocr/data/pgnet_dataset.py +0 -104
- pyxlpr/ppocr/data/pubtab_dataset.py +0 -107
- pyxlpr/ppocr/data/simple_dataset.py +0 -372
- pyxlpr/ppocr/losses/__init__.py +0 -61
- pyxlpr/ppocr/losses/ace_loss.py +0 -52
- pyxlpr/ppocr/losses/basic_loss.py +0 -135
- pyxlpr/ppocr/losses/center_loss.py +0 -88
- pyxlpr/ppocr/losses/cls_loss.py +0 -30
- pyxlpr/ppocr/losses/combined_loss.py +0 -67
- pyxlpr/ppocr/losses/det_basic_loss.py +0 -208
- pyxlpr/ppocr/losses/det_db_loss.py +0 -80
- pyxlpr/ppocr/losses/det_east_loss.py +0 -63
- pyxlpr/ppocr/losses/det_pse_loss.py +0 -149
- pyxlpr/ppocr/losses/det_sast_loss.py +0 -121
- pyxlpr/ppocr/losses/distillation_loss.py +0 -272
- pyxlpr/ppocr/losses/e2e_pg_loss.py +0 -140
- pyxlpr/ppocr/losses/kie_sdmgr_loss.py +0 -113
- pyxlpr/ppocr/losses/rec_aster_loss.py +0 -99
- pyxlpr/ppocr/losses/rec_att_loss.py +0 -39
- pyxlpr/ppocr/losses/rec_ctc_loss.py +0 -44
- pyxlpr/ppocr/losses/rec_enhanced_ctc_loss.py +0 -70
- pyxlpr/ppocr/losses/rec_nrtr_loss.py +0 -30
- pyxlpr/ppocr/losses/rec_sar_loss.py +0 -28
- pyxlpr/ppocr/losses/rec_srn_loss.py +0 -47
- pyxlpr/ppocr/losses/table_att_loss.py +0 -109
- pyxlpr/ppocr/metrics/__init__.py +0 -44
- pyxlpr/ppocr/metrics/cls_metric.py +0 -45
- pyxlpr/ppocr/metrics/det_metric.py +0 -82
- pyxlpr/ppocr/metrics/distillation_metric.py +0 -73
- pyxlpr/ppocr/metrics/e2e_metric.py +0 -86
- pyxlpr/ppocr/metrics/eval_det_iou.py +0 -274
- pyxlpr/ppocr/metrics/kie_metric.py +0 -70
- pyxlpr/ppocr/metrics/rec_metric.py +0 -75
- pyxlpr/ppocr/metrics/table_metric.py +0 -50
- pyxlpr/ppocr/modeling/architectures/__init__.py +0 -32
- pyxlpr/ppocr/modeling/architectures/base_model.py +0 -88
- pyxlpr/ppocr/modeling/architectures/distillation_model.py +0 -60
- pyxlpr/ppocr/modeling/backbones/__init__.py +0 -54
- pyxlpr/ppocr/modeling/backbones/det_mobilenet_v3.py +0 -268
- pyxlpr/ppocr/modeling/backbones/det_resnet_vd.py +0 -246
- pyxlpr/ppocr/modeling/backbones/det_resnet_vd_sast.py +0 -285
- pyxlpr/ppocr/modeling/backbones/e2e_resnet_vd_pg.py +0 -265
- pyxlpr/ppocr/modeling/backbones/kie_unet_sdmgr.py +0 -186
- pyxlpr/ppocr/modeling/backbones/rec_mobilenet_v3.py +0 -138
- pyxlpr/ppocr/modeling/backbones/rec_mv1_enhance.py +0 -258
- pyxlpr/ppocr/modeling/backbones/rec_nrtr_mtb.py +0 -48
- pyxlpr/ppocr/modeling/backbones/rec_resnet_31.py +0 -210
- pyxlpr/ppocr/modeling/backbones/rec_resnet_aster.py +0 -143
- pyxlpr/ppocr/modeling/backbones/rec_resnet_fpn.py +0 -307
- pyxlpr/ppocr/modeling/backbones/rec_resnet_vd.py +0 -286
- pyxlpr/ppocr/modeling/heads/__init__.py +0 -54
- pyxlpr/ppocr/modeling/heads/cls_head.py +0 -52
- pyxlpr/ppocr/modeling/heads/det_db_head.py +0 -118
- pyxlpr/ppocr/modeling/heads/det_east_head.py +0 -121
- pyxlpr/ppocr/modeling/heads/det_pse_head.py +0 -37
- pyxlpr/ppocr/modeling/heads/det_sast_head.py +0 -128
- pyxlpr/ppocr/modeling/heads/e2e_pg_head.py +0 -253
- pyxlpr/ppocr/modeling/heads/kie_sdmgr_head.py +0 -206
- pyxlpr/ppocr/modeling/heads/multiheadAttention.py +0 -163
- pyxlpr/ppocr/modeling/heads/rec_aster_head.py +0 -393
- pyxlpr/ppocr/modeling/heads/rec_att_head.py +0 -202
- pyxlpr/ppocr/modeling/heads/rec_ctc_head.py +0 -88
- pyxlpr/ppocr/modeling/heads/rec_nrtr_head.py +0 -826
- pyxlpr/ppocr/modeling/heads/rec_sar_head.py +0 -402
- pyxlpr/ppocr/modeling/heads/rec_srn_head.py +0 -280
- pyxlpr/ppocr/modeling/heads/self_attention.py +0 -406
- pyxlpr/ppocr/modeling/heads/table_att_head.py +0 -246
- pyxlpr/ppocr/modeling/necks/__init__.py +0 -32
- pyxlpr/ppocr/modeling/necks/db_fpn.py +0 -111
- pyxlpr/ppocr/modeling/necks/east_fpn.py +0 -188
- pyxlpr/ppocr/modeling/necks/fpn.py +0 -138
- pyxlpr/ppocr/modeling/necks/pg_fpn.py +0 -314
- pyxlpr/ppocr/modeling/necks/rnn.py +0 -92
- pyxlpr/ppocr/modeling/necks/sast_fpn.py +0 -284
- pyxlpr/ppocr/modeling/necks/table_fpn.py +0 -110
- pyxlpr/ppocr/modeling/transforms/__init__.py +0 -28
- pyxlpr/ppocr/modeling/transforms/stn.py +0 -135
- pyxlpr/ppocr/modeling/transforms/tps.py +0 -308
- pyxlpr/ppocr/modeling/transforms/tps_spatial_transformer.py +0 -156
- pyxlpr/ppocr/optimizer/__init__.py +0 -61
- pyxlpr/ppocr/optimizer/learning_rate.py +0 -228
- pyxlpr/ppocr/optimizer/lr_scheduler.py +0 -49
- pyxlpr/ppocr/optimizer/optimizer.py +0 -160
- pyxlpr/ppocr/optimizer/regularizer.py +0 -52
- pyxlpr/ppocr/postprocess/__init__.py +0 -55
- pyxlpr/ppocr/postprocess/cls_postprocess.py +0 -33
- pyxlpr/ppocr/postprocess/db_postprocess.py +0 -234
- pyxlpr/ppocr/postprocess/east_postprocess.py +0 -143
- pyxlpr/ppocr/postprocess/locality_aware_nms.py +0 -200
- pyxlpr/ppocr/postprocess/pg_postprocess.py +0 -52
- pyxlpr/ppocr/postprocess/pse_postprocess/__init__.py +0 -15
- pyxlpr/ppocr/postprocess/pse_postprocess/pse/__init__.py +0 -29
- pyxlpr/ppocr/postprocess/pse_postprocess/pse/setup.py +0 -14
- pyxlpr/ppocr/postprocess/pse_postprocess/pse_postprocess.py +0 -118
- pyxlpr/ppocr/postprocess/rec_postprocess.py +0 -654
- pyxlpr/ppocr/postprocess/sast_postprocess.py +0 -355
- pyxlpr/ppocr/tools/__init__.py +0 -14
- pyxlpr/ppocr/tools/eval.py +0 -83
- pyxlpr/ppocr/tools/export_center.py +0 -77
- pyxlpr/ppocr/tools/export_model.py +0 -129
- pyxlpr/ppocr/tools/infer/predict_cls.py +0 -151
- pyxlpr/ppocr/tools/infer/predict_det.py +0 -300
- pyxlpr/ppocr/tools/infer/predict_e2e.py +0 -169
- pyxlpr/ppocr/tools/infer/predict_rec.py +0 -414
- pyxlpr/ppocr/tools/infer/predict_system.py +0 -204
- pyxlpr/ppocr/tools/infer/utility.py +0 -629
- pyxlpr/ppocr/tools/infer_cls.py +0 -83
- pyxlpr/ppocr/tools/infer_det.py +0 -134
- pyxlpr/ppocr/tools/infer_e2e.py +0 -122
- pyxlpr/ppocr/tools/infer_kie.py +0 -153
- pyxlpr/ppocr/tools/infer_rec.py +0 -146
- pyxlpr/ppocr/tools/infer_table.py +0 -107
- pyxlpr/ppocr/tools/program.py +0 -596
- pyxlpr/ppocr/tools/test_hubserving.py +0 -117
- pyxlpr/ppocr/tools/train.py +0 -163
- pyxlpr/ppocr/tools/xlprog.py +0 -748
- pyxlpr/ppocr/utils/EN_symbol_dict.txt +0 -94
- pyxlpr/ppocr/utils/__init__.py +0 -24
- pyxlpr/ppocr/utils/dict/ar_dict.txt +0 -117
- pyxlpr/ppocr/utils/dict/arabic_dict.txt +0 -162
- pyxlpr/ppocr/utils/dict/be_dict.txt +0 -145
- pyxlpr/ppocr/utils/dict/bg_dict.txt +0 -140
- pyxlpr/ppocr/utils/dict/chinese_cht_dict.txt +0 -8421
- pyxlpr/ppocr/utils/dict/cyrillic_dict.txt +0 -163
- pyxlpr/ppocr/utils/dict/devanagari_dict.txt +0 -167
- pyxlpr/ppocr/utils/dict/en_dict.txt +0 -63
- pyxlpr/ppocr/utils/dict/fa_dict.txt +0 -136
- pyxlpr/ppocr/utils/dict/french_dict.txt +0 -136
- pyxlpr/ppocr/utils/dict/german_dict.txt +0 -143
- pyxlpr/ppocr/utils/dict/hi_dict.txt +0 -162
- pyxlpr/ppocr/utils/dict/it_dict.txt +0 -118
- pyxlpr/ppocr/utils/dict/japan_dict.txt +0 -4399
- pyxlpr/ppocr/utils/dict/ka_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/korean_dict.txt +0 -3688
- pyxlpr/ppocr/utils/dict/latin_dict.txt +0 -185
- pyxlpr/ppocr/utils/dict/mr_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/ne_dict.txt +0 -153
- pyxlpr/ppocr/utils/dict/oc_dict.txt +0 -96
- pyxlpr/ppocr/utils/dict/pu_dict.txt +0 -130
- pyxlpr/ppocr/utils/dict/rs_dict.txt +0 -91
- pyxlpr/ppocr/utils/dict/rsc_dict.txt +0 -134
- pyxlpr/ppocr/utils/dict/ru_dict.txt +0 -125
- pyxlpr/ppocr/utils/dict/ta_dict.txt +0 -128
- pyxlpr/ppocr/utils/dict/table_dict.txt +0 -277
- pyxlpr/ppocr/utils/dict/table_structure_dict.txt +0 -2759
- pyxlpr/ppocr/utils/dict/te_dict.txt +0 -151
- pyxlpr/ppocr/utils/dict/ug_dict.txt +0 -114
- pyxlpr/ppocr/utils/dict/uk_dict.txt +0 -142
- pyxlpr/ppocr/utils/dict/ur_dict.txt +0 -137
- pyxlpr/ppocr/utils/dict/xi_dict.txt +0 -110
- pyxlpr/ppocr/utils/dict90.txt +0 -90
- pyxlpr/ppocr/utils/e2e_metric/Deteval.py +0 -574
- pyxlpr/ppocr/utils/e2e_metric/polygon_fast.py +0 -83
- pyxlpr/ppocr/utils/e2e_utils/extract_batchsize.py +0 -87
- pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_fast.py +0 -457
- pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_slow.py +0 -592
- pyxlpr/ppocr/utils/e2e_utils/pgnet_pp_utils.py +0 -162
- pyxlpr/ppocr/utils/e2e_utils/visual.py +0 -162
- pyxlpr/ppocr/utils/en_dict.txt +0 -95
- pyxlpr/ppocr/utils/gen_label.py +0 -81
- pyxlpr/ppocr/utils/ic15_dict.txt +0 -36
- pyxlpr/ppocr/utils/iou.py +0 -54
- pyxlpr/ppocr/utils/logging.py +0 -69
- pyxlpr/ppocr/utils/network.py +0 -84
- pyxlpr/ppocr/utils/ppocr_keys_v1.txt +0 -6623
- pyxlpr/ppocr/utils/profiler.py +0 -110
- pyxlpr/ppocr/utils/save_load.py +0 -150
- pyxlpr/ppocr/utils/stats.py +0 -72
- pyxlpr/ppocr/utils/utility.py +0 -80
- pyxlpr/ppstructure/__init__.py +0 -13
- pyxlpr/ppstructure/predict_system.py +0 -187
- pyxlpr/ppstructure/table/__init__.py +0 -13
- pyxlpr/ppstructure/table/eval_table.py +0 -72
- pyxlpr/ppstructure/table/matcher.py +0 -192
- pyxlpr/ppstructure/table/predict_structure.py +0 -136
- pyxlpr/ppstructure/table/predict_table.py +0 -221
- pyxlpr/ppstructure/table/table_metric/__init__.py +0 -16
- pyxlpr/ppstructure/table/table_metric/parallel.py +0 -51
- pyxlpr/ppstructure/table/table_metric/table_metric.py +0 -247
- pyxlpr/ppstructure/table/tablepyxl/__init__.py +0 -13
- pyxlpr/ppstructure/table/tablepyxl/style.py +0 -283
- pyxlpr/ppstructure/table/tablepyxl/tablepyxl.py +0 -118
- pyxlpr/ppstructure/utility.py +0 -71
- pyxlpr/xlai.py +0 -10
- /pyxllib/{ext/autogui → autogui}/virtualkey.py +0 -0
- {pyxllib-0.3.96.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
@@ -1,247 +0,0 @@
|
|
1
|
-
# Copyright 2020 IBM
|
2
|
-
# Author: peter.zhong@au1.ibm.com
|
3
|
-
#
|
4
|
-
# This is free software; you can redistribute it and/or modify
|
5
|
-
# it under the terms of the Apache 2.0 License.
|
6
|
-
#
|
7
|
-
# This software is distributed in the hope that it will be useful,
|
8
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
9
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
10
|
-
# Apache 2.0 License for more details.
|
11
|
-
|
12
|
-
import distance
|
13
|
-
from apted import APTED, Config
|
14
|
-
from apted.helpers import Tree
|
15
|
-
from lxml import etree, html
|
16
|
-
from collections import deque
|
17
|
-
from .parallel import parallel_process
|
18
|
-
from tqdm import tqdm
|
19
|
-
|
20
|
-
|
21
|
-
class TableTree(Tree):
|
22
|
-
def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
|
23
|
-
self.tag = tag
|
24
|
-
self.colspan = colspan
|
25
|
-
self.rowspan = rowspan
|
26
|
-
self.content = content
|
27
|
-
self.children = list(children)
|
28
|
-
|
29
|
-
def bracket(self):
|
30
|
-
"""Show tree using brackets notation"""
|
31
|
-
if self.tag == 'td':
|
32
|
-
result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \
|
33
|
-
(self.tag, self.colspan, self.rowspan, self.content)
|
34
|
-
else:
|
35
|
-
result = '"tag": %s' % self.tag
|
36
|
-
for child in self.children:
|
37
|
-
result += child.bracket()
|
38
|
-
return "{{{}}}".format(result)
|
39
|
-
|
40
|
-
|
41
|
-
class CustomConfig(Config):
|
42
|
-
@staticmethod
|
43
|
-
def maximum(*sequences):
|
44
|
-
"""Get maximum possible value
|
45
|
-
"""
|
46
|
-
return max(map(len, sequences))
|
47
|
-
|
48
|
-
def normalized_distance(self, *sequences):
|
49
|
-
"""Get distance from 0 to 1
|
50
|
-
"""
|
51
|
-
return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
|
52
|
-
|
53
|
-
def rename(self, node1, node2):
|
54
|
-
"""Compares attributes of trees"""
|
55
|
-
#print(node1.tag)
|
56
|
-
if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
|
57
|
-
return 1.
|
58
|
-
if node1.tag == 'td':
|
59
|
-
if node1.content or node2.content:
|
60
|
-
#print(node1.content, )
|
61
|
-
return self.normalized_distance(node1.content, node2.content)
|
62
|
-
return 0.
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
class CustomConfig_del_short(Config):
|
67
|
-
@staticmethod
|
68
|
-
def maximum(*sequences):
|
69
|
-
"""Get maximum possible value
|
70
|
-
"""
|
71
|
-
return max(map(len, sequences))
|
72
|
-
|
73
|
-
def normalized_distance(self, *sequences):
|
74
|
-
"""Get distance from 0 to 1
|
75
|
-
"""
|
76
|
-
return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
|
77
|
-
|
78
|
-
def rename(self, node1, node2):
|
79
|
-
"""Compares attributes of trees"""
|
80
|
-
if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
|
81
|
-
return 1.
|
82
|
-
if node1.tag == 'td':
|
83
|
-
if node1.content or node2.content:
|
84
|
-
#print('before')
|
85
|
-
#print(node1.content, node2.content)
|
86
|
-
#print('after')
|
87
|
-
node1_content = node1.content
|
88
|
-
node2_content = node2.content
|
89
|
-
if len(node1_content) < 3:
|
90
|
-
node1_content = ['####']
|
91
|
-
if len(node2_content) < 3:
|
92
|
-
node2_content = ['####']
|
93
|
-
return self.normalized_distance(node1_content, node2_content)
|
94
|
-
return 0.
|
95
|
-
|
96
|
-
class CustomConfig_del_block(Config):
|
97
|
-
@staticmethod
|
98
|
-
def maximum(*sequences):
|
99
|
-
"""Get maximum possible value
|
100
|
-
"""
|
101
|
-
return max(map(len, sequences))
|
102
|
-
|
103
|
-
def normalized_distance(self, *sequences):
|
104
|
-
"""Get distance from 0 to 1
|
105
|
-
"""
|
106
|
-
return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
|
107
|
-
|
108
|
-
def rename(self, node1, node2):
|
109
|
-
"""Compares attributes of trees"""
|
110
|
-
if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
|
111
|
-
return 1.
|
112
|
-
if node1.tag == 'td':
|
113
|
-
if node1.content or node2.content:
|
114
|
-
|
115
|
-
node1_content = node1.content
|
116
|
-
node2_content = node2.content
|
117
|
-
while ' ' in node1_content:
|
118
|
-
print(node1_content.index(' '))
|
119
|
-
node1_content.pop(node1_content.index(' '))
|
120
|
-
while ' ' in node2_content:
|
121
|
-
print(node2_content.index(' '))
|
122
|
-
node2_content.pop(node2_content.index(' '))
|
123
|
-
return self.normalized_distance(node1_content, node2_content)
|
124
|
-
return 0.
|
125
|
-
|
126
|
-
class TEDS(object):
|
127
|
-
''' Tree Edit Distance basead Similarity
|
128
|
-
'''
|
129
|
-
|
130
|
-
def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None):
|
131
|
-
assert isinstance(n_jobs, int) and (
|
132
|
-
n_jobs >= 1), 'n_jobs must be an integer greather than 1'
|
133
|
-
self.structure_only = structure_only
|
134
|
-
self.n_jobs = n_jobs
|
135
|
-
self.ignore_nodes = ignore_nodes
|
136
|
-
self.__tokens__ = []
|
137
|
-
|
138
|
-
def tokenize(self, node):
|
139
|
-
''' Tokenizes table cells
|
140
|
-
'''
|
141
|
-
self.__tokens__.append('<%s>' % node.tag)
|
142
|
-
if node.text is not None:
|
143
|
-
self.__tokens__ += list(node.text)
|
144
|
-
for n in node.getchildren():
|
145
|
-
self.tokenize(n)
|
146
|
-
if node.tag != 'unk':
|
147
|
-
self.__tokens__.append('</%s>' % node.tag)
|
148
|
-
if node.tag != 'td' and node.tail is not None:
|
149
|
-
self.__tokens__ += list(node.tail)
|
150
|
-
|
151
|
-
def load_html_tree(self, node, parent=None):
|
152
|
-
''' Converts HTML tree to the format required by apted
|
153
|
-
'''
|
154
|
-
global __tokens__
|
155
|
-
if node.tag == 'td':
|
156
|
-
if self.structure_only:
|
157
|
-
cell = []
|
158
|
-
else:
|
159
|
-
self.__tokens__ = []
|
160
|
-
self.tokenize(node)
|
161
|
-
cell = self.__tokens__[1:-1].copy()
|
162
|
-
new_node = TableTree(node.tag,
|
163
|
-
int(node.attrib.get('colspan', '1')),
|
164
|
-
int(node.attrib.get('rowspan', '1')),
|
165
|
-
cell, *deque())
|
166
|
-
else:
|
167
|
-
new_node = TableTree(node.tag, None, None, None, *deque())
|
168
|
-
if parent is not None:
|
169
|
-
parent.children.append(new_node)
|
170
|
-
if node.tag != 'td':
|
171
|
-
for n in node.getchildren():
|
172
|
-
self.load_html_tree(n, new_node)
|
173
|
-
if parent is None:
|
174
|
-
return new_node
|
175
|
-
|
176
|
-
def evaluate(self, pred, true):
|
177
|
-
''' Computes TEDS score between the prediction and the ground truth of a
|
178
|
-
given sample
|
179
|
-
'''
|
180
|
-
if (not pred) or (not true):
|
181
|
-
return 0.0
|
182
|
-
parser = html.HTMLParser(remove_comments=True, encoding='utf-8')
|
183
|
-
pred = html.fromstring(pred, parser=parser)
|
184
|
-
true = html.fromstring(true, parser=parser)
|
185
|
-
if pred.xpath('body/table') and true.xpath('body/table'):
|
186
|
-
pred = pred.xpath('body/table')[0]
|
187
|
-
true = true.xpath('body/table')[0]
|
188
|
-
if self.ignore_nodes:
|
189
|
-
etree.strip_tags(pred, *self.ignore_nodes)
|
190
|
-
etree.strip_tags(true, *self.ignore_nodes)
|
191
|
-
n_nodes_pred = len(pred.xpath(".//*"))
|
192
|
-
n_nodes_true = len(true.xpath(".//*"))
|
193
|
-
n_nodes = max(n_nodes_pred, n_nodes_true)
|
194
|
-
tree_pred = self.load_html_tree(pred)
|
195
|
-
tree_true = self.load_html_tree(true)
|
196
|
-
distance = APTED(tree_pred, tree_true,
|
197
|
-
CustomConfig()).compute_edit_distance()
|
198
|
-
return 1.0 - (float(distance) / n_nodes)
|
199
|
-
else:
|
200
|
-
return 0.0
|
201
|
-
|
202
|
-
def batch_evaluate(self, pred_json, true_json):
|
203
|
-
''' Computes TEDS score between the prediction and the ground truth of
|
204
|
-
a batch of samples
|
205
|
-
@params pred_json: {'FILENAME': 'HTML CODE', ...}
|
206
|
-
@params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
|
207
|
-
@output: {'FILENAME': 'TEDS SCORE', ...}
|
208
|
-
'''
|
209
|
-
samples = true_json.keys()
|
210
|
-
if self.n_jobs == 1:
|
211
|
-
scores = [self.evaluate(pred_json.get(
|
212
|
-
filename, ''), true_json[filename]['html']) for filename in tqdm(samples)]
|
213
|
-
else:
|
214
|
-
inputs = [{'pred': pred_json.get(
|
215
|
-
filename, ''), 'true': true_json[filename]['html']} for filename in samples]
|
216
|
-
scores = parallel_process(
|
217
|
-
inputs, self.evaluate, use_kwargs=True, n_jobs=self.n_jobs, front_num=1)
|
218
|
-
scores = dict(zip(samples, scores))
|
219
|
-
return scores
|
220
|
-
|
221
|
-
def batch_evaluate_html(self, pred_htmls, true_htmls):
|
222
|
-
''' Computes TEDS score between the prediction and the ground truth of
|
223
|
-
a batch of samples
|
224
|
-
'''
|
225
|
-
if self.n_jobs == 1:
|
226
|
-
scores = [self.evaluate(pred_html, true_html) for (
|
227
|
-
pred_html, true_html) in zip(pred_htmls, true_htmls)]
|
228
|
-
else:
|
229
|
-
inputs = [{"pred": pred_html, "true": true_html} for(
|
230
|
-
pred_html, true_html) in zip(pred_htmls, true_htmls)]
|
231
|
-
|
232
|
-
scores = parallel_process(
|
233
|
-
inputs, self.evaluate, use_kwargs=True, n_jobs=self.n_jobs, front_num=1)
|
234
|
-
return scores
|
235
|
-
|
236
|
-
|
237
|
-
if __name__ == '__main__':
|
238
|
-
import json
|
239
|
-
import pprint
|
240
|
-
with open('sample_pred.json') as fp:
|
241
|
-
pred_json = json.load(fp)
|
242
|
-
with open('sample_gt.json') as fp:
|
243
|
-
true_json = json.load(fp)
|
244
|
-
teds = TEDS(n_jobs=4)
|
245
|
-
scores = teds.batch_evaluate(pred_json, true_json)
|
246
|
-
pp = pprint.PrettyPrinter()
|
247
|
-
pp.pprint(scores)
|
@@ -1,13 +0,0 @@
|
|
1
|
-
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
2
|
-
#
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
# you may not use this file except in compliance with the License.
|
5
|
-
# You may obtain a copy of the License at
|
6
|
-
#
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
#
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
# See the License for the specific language governing permissions and
|
13
|
-
# limitations under the License.
|
@@ -1,283 +0,0 @@
|
|
1
|
-
# This is where we handle translating css styles into openpyxl styles
|
2
|
-
# and cascading those from parent to child in the dom.
|
3
|
-
|
4
|
-
from openpyxl.cell import cell
|
5
|
-
from openpyxl.styles import Font, Alignment, PatternFill, NamedStyle, Border, Side, Color
|
6
|
-
from openpyxl.styles.fills import FILL_SOLID
|
7
|
-
from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE
|
8
|
-
from openpyxl.styles.colors import BLACK
|
9
|
-
|
10
|
-
FORMAT_DATE_MMDDYYYY = 'mm/dd/yyyy'
|
11
|
-
|
12
|
-
|
13
|
-
def colormap(color):
|
14
|
-
"""
|
15
|
-
Convenience for looking up known colors
|
16
|
-
"""
|
17
|
-
cmap = {'black': BLACK}
|
18
|
-
return cmap.get(color, color)
|
19
|
-
|
20
|
-
|
21
|
-
def style_string_to_dict(style):
|
22
|
-
"""
|
23
|
-
Convert css style string to a python dictionary
|
24
|
-
"""
|
25
|
-
def clean_split(string, delim):
|
26
|
-
return (s.strip() for s in string.split(delim))
|
27
|
-
styles = [clean_split(s, ":") for s in style.split(";") if ":" in s]
|
28
|
-
return dict(styles)
|
29
|
-
|
30
|
-
|
31
|
-
def get_side(style, name):
|
32
|
-
return {'border_style': style.get('border-{}-style'.format(name)),
|
33
|
-
'color': colormap(style.get('border-{}-color'.format(name)))}
|
34
|
-
|
35
|
-
known_styles = {}
|
36
|
-
|
37
|
-
|
38
|
-
def style_dict_to_named_style(style_dict, number_format=None):
|
39
|
-
"""
|
40
|
-
Change css style (stored in a python dictionary) to openpyxl NamedStyle
|
41
|
-
"""
|
42
|
-
|
43
|
-
style_and_format_string = str({
|
44
|
-
'style_dict': style_dict,
|
45
|
-
'parent': style_dict.parent,
|
46
|
-
'number_format': number_format,
|
47
|
-
})
|
48
|
-
|
49
|
-
if style_and_format_string not in known_styles:
|
50
|
-
# Font
|
51
|
-
font = Font(bold=style_dict.get('font-weight') == 'bold',
|
52
|
-
color=style_dict.get_color('color', None),
|
53
|
-
size=style_dict.get('font-size'))
|
54
|
-
|
55
|
-
# Alignment
|
56
|
-
alignment = Alignment(horizontal=style_dict.get('text-align', 'general'),
|
57
|
-
vertical=style_dict.get('vertical-align'),
|
58
|
-
wrap_text=style_dict.get('white-space', 'nowrap') == 'normal')
|
59
|
-
|
60
|
-
# Fill
|
61
|
-
bg_color = style_dict.get_color('background-color')
|
62
|
-
fg_color = style_dict.get_color('foreground-color', Color())
|
63
|
-
fill_type = style_dict.get('fill-type')
|
64
|
-
if bg_color and bg_color != 'transparent':
|
65
|
-
fill = PatternFill(fill_type=fill_type or FILL_SOLID,
|
66
|
-
start_color=bg_color,
|
67
|
-
end_color=fg_color)
|
68
|
-
else:
|
69
|
-
fill = PatternFill()
|
70
|
-
|
71
|
-
# Border
|
72
|
-
border = Border(left=Side(**get_side(style_dict, 'left')),
|
73
|
-
right=Side(**get_side(style_dict, 'right')),
|
74
|
-
top=Side(**get_side(style_dict, 'top')),
|
75
|
-
bottom=Side(**get_side(style_dict, 'bottom')),
|
76
|
-
diagonal=Side(**get_side(style_dict, 'diagonal')),
|
77
|
-
diagonal_direction=None,
|
78
|
-
outline=Side(**get_side(style_dict, 'outline')),
|
79
|
-
vertical=None,
|
80
|
-
horizontal=None)
|
81
|
-
|
82
|
-
name = 'Style {}'.format(len(known_styles) + 1)
|
83
|
-
|
84
|
-
pyxl_style = NamedStyle(name=name, font=font, fill=fill, alignment=alignment, border=border,
|
85
|
-
number_format=number_format)
|
86
|
-
|
87
|
-
known_styles[style_and_format_string] = pyxl_style
|
88
|
-
|
89
|
-
return known_styles[style_and_format_string]
|
90
|
-
|
91
|
-
|
92
|
-
class StyleDict(dict):
|
93
|
-
"""
|
94
|
-
It's like a dictionary, but it looks for items in the parent dictionary
|
95
|
-
"""
|
96
|
-
def __init__(self, *args, **kwargs):
|
97
|
-
self.parent = kwargs.pop('parent', None)
|
98
|
-
super(StyleDict, self).__init__(*args, **kwargs)
|
99
|
-
|
100
|
-
def __getitem__(self, item):
|
101
|
-
if item in self:
|
102
|
-
return super(StyleDict, self).__getitem__(item)
|
103
|
-
elif self.parent:
|
104
|
-
return self.parent[item]
|
105
|
-
else:
|
106
|
-
raise KeyError('{} not found'.format(item))
|
107
|
-
|
108
|
-
def __hash__(self):
|
109
|
-
return hash(tuple([(k, self.get(k)) for k in self._keys()]))
|
110
|
-
|
111
|
-
# Yielding the keys avoids creating unnecessary data structures
|
112
|
-
# and happily works with both python2 and python3 where the
|
113
|
-
# .keys() method is a dictionary_view in python3 and a list in python2.
|
114
|
-
def _keys(self):
|
115
|
-
yielded = set()
|
116
|
-
for k in self.keys():
|
117
|
-
yielded.add(k)
|
118
|
-
yield k
|
119
|
-
if self.parent:
|
120
|
-
for k in self.parent._keys():
|
121
|
-
if k not in yielded:
|
122
|
-
yielded.add(k)
|
123
|
-
yield k
|
124
|
-
|
125
|
-
def get(self, k, d=None):
|
126
|
-
try:
|
127
|
-
return self[k]
|
128
|
-
except KeyError:
|
129
|
-
return d
|
130
|
-
|
131
|
-
def get_color(self, k, d=None):
|
132
|
-
"""
|
133
|
-
Strip leading # off colors if necessary
|
134
|
-
"""
|
135
|
-
color = self.get(k, d)
|
136
|
-
if hasattr(color, 'startswith') and color.startswith('#'):
|
137
|
-
color = color[1:]
|
138
|
-
if len(color) == 3: # Premailers reduces colors like #00ff00 to #0f0, openpyxl doesn't like that
|
139
|
-
color = ''.join(2 * c for c in color)
|
140
|
-
return color
|
141
|
-
|
142
|
-
|
143
|
-
class Element(object):
|
144
|
-
"""
|
145
|
-
Our base class for representing an html element along with a cascading style.
|
146
|
-
The element is created along with a parent so that the StyleDict that we store
|
147
|
-
can point to the parent's StyleDict.
|
148
|
-
"""
|
149
|
-
def __init__(self, element, parent=None):
|
150
|
-
self.element = element
|
151
|
-
self.number_format = None
|
152
|
-
parent_style = parent.style_dict if parent else None
|
153
|
-
self.style_dict = StyleDict(style_string_to_dict(element.get('style', '')), parent=parent_style)
|
154
|
-
self._style_cache = None
|
155
|
-
|
156
|
-
def style(self):
|
157
|
-
"""
|
158
|
-
Turn the css styles for this element into an openpyxl NamedStyle.
|
159
|
-
"""
|
160
|
-
if not self._style_cache:
|
161
|
-
self._style_cache = style_dict_to_named_style(self.style_dict, number_format=self.number_format)
|
162
|
-
return self._style_cache
|
163
|
-
|
164
|
-
def get_dimension(self, dimension_key):
|
165
|
-
"""
|
166
|
-
Extracts the dimension from the style dict of the Element and returns it as a float.
|
167
|
-
"""
|
168
|
-
dimension = self.style_dict.get(dimension_key)
|
169
|
-
if dimension:
|
170
|
-
if dimension[-2:] in ['px', 'em', 'pt', 'in', 'cm']:
|
171
|
-
dimension = dimension[:-2]
|
172
|
-
dimension = float(dimension)
|
173
|
-
return dimension
|
174
|
-
|
175
|
-
|
176
|
-
class Table(Element):
|
177
|
-
"""
|
178
|
-
The concrete implementations of Elements are semantically named for the types of elements we are interested in.
|
179
|
-
This defines a very concrete tree structure for html tables that we expect to deal with. I prefer this compared to
|
180
|
-
allowing Element to have an arbitrary number of children and dealing with an abstract element tree.
|
181
|
-
"""
|
182
|
-
def __init__(self, table):
|
183
|
-
"""
|
184
|
-
takes an html table object (from lxml)
|
185
|
-
"""
|
186
|
-
super(Table, self).__init__(table)
|
187
|
-
table_head = table.find('thead')
|
188
|
-
self.head = TableHead(table_head, parent=self) if table_head is not None else None
|
189
|
-
table_body = table.find('tbody')
|
190
|
-
self.body = TableBody(table_body if table_body is not None else table, parent=self)
|
191
|
-
|
192
|
-
|
193
|
-
class TableHead(Element):
|
194
|
-
"""
|
195
|
-
This class maps to the `<th>` element of the html table.
|
196
|
-
"""
|
197
|
-
def __init__(self, head, parent=None):
|
198
|
-
super(TableHead, self).__init__(head, parent=parent)
|
199
|
-
self.rows = [TableRow(tr, parent=self) for tr in head.findall('tr')]
|
200
|
-
|
201
|
-
|
202
|
-
class TableBody(Element):
|
203
|
-
"""
|
204
|
-
This class maps to the `<tbody>` element of the html table.
|
205
|
-
"""
|
206
|
-
def __init__(self, body, parent=None):
|
207
|
-
super(TableBody, self).__init__(body, parent=parent)
|
208
|
-
self.rows = [TableRow(tr, parent=self) for tr in body.findall('tr')]
|
209
|
-
|
210
|
-
|
211
|
-
class TableRow(Element):
|
212
|
-
"""
|
213
|
-
This class maps to the `<tr>` element of the html table.
|
214
|
-
"""
|
215
|
-
def __init__(self, tr, parent=None):
|
216
|
-
super(TableRow, self).__init__(tr, parent=parent)
|
217
|
-
self.cells = [TableCell(cell, parent=self) for cell in tr.findall('th') + tr.findall('td')]
|
218
|
-
|
219
|
-
|
220
|
-
def element_to_string(el):
|
221
|
-
return _element_to_string(el).strip()
|
222
|
-
|
223
|
-
|
224
|
-
def _element_to_string(el):
|
225
|
-
string = ''
|
226
|
-
|
227
|
-
for x in el.iterchildren():
|
228
|
-
string += '\n' + _element_to_string(x)
|
229
|
-
|
230
|
-
text = el.text.strip() if el.text else ''
|
231
|
-
tail = el.tail.strip() if el.tail else ''
|
232
|
-
|
233
|
-
return text + string + '\n' + tail
|
234
|
-
|
235
|
-
|
236
|
-
class TableCell(Element):
|
237
|
-
"""
|
238
|
-
This class maps to the `<td>` element of the html table.
|
239
|
-
"""
|
240
|
-
CELL_TYPES = {'TYPE_STRING', 'TYPE_FORMULA', 'TYPE_NUMERIC', 'TYPE_BOOL', 'TYPE_CURRENCY', 'TYPE_PERCENTAGE',
|
241
|
-
'TYPE_NULL', 'TYPE_INLINE', 'TYPE_ERROR', 'TYPE_FORMULA_CACHE_STRING', 'TYPE_INTEGER'}
|
242
|
-
|
243
|
-
def __init__(self, cell, parent=None):
|
244
|
-
super(TableCell, self).__init__(cell, parent=parent)
|
245
|
-
self.value = element_to_string(cell)
|
246
|
-
self.number_format = self.get_number_format()
|
247
|
-
|
248
|
-
def data_type(self):
|
249
|
-
cell_types = self.CELL_TYPES & set(self.element.get('class', '').split())
|
250
|
-
if cell_types:
|
251
|
-
if 'TYPE_FORMULA' in cell_types:
|
252
|
-
# Make sure TYPE_FORMULA takes precedence over the other classes in the set.
|
253
|
-
cell_type = 'TYPE_FORMULA'
|
254
|
-
elif cell_types & {'TYPE_CURRENCY', 'TYPE_INTEGER', 'TYPE_PERCENTAGE'}:
|
255
|
-
cell_type = 'TYPE_NUMERIC'
|
256
|
-
else:
|
257
|
-
cell_type = cell_types.pop()
|
258
|
-
else:
|
259
|
-
cell_type = 'TYPE_STRING'
|
260
|
-
return getattr(cell, cell_type)
|
261
|
-
|
262
|
-
def get_number_format(self):
|
263
|
-
if 'TYPE_CURRENCY' in self.element.get('class', '').split():
|
264
|
-
return FORMAT_CURRENCY_USD_SIMPLE
|
265
|
-
if 'TYPE_INTEGER' in self.element.get('class', '').split():
|
266
|
-
return '#,##0'
|
267
|
-
if 'TYPE_PERCENTAGE' in self.element.get('class', '').split():
|
268
|
-
return FORMAT_PERCENTAGE
|
269
|
-
if 'TYPE_DATE' in self.element.get('class', '').split():
|
270
|
-
return FORMAT_DATE_MMDDYYYY
|
271
|
-
if self.data_type() == cell.TYPE_NUMERIC:
|
272
|
-
try:
|
273
|
-
int(self.value)
|
274
|
-
except ValueError:
|
275
|
-
return '#,##0.##'
|
276
|
-
else:
|
277
|
-
return '#,##0'
|
278
|
-
|
279
|
-
def format(self, cell):
|
280
|
-
cell.style = self.style()
|
281
|
-
data_type = self.data_type()
|
282
|
-
if data_type:
|
283
|
-
cell.data_type = data_type
|
@@ -1,118 +0,0 @@
|
|
1
|
-
# Do imports like python3 so our package works for 2 and 3
|
2
|
-
from __future__ import absolute_import
|
3
|
-
|
4
|
-
from lxml import html
|
5
|
-
from openpyxl import Workbook
|
6
|
-
from openpyxl.utils import get_column_letter
|
7
|
-
from premailer import Premailer
|
8
|
-
from tablepyxl.style import Table
|
9
|
-
|
10
|
-
|
11
|
-
def string_to_int(s):
|
12
|
-
if s.isdigit():
|
13
|
-
return int(s)
|
14
|
-
return 0
|
15
|
-
|
16
|
-
|
17
|
-
def get_Tables(doc):
|
18
|
-
tree = html.fromstring(doc)
|
19
|
-
comments = tree.xpath('//comment()')
|
20
|
-
for comment in comments:
|
21
|
-
comment.drop_tag()
|
22
|
-
return [Table(table) for table in tree.xpath('//table')]
|
23
|
-
|
24
|
-
|
25
|
-
def write_rows(worksheet, elem, row, column=1):
|
26
|
-
"""
|
27
|
-
Writes every tr child element of elem to a row in the worksheet
|
28
|
-
returns the next row after all rows are written
|
29
|
-
"""
|
30
|
-
from openpyxl.cell.cell import MergedCell
|
31
|
-
|
32
|
-
initial_column = column
|
33
|
-
for table_row in elem.rows:
|
34
|
-
for table_cell in table_row.cells:
|
35
|
-
cell = worksheet.cell(row=row, column=column)
|
36
|
-
while isinstance(cell, MergedCell):
|
37
|
-
column += 1
|
38
|
-
cell = worksheet.cell(row=row, column=column)
|
39
|
-
|
40
|
-
colspan = string_to_int(table_cell.element.get("colspan", "1"))
|
41
|
-
rowspan = string_to_int(table_cell.element.get("rowspan", "1"))
|
42
|
-
if rowspan > 1 or colspan > 1:
|
43
|
-
worksheet.merge_cells(start_row=row, start_column=column,
|
44
|
-
end_row=row + rowspan - 1, end_column=column + colspan - 1)
|
45
|
-
|
46
|
-
cell.value = table_cell.value
|
47
|
-
table_cell.format(cell)
|
48
|
-
min_width = table_cell.get_dimension('min-width')
|
49
|
-
max_width = table_cell.get_dimension('max-width')
|
50
|
-
|
51
|
-
if colspan == 1:
|
52
|
-
# Initially, when iterating for the first time through the loop, the width of all the cells is None.
|
53
|
-
# As we start filling in contents, the initial width of the cell (which can be retrieved by:
|
54
|
-
# worksheet.column_dimensions[get_column_letter(column)].width) is equal to the width of the previous
|
55
|
-
# cell in the same column (i.e. width of A2 = width of A1)
|
56
|
-
width = max(worksheet.column_dimensions[get_column_letter(column)].width or 0, len(table_cell.value) + 2)
|
57
|
-
if max_width and width > max_width:
|
58
|
-
width = max_width
|
59
|
-
elif min_width and width < min_width:
|
60
|
-
width = min_width
|
61
|
-
worksheet.column_dimensions[get_column_letter(column)].width = width
|
62
|
-
column += colspan
|
63
|
-
row += 1
|
64
|
-
column = initial_column
|
65
|
-
return row
|
66
|
-
|
67
|
-
|
68
|
-
def table_to_sheet(table, wb):
|
69
|
-
"""
|
70
|
-
Takes a table and workbook and writes the table to a new sheet.
|
71
|
-
The sheet title will be the same as the table attribute name.
|
72
|
-
"""
|
73
|
-
ws = wb.create_sheet(title=table.element.get('name'))
|
74
|
-
insert_table(table, ws, 1, 1)
|
75
|
-
|
76
|
-
|
77
|
-
def document_to_workbook(doc, wb=None, base_url=None):
|
78
|
-
"""
|
79
|
-
Takes a string representation of an html document and writes one sheet for
|
80
|
-
every table in the document.
|
81
|
-
The workbook is returned
|
82
|
-
"""
|
83
|
-
if not wb:
|
84
|
-
wb = Workbook()
|
85
|
-
wb.remove(wb.active)
|
86
|
-
|
87
|
-
inline_styles_doc = Premailer(doc, base_url=base_url, remove_classes=False).transform()
|
88
|
-
tables = get_Tables(inline_styles_doc)
|
89
|
-
|
90
|
-
for table in tables:
|
91
|
-
table_to_sheet(table, wb)
|
92
|
-
|
93
|
-
return wb
|
94
|
-
|
95
|
-
|
96
|
-
def document_to_xl(doc, filename, base_url=None):
|
97
|
-
"""
|
98
|
-
Takes a string representation of an html document and writes one sheet for
|
99
|
-
every table in the document. The workbook is written out to a file called filename
|
100
|
-
"""
|
101
|
-
wb = document_to_workbook(doc, base_url=base_url)
|
102
|
-
wb.save(filename)
|
103
|
-
|
104
|
-
|
105
|
-
def insert_table(table, worksheet, column, row):
|
106
|
-
if table.head:
|
107
|
-
row = write_rows(worksheet, table.head, row, column)
|
108
|
-
if table.body:
|
109
|
-
row = write_rows(worksheet, table.body, row, column)
|
110
|
-
|
111
|
-
|
112
|
-
def insert_table_at_cell(table, cell):
|
113
|
-
"""
|
114
|
-
Inserts a table at the location of an openpyxl Cell object.
|
115
|
-
"""
|
116
|
-
ws = cell.parent
|
117
|
-
column, row = cell.column, cell.row
|
118
|
-
insert_table(table, ws, column, row)
|