pyxllib 0.3.96__py3-none-any.whl → 0.3.200__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. pyxllib/__init__.py +21 -21
  2. pyxllib/algo/__init__.py +8 -8
  3. pyxllib/algo/disjoint.py +54 -54
  4. pyxllib/algo/geo.py +541 -529
  5. pyxllib/algo/intervals.py +964 -964
  6. pyxllib/algo/matcher.py +389 -311
  7. pyxllib/algo/newbie.py +166 -166
  8. pyxllib/algo/pupil.py +629 -461
  9. pyxllib/algo/shapelylib.py +67 -67
  10. pyxllib/algo/specialist.py +241 -240
  11. pyxllib/algo/stat.py +494 -458
  12. pyxllib/algo/treelib.py +149 -149
  13. pyxllib/algo/unitlib.py +66 -66
  14. {pyxlpr → pyxllib/autogui}/__init__.py +5 -5
  15. pyxllib/autogui/activewin.py +246 -0
  16. pyxllib/autogui/all.py +9 -0
  17. pyxllib/{ext/autogui → autogui}/autogui.py +852 -823
  18. pyxllib/autogui/uiautolib.py +362 -0
  19. pyxllib/{ext/autogui → autogui}/virtualkey.py +102 -102
  20. pyxllib/autogui/wechat.py +827 -0
  21. pyxllib/autogui/wechat_msg.py +421 -0
  22. pyxllib/autogui/wxautolib.py +84 -0
  23. pyxllib/cv/__init__.py +5 -5
  24. pyxllib/cv/expert.py +267 -267
  25. pyxllib/cv/imfile.py +159 -159
  26. pyxllib/cv/imhash.py +39 -39
  27. pyxllib/cv/pupil.py +9 -9
  28. pyxllib/cv/rgbfmt.py +1525 -1525
  29. pyxllib/cv/slidercaptcha.py +137 -0
  30. pyxllib/cv/trackbartools.py +251 -251
  31. pyxllib/cv/xlcvlib.py +1040 -1040
  32. pyxllib/cv/xlpillib.py +423 -423
  33. pyxllib/data/echarts.py +240 -129
  34. pyxllib/data/jsonlib.py +89 -0
  35. pyxllib/data/oss.py +72 -72
  36. pyxllib/data/pglib.py +1127 -643
  37. pyxllib/data/sqlite.py +568 -341
  38. pyxllib/data/sqllib.py +297 -297
  39. pyxllib/ext/JLineViewer.py +505 -492
  40. pyxllib/ext/__init__.py +6 -6
  41. pyxllib/ext/demolib.py +246 -246
  42. pyxllib/ext/drissionlib.py +277 -0
  43. pyxllib/ext/kq5034lib.py +12 -1606
  44. pyxllib/ext/old.py +663 -663
  45. pyxllib/ext/qt.py +449 -449
  46. pyxllib/ext/robustprocfile.py +497 -0
  47. pyxllib/ext/seleniumlib.py +76 -76
  48. pyxllib/ext/tk.py +173 -173
  49. pyxllib/ext/unixlib.py +827 -826
  50. pyxllib/ext/utools.py +351 -338
  51. pyxllib/ext/webhook.py +124 -101
  52. pyxllib/ext/win32lib.py +40 -40
  53. pyxllib/ext/wjxlib.py +88 -0
  54. pyxllib/ext/wpsapi.py +124 -0
  55. pyxllib/ext/xlwork.py +9 -0
  56. pyxllib/ext/yuquelib.py +1105 -173
  57. pyxllib/file/__init__.py +17 -17
  58. pyxllib/file/docxlib.py +761 -761
  59. pyxllib/file/gitlib.py +309 -309
  60. pyxllib/file/libreoffice.py +165 -0
  61. pyxllib/file/movielib.py +148 -139
  62. pyxllib/file/newbie.py +10 -10
  63. pyxllib/file/onenotelib.py +1469 -1469
  64. pyxllib/file/packlib/__init__.py +330 -293
  65. pyxllib/file/packlib/zipfile.py +2441 -2441
  66. pyxllib/file/pdflib.py +426 -426
  67. pyxllib/file/pupil.py +185 -185
  68. pyxllib/file/specialist/__init__.py +685 -685
  69. pyxllib/file/specialist/dirlib.py +799 -799
  70. pyxllib/file/specialist/download.py +193 -186
  71. pyxllib/file/specialist/filelib.py +2829 -2618
  72. pyxllib/file/xlsxlib.py +3131 -2976
  73. pyxllib/file/xlsyncfile.py +341 -0
  74. pyxllib/prog/__init__.py +5 -5
  75. pyxllib/prog/cachetools.py +64 -0
  76. pyxllib/prog/deprecatedlib.py +233 -233
  77. pyxllib/prog/filelock.py +42 -0
  78. pyxllib/prog/ipyexec.py +253 -253
  79. pyxllib/prog/multiprogs.py +940 -0
  80. pyxllib/prog/newbie.py +451 -444
  81. pyxllib/prog/pupil.py +1197 -1128
  82. pyxllib/prog/sitepackages.py +33 -33
  83. pyxllib/prog/specialist/__init__.py +391 -217
  84. pyxllib/prog/specialist/bc.py +203 -200
  85. pyxllib/prog/specialist/browser.py +497 -488
  86. pyxllib/prog/specialist/common.py +347 -347
  87. pyxllib/prog/specialist/datetime.py +199 -131
  88. pyxllib/prog/specialist/tictoc.py +240 -241
  89. pyxllib/prog/specialist/xllog.py +180 -180
  90. pyxllib/prog/xlosenv.py +108 -101
  91. pyxllib/stdlib/__init__.py +17 -17
  92. pyxllib/stdlib/tablepyxl/__init__.py +10 -10
  93. pyxllib/stdlib/tablepyxl/style.py +303 -303
  94. pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
  95. pyxllib/text/__init__.py +8 -8
  96. pyxllib/text/ahocorasick.py +39 -39
  97. pyxllib/text/airscript.js +744 -0
  98. pyxllib/text/charclasslib.py +121 -109
  99. pyxllib/text/jiebalib.py +267 -264
  100. pyxllib/text/jinjalib.py +32 -0
  101. pyxllib/text/jsa_ai_prompt.md +271 -0
  102. pyxllib/text/jscode.py +922 -767
  103. pyxllib/text/latex/__init__.py +158 -158
  104. pyxllib/text/levenshtein.py +303 -303
  105. pyxllib/text/nestenv.py +1215 -1215
  106. pyxllib/text/newbie.py +300 -288
  107. pyxllib/text/pupil/__init__.py +8 -8
  108. pyxllib/text/pupil/common.py +1121 -1095
  109. pyxllib/text/pupil/xlalign.py +326 -326
  110. pyxllib/text/pycode.py +47 -47
  111. pyxllib/text/specialist/__init__.py +8 -8
  112. pyxllib/text/specialist/common.py +112 -112
  113. pyxllib/text/specialist/ptag.py +186 -186
  114. pyxllib/text/spellchecker.py +172 -172
  115. pyxllib/text/templates/echart_base.html +11 -0
  116. pyxllib/text/templates/highlight_code.html +17 -0
  117. pyxllib/text/templates/latex_editor.html +103 -0
  118. pyxllib/text/vbacode.py +17 -17
  119. pyxllib/text/xmllib.py +747 -685
  120. pyxllib/xl.py +42 -38
  121. pyxllib/xlcv.py +17 -17
  122. pyxllib-0.3.200.dist-info/METADATA +48 -0
  123. pyxllib-0.3.200.dist-info/RECORD +126 -0
  124. {pyxllib-0.3.96.dist-info → pyxllib-0.3.200.dist-info}/WHEEL +1 -2
  125. {pyxllib-0.3.96.dist-info → pyxllib-0.3.200.dist-info/licenses}/LICENSE +190 -190
  126. pyxllib/ext/autogui/__init__.py +0 -8
  127. pyxllib-0.3.96.dist-info/METADATA +0 -51
  128. pyxllib-0.3.96.dist-info/RECORD +0 -333
  129. pyxllib-0.3.96.dist-info/top_level.txt +0 -2
  130. pyxlpr/ai/__init__.py +0 -5
  131. pyxlpr/ai/clientlib.py +0 -1281
  132. pyxlpr/ai/specialist.py +0 -286
  133. pyxlpr/ai/torch_app.py +0 -172
  134. pyxlpr/ai/xlpaddle.py +0 -655
  135. pyxlpr/ai/xltorch.py +0 -705
  136. pyxlpr/data/__init__.py +0 -11
  137. pyxlpr/data/coco.py +0 -1325
  138. pyxlpr/data/datacls.py +0 -365
  139. pyxlpr/data/datasets.py +0 -200
  140. pyxlpr/data/gptlib.py +0 -1291
  141. pyxlpr/data/icdar/__init__.py +0 -96
  142. pyxlpr/data/icdar/deteval.py +0 -377
  143. pyxlpr/data/icdar/icdar2013.py +0 -341
  144. pyxlpr/data/icdar/iou.py +0 -340
  145. pyxlpr/data/icdar/rrc_evaluation_funcs_1_1.py +0 -463
  146. pyxlpr/data/imtextline.py +0 -473
  147. pyxlpr/data/labelme.py +0 -866
  148. pyxlpr/data/removeline.py +0 -179
  149. pyxlpr/data/specialist.py +0 -57
  150. pyxlpr/eval/__init__.py +0 -85
  151. pyxlpr/paddleocr.py +0 -776
  152. pyxlpr/ppocr/__init__.py +0 -15
  153. pyxlpr/ppocr/configs/rec/multi_language/generate_multi_language_configs.py +0 -226
  154. pyxlpr/ppocr/data/__init__.py +0 -135
  155. pyxlpr/ppocr/data/imaug/ColorJitter.py +0 -26
  156. pyxlpr/ppocr/data/imaug/__init__.py +0 -67
  157. pyxlpr/ppocr/data/imaug/copy_paste.py +0 -170
  158. pyxlpr/ppocr/data/imaug/east_process.py +0 -437
  159. pyxlpr/ppocr/data/imaug/gen_table_mask.py +0 -244
  160. pyxlpr/ppocr/data/imaug/iaa_augment.py +0 -114
  161. pyxlpr/ppocr/data/imaug/label_ops.py +0 -789
  162. pyxlpr/ppocr/data/imaug/make_border_map.py +0 -184
  163. pyxlpr/ppocr/data/imaug/make_pse_gt.py +0 -106
  164. pyxlpr/ppocr/data/imaug/make_shrink_map.py +0 -126
  165. pyxlpr/ppocr/data/imaug/operators.py +0 -433
  166. pyxlpr/ppocr/data/imaug/pg_process.py +0 -906
  167. pyxlpr/ppocr/data/imaug/randaugment.py +0 -143
  168. pyxlpr/ppocr/data/imaug/random_crop_data.py +0 -239
  169. pyxlpr/ppocr/data/imaug/rec_img_aug.py +0 -533
  170. pyxlpr/ppocr/data/imaug/sast_process.py +0 -777
  171. pyxlpr/ppocr/data/imaug/text_image_aug/__init__.py +0 -17
  172. pyxlpr/ppocr/data/imaug/text_image_aug/augment.py +0 -120
  173. pyxlpr/ppocr/data/imaug/text_image_aug/warp_mls.py +0 -168
  174. pyxlpr/ppocr/data/lmdb_dataset.py +0 -115
  175. pyxlpr/ppocr/data/pgnet_dataset.py +0 -104
  176. pyxlpr/ppocr/data/pubtab_dataset.py +0 -107
  177. pyxlpr/ppocr/data/simple_dataset.py +0 -372
  178. pyxlpr/ppocr/losses/__init__.py +0 -61
  179. pyxlpr/ppocr/losses/ace_loss.py +0 -52
  180. pyxlpr/ppocr/losses/basic_loss.py +0 -135
  181. pyxlpr/ppocr/losses/center_loss.py +0 -88
  182. pyxlpr/ppocr/losses/cls_loss.py +0 -30
  183. pyxlpr/ppocr/losses/combined_loss.py +0 -67
  184. pyxlpr/ppocr/losses/det_basic_loss.py +0 -208
  185. pyxlpr/ppocr/losses/det_db_loss.py +0 -80
  186. pyxlpr/ppocr/losses/det_east_loss.py +0 -63
  187. pyxlpr/ppocr/losses/det_pse_loss.py +0 -149
  188. pyxlpr/ppocr/losses/det_sast_loss.py +0 -121
  189. pyxlpr/ppocr/losses/distillation_loss.py +0 -272
  190. pyxlpr/ppocr/losses/e2e_pg_loss.py +0 -140
  191. pyxlpr/ppocr/losses/kie_sdmgr_loss.py +0 -113
  192. pyxlpr/ppocr/losses/rec_aster_loss.py +0 -99
  193. pyxlpr/ppocr/losses/rec_att_loss.py +0 -39
  194. pyxlpr/ppocr/losses/rec_ctc_loss.py +0 -44
  195. pyxlpr/ppocr/losses/rec_enhanced_ctc_loss.py +0 -70
  196. pyxlpr/ppocr/losses/rec_nrtr_loss.py +0 -30
  197. pyxlpr/ppocr/losses/rec_sar_loss.py +0 -28
  198. pyxlpr/ppocr/losses/rec_srn_loss.py +0 -47
  199. pyxlpr/ppocr/losses/table_att_loss.py +0 -109
  200. pyxlpr/ppocr/metrics/__init__.py +0 -44
  201. pyxlpr/ppocr/metrics/cls_metric.py +0 -45
  202. pyxlpr/ppocr/metrics/det_metric.py +0 -82
  203. pyxlpr/ppocr/metrics/distillation_metric.py +0 -73
  204. pyxlpr/ppocr/metrics/e2e_metric.py +0 -86
  205. pyxlpr/ppocr/metrics/eval_det_iou.py +0 -274
  206. pyxlpr/ppocr/metrics/kie_metric.py +0 -70
  207. pyxlpr/ppocr/metrics/rec_metric.py +0 -75
  208. pyxlpr/ppocr/metrics/table_metric.py +0 -50
  209. pyxlpr/ppocr/modeling/architectures/__init__.py +0 -32
  210. pyxlpr/ppocr/modeling/architectures/base_model.py +0 -88
  211. pyxlpr/ppocr/modeling/architectures/distillation_model.py +0 -60
  212. pyxlpr/ppocr/modeling/backbones/__init__.py +0 -54
  213. pyxlpr/ppocr/modeling/backbones/det_mobilenet_v3.py +0 -268
  214. pyxlpr/ppocr/modeling/backbones/det_resnet_vd.py +0 -246
  215. pyxlpr/ppocr/modeling/backbones/det_resnet_vd_sast.py +0 -285
  216. pyxlpr/ppocr/modeling/backbones/e2e_resnet_vd_pg.py +0 -265
  217. pyxlpr/ppocr/modeling/backbones/kie_unet_sdmgr.py +0 -186
  218. pyxlpr/ppocr/modeling/backbones/rec_mobilenet_v3.py +0 -138
  219. pyxlpr/ppocr/modeling/backbones/rec_mv1_enhance.py +0 -258
  220. pyxlpr/ppocr/modeling/backbones/rec_nrtr_mtb.py +0 -48
  221. pyxlpr/ppocr/modeling/backbones/rec_resnet_31.py +0 -210
  222. pyxlpr/ppocr/modeling/backbones/rec_resnet_aster.py +0 -143
  223. pyxlpr/ppocr/modeling/backbones/rec_resnet_fpn.py +0 -307
  224. pyxlpr/ppocr/modeling/backbones/rec_resnet_vd.py +0 -286
  225. pyxlpr/ppocr/modeling/heads/__init__.py +0 -54
  226. pyxlpr/ppocr/modeling/heads/cls_head.py +0 -52
  227. pyxlpr/ppocr/modeling/heads/det_db_head.py +0 -118
  228. pyxlpr/ppocr/modeling/heads/det_east_head.py +0 -121
  229. pyxlpr/ppocr/modeling/heads/det_pse_head.py +0 -37
  230. pyxlpr/ppocr/modeling/heads/det_sast_head.py +0 -128
  231. pyxlpr/ppocr/modeling/heads/e2e_pg_head.py +0 -253
  232. pyxlpr/ppocr/modeling/heads/kie_sdmgr_head.py +0 -206
  233. pyxlpr/ppocr/modeling/heads/multiheadAttention.py +0 -163
  234. pyxlpr/ppocr/modeling/heads/rec_aster_head.py +0 -393
  235. pyxlpr/ppocr/modeling/heads/rec_att_head.py +0 -202
  236. pyxlpr/ppocr/modeling/heads/rec_ctc_head.py +0 -88
  237. pyxlpr/ppocr/modeling/heads/rec_nrtr_head.py +0 -826
  238. pyxlpr/ppocr/modeling/heads/rec_sar_head.py +0 -402
  239. pyxlpr/ppocr/modeling/heads/rec_srn_head.py +0 -280
  240. pyxlpr/ppocr/modeling/heads/self_attention.py +0 -406
  241. pyxlpr/ppocr/modeling/heads/table_att_head.py +0 -246
  242. pyxlpr/ppocr/modeling/necks/__init__.py +0 -32
  243. pyxlpr/ppocr/modeling/necks/db_fpn.py +0 -111
  244. pyxlpr/ppocr/modeling/necks/east_fpn.py +0 -188
  245. pyxlpr/ppocr/modeling/necks/fpn.py +0 -138
  246. pyxlpr/ppocr/modeling/necks/pg_fpn.py +0 -314
  247. pyxlpr/ppocr/modeling/necks/rnn.py +0 -92
  248. pyxlpr/ppocr/modeling/necks/sast_fpn.py +0 -284
  249. pyxlpr/ppocr/modeling/necks/table_fpn.py +0 -110
  250. pyxlpr/ppocr/modeling/transforms/__init__.py +0 -28
  251. pyxlpr/ppocr/modeling/transforms/stn.py +0 -135
  252. pyxlpr/ppocr/modeling/transforms/tps.py +0 -308
  253. pyxlpr/ppocr/modeling/transforms/tps_spatial_transformer.py +0 -156
  254. pyxlpr/ppocr/optimizer/__init__.py +0 -61
  255. pyxlpr/ppocr/optimizer/learning_rate.py +0 -228
  256. pyxlpr/ppocr/optimizer/lr_scheduler.py +0 -49
  257. pyxlpr/ppocr/optimizer/optimizer.py +0 -160
  258. pyxlpr/ppocr/optimizer/regularizer.py +0 -52
  259. pyxlpr/ppocr/postprocess/__init__.py +0 -55
  260. pyxlpr/ppocr/postprocess/cls_postprocess.py +0 -33
  261. pyxlpr/ppocr/postprocess/db_postprocess.py +0 -234
  262. pyxlpr/ppocr/postprocess/east_postprocess.py +0 -143
  263. pyxlpr/ppocr/postprocess/locality_aware_nms.py +0 -200
  264. pyxlpr/ppocr/postprocess/pg_postprocess.py +0 -52
  265. pyxlpr/ppocr/postprocess/pse_postprocess/__init__.py +0 -15
  266. pyxlpr/ppocr/postprocess/pse_postprocess/pse/__init__.py +0 -29
  267. pyxlpr/ppocr/postprocess/pse_postprocess/pse/setup.py +0 -14
  268. pyxlpr/ppocr/postprocess/pse_postprocess/pse_postprocess.py +0 -118
  269. pyxlpr/ppocr/postprocess/rec_postprocess.py +0 -654
  270. pyxlpr/ppocr/postprocess/sast_postprocess.py +0 -355
  271. pyxlpr/ppocr/tools/__init__.py +0 -14
  272. pyxlpr/ppocr/tools/eval.py +0 -83
  273. pyxlpr/ppocr/tools/export_center.py +0 -77
  274. pyxlpr/ppocr/tools/export_model.py +0 -129
  275. pyxlpr/ppocr/tools/infer/predict_cls.py +0 -151
  276. pyxlpr/ppocr/tools/infer/predict_det.py +0 -300
  277. pyxlpr/ppocr/tools/infer/predict_e2e.py +0 -169
  278. pyxlpr/ppocr/tools/infer/predict_rec.py +0 -414
  279. pyxlpr/ppocr/tools/infer/predict_system.py +0 -204
  280. pyxlpr/ppocr/tools/infer/utility.py +0 -629
  281. pyxlpr/ppocr/tools/infer_cls.py +0 -83
  282. pyxlpr/ppocr/tools/infer_det.py +0 -134
  283. pyxlpr/ppocr/tools/infer_e2e.py +0 -122
  284. pyxlpr/ppocr/tools/infer_kie.py +0 -153
  285. pyxlpr/ppocr/tools/infer_rec.py +0 -146
  286. pyxlpr/ppocr/tools/infer_table.py +0 -107
  287. pyxlpr/ppocr/tools/program.py +0 -596
  288. pyxlpr/ppocr/tools/test_hubserving.py +0 -117
  289. pyxlpr/ppocr/tools/train.py +0 -163
  290. pyxlpr/ppocr/tools/xlprog.py +0 -748
  291. pyxlpr/ppocr/utils/EN_symbol_dict.txt +0 -94
  292. pyxlpr/ppocr/utils/__init__.py +0 -24
  293. pyxlpr/ppocr/utils/dict/ar_dict.txt +0 -117
  294. pyxlpr/ppocr/utils/dict/arabic_dict.txt +0 -162
  295. pyxlpr/ppocr/utils/dict/be_dict.txt +0 -145
  296. pyxlpr/ppocr/utils/dict/bg_dict.txt +0 -140
  297. pyxlpr/ppocr/utils/dict/chinese_cht_dict.txt +0 -8421
  298. pyxlpr/ppocr/utils/dict/cyrillic_dict.txt +0 -163
  299. pyxlpr/ppocr/utils/dict/devanagari_dict.txt +0 -167
  300. pyxlpr/ppocr/utils/dict/en_dict.txt +0 -63
  301. pyxlpr/ppocr/utils/dict/fa_dict.txt +0 -136
  302. pyxlpr/ppocr/utils/dict/french_dict.txt +0 -136
  303. pyxlpr/ppocr/utils/dict/german_dict.txt +0 -143
  304. pyxlpr/ppocr/utils/dict/hi_dict.txt +0 -162
  305. pyxlpr/ppocr/utils/dict/it_dict.txt +0 -118
  306. pyxlpr/ppocr/utils/dict/japan_dict.txt +0 -4399
  307. pyxlpr/ppocr/utils/dict/ka_dict.txt +0 -153
  308. pyxlpr/ppocr/utils/dict/korean_dict.txt +0 -3688
  309. pyxlpr/ppocr/utils/dict/latin_dict.txt +0 -185
  310. pyxlpr/ppocr/utils/dict/mr_dict.txt +0 -153
  311. pyxlpr/ppocr/utils/dict/ne_dict.txt +0 -153
  312. pyxlpr/ppocr/utils/dict/oc_dict.txt +0 -96
  313. pyxlpr/ppocr/utils/dict/pu_dict.txt +0 -130
  314. pyxlpr/ppocr/utils/dict/rs_dict.txt +0 -91
  315. pyxlpr/ppocr/utils/dict/rsc_dict.txt +0 -134
  316. pyxlpr/ppocr/utils/dict/ru_dict.txt +0 -125
  317. pyxlpr/ppocr/utils/dict/ta_dict.txt +0 -128
  318. pyxlpr/ppocr/utils/dict/table_dict.txt +0 -277
  319. pyxlpr/ppocr/utils/dict/table_structure_dict.txt +0 -2759
  320. pyxlpr/ppocr/utils/dict/te_dict.txt +0 -151
  321. pyxlpr/ppocr/utils/dict/ug_dict.txt +0 -114
  322. pyxlpr/ppocr/utils/dict/uk_dict.txt +0 -142
  323. pyxlpr/ppocr/utils/dict/ur_dict.txt +0 -137
  324. pyxlpr/ppocr/utils/dict/xi_dict.txt +0 -110
  325. pyxlpr/ppocr/utils/dict90.txt +0 -90
  326. pyxlpr/ppocr/utils/e2e_metric/Deteval.py +0 -574
  327. pyxlpr/ppocr/utils/e2e_metric/polygon_fast.py +0 -83
  328. pyxlpr/ppocr/utils/e2e_utils/extract_batchsize.py +0 -87
  329. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_fast.py +0 -457
  330. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_slow.py +0 -592
  331. pyxlpr/ppocr/utils/e2e_utils/pgnet_pp_utils.py +0 -162
  332. pyxlpr/ppocr/utils/e2e_utils/visual.py +0 -162
  333. pyxlpr/ppocr/utils/en_dict.txt +0 -95
  334. pyxlpr/ppocr/utils/gen_label.py +0 -81
  335. pyxlpr/ppocr/utils/ic15_dict.txt +0 -36
  336. pyxlpr/ppocr/utils/iou.py +0 -54
  337. pyxlpr/ppocr/utils/logging.py +0 -69
  338. pyxlpr/ppocr/utils/network.py +0 -84
  339. pyxlpr/ppocr/utils/ppocr_keys_v1.txt +0 -6623
  340. pyxlpr/ppocr/utils/profiler.py +0 -110
  341. pyxlpr/ppocr/utils/save_load.py +0 -150
  342. pyxlpr/ppocr/utils/stats.py +0 -72
  343. pyxlpr/ppocr/utils/utility.py +0 -80
  344. pyxlpr/ppstructure/__init__.py +0 -13
  345. pyxlpr/ppstructure/predict_system.py +0 -187
  346. pyxlpr/ppstructure/table/__init__.py +0 -13
  347. pyxlpr/ppstructure/table/eval_table.py +0 -72
  348. pyxlpr/ppstructure/table/matcher.py +0 -192
  349. pyxlpr/ppstructure/table/predict_structure.py +0 -136
  350. pyxlpr/ppstructure/table/predict_table.py +0 -221
  351. pyxlpr/ppstructure/table/table_metric/__init__.py +0 -16
  352. pyxlpr/ppstructure/table/table_metric/parallel.py +0 -51
  353. pyxlpr/ppstructure/table/table_metric/table_metric.py +0 -247
  354. pyxlpr/ppstructure/table/tablepyxl/__init__.py +0 -13
  355. pyxlpr/ppstructure/table/tablepyxl/style.py +0 -283
  356. pyxlpr/ppstructure/table/tablepyxl/tablepyxl.py +0 -118
  357. pyxlpr/ppstructure/utility.py +0 -71
  358. pyxlpr/xlai.py +0 -10
pyxllib/text/xmllib.py CHANGED
@@ -1,685 +1,747 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Date : 2020/06/02 20:16
6
-
7
- """
8
- xml等网页结构方面的处理
9
- """
10
-
11
- from pyxllib.prog.pupil import check_install_package
12
-
13
- # 一个xpath解析库
14
- check_install_package('xpath_parser', 'xpath-parser')
15
-
16
- import collections
17
- from collections import Counter, defaultdict
18
- import re
19
- import os
20
-
21
- import requests
22
- import pandas as pd
23
- import bs4
24
- from bs4 import BeautifulSoup
25
- from humanfriendly import format_size
26
- from xpath_parser import XpathExpression
27
-
28
- from pyxllib.prog.newbie import round_int
29
- from pyxllib.prog.pupil import dprint, run_once, inject_members
30
- from pyxllib.prog.specialist import browser
31
- from pyxllib.algo.pupil import SearchBase
32
- from pyxllib.text.newbie import xldictstr
33
- from pyxllib.text.pupil import shorten, ensure_gbk, BookContents, strwidth, grp_chinese_char
34
- from pyxllib.file.specialist import File, Dir, get_etag
35
-
36
-
37
- class XlBs4Tag(bs4.element.Tag):
38
-
39
- @property
40
- def tag_name(self):
41
- """输入一个bs4的Tag或NavigableString,
42
- 返回tag.name或者'NavigableString'
43
- """
44
- if self.name:
45
- return self.name
46
- elif isinstance(self, bs4.element.NavigableString):
47
- return 'NavigableString'
48
- else:
49
- dprint(self) # 获取结点t名称失败
50
- return None
51
-
52
- def subtag_names(self):
53
- """ 列出结点的所有直接子结点(花括号后面跟的数字是连续出现次数)
54
- 例如body的: p{137},tbl,p{94},tbl,p{1640},sectPr
55
- """
56
-
57
- def counter(m):
58
- s1 = m.group(1)
59
- n = (m.end(0) - m.start(0)) // len(s1)
60
- s = s1[:-1] + '{' + str(n) + '}'
61
- if m.string[m.end(0) - 1] == '':
62
- s += ','
63
- return s
64
-
65
- if self.name and self.contents:
66
- s = ','.join([x.tag_name for x in self.contents]) + ','
67
- s = re.sub(r'([^,]+,)(\1)+', counter, s)
68
- else:
69
- s = ''
70
- if s and s[-1] == '':
71
- s = s[:-1]
72
- return s
73
-
74
- def treestruct_raw(self, **kwargs):
75
- """ 查看树形结构的raw版本
76
- 各参数含义详见dfs_base
77
- """
78
- # 1 先用dfs获得基本结果
79
- sb = SearchBase(self)
80
- s = sb.fmt_nodes(**kwargs)
81
- return s
82
-
83
- def treestruct_brief(self, linenum=True, prefix='- ', **kwargs):
84
- """ 查看树形结构的简洁版
85
- """
86
-
87
- class Search(SearchBase):
88
- def fmt_node(self, node, depth, *, prefix=prefix, show_node_type=False):
89
- if isinstance(node, bs4.element.ProcessingInstruction):
90
- s = 'ProcessingInstruction,' + str(node)
91
- elif isinstance(node, bs4.element.Tag):
92
- s = node.name + ',' + xldictstr(node.attrs, item_delimit=',')
93
- elif isinstance(node, bs4.element.NavigableString):
94
- s = shorten(str(node), 200)
95
- if not s.strip():
96
- s = '<??>'
97
- else:
98
- s = '遇到特殊类型,' + str(node)
99
- return (prefix * depth) + s
100
-
101
- search = Search(self)
102
- res = search.fmt_nodes(linenum=linenum, **kwargs)
103
- return res
104
-
105
- def treestruct_stat(self):
106
- """生成一个两个二维表的统计数据
107
- ls1, ls2 = treestruct_stat()
108
- ls1 结点规律表
109
- ls2属性规律表
110
- count_tagname、check_tag的功能基本都可以被这个函数代替
111
- """
112
-
113
- def text(t):
114
- """ 考虑到结果一般都是存储到excel,所以会把无法存成gbk的字符串删掉
115
- 另外控制了每个元素的长度上限
116
- """
117
- s = ensure_gbk(t)
118
- s = s[:100]
119
- return s
120
-
121
- def depth(t):
122
- """结点t的深度"""
123
- return len(tuple(t.parents))
124
-
125
- t = self.contents[0]
126
- # ls1 = [['element序号', '层级', '结构', '父结点', '当前结点', '属性值/字符串值', '直接子结点结构']]
127
- # ls2 = [['序号', 'element序号', '当前结点', '属性名', '属性值']] #
128
- ls1 = [] # 这个重点是分析结点规律
129
- ls2 = [] # 这个重点是分析属性规律
130
- i = 1
131
- while t:
132
- # 1 结点规律表
133
- d = depth(t)
134
- line = [i, d, '_' * d + str(d), t.parent.tag_name, t.tag_name,
135
- text(xldictstr(t.attrs) if t.name else t), # 结点存属性,字符串存值
136
- t.subtag_names()]
137
- ls1.append(line)
138
- # 2 属性规律表
139
- if t.name:
140
- k = len(ls2)
141
- for attr, value in t.attrs.items():
142
- ls2.append([k, i, t.tag_name, attr, value])
143
- k += 1
144
- # 下个结点
145
- t = t.next_element
146
- i += 1
147
- df1 = pd.DataFrame.from_records(ls1, columns=['element序号', '层级', '结构', '父结点', '当前结点', '属性值/字符串值', '直接子结点结构'])
148
- df2 = pd.DataFrame.from_records(ls2, columns=['序号', 'element序号', '当前结点', '属性名', '属性值'])
149
- return df1, df2
150
-
151
- def count_tagname(self):
152
- """统计每个标签出现的次数:
153
- 1 w:rpr 650
154
- 2 w:rfonts 650
155
- 3 w:szcs 618
156
- 4 w:r 565
157
- 5 None 532
158
- 6 w:t 531
159
- """
160
- ct = collections.Counter()
161
-
162
- def inner(node):
163
- try:
164
- ct[node.name] += 1
165
- for t in node.children:
166
- inner(t)
167
- except AttributeError:
168
- pass
169
-
170
- inner(self)
171
- return ct.most_common()
172
-
173
- def check_tag(self, tagname=None):
174
- """ 统计每个标签在不同层级出现的次数:
175
-
176
- :param tagname:
177
- None:统计全文出现的各种标签在不同层级出现次数
178
- 't'等值: tagname参数允许只检查特殊标签情况,此时会将所有tagname设为第0级
179
-
180
- TODO 检查一个标签内部是否有同名标签?
181
- """
182
- d = defaultdict()
183
-
184
- def add(name, depth):
185
- if name not in d:
186
- d[name] = defaultdict(int)
187
- d[name][depth] += 1
188
-
189
- def inner(node, depth):
190
- if isinstance(node, bs4.element.ProcessingInstruction):
191
- add('ProcessingInstruction', depth)
192
- elif isinstance(node, bs4.element.Tag):
193
- if node.name == tagname and depth:
194
- dprint(node, depth) # tagname里有同名子标签
195
- add(node.name, depth)
196
- for t in node.children:
197
- inner(t, depth + 1)
198
- elif isinstance(node, bs4.element.NavigableString):
199
- add('NavigableString', depth)
200
- else:
201
- add('其他特殊结点', depth)
202
-
203
- # 1 统计结点在每一层出现的次数
204
- if tagname:
205
- for t in self.find_all(tagname):
206
- inner(t, 0)
207
- else:
208
- inner(self, 0)
209
-
210
- # 2 总出现次数和?
211
-
212
- return d
213
-
214
- def check_namespace(self):
215
- """检查名称空间问题,会同时检查标签名和属性名:
216
- 1 cNvPr pic:cNvPr(579),wps:cNvPr(52),wpg:cNvPr(15)
217
- 2 spPr pic:spPr(579),wps:spPr(52)
218
- """
219
- # 1 获得所有名称
220
- # 因为是采用node的原始xml文本,所以能保证会取得带有名称空间的文本内容
221
- ct0 = Counter(re.findall(r'<([a-zA-Z:]+)', str(self)))
222
- ct = defaultdict(str)
223
- s = set()
224
- for key, value in ct0.items():
225
- k = re.sub(r'.*:', '', key)
226
- if k in ct:
227
- s.add(k)
228
- ct[k] += f',{key}({value})'
229
- else:
230
- ct[k] = f'{key}({value})'
231
-
232
- # 2 对有重复和无重复的元素划分存储
233
- ls1 = [] # 有重复的存储到ls1
234
- ls2 = [] # 没有重复的正常结果存储到ls2,可以不显示
235
- for k, v in ct.items():
236
- if k in s:
237
- ls1.append([k, v])
238
- else:
239
- ls2.append([k, v])
240
-
241
- # 3 显示有重复的情况
242
- # browser(ls1, filename='检查名称空间问题')
243
- return ls1
244
-
245
- def get_catalogue(self, *args, size=False, start_level=-1, **kwargs):
246
- """ 找到所有的h生成文本版的目录
247
-
248
- :param bool|int size: 布尔或者乘因子,表示是否展示文本,以及乘以倍率,比如双语阅读时,size可以缩放一半
249
-
250
- *args, **kwargs 参考 BookContents.format_str
251
-
252
- 注意这里算法跟css样式不太一样,避免这里能写代码,能做更细腻的操作
253
- """
254
- bc = BookContents()
255
- for h in self.find_all(re.compile(r'h\d')):
256
- if size:
257
- part_size = h.section_text_size(size, fmt=True)
258
- bc.add(int(h.name[1]), h.get_text().replace('\n', ' '), part_size)
259
- else:
260
- bc.add(int(h.name[1]), h.get_text().replace('\n', ' '))
261
-
262
- if 'page' not in kwargs:
263
- kwargs['page'] = size
264
-
265
- if bc.contents:
266
- return bc.format_str(*args, start_level=start_level, **kwargs)
267
- else:
268
- return ''
269
-
270
- def section_text_size(self, factor=1, fmt=False):
271
- """ 计算某节标题下的正文内容长度 """
272
- if not re.match(r'h\d+$', self.name):
273
- raise TypeError
274
-
275
- # 这应该是相对比较简便的计算每一节内容多长的算法~~
276
- part_size = 0
277
- for x in self.next_siblings:
278
- if x.name == self.name:
279
- break
280
- else:
281
- text = str(x) if isinstance(x, bs4.element.NavigableString) else x.get_text()
282
- part_size += strwidth(text)
283
- part_size = round_int(part_size * factor)
284
-
285
- if fmt:
286
- return format_size(part_size).replace(' ', '').replace('bytes', 'B')
287
- else:
288
- return part_size
289
-
290
- def head_add_size(self, factor=1):
291
- """ 标题增加每节内容大小标记
292
-
293
- :param factor: 乘因子,默认是1。但双语阅读等情况,内容会多拷贝一份,此时可以乘以0.5,显示正常原文的大小。
294
- """
295
- for h in self.find_all(re.compile(r'h\d')):
296
- part_size = h.section_text_size(factor, fmt=True)
297
- navi_str = list(h.strings)[-1].rstrip()
298
- navi_str.replace_with(str(navi_str) + '' + part_size)
299
-
300
- def head_add_number(self, start_level=-1, jump=True):
301
- """ 标题增加每节编号
302
- """
303
- bc = BookContents()
304
- heads = list(self.find_all(re.compile(r'h\d')))
305
- for h in heads:
306
- bc.add(int(h.name[1]), h.get_text().replace('\n', ' '))
307
-
308
- if not bc.contents:
309
- return
310
-
311
- nums = bc.format_numbers(start_level=start_level, jump=jump)
312
- for i, h in enumerate(heads):
313
- navi_strs = list(h.strings)
314
- if navi_strs:
315
- navi_str = navi_strs[0]
316
- if nums[i]:
317
- navi_str.replace_with(nums[i] + ' ' + str(navi_str))
318
- else:
319
- h.string = nums[i]
320
-
321
- def xltext(self):
322
- """ 自己特用的文本化方法
323
-
324
- 有些空格会丢掉,要用这句转回来
325
-
326
- 210924周五20:23,但后续实验又遭到了质疑,目前这功能虽然留着,但不建议使用
327
- """
328
- # return self.prettify(formatter=lambda s: s.replace(u'\xa0', '&nbsp;'))
329
- # \xa0好像是些特殊字符,删掉就行。。。 不对,也不是特殊字符~~
330
- # return self.prettify(formatter=lambda s: s.replace(u'\xa0', ''))
331
- # return self.prettify()
332
- return str(self)
333
-
334
- def browser(self):
335
- browser.html(self)
336
-
337
- @run_once('id,str')
338
- def get_nonempty_childrens(self, *args):
339
- """ 获得所有Tag类型的直接子结点 (偏定制,不是那么通用的接口)
340
-
341
- 会同时检查NavigableString类型,且必须是空白字符串,比如空格、\n之类
342
- """
343
-
344
- def check(x):
345
- if isinstance(x, bs4.element.Tag):
346
- return True
347
- elif isinstance(x, bs4.element.Comment):
348
- return False
349
- elif isinstance(x, bs4.element.NavigableString):
350
- assert not x.strip(), f'非空字符串值:{x}'
351
- return False
352
- else:
353
- raise ValueError(f'未见类型 {x}')
354
-
355
- ls = list(filter(check, self.children))
356
-
357
- if len(args):
358
- return ls[args[0]].get_nonempty_childrens(*args[1:])
359
- else:
360
- return ls
361
-
362
- def get_nonempty_children(self, *args):
363
- """ 输入args下标,指定获得某一个非空子结点 """
364
- if len(args):
365
- ls = self.get_nonempty_childrens(*args[:-1])
366
- return ls[args[-1]]
367
- else:
368
- return self
369
-
370
- def next_preorder_node(self, iter_child=True):
371
- """ 自己写的先序遍历
372
-
373
- 主要应用在xml、bs4相关遍历检索时,有时候遇到特殊结点
374
- 可能子结点不需要解析
375
- 或者整个cur_node和子结点已经被解析完了,不需要再按照通常的先序遍历继续进入子结点
376
- 此时可以 iter_child=False,进入下一个兄弟结点
377
- """
378
- # 传入的不一定是一个Tag结点~~
379
- if not isinstance(self, bs4.element.Tag):
380
- return None
381
-
382
- if iter_child and self.contents:
383
- return self.contents[0]
384
- else:
385
- cur_node = self
386
- while True:
387
- parent = cur_node.parent
388
- if parent is None:
389
- return None
390
- sibing = cur_node.find_next_sibling()
391
- if sibing:
392
- return sibing
393
- cur_node = parent
394
-
395
- def find_by_xpath(self, xpath):
396
- """ 使用xpath定位元素
397
-
398
- bs4官方没有自带,网上找到的很多也不中意。就自己根据需求简单定制一下。非完整版实现,但希望能支持常用的几个操作。
399
- 好在还是有现成的xpath解析库的,自己扩展实现也不会太难。
400
- """
401
- xp = XpathExpression(xpath)
402
-
403
- cur_tag = self
404
- for node in xp.nodes:
405
- if node.name == '*':
406
- name = None
407
- else:
408
- name = node.name
409
-
410
- # TODO 其他前缀功能: .. 父结点, / 根节点
411
- recursive = node.ignore_position
412
-
413
- attrs = {}
414
- limit = 1
415
- for a in node.attrs:
416
- if a[0] == '@':
417
- k, v = a.split('=')
418
- attrs[k[1:]] = v[1:-1]
419
- elif re.match(r'\d+$', a): # 索引下标
420
- limit = int(a)
421
- else:
422
- raise NotImplementedError
423
-
424
- # node.type没用上,应该有些需要用途的
425
-
426
- sub_tags = cur_tag.find_all(name, attrs, recursive, limit=limit)
427
- if sub_tags:
428
- cur_tag = sub_tags[-1]
429
- else: # 没找到
430
- return None
431
-
432
- return cur_tag
433
-
434
-
435
- inject_members(XlBs4Tag, bs4.element.Tag)
436
-
437
-
438
- def mathjax_html_head(s):
439
- """增加mathjax解析脚本"""
440
- head = r"""<!DOCTYPE html>
441
- <html>
442
- <head>
443
- <head><meta http-equiv=Content-Type content="text/html;charset=utf-8"></head>
444
- <script src="https://a.cdn.histudy.com/lib/config/mathjax_config-klxx.js?v=1.1"></script>
445
- <script type="text/javascript" async src="https://a.cdn.histudy.com/lib/mathjax/2.7.1/MathJax/MathJax.js?config=TeX-AMS-MML_SVG">
446
- MathJax.Hub.Config(MATHJAX_KLXX_CONFIG);
447
- </script>
448
- </head>
449
- <body>"""
450
- tail = '</body></html>'
451
- return head + s + tail
452
-
453
-
454
- def html_bitran_template(htmlcontent):
455
- """ 双语翻译的html模板,html bilingual translation template
456
-
457
- 一般是将word导出的html文件,转成方便谷歌翻译操作,进行双语对照的格式
458
-
459
- 基本原理,是利用chrome识别class="notranslate"标记会跳过不翻译的特性
460
- 对正文标签p拷贝两份,一份原文,一份带notranslate标记的内容
461
- 这样在执行谷歌翻译后,就能出现双语对照的效果
462
-
463
- 其实最好的办法,是能调用翻译API,直接给出双语成果的html
464
- 但谷歌的googletrans连不上外网无法使用
465
- 其他公司的翻译接口应该没问题,但我嫌其可能没有google好,以及不是重点,就先暂缓开发
466
- ---
467
- 习惯来说,一般上面是英文,下面是中文,但是我又想使用中文标题~~
468
- """
469
- from pyxllib.text.nestenv import NestEnv
470
-
471
- # 0 将所有负margin-left变为0
472
- htmlcontent = re.sub(r'margin-left:-\d+(\.\d+)', 'margin-left:0', htmlcontent)
473
-
474
- # 1 区间定位分组
475
- ne = NestEnv(htmlcontent)
476
- ne2 = ne.xmltag('p')
477
- for name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'pre', 'ol', 'li'):
478
- ne2 += ne.xmltag(name, symmetry=True)
479
-
480
- # 以下是针对python document复制到word的情况,不一定具有广泛泛用性
481
- # 目的是让代码块按块复制,而不是按行复制
482
- ne2 += ne.find2(re.compile("<div style=['\"]mso-element:para-border-div;.+?#AACC99"), '</div>')
483
-
484
- # 2 每个区间的处理规则
485
- def func(s):
486
- """ 找出p、h后,具体对每个tag要执行的操作
487
-
488
- 分前后两波文本s1(原文),s2(翻译文)
489
- """
490
-
491
- # 1 s1 只要加 notranslate
492
- s1 = s
493
- bs = BeautifulSoup(s1, 'lxml')
494
- x = next(bs.body.children)
495
- cls_ = x.get('class', None)
496
- x['class'] = (cls_ + ['notranslate']) if cls_ else 'notranslate'
497
- s1 = x.prettify()
498
-
499
- # 2 s2 可能要做些骚操作
500
- s2 = s
501
- bs = BeautifulSoup(s2, 'lxml')
502
- x = next(bs.body.children)
503
-
504
- # 比如自定义翻译,这个无伤大雅的,如果搞不定,可以先注释掉,后面再说
505
- # if re.match(r'h\d+$', x.name):
506
- # for y in x.descendants:
507
- # if isinstance(y, NavigableString):
508
- # y.replace_with(re.sub(r'Conclusion', '总结', str(y)))
509
- # else:
510
- # for z in y.strings:
511
- # z.replace_with(re.sub(r'Conclusion', '总结', str(z)))
512
- # y.replace_with(re.sub(r'^Abstract$', '摘要', str(y)))
513
- # s2 = str(x)
514
-
515
- if re.match(r'h\d+$', x.name):
516
- x.name = 'p' # 去掉标题格式,统一为段落格式
517
- s2 = x.prettify()
518
- elif x.name in ('div', 'pre'):
519
- # 实际使用体验,想了下,代码块还是不如保留原样最方便,不用拷贝翻译
520
- # s2 = x.prettify()
521
- s2 = '' # 复制方式很有技巧
522
- # 如果p没有文本字符串,也不拷贝
523
- if not x.get_text().strip():
524
- s2 = ''
525
- # if x.name == 'p' and x.get('style', None) and 'margin-left' in x['style']:
526
- # x['style'] = re.sub(r'(margin-left:)\d+(\.\d+)?', r'\g<1>0', x['style'])
527
-
528
- return s1 + '\n' + s2
529
-
530
- res = ne2.replace(func)
531
-
532
- return res
533
-
534
-
535
- class MakeHtmlNavigation:
536
- """ 给网页添加一个带有超链接跳转的导航栏 """
537
-
538
- @classmethod
539
- def from_url(cls, url, **kwargs):
540
- """ 自动下载url的内容,缓存到本地后,加上导航栏打开 """
541
- content = requests.get(url).content.decode('utf8')
542
- etag = get_etag(url) # 直接算url的etag,不用很严谨
543
- return cls.from_content(content, etag, **kwargs)
544
-
545
- @classmethod
546
- def from_file(cls, file, **kwargs):
547
- """ 输入本地一个html文件的路径,加上导航栏打开 """
548
- file = File(file)
549
- content = file.read()
550
- # 输入文件的情况,生成的_content等html要在同目录
551
- return cls.from_content(content, os.path.splitext(str(file))[0], **kwargs)
552
-
553
- @classmethod
554
- def from_content(cls, html_content, title='temphtml', *,
555
- encoding=None, number=True, text_catalogue=True):
556
- """
557
- :param html_content: 原始网页的完整内容
558
- :param title: 页面标题,默认会先找head/title,如果没有,则取一个随机名称(TODO 未实装,目前固定名称)
559
- :param encoding: 保存的几个文件编码,默认是utf8,但windows平台有些特殊场合也可能要存储gbk
560
- :param number: 是否对每节启用自动编号的css
561
-
562
- 算法基本原理:读取原网页,找出所有h标签,并增设a锚点
563
- 另外生成一个导航html文件
564
- 然后再生成一个主文件,让用户通过主文件来浏览页面
565
-
566
- # 读取csdn博客并展示目录 (不过因为这个存在跳级,效果不是那么好)
567
- >> file = 自动制作网页标题的导航栏(requests.get(r'https://blog.csdn.net/code4101/article/details/83009000').content.decode('utf8'))
568
- >> browser(str(file))
569
- http://i2.tiimg.com/582188/64f40d235705de69.png
570
- """
571
- from humanfriendly import format_size
572
-
573
- # 1 对原html,设置锚点,生成一个新的文件f2
574
- cnt = 0
575
-
576
- # 这个refs是可以用py算法生成的,目前是存储在github上引用
577
- refs = ['<html><head>',
578
- '<link rel=Stylesheet type="text/css" media=all '
579
- f'href="https://code4101.github.io/css/navigation{int(number)}.css">',
580
- '</head><body>']
581
-
582
- f2 = File(title + '_content', Dir.TEMP, suffix='.html')
583
-
584
- def func(m):
585
- nonlocal cnt
586
- cnt += 1
587
- name, content = m.group('name'), m.group('inner')
588
- content = BeautifulSoup(content, 'lxml').get_text()
589
- # 要写<h><a></a></h>,不能写<a><h></h></a>,否则css中设置的计数器重置不会起作用
590
- refs.append(f'<{name}><a href="{f2}#navigation{cnt}" target="showframe">{content}</a></{name}>')
591
- return f'<a name="navigation{cnt}"/>' + m.group()
592
-
593
- html_content = re.sub(r'<(?P<name>h\d+)(?:>|\s.*?>)(?P<body>\s*(?P<inner>.*?)\s*)</\1>',
594
- func, html_content, flags=re.DOTALL)
595
- f2 = f2.write(html_content, encoding=encoding, if_exists='replace')
596
-
597
- # 2 f1除了导航栏,可以多附带一些有用的参考信息
598
- # 2.1 前文的refs已经存储了超链接的导航
599
-
600
- # 2.2 文本版的目录
601
- bs = BeautifulSoup(html_content, 'lxml')
602
- text = bs.get_text()
603
- if text_catalogue:
604
- # 目录
605
- refs.append(f'<br/>【文本版的目录】')
606
- catalogue = bs.get_catalogue(indent='\t', start_level=-1, jump=True, size=True)
607
- refs.append(f'<pre>{catalogue}</pre>')
608
- # 全文长度
609
- n = strwidth(text)
610
- refs.append('<br/>【Total Bytes】' + format_size(n))
611
-
612
- # 2.3 文中使用的高频词
613
- # 英文可以直接按空格切开统计,区分大小写
614
- text2 = re.sub(grp_chinese_char(), '', text) # 删除中文,先不做中文的功能~~
615
- text2 = re.sub(r'[,\.,。\(\)();;??"]', ' ', text2) # 标点符号按空格处理
616
- words = Counter(text2.split())
617
- msg = '\n'.join([(x[0] if x[1] == 1 else f'{x[0]},{x[1]}') for x in words.most_common()])
618
- msg += f'<br/>共{len(words)}个词汇,用词数{sum(words.values())}。'
619
- refs.append(f'<br/>【词汇表】<pre>{msg}</pre>')
620
-
621
- # 2.5 收尾,写入f1
622
- refs.append('</body>\n</html>')
623
- f1 = File(title + '_catalogue', Dir.TEMP, suffix='.html').write('\n'.join(refs), encoding=encoding,
624
- if_exists='replace')
625
-
626
- # 3 生成主页 f0
627
- main_content = f"""<html>
628
- <frameset cols="20%,80%">
629
- <frame src="{f1}">
630
- <frame src="{f2}" name="showframe">
631
- </frameset></html>"""
632
-
633
- f0 = File(title + '_index', Dir.TEMP, suffix='.html').write(main_content, encoding=encoding,
634
- if_exists='replace')
635
- return f0
636
-
637
-
638
- class HtmlParser:
639
- """ 对树形结构、位置比较固定的html文档的一个解析框架 """
640
-
641
- def __init__(self, root):
642
- """ 输入根节点root """
643
- self.root = root
644
-
645
- @classmethod
646
- @run_once
647
- def get_parse_funcs(cls):
648
- res = []
649
-
650
- # 获取所有的方法名
651
- members = dir(cls)
652
- methods = filter(lambda m: callable(getattr(cls, m)), members)
653
-
654
- # 以parse、parse_0、parse_0_2等格式命名的函数,是解析树结构特定位置,这里自动执行解析
655
- for method in methods:
656
- if re.match(r'parse(_\d+)*$', method):
657
- # 智能获取对应下标的结构变量
658
- res.append(method)
659
-
660
- return res
661
-
662
- def run(self):
663
- for method in self.get_parse_funcs():
664
- # 智能获取对应下标的结构变量
665
- idxs = [int(v) for v in method[5:].split('_') if v]
666
- x = self.root.get_nonempty_children(*idxs)
667
- # 自动执行函数
668
- getattr(self, method)(x)
669
-
670
-
671
- def get_jinja_template(name, **kwargs):
672
- from jinja2 import Environment
673
- from pyxllib.file.specialist import XlPath
674
-
675
- template = Environment(**kwargs).from_string((XlPath(__file__).parent / f'templates/{name}').read_text())
676
- return template
677
-
678
-
679
- def concat_htmlbody(ls):
680
- """ 对多份网页内容中的body进行拼接
681
- """
682
- texts = [re.search(r'<body>(.*?)</body>', x, flags=re.DOTALL).group(1) for x in ls]
683
- # 用第一份作为主模板
684
- text = re.sub(r'<body>(.*?)</body>', lambda m: '<body>' + '\n'.join(texts) + '</body>', ls[0], flags=re.DOTALL)
685
- return text
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : 陈坤泽
4
+ # @Email : 877362867@qq.com
5
+ # @Date : 2020/06/02 20:16
6
+
7
+ """
8
+ xml等网页结构方面的处理
9
+ """
10
+
11
+ # from pyxllib.prog.pupil import check_install_package
12
+
13
+ # 一个xpath解析库
14
+ # check_install_package('xpath_parser', 'xpath-parser')
15
+
16
+ import collections
17
+ from collections import Counter, defaultdict
18
+ import re
19
+ import os
20
+
21
+ import requests
22
+ import pandas as pd
23
+ import bs4
24
+ from bs4 import BeautifulSoup
25
+ from humanfriendly import format_size
26
+ # from xpath_parser import XpathExpression
27
+
28
+ from pyxllib.prog.newbie import round_int
29
+ from pyxllib.prog.pupil import dprint, run_once, inject_members
30
+ from pyxllib.prog.specialist import browser
31
+ from pyxllib.algo.pupil import SearchBase
32
+ from pyxllib.text.newbie import xldictstr
33
+ from pyxllib.text.pupil import shorten, ensure_gbk, BookContents, strwidth, grp_chinese_char
34
+ from pyxllib.file.specialist import File, Dir, get_etag
35
+ from pyxllib.text.jinjalib import get_jinja_template
36
+
37
+
38
+ class XlBs4Tag(bs4.element.Tag):
39
+
40
+ @property
41
+ def tag_name(self):
42
+ """输入一个bs4的Tag或NavigableString
43
+ 返回tag.name或者'NavigableString'
44
+ """
45
+ if self.name:
46
+ return self.name
47
+ elif isinstance(self, bs4.element.NavigableString):
48
+ return 'NavigableString'
49
+ else:
50
+ dprint(self) # 获取结点t名称失败
51
+ return None
52
+
53
+ def subtag_names(self):
54
+ """ 列出结点的所有直接子结点(花括号后面跟的数字是连续出现次数)
55
+ 例如body的: p{137},tbl,p{94},tbl,p{1640},sectPr
56
+ """
57
+
58
+ def counter(m):
59
+ s1 = m.group(1)
60
+ n = (m.end(0) - m.start(0)) // len(s1)
61
+ s = s1[:-1] + '{' + str(n) + '}'
62
+ if m.string[m.end(0) - 1] == ',':
63
+ s += ','
64
+ return s
65
+
66
+ if self.name and self.contents:
67
+ s = ','.join([x.tag_name for x in self.contents]) + ','
68
+ s = re.sub(r'([^,]+,)(\1)+', counter, s)
69
+ else:
70
+ s = ''
71
+ if s and s[-1] == ',':
72
+ s = s[:-1]
73
+ return s
74
+
75
+ def treestruct_raw(self, **kwargs):
76
+ """ 查看树形结构的raw版本
77
+ 各参数含义详见dfs_base
78
+ """
79
+ # 1 先用dfs获得基本结果
80
+ sb = SearchBase(self)
81
+ s = sb.fmt_nodes(**kwargs)
82
+ return s
83
+
84
+ def treestruct_brief(self, linenum=True, prefix='- ', **kwargs):
85
+ """ 查看树形结构的简洁版
86
+ """
87
+
88
+ class Search(SearchBase):
89
+ def fmt_node(self, node, depth, *, prefix=prefix, show_node_type=False):
90
+ if isinstance(node, bs4.element.ProcessingInstruction):
91
+ s = 'ProcessingInstruction,' + str(node)
92
+ elif isinstance(node, bs4.element.Tag):
93
+ s = node.name + ',' + xldictstr(node.attrs, item_delimit=',')
94
+ elif isinstance(node, bs4.element.NavigableString):
95
+ s = shorten(str(node), 200)
96
+ if not s.strip():
97
+ s = '<??>'
98
+ else:
99
+ s = '遇到特殊类型,' + str(node)
100
+ return (prefix * depth) + s
101
+
102
+ search = Search(self)
103
+ res = search.fmt_nodes(linenum=linenum, **kwargs)
104
+ return res
105
+
106
+ def treestruct_stat(self):
107
+ """生成一个两个二维表的统计数据
108
+ ls1, ls2 = treestruct_stat()
109
+ ls1结点规律表
110
+ ls2: 属性规律表
111
+ count_tagname、check_tag的功能基本都可以被这个函数代替
112
+ """
113
+
114
+ def text(t):
115
+ """ 考虑到结果一般都是存储到excel,所以会把无法存成gbk的字符串删掉
116
+ 另外控制了每个元素的长度上限
117
+ """
118
+ s = ensure_gbk(t)
119
+ s = s[:100]
120
+ return s
121
+
122
+ def depth(t):
123
+ """结点t的深度"""
124
+ return len(tuple(t.parents))
125
+
126
+ t = self.contents[0]
127
+ # ls1 = [['element序号', '层级', '结构', '父结点', '当前结点', '属性值/字符串值', '直接子结点结构']]
128
+ # ls2 = [['序号', 'element序号', '当前结点', '属性名', '属性值']] #
129
+ ls1 = [] # 这个重点是分析结点规律
130
+ ls2 = [] # 这个重点是分析属性规律
131
+ i = 1
132
+ while t:
133
+ # 1 结点规律表
134
+ d = depth(t)
135
+ line = [i, d, '_' * d + str(d), t.parent.tag_name, t.tag_name,
136
+ text(xldictstr(t.attrs) if t.name else t), # 结点存属性,字符串存值
137
+ t.subtag_names()]
138
+ ls1.append(line)
139
+ # 2 属性规律表
140
+ if t.name:
141
+ k = len(ls2)
142
+ for attr, value in t.attrs.items():
143
+ ls2.append([k, i, t.tag_name, attr, value])
144
+ k += 1
145
+ # 下个结点
146
+ t = t.next_element
147
+ i += 1
148
+ df1 = pd.DataFrame.from_records(ls1,
149
+ columns=['element序号', '层级', '结构', '父结点', '当前结点', '属性值/字符串值',
150
+ '直接子结点结构'])
151
+ df2 = pd.DataFrame.from_records(ls2, columns=['序号', 'element序号', '当前结点', '属性名', '属性值'])
152
+ return df1, df2
153
+
154
+ def count_tagname(self):
155
+ """统计每个标签出现的次数:
156
+ 1 w:rpr 650
157
+ 2 w:rfonts 650
158
+ 3 w:szcs 618
159
+ 4 w:r 565
160
+ 5 None 532
161
+ 6 w:t 531
162
+ """
163
+ ct = collections.Counter()
164
+
165
+ def inner(node):
166
+ try:
167
+ ct[node.name] += 1
168
+ for t in node.children:
169
+ inner(t)
170
+ except AttributeError:
171
+ pass
172
+
173
+ inner(self)
174
+ return ct.most_common()
175
+
176
+ def check_tag(self, tagname=None):
177
+ """ 统计每个标签在不同层级出现的次数:
178
+
179
+ :param tagname:
180
+ None:统计全文出现的各种标签在不同层级出现次数
181
+ 't'等值: tagname参数允许只检查特殊标签情况,此时会将所有tagname设为第0级
182
+
183
+ TODO 检查一个标签内部是否有同名标签?
184
+ """
185
+ d = defaultdict()
186
+
187
+ def add(name, depth):
188
+ if name not in d:
189
+ d[name] = defaultdict(int)
190
+ d[name][depth] += 1
191
+
192
+ def inner(node, depth):
193
+ if isinstance(node, bs4.element.ProcessingInstruction):
194
+ add('ProcessingInstruction', depth)
195
+ elif isinstance(node, bs4.element.Tag):
196
+ if node.name == tagname and depth:
197
+ dprint(node, depth) # tagname里有同名子标签
198
+ add(node.name, depth)
199
+ for t in node.children:
200
+ inner(t, depth + 1)
201
+ elif isinstance(node, bs4.element.NavigableString):
202
+ add('NavigableString', depth)
203
+ else:
204
+ add('其他特殊结点', depth)
205
+
206
+ # 1 统计结点在每一层出现的次数
207
+ if tagname:
208
+ for t in self.find_all(tagname):
209
+ inner(t, 0)
210
+ else:
211
+ inner(self, 0)
212
+
213
+ # 2 总出现次数和?
214
+
215
+ return d
216
+
217
+ def check_namespace(self):
218
+ """检查名称空间问题,会同时检查标签名和属性名:
219
+ 1 cNvPr pic:cNvPr(579),wps:cNvPr(52),wpg:cNvPr(15)
220
+ 2 spPr pic:spPr(579),wps:spPr(52)
221
+ """
222
+ # 1 获得所有名称
223
+ # 因为是采用node的原始xml文本,所以能保证会取得带有名称空间的文本内容
224
+ ct0 = Counter(re.findall(r'<([a-zA-Z:]+)', str(self)))
225
+ ct = defaultdict(str)
226
+ s = set()
227
+ for key, value in ct0.items():
228
+ k = re.sub(r'.*:', '', key)
229
+ if k in ct:
230
+ s.add(k)
231
+ ct[k] += f',{key}({value})'
232
+ else:
233
+ ct[k] = f'{key}({value})'
234
+
235
+ # 2 对有重复和无重复的元素划分存储
236
+ ls1 = [] # 有重复的存储到ls1
237
+ ls2 = [] # 没有重复的正常结果存储到ls2,可以不显示
238
+ for k, v in ct.items():
239
+ if k in s:
240
+ ls1.append([k, v])
241
+ else:
242
+ ls2.append([k, v])
243
+
244
+ # 3 显示有重复的情况
245
+ # browser(ls1, filename='检查名称空间问题')
246
+ return ls1
247
+
248
+ def get_catalogue(self, *args, size=False, start_level=-1, **kwargs):
249
+ """ 找到所有的h生成文本版的目录
250
+
251
+ :param bool|int size: 布尔或者乘因子,表示是否展示文本,以及乘以倍率,比如双语阅读时,size可以缩放一半
252
+
253
+ *args, **kwargs 参考 BookContents.format_str
254
+
255
+ 注意这里算法跟css样式不太一样,避免这里能写代码,能做更细腻的操作
256
+ """
257
+ bc = BookContents()
258
+ for h in self.find_all(re.compile(r'h\d')):
259
+ if size:
260
+ part_size = h.section_text_size(size, fmt=True)
261
+ bc.add(int(h.name[1]), h.get_text().replace('\n', ' ').strip(), part_size)
262
+ else:
263
+ bc.add(int(h.name[1]), h.get_text().replace('\n', ' ').strip())
264
+
265
+ if 'page' not in kwargs:
266
+ kwargs['page'] = size
267
+
268
+ if bc.contents:
269
+ return bc.format_str(*args, start_level=start_level, **kwargs)
270
+ else:
271
+ return ''
272
+
273
+ def section_text_size(self, factor=1, fmt=False):
274
+ """ 计算某节标题下的正文内容长度 """
275
+ if not re.match(r'h\d+$', self.name):
276
+ raise TypeError
277
+
278
+ # 这应该是相对比较简便的计算每一节内容多长的算法~~
279
+ part_size = 0
280
+ for x in self.next_siblings:
281
+ if x.name == self.name:
282
+ break
283
+ else:
284
+ text = str(x) if isinstance(x, bs4.element.NavigableString) else x.get_text()
285
+ part_size += strwidth(text)
286
+ part_size = round_int(part_size * factor)
287
+
288
+ if fmt:
289
+ return format_size(part_size).replace(' ', '').replace('bytes', 'B')
290
+ else:
291
+ return part_size
292
+
293
+ def head_add_size(self, factor=1):
294
+ """ 标题增加每节内容大小标记
295
+
296
+ :param factor: 乘因子,默认是1。但双语阅读等情况,内容会多拷贝一份,此时可以乘以0.5,显示正常原文的大小。
297
+ """
298
+ for h in self.find_all(re.compile(r'h\d')):
299
+ part_size = h.section_text_size(factor, fmt=True)
300
+ navi_str = list(h.strings)[-1].rstrip()
301
+ navi_str.replace_with(str(navi_str) + ',' + part_size)
302
+
303
+ def head_add_number(self, start_level=-1, jump=True):
304
+ """ 标题增加每节编号
305
+ """
306
+ bc = BookContents()
307
+ heads = list(self.find_all(re.compile(r'h\d')))
308
+ for h in heads:
309
+ bc.add(int(h.name[1]), h.get_text().replace('\n', ' '))
310
+
311
+ if not bc.contents:
312
+ return
313
+
314
+ nums = bc.format_numbers(start_level=start_level, jump=jump)
315
+ for i, h in enumerate(heads):
316
+ navi_strs = list(h.strings)
317
+ if navi_strs:
318
+ navi_str = navi_strs[0]
319
+ if nums[i]:
320
+ navi_str.replace_with(nums[i] + ' ' + str(navi_str))
321
+ else:
322
+ h.string = nums[i]
323
+
324
+ def xltext(self):
325
+ """ 自己特用的文本化方法
326
+
327
+ 有些空格会丢掉,要用这句转回来
328
+
329
+ 210924周五20:23,但后续实验又遭到了质疑,目前这功能虽然留着,但不建议使用
330
+ """
331
+ # return self.prettify(formatter=lambda s: s.replace(u'\xa0', '&nbsp;'))
332
+ # \xa0好像是些特殊字符,删掉就行。。。 不对,也不是特殊字符~~
333
+ # return self.prettify(formatter=lambda s: s.replace(u'\xa0', ''))
334
+ # return self.prettify()
335
+ return str(self)
336
+
337
+ def browser(self):
338
+ browser.html(self)
339
+
340
+ @run_once('id,str')
341
+ def get_nonempty_childrens(self, *args):
342
+ """ 获得所有Tag类型的直接子结点 (偏定制,不是那么通用的接口)
343
+
344
+ 会同时检查NavigableString类型,且必须是空白字符串,比如空格、\n之类
345
+ """
346
+
347
+ def check(x):
348
+ if isinstance(x, bs4.element.Tag):
349
+ return True
350
+ elif isinstance(x, bs4.element.Comment):
351
+ return False
352
+ elif isinstance(x, bs4.element.NavigableString):
353
+ assert not x.strip(), f'非空字符串值:{x}'
354
+ return False
355
+ else:
356
+ raise ValueError(f'未见类型 {x}')
357
+
358
+ ls = list(filter(check, self.children))
359
+
360
+ if len(args):
361
+ return ls[args[0]].get_nonempty_childrens(*args[1:])
362
+ else:
363
+ return ls
364
+
365
+ def get_nonempty_children(self, *args):
366
+ """ 输入args下标,指定获得某一个非空子结点 """
367
+ if len(args):
368
+ ls = self.get_nonempty_childrens(*args[:-1])
369
+ return ls[args[-1]]
370
+ else:
371
+ return self
372
+
373
+ def next_preorder_node(self, iter_child=True):
374
+ """ 自己写的先序遍历
375
+
376
+ 主要应用在xml、bs4相关遍历检索时,有时候遇到特殊结点
377
+ 可能子结点不需要解析
378
+ 或者整个cur_node和子结点已经被解析完了,不需要再按照通常的先序遍历继续进入子结点
379
+ 此时可以 iter_child=False,进入下一个兄弟结点
380
+ """
381
+ # 传入的不一定是一个Tag结点~~
382
+ if not isinstance(self, bs4.element.Tag):
383
+ return None
384
+
385
+ if iter_child and self.contents:
386
+ return self.contents[0]
387
+ else:
388
+ cur_node = self
389
+ while True:
390
+ parent = cur_node.parent
391
+ if parent is None:
392
+ return None
393
+ sibing = cur_node.find_next_sibling()
394
+ if sibing:
395
+ return sibing
396
+ cur_node = parent
397
+
398
+ def find_by_xpath(self, xpath):
399
+ """ 使用xpath定位元素
400
+
401
+ bs4官方没有自带,网上找到的很多也不中意。就自己根据需求简单定制一下。非完整版实现,但希望能支持常用的几个操作。
402
+ 好在还是有现成的xpath解析库的,自己扩展实现也不会太难。
403
+ """
404
+ from xpath_parser import XpathExpression
405
+
406
+ xp = XpathExpression(xpath)
407
+
408
+ cur_tag = self
409
+ for node in xp.nodes:
410
+ if node.name == '*':
411
+ name = None
412
+ else:
413
+ name = node.name
414
+
415
+ # TODO 其他前缀功能: .. 父结点, / 根节点
416
+ recursive = node.ignore_position
417
+
418
+ attrs = {}
419
+ limit = 1
420
+ for a in node.attrs:
421
+ if a[0] == '@':
422
+ k, v = a.split('=')
423
+ attrs[k[1:]] = v[1:-1]
424
+ elif re.match(r'\d+$', a): # 索引下标
425
+ limit = int(a)
426
+ else:
427
+ raise NotImplementedError
428
+
429
+ # node.type没用上,应该有些需要用途的
430
+
431
+ sub_tags = cur_tag.find_all(name, attrs, recursive, limit=limit)
432
+ if sub_tags:
433
+ cur_tag = sub_tags[-1]
434
+ else: # 没找到
435
+ return None
436
+
437
+ return cur_tag
438
+
439
+ def __修改功能(self):
440
+ pass
441
+
442
+ @classmethod
443
+ def _to_node(cls, html):
444
+ """ 输入可以是字符串、文档、结点 """
445
+ if isinstance(html, str):
446
+ new_node = next(BeautifulSoup(html, 'lxml').body.children)
447
+ elif html.find('body'):
448
+ new_node = next(html.body.children)
449
+ else:
450
+ new_node = html
451
+ return new_node
452
+
453
+ @classmethod
454
+ def _to_nodes(cls, html):
455
+ """ 输入可以是字符串、文档、结点 """
456
+ if isinstance(html, str):
457
+ new_nodes = list(BeautifulSoup(html, 'lxml').body.children)
458
+ elif html.find('body'):
459
+ new_nodes = list(html.body.children)
460
+ else:
461
+ new_nodes = [html]
462
+ return new_nodes
463
+
464
+ def replace_html_with(self, html):
465
+ nodes = self._to_nodes(html) # 支持替换成多个节点
466
+ if not nodes:
467
+ return
468
+ self.replace_with(nodes[0])
469
+
470
+ cur = nodes[0]
471
+ for node in nodes[1:]:
472
+ cur.insert_after(node)
473
+ cur = node
474
+
475
+ def insert_html_before(self, html):
476
+ nodes = self._to_nodes(html)
477
+ if not nodes:
478
+ return
479
+ self.insert_before(nodes[0])
480
+
481
+ cur = nodes[0]
482
+ for node in nodes[1:]:
483
+ cur.insert_after(node)
484
+ cur = node
485
+
486
+ def insert_html_after(self, html):
487
+ nodes = self._to_nodes(html)
488
+ if not nodes:
489
+ return
490
+
491
+ cur = self
492
+ for node in nodes:
493
+ cur.insert_after(node)
494
+ cur = node
495
+
496
+ def append_html(self, html):
497
+ """ 原append的扩展 """
498
+ nodes = self._to_nodes(html)
499
+ for node in nodes:
500
+ self.append(node)
501
+
502
+
503
+ inject_members(XlBs4Tag, bs4.element.Tag)
504
+ # 这样虽然不优雅,但主要是让特殊的String类型也支持兼容tag_name属性
505
+ inject_members(XlBs4Tag, bs4.element.NavigableString)
506
+
507
+
508
+ def mathjax_html_head(s):
509
+ """增加mathjax解析脚本"""
510
+ head = r"""<!DOCTYPE html>
511
+ <html>
512
+ <head>
513
+ <head><meta http-equiv=Content-Type content="text/html;charset=utf-8"></head>
514
+ <script src="https://a.cdn.histudy.com/lib/config/mathjax_config-klxx.js?v=1.1"></script>
515
+ <script type="text/javascript" async src="https://a.cdn.histudy.com/lib/mathjax/2.7.1/MathJax/MathJax.js?config=TeX-AMS-MML_SVG">
516
+ MathJax.Hub.Config(MATHJAX_KLXX_CONFIG);
517
+ </script>
518
+ </head>
519
+ <body>"""
520
+ tail = '</body></html>'
521
+ return head + s + tail
522
+
523
+
524
+ def html_bitran_template(htmlcontent):
525
+ """ 双语翻译的html模板,html bilingual translation template
526
+
527
+ 一般是将word导出的html文件,转成方便谷歌翻译操作,进行双语对照的格式
528
+
529
+ 基本原理,是利用chrome识别class="notranslate"标记会跳过不翻译的特性
530
+ 对正文标签p拷贝两份,一份原文,一份带notranslate标记的内容
531
+ 这样在执行谷歌翻译后,就能出现双语对照的效果
532
+
533
+ 其实最好的办法,是能调用翻译API,直接给出双语成果的html
534
+ 但谷歌的googletrans连不上外网无法使用
535
+ 其他公司的翻译接口应该没问题,但我嫌其可能没有google好,以及不是重点,就先暂缓开发
536
+ ---
537
+ 习惯来说,一般上面是英文,下面是中文,但是我又想使用中文标题~~
538
+ """
539
+ from pyxllib.text.nestenv import NestEnv
540
+
541
+ # 0 将所有负margin-left变为0
542
+ htmlcontent = re.sub(r'margin-left:-\d+(\.\d+)', 'margin-left:0', htmlcontent)
543
+
544
+ # 1 区间定位分组
545
+ ne = NestEnv(htmlcontent)
546
+ ne2 = ne.xmltag('p')
547
+ for name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'pre', 'ol', 'li'):
548
+ ne2 += ne.xmltag(name, symmetry=True)
549
+
550
+ # 以下是针对python document复制到word的情况,不一定具有广泛泛用性
551
+ # 目的是让代码块按块复制,而不是按行复制
552
+ ne2 += ne.find2(re.compile("<div style=['\"]mso-element:para-border-div;.+?#AACC99"), '</div>')
553
+
554
+ # 2 每个区间的处理规则
555
+ def func(s):
556
+ """ 找出p、h后,具体对每个tag要执行的操作
557
+
558
+ 分前后两波文本s1(原文),s2(翻译文)
559
+ """
560
+
561
+ # 1 s1 只要加 notranslate
562
+ s1 = s
563
+ bs = BeautifulSoup(s1, 'lxml')
564
+ x = next(bs.body.children)
565
+ cls_ = x.get('class', None)
566
+ x['class'] = (cls_ + ['notranslate']) if cls_ else 'notranslate'
567
+ s1 = x.prettify()
568
+
569
+ # 2 s2 可能要做些骚操作
570
+ s2 = s
571
+ bs = BeautifulSoup(s2, 'lxml')
572
+ x = next(bs.body.children)
573
+
574
+ # 比如自定义翻译,这个无伤大雅的,如果搞不定,可以先注释掉,后面再说
575
+ # if re.match(r'h\d+$', x.name):
576
+ # for y in x.descendants:
577
+ # if isinstance(y, NavigableString):
578
+ # y.replace_with(re.sub(r'Conclusion', '总结', str(y)))
579
+ # else:
580
+ # for z in y.strings:
581
+ # z.replace_with(re.sub(r'Conclusion', '总结', str(z)))
582
+ # y.replace_with(re.sub(r'^Abstract$', '摘要', str(y)))
583
+ # s2 = str(x)
584
+
585
+ if re.match(r'h\d+$', x.name):
586
+ x.name = 'p' # 去掉标题格式,统一为段落格式
587
+ s2 = x.prettify()
588
+ elif x.name in ('div', 'pre'):
589
+ # 实际使用体验,想了下,代码块还是不如保留原样最方便,不用拷贝翻译
590
+ # s2 = x.prettify()
591
+ s2 = '' # 复制方式很有技巧
592
+ # 如果p没有文本字符串,也不拷贝
593
+ if not x.get_text().strip():
594
+ s2 = ''
595
+ # if x.name == 'p' and x.get('style', None) and 'margin-left' in x['style']:
596
+ # x['style'] = re.sub(r'(margin-left:)\d+(\.\d+)?', r'\g<1>0', x['style'])
597
+
598
+ return s1 + '\n' + s2
599
+
600
+ res = ne2.replace(func)
601
+
602
+ return res
603
+
604
+
605
+ class MakeHtmlNavigation:
606
+ """ 给网页添加一个带有超链接跳转的导航栏 """
607
+
608
+ @classmethod
609
+ def from_url(cls, url, **kwargs):
610
+ """ 自动下载url的内容,缓存到本地后,加上导航栏打开 """
611
+ content = requests.get(url).content.decode('utf8')
612
+ etag = get_etag(url) # 直接算url的etag,不用很严谨
613
+ return cls.from_content(content, etag, **kwargs)
614
+
615
+ @classmethod
616
+ def from_file(cls, file, **kwargs):
617
+ """ 输入本地一个html文件的路径,加上导航栏打开 """
618
+ file = File(file)
619
+ content = file.read()
620
+ # 输入文件的情况,生成的_content等html要在同目录
621
+ return cls.from_content(content, os.path.splitext(str(file))[0], **kwargs)
622
+
623
+ @classmethod
624
+ def from_content(cls, html_content, title='temphtml', *,
625
+ encoding=None, number=True, text_catalogue=True):
626
+ """
627
+ :param html_content: 原始网页的完整内容
628
+ :param title: 页面标题,默认会先找head/title,如果没有,则取一个随机名称(TODO 未实装,目前固定名称)
629
+ :param encoding: 保存的几个文件编码,默认是utf8,但windows平台有些特殊场合也可能要存储gbk
630
+ :param number: 是否对每节启用自动编号的css
631
+
632
+ 算法基本原理:读取原网页,找出所有h标签,并增设a锚点
633
+ 另外生成一个导航html文件
634
+ 然后再生成一个主文件,让用户通过主文件来浏览页面
635
+
636
+ # 读取csdn博客并展示目录 (不过因为这个存在跳级,效果不是那么好)
637
+ >> file = 自动制作网页标题的导航栏(requests.get(r'https://blog.csdn.net/code4101/article/details/83009000').content.decode('utf8'))
638
+ >> browser(str(file))
639
+ http://i2.tiimg.com/582188/64f40d235705de69.png
640
+ """
641
+ from humanfriendly import format_size
642
+
643
+ # 1 对原html,设置锚点,生成一个新的文件f2
644
+ cnt = 0
645
+
646
+ # 这个refs是可以用py算法生成的,目前是存储在github上引用
647
+ refs = ['<html><head>',
648
+ '<link rel=Stylesheet type="text/css" media=all '
649
+ f'href="https://code4101.github.io/css/navigation{int(number)}.css">',
650
+ '</head><body>']
651
+
652
+ f2 = File(title + '_content', Dir.TEMP, suffix='.html')
653
+
654
+ def func(m):
655
+ nonlocal cnt
656
+ cnt += 1
657
+ name, content = m.group('name'), m.group('inner')
658
+ content = BeautifulSoup(content, 'lxml').get_text()
659
+ # 要写<h><a></a></h>,不能写<a><h></h></a>,否则css中设置的计数器重置不会起作用
660
+ refs.append(f'<{name}><a href="{f2}#navigation{cnt}" target="showframe">{content}</a></{name}>')
661
+ return f'<a name="navigation{cnt}"/>' + m.group()
662
+
663
+ html_content = re.sub(r'<(?P<name>h\d+)(?:>|\s.*?>)(?P<body>\s*(?P<inner>.*?)\s*)</\1>',
664
+ func, html_content, flags=re.DOTALL)
665
+ f2 = f2.write(html_content, encoding=encoding, if_exists='replace')
666
+
667
+ # 2 f1除了导航栏,可以多附带一些有用的参考信息
668
+ # 2.1 前文的refs已经存储了超链接的导航
669
+
670
+ # 2.2 文本版的目录
671
+ bs = BeautifulSoup(html_content, 'lxml')
672
+ text = bs.get_text()
673
+ if text_catalogue:
674
+ # 目录
675
+ refs.append(f'<br/>【文本版的目录】')
676
+ catalogue = bs.get_catalogue(indent='\t', start_level=-1, jump=True, size=True)
677
+ refs.append(f'<pre>{catalogue}</pre>')
678
+ # 全文长度
679
+ n = strwidth(text)
680
+ refs.append('<br/>【Total Bytes】' + format_size(n))
681
+
682
+ # 2.3 文中使用的高频词
683
+ # 英文可以直接按空格切开统计,区分大小写
684
+ text2 = re.sub(grp_chinese_char(), '', text) # 删除中文,先不做中文的功能~~
685
+ text2 = re.sub(r'[,\.,。\(\)();;??"]', ' ', text2) # 标点符号按空格处理
686
+ words = Counter(text2.split())
687
+ msg = '\n'.join([(x[0] if x[1] == 1 else f'{x[0]},{x[1]}') for x in words.most_common()])
688
+ msg += f'<br/>共{len(words)}个词汇,用词数{sum(words.values())}。'
689
+ refs.append(f'<br/>【词汇表】<pre>{msg}</pre>')
690
+
691
+ # 2.5 收尾,写入f1
692
+ refs.append('</body>\n</html>')
693
+ f1 = File(title + '_catalogue', Dir.TEMP, suffix='.html').write('\n'.join(refs), encoding=encoding,
694
+ if_exists='replace')
695
+
696
+ # 3 生成主页 f0
697
+ main_content = f"""<html>
698
+ <frameset cols="20%,80%">
699
+ <frame src="{f1}">
700
+ <frame src="{f2}" name="showframe">
701
+ </frameset></html>"""
702
+
703
+ f0 = File(title + '_index', Dir.TEMP, suffix='.html').write(main_content, encoding=encoding,
704
+ if_exists='replace')
705
+ return f0
706
+
707
+
708
+ class HtmlParser:
709
+ """ 对树形结构、位置比较固定的html文档的一个解析框架 """
710
+
711
+ def __init__(self, root):
712
+ """ 输入根节点root """
713
+ self.root = root
714
+
715
+ @classmethod
716
+ @run_once
717
+ def get_parse_funcs(cls):
718
+ res = []
719
+
720
+ # 获取所有的方法名
721
+ members = dir(cls)
722
+ methods = filter(lambda m: callable(getattr(cls, m)), members)
723
+
724
+ # 以parse、parse_0、parse_0_2等格式命名的函数,是解析树结构特定位置,这里自动执行解析
725
+ for method in methods:
726
+ if re.match(r'parse(_\d+)*$', method):
727
+ # 智能获取对应下标的结构变量
728
+ res.append(method)
729
+
730
+ return res
731
+
732
+ def run(self):
733
+ for method in self.get_parse_funcs():
734
+ # 智能获取对应下标的结构变量
735
+ idxs = [int(v) for v in method[5:].split('_') if v]
736
+ x = self.root.get_nonempty_children(*idxs)
737
+ # 自动执行函数
738
+ getattr(self, method)(x)
739
+
740
+
741
+ def concat_htmlbody(ls):
742
+ """ 对多份网页内容中的body进行拼接
743
+ """
744
+ texts = [re.search(r'<body>(.*?)</body>', x, flags=re.DOTALL).group(1) for x in ls]
745
+ # 用第一份作为主模板
746
+ text = re.sub(r'<body>(.*?)</body>', lambda m: '<body>' + '\n'.join(texts) + '</body>', ls[0], flags=re.DOTALL)
747
+ return text