pyxllib 0.3.96__py3-none-any.whl → 0.3.200__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. pyxllib/__init__.py +21 -21
  2. pyxllib/algo/__init__.py +8 -8
  3. pyxllib/algo/disjoint.py +54 -54
  4. pyxllib/algo/geo.py +541 -529
  5. pyxllib/algo/intervals.py +964 -964
  6. pyxllib/algo/matcher.py +389 -311
  7. pyxllib/algo/newbie.py +166 -166
  8. pyxllib/algo/pupil.py +629 -461
  9. pyxllib/algo/shapelylib.py +67 -67
  10. pyxllib/algo/specialist.py +241 -240
  11. pyxllib/algo/stat.py +494 -458
  12. pyxllib/algo/treelib.py +149 -149
  13. pyxllib/algo/unitlib.py +66 -66
  14. {pyxlpr → pyxllib/autogui}/__init__.py +5 -5
  15. pyxllib/autogui/activewin.py +246 -0
  16. pyxllib/autogui/all.py +9 -0
  17. pyxllib/{ext/autogui → autogui}/autogui.py +852 -823
  18. pyxllib/autogui/uiautolib.py +362 -0
  19. pyxllib/{ext/autogui → autogui}/virtualkey.py +102 -102
  20. pyxllib/autogui/wechat.py +827 -0
  21. pyxllib/autogui/wechat_msg.py +421 -0
  22. pyxllib/autogui/wxautolib.py +84 -0
  23. pyxllib/cv/__init__.py +5 -5
  24. pyxllib/cv/expert.py +267 -267
  25. pyxllib/cv/imfile.py +159 -159
  26. pyxllib/cv/imhash.py +39 -39
  27. pyxllib/cv/pupil.py +9 -9
  28. pyxllib/cv/rgbfmt.py +1525 -1525
  29. pyxllib/cv/slidercaptcha.py +137 -0
  30. pyxllib/cv/trackbartools.py +251 -251
  31. pyxllib/cv/xlcvlib.py +1040 -1040
  32. pyxllib/cv/xlpillib.py +423 -423
  33. pyxllib/data/echarts.py +240 -129
  34. pyxllib/data/jsonlib.py +89 -0
  35. pyxllib/data/oss.py +72 -72
  36. pyxllib/data/pglib.py +1127 -643
  37. pyxllib/data/sqlite.py +568 -341
  38. pyxllib/data/sqllib.py +297 -297
  39. pyxllib/ext/JLineViewer.py +505 -492
  40. pyxllib/ext/__init__.py +6 -6
  41. pyxllib/ext/demolib.py +246 -246
  42. pyxllib/ext/drissionlib.py +277 -0
  43. pyxllib/ext/kq5034lib.py +12 -1606
  44. pyxllib/ext/old.py +663 -663
  45. pyxllib/ext/qt.py +449 -449
  46. pyxllib/ext/robustprocfile.py +497 -0
  47. pyxllib/ext/seleniumlib.py +76 -76
  48. pyxllib/ext/tk.py +173 -173
  49. pyxllib/ext/unixlib.py +827 -826
  50. pyxllib/ext/utools.py +351 -338
  51. pyxllib/ext/webhook.py +124 -101
  52. pyxllib/ext/win32lib.py +40 -40
  53. pyxllib/ext/wjxlib.py +88 -0
  54. pyxllib/ext/wpsapi.py +124 -0
  55. pyxllib/ext/xlwork.py +9 -0
  56. pyxllib/ext/yuquelib.py +1105 -173
  57. pyxllib/file/__init__.py +17 -17
  58. pyxllib/file/docxlib.py +761 -761
  59. pyxllib/file/gitlib.py +309 -309
  60. pyxllib/file/libreoffice.py +165 -0
  61. pyxllib/file/movielib.py +148 -139
  62. pyxllib/file/newbie.py +10 -10
  63. pyxllib/file/onenotelib.py +1469 -1469
  64. pyxllib/file/packlib/__init__.py +330 -293
  65. pyxllib/file/packlib/zipfile.py +2441 -2441
  66. pyxllib/file/pdflib.py +426 -426
  67. pyxllib/file/pupil.py +185 -185
  68. pyxllib/file/specialist/__init__.py +685 -685
  69. pyxllib/file/specialist/dirlib.py +799 -799
  70. pyxllib/file/specialist/download.py +193 -186
  71. pyxllib/file/specialist/filelib.py +2829 -2618
  72. pyxllib/file/xlsxlib.py +3131 -2976
  73. pyxllib/file/xlsyncfile.py +341 -0
  74. pyxllib/prog/__init__.py +5 -5
  75. pyxllib/prog/cachetools.py +64 -0
  76. pyxllib/prog/deprecatedlib.py +233 -233
  77. pyxllib/prog/filelock.py +42 -0
  78. pyxllib/prog/ipyexec.py +253 -253
  79. pyxllib/prog/multiprogs.py +940 -0
  80. pyxllib/prog/newbie.py +451 -444
  81. pyxllib/prog/pupil.py +1197 -1128
  82. pyxllib/prog/sitepackages.py +33 -33
  83. pyxllib/prog/specialist/__init__.py +391 -217
  84. pyxllib/prog/specialist/bc.py +203 -200
  85. pyxllib/prog/specialist/browser.py +497 -488
  86. pyxllib/prog/specialist/common.py +347 -347
  87. pyxllib/prog/specialist/datetime.py +199 -131
  88. pyxllib/prog/specialist/tictoc.py +240 -241
  89. pyxllib/prog/specialist/xllog.py +180 -180
  90. pyxllib/prog/xlosenv.py +108 -101
  91. pyxllib/stdlib/__init__.py +17 -17
  92. pyxllib/stdlib/tablepyxl/__init__.py +10 -10
  93. pyxllib/stdlib/tablepyxl/style.py +303 -303
  94. pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
  95. pyxllib/text/__init__.py +8 -8
  96. pyxllib/text/ahocorasick.py +39 -39
  97. pyxllib/text/airscript.js +744 -0
  98. pyxllib/text/charclasslib.py +121 -109
  99. pyxllib/text/jiebalib.py +267 -264
  100. pyxllib/text/jinjalib.py +32 -0
  101. pyxllib/text/jsa_ai_prompt.md +271 -0
  102. pyxllib/text/jscode.py +922 -767
  103. pyxllib/text/latex/__init__.py +158 -158
  104. pyxllib/text/levenshtein.py +303 -303
  105. pyxllib/text/nestenv.py +1215 -1215
  106. pyxllib/text/newbie.py +300 -288
  107. pyxllib/text/pupil/__init__.py +8 -8
  108. pyxllib/text/pupil/common.py +1121 -1095
  109. pyxllib/text/pupil/xlalign.py +326 -326
  110. pyxllib/text/pycode.py +47 -47
  111. pyxllib/text/specialist/__init__.py +8 -8
  112. pyxllib/text/specialist/common.py +112 -112
  113. pyxllib/text/specialist/ptag.py +186 -186
  114. pyxllib/text/spellchecker.py +172 -172
  115. pyxllib/text/templates/echart_base.html +11 -0
  116. pyxllib/text/templates/highlight_code.html +17 -0
  117. pyxllib/text/templates/latex_editor.html +103 -0
  118. pyxllib/text/vbacode.py +17 -17
  119. pyxllib/text/xmllib.py +747 -685
  120. pyxllib/xl.py +42 -38
  121. pyxllib/xlcv.py +17 -17
  122. pyxllib-0.3.200.dist-info/METADATA +48 -0
  123. pyxllib-0.3.200.dist-info/RECORD +126 -0
  124. {pyxllib-0.3.96.dist-info → pyxllib-0.3.200.dist-info}/WHEEL +1 -2
  125. {pyxllib-0.3.96.dist-info → pyxllib-0.3.200.dist-info/licenses}/LICENSE +190 -190
  126. pyxllib/ext/autogui/__init__.py +0 -8
  127. pyxllib-0.3.96.dist-info/METADATA +0 -51
  128. pyxllib-0.3.96.dist-info/RECORD +0 -333
  129. pyxllib-0.3.96.dist-info/top_level.txt +0 -2
  130. pyxlpr/ai/__init__.py +0 -5
  131. pyxlpr/ai/clientlib.py +0 -1281
  132. pyxlpr/ai/specialist.py +0 -286
  133. pyxlpr/ai/torch_app.py +0 -172
  134. pyxlpr/ai/xlpaddle.py +0 -655
  135. pyxlpr/ai/xltorch.py +0 -705
  136. pyxlpr/data/__init__.py +0 -11
  137. pyxlpr/data/coco.py +0 -1325
  138. pyxlpr/data/datacls.py +0 -365
  139. pyxlpr/data/datasets.py +0 -200
  140. pyxlpr/data/gptlib.py +0 -1291
  141. pyxlpr/data/icdar/__init__.py +0 -96
  142. pyxlpr/data/icdar/deteval.py +0 -377
  143. pyxlpr/data/icdar/icdar2013.py +0 -341
  144. pyxlpr/data/icdar/iou.py +0 -340
  145. pyxlpr/data/icdar/rrc_evaluation_funcs_1_1.py +0 -463
  146. pyxlpr/data/imtextline.py +0 -473
  147. pyxlpr/data/labelme.py +0 -866
  148. pyxlpr/data/removeline.py +0 -179
  149. pyxlpr/data/specialist.py +0 -57
  150. pyxlpr/eval/__init__.py +0 -85
  151. pyxlpr/paddleocr.py +0 -776
  152. pyxlpr/ppocr/__init__.py +0 -15
  153. pyxlpr/ppocr/configs/rec/multi_language/generate_multi_language_configs.py +0 -226
  154. pyxlpr/ppocr/data/__init__.py +0 -135
  155. pyxlpr/ppocr/data/imaug/ColorJitter.py +0 -26
  156. pyxlpr/ppocr/data/imaug/__init__.py +0 -67
  157. pyxlpr/ppocr/data/imaug/copy_paste.py +0 -170
  158. pyxlpr/ppocr/data/imaug/east_process.py +0 -437
  159. pyxlpr/ppocr/data/imaug/gen_table_mask.py +0 -244
  160. pyxlpr/ppocr/data/imaug/iaa_augment.py +0 -114
  161. pyxlpr/ppocr/data/imaug/label_ops.py +0 -789
  162. pyxlpr/ppocr/data/imaug/make_border_map.py +0 -184
  163. pyxlpr/ppocr/data/imaug/make_pse_gt.py +0 -106
  164. pyxlpr/ppocr/data/imaug/make_shrink_map.py +0 -126
  165. pyxlpr/ppocr/data/imaug/operators.py +0 -433
  166. pyxlpr/ppocr/data/imaug/pg_process.py +0 -906
  167. pyxlpr/ppocr/data/imaug/randaugment.py +0 -143
  168. pyxlpr/ppocr/data/imaug/random_crop_data.py +0 -239
  169. pyxlpr/ppocr/data/imaug/rec_img_aug.py +0 -533
  170. pyxlpr/ppocr/data/imaug/sast_process.py +0 -777
  171. pyxlpr/ppocr/data/imaug/text_image_aug/__init__.py +0 -17
  172. pyxlpr/ppocr/data/imaug/text_image_aug/augment.py +0 -120
  173. pyxlpr/ppocr/data/imaug/text_image_aug/warp_mls.py +0 -168
  174. pyxlpr/ppocr/data/lmdb_dataset.py +0 -115
  175. pyxlpr/ppocr/data/pgnet_dataset.py +0 -104
  176. pyxlpr/ppocr/data/pubtab_dataset.py +0 -107
  177. pyxlpr/ppocr/data/simple_dataset.py +0 -372
  178. pyxlpr/ppocr/losses/__init__.py +0 -61
  179. pyxlpr/ppocr/losses/ace_loss.py +0 -52
  180. pyxlpr/ppocr/losses/basic_loss.py +0 -135
  181. pyxlpr/ppocr/losses/center_loss.py +0 -88
  182. pyxlpr/ppocr/losses/cls_loss.py +0 -30
  183. pyxlpr/ppocr/losses/combined_loss.py +0 -67
  184. pyxlpr/ppocr/losses/det_basic_loss.py +0 -208
  185. pyxlpr/ppocr/losses/det_db_loss.py +0 -80
  186. pyxlpr/ppocr/losses/det_east_loss.py +0 -63
  187. pyxlpr/ppocr/losses/det_pse_loss.py +0 -149
  188. pyxlpr/ppocr/losses/det_sast_loss.py +0 -121
  189. pyxlpr/ppocr/losses/distillation_loss.py +0 -272
  190. pyxlpr/ppocr/losses/e2e_pg_loss.py +0 -140
  191. pyxlpr/ppocr/losses/kie_sdmgr_loss.py +0 -113
  192. pyxlpr/ppocr/losses/rec_aster_loss.py +0 -99
  193. pyxlpr/ppocr/losses/rec_att_loss.py +0 -39
  194. pyxlpr/ppocr/losses/rec_ctc_loss.py +0 -44
  195. pyxlpr/ppocr/losses/rec_enhanced_ctc_loss.py +0 -70
  196. pyxlpr/ppocr/losses/rec_nrtr_loss.py +0 -30
  197. pyxlpr/ppocr/losses/rec_sar_loss.py +0 -28
  198. pyxlpr/ppocr/losses/rec_srn_loss.py +0 -47
  199. pyxlpr/ppocr/losses/table_att_loss.py +0 -109
  200. pyxlpr/ppocr/metrics/__init__.py +0 -44
  201. pyxlpr/ppocr/metrics/cls_metric.py +0 -45
  202. pyxlpr/ppocr/metrics/det_metric.py +0 -82
  203. pyxlpr/ppocr/metrics/distillation_metric.py +0 -73
  204. pyxlpr/ppocr/metrics/e2e_metric.py +0 -86
  205. pyxlpr/ppocr/metrics/eval_det_iou.py +0 -274
  206. pyxlpr/ppocr/metrics/kie_metric.py +0 -70
  207. pyxlpr/ppocr/metrics/rec_metric.py +0 -75
  208. pyxlpr/ppocr/metrics/table_metric.py +0 -50
  209. pyxlpr/ppocr/modeling/architectures/__init__.py +0 -32
  210. pyxlpr/ppocr/modeling/architectures/base_model.py +0 -88
  211. pyxlpr/ppocr/modeling/architectures/distillation_model.py +0 -60
  212. pyxlpr/ppocr/modeling/backbones/__init__.py +0 -54
  213. pyxlpr/ppocr/modeling/backbones/det_mobilenet_v3.py +0 -268
  214. pyxlpr/ppocr/modeling/backbones/det_resnet_vd.py +0 -246
  215. pyxlpr/ppocr/modeling/backbones/det_resnet_vd_sast.py +0 -285
  216. pyxlpr/ppocr/modeling/backbones/e2e_resnet_vd_pg.py +0 -265
  217. pyxlpr/ppocr/modeling/backbones/kie_unet_sdmgr.py +0 -186
  218. pyxlpr/ppocr/modeling/backbones/rec_mobilenet_v3.py +0 -138
  219. pyxlpr/ppocr/modeling/backbones/rec_mv1_enhance.py +0 -258
  220. pyxlpr/ppocr/modeling/backbones/rec_nrtr_mtb.py +0 -48
  221. pyxlpr/ppocr/modeling/backbones/rec_resnet_31.py +0 -210
  222. pyxlpr/ppocr/modeling/backbones/rec_resnet_aster.py +0 -143
  223. pyxlpr/ppocr/modeling/backbones/rec_resnet_fpn.py +0 -307
  224. pyxlpr/ppocr/modeling/backbones/rec_resnet_vd.py +0 -286
  225. pyxlpr/ppocr/modeling/heads/__init__.py +0 -54
  226. pyxlpr/ppocr/modeling/heads/cls_head.py +0 -52
  227. pyxlpr/ppocr/modeling/heads/det_db_head.py +0 -118
  228. pyxlpr/ppocr/modeling/heads/det_east_head.py +0 -121
  229. pyxlpr/ppocr/modeling/heads/det_pse_head.py +0 -37
  230. pyxlpr/ppocr/modeling/heads/det_sast_head.py +0 -128
  231. pyxlpr/ppocr/modeling/heads/e2e_pg_head.py +0 -253
  232. pyxlpr/ppocr/modeling/heads/kie_sdmgr_head.py +0 -206
  233. pyxlpr/ppocr/modeling/heads/multiheadAttention.py +0 -163
  234. pyxlpr/ppocr/modeling/heads/rec_aster_head.py +0 -393
  235. pyxlpr/ppocr/modeling/heads/rec_att_head.py +0 -202
  236. pyxlpr/ppocr/modeling/heads/rec_ctc_head.py +0 -88
  237. pyxlpr/ppocr/modeling/heads/rec_nrtr_head.py +0 -826
  238. pyxlpr/ppocr/modeling/heads/rec_sar_head.py +0 -402
  239. pyxlpr/ppocr/modeling/heads/rec_srn_head.py +0 -280
  240. pyxlpr/ppocr/modeling/heads/self_attention.py +0 -406
  241. pyxlpr/ppocr/modeling/heads/table_att_head.py +0 -246
  242. pyxlpr/ppocr/modeling/necks/__init__.py +0 -32
  243. pyxlpr/ppocr/modeling/necks/db_fpn.py +0 -111
  244. pyxlpr/ppocr/modeling/necks/east_fpn.py +0 -188
  245. pyxlpr/ppocr/modeling/necks/fpn.py +0 -138
  246. pyxlpr/ppocr/modeling/necks/pg_fpn.py +0 -314
  247. pyxlpr/ppocr/modeling/necks/rnn.py +0 -92
  248. pyxlpr/ppocr/modeling/necks/sast_fpn.py +0 -284
  249. pyxlpr/ppocr/modeling/necks/table_fpn.py +0 -110
  250. pyxlpr/ppocr/modeling/transforms/__init__.py +0 -28
  251. pyxlpr/ppocr/modeling/transforms/stn.py +0 -135
  252. pyxlpr/ppocr/modeling/transforms/tps.py +0 -308
  253. pyxlpr/ppocr/modeling/transforms/tps_spatial_transformer.py +0 -156
  254. pyxlpr/ppocr/optimizer/__init__.py +0 -61
  255. pyxlpr/ppocr/optimizer/learning_rate.py +0 -228
  256. pyxlpr/ppocr/optimizer/lr_scheduler.py +0 -49
  257. pyxlpr/ppocr/optimizer/optimizer.py +0 -160
  258. pyxlpr/ppocr/optimizer/regularizer.py +0 -52
  259. pyxlpr/ppocr/postprocess/__init__.py +0 -55
  260. pyxlpr/ppocr/postprocess/cls_postprocess.py +0 -33
  261. pyxlpr/ppocr/postprocess/db_postprocess.py +0 -234
  262. pyxlpr/ppocr/postprocess/east_postprocess.py +0 -143
  263. pyxlpr/ppocr/postprocess/locality_aware_nms.py +0 -200
  264. pyxlpr/ppocr/postprocess/pg_postprocess.py +0 -52
  265. pyxlpr/ppocr/postprocess/pse_postprocess/__init__.py +0 -15
  266. pyxlpr/ppocr/postprocess/pse_postprocess/pse/__init__.py +0 -29
  267. pyxlpr/ppocr/postprocess/pse_postprocess/pse/setup.py +0 -14
  268. pyxlpr/ppocr/postprocess/pse_postprocess/pse_postprocess.py +0 -118
  269. pyxlpr/ppocr/postprocess/rec_postprocess.py +0 -654
  270. pyxlpr/ppocr/postprocess/sast_postprocess.py +0 -355
  271. pyxlpr/ppocr/tools/__init__.py +0 -14
  272. pyxlpr/ppocr/tools/eval.py +0 -83
  273. pyxlpr/ppocr/tools/export_center.py +0 -77
  274. pyxlpr/ppocr/tools/export_model.py +0 -129
  275. pyxlpr/ppocr/tools/infer/predict_cls.py +0 -151
  276. pyxlpr/ppocr/tools/infer/predict_det.py +0 -300
  277. pyxlpr/ppocr/tools/infer/predict_e2e.py +0 -169
  278. pyxlpr/ppocr/tools/infer/predict_rec.py +0 -414
  279. pyxlpr/ppocr/tools/infer/predict_system.py +0 -204
  280. pyxlpr/ppocr/tools/infer/utility.py +0 -629
  281. pyxlpr/ppocr/tools/infer_cls.py +0 -83
  282. pyxlpr/ppocr/tools/infer_det.py +0 -134
  283. pyxlpr/ppocr/tools/infer_e2e.py +0 -122
  284. pyxlpr/ppocr/tools/infer_kie.py +0 -153
  285. pyxlpr/ppocr/tools/infer_rec.py +0 -146
  286. pyxlpr/ppocr/tools/infer_table.py +0 -107
  287. pyxlpr/ppocr/tools/program.py +0 -596
  288. pyxlpr/ppocr/tools/test_hubserving.py +0 -117
  289. pyxlpr/ppocr/tools/train.py +0 -163
  290. pyxlpr/ppocr/tools/xlprog.py +0 -748
  291. pyxlpr/ppocr/utils/EN_symbol_dict.txt +0 -94
  292. pyxlpr/ppocr/utils/__init__.py +0 -24
  293. pyxlpr/ppocr/utils/dict/ar_dict.txt +0 -117
  294. pyxlpr/ppocr/utils/dict/arabic_dict.txt +0 -162
  295. pyxlpr/ppocr/utils/dict/be_dict.txt +0 -145
  296. pyxlpr/ppocr/utils/dict/bg_dict.txt +0 -140
  297. pyxlpr/ppocr/utils/dict/chinese_cht_dict.txt +0 -8421
  298. pyxlpr/ppocr/utils/dict/cyrillic_dict.txt +0 -163
  299. pyxlpr/ppocr/utils/dict/devanagari_dict.txt +0 -167
  300. pyxlpr/ppocr/utils/dict/en_dict.txt +0 -63
  301. pyxlpr/ppocr/utils/dict/fa_dict.txt +0 -136
  302. pyxlpr/ppocr/utils/dict/french_dict.txt +0 -136
  303. pyxlpr/ppocr/utils/dict/german_dict.txt +0 -143
  304. pyxlpr/ppocr/utils/dict/hi_dict.txt +0 -162
  305. pyxlpr/ppocr/utils/dict/it_dict.txt +0 -118
  306. pyxlpr/ppocr/utils/dict/japan_dict.txt +0 -4399
  307. pyxlpr/ppocr/utils/dict/ka_dict.txt +0 -153
  308. pyxlpr/ppocr/utils/dict/korean_dict.txt +0 -3688
  309. pyxlpr/ppocr/utils/dict/latin_dict.txt +0 -185
  310. pyxlpr/ppocr/utils/dict/mr_dict.txt +0 -153
  311. pyxlpr/ppocr/utils/dict/ne_dict.txt +0 -153
  312. pyxlpr/ppocr/utils/dict/oc_dict.txt +0 -96
  313. pyxlpr/ppocr/utils/dict/pu_dict.txt +0 -130
  314. pyxlpr/ppocr/utils/dict/rs_dict.txt +0 -91
  315. pyxlpr/ppocr/utils/dict/rsc_dict.txt +0 -134
  316. pyxlpr/ppocr/utils/dict/ru_dict.txt +0 -125
  317. pyxlpr/ppocr/utils/dict/ta_dict.txt +0 -128
  318. pyxlpr/ppocr/utils/dict/table_dict.txt +0 -277
  319. pyxlpr/ppocr/utils/dict/table_structure_dict.txt +0 -2759
  320. pyxlpr/ppocr/utils/dict/te_dict.txt +0 -151
  321. pyxlpr/ppocr/utils/dict/ug_dict.txt +0 -114
  322. pyxlpr/ppocr/utils/dict/uk_dict.txt +0 -142
  323. pyxlpr/ppocr/utils/dict/ur_dict.txt +0 -137
  324. pyxlpr/ppocr/utils/dict/xi_dict.txt +0 -110
  325. pyxlpr/ppocr/utils/dict90.txt +0 -90
  326. pyxlpr/ppocr/utils/e2e_metric/Deteval.py +0 -574
  327. pyxlpr/ppocr/utils/e2e_metric/polygon_fast.py +0 -83
  328. pyxlpr/ppocr/utils/e2e_utils/extract_batchsize.py +0 -87
  329. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_fast.py +0 -457
  330. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_slow.py +0 -592
  331. pyxlpr/ppocr/utils/e2e_utils/pgnet_pp_utils.py +0 -162
  332. pyxlpr/ppocr/utils/e2e_utils/visual.py +0 -162
  333. pyxlpr/ppocr/utils/en_dict.txt +0 -95
  334. pyxlpr/ppocr/utils/gen_label.py +0 -81
  335. pyxlpr/ppocr/utils/ic15_dict.txt +0 -36
  336. pyxlpr/ppocr/utils/iou.py +0 -54
  337. pyxlpr/ppocr/utils/logging.py +0 -69
  338. pyxlpr/ppocr/utils/network.py +0 -84
  339. pyxlpr/ppocr/utils/ppocr_keys_v1.txt +0 -6623
  340. pyxlpr/ppocr/utils/profiler.py +0 -110
  341. pyxlpr/ppocr/utils/save_load.py +0 -150
  342. pyxlpr/ppocr/utils/stats.py +0 -72
  343. pyxlpr/ppocr/utils/utility.py +0 -80
  344. pyxlpr/ppstructure/__init__.py +0 -13
  345. pyxlpr/ppstructure/predict_system.py +0 -187
  346. pyxlpr/ppstructure/table/__init__.py +0 -13
  347. pyxlpr/ppstructure/table/eval_table.py +0 -72
  348. pyxlpr/ppstructure/table/matcher.py +0 -192
  349. pyxlpr/ppstructure/table/predict_structure.py +0 -136
  350. pyxlpr/ppstructure/table/predict_table.py +0 -221
  351. pyxlpr/ppstructure/table/table_metric/__init__.py +0 -16
  352. pyxlpr/ppstructure/table/table_metric/parallel.py +0 -51
  353. pyxlpr/ppstructure/table/table_metric/table_metric.py +0 -247
  354. pyxlpr/ppstructure/table/tablepyxl/__init__.py +0 -13
  355. pyxlpr/ppstructure/table/tablepyxl/style.py +0 -283
  356. pyxlpr/ppstructure/table/tablepyxl/tablepyxl.py +0 -118
  357. pyxlpr/ppstructure/utility.py +0 -71
  358. pyxlpr/xlai.py +0 -10
pyxllib/file/pdflib.py CHANGED
@@ -1,426 +1,426 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Date : 2020/06/02 16:06
6
-
7
- from pyxllib.prog.pupil import check_install_package
8
-
9
- check_install_package('fitz', 'PyMuPdf>=1.18.17')
10
-
11
- import json
12
- import os
13
- import pprint
14
- import re
15
-
16
- import fitz
17
-
18
- from pyxllib.prog.newbie import round_int, decode_bitflags
19
- from pyxllib.prog.pupil import DictTool, inject_members, dprint
20
- from pyxllib.prog.specialist import browser
21
- from pyxllib.algo.newbie import round_unit
22
- from pyxllib.algo.pupil import get_number_width
23
- from pyxllib.file.specialist import XlPath, writefile, get_etag
24
- from pyxllib.cv.expert import xlcv, xlpil
25
- from pyxlpr.data.labelme import LabelmeDict
26
-
27
-
28
- def __fitz():
29
- print(fitz.__doc__)
30
-
31
-
32
- class FitzDoc:
33
- """ 原名叫FitzPdf,但不一定是处理pdf,也可能是其他文档,所以改名 FitzDoc
34
- """
35
-
36
- def __init__(self, file):
37
- self.src_file = XlPath(file)
38
- self.doc = fitz.open(str(file))
39
-
40
- def to_images(self, dst_dir=None, file_fmt='{filestem}_{number}.jpg', num_width=None, *,
41
- scale=1, start=1, fmt_onepage=False):
42
- """ 将pdf转为若干页图片
43
-
44
- :param dst_dir: 目标目录
45
- 默认情况下,只有一页pdf则存储到对应的pdf目录,多页则存储到同名子目录下
46
- 如果不想这样被智能控制,只要指定明确的dst即可
47
- :param file_fmt: 后缀格式,包括修改导出的图片类型,注意要用 {} 占位符表示页码编号
48
- :param num_width: 生成的每一页文件编号,使用的数字前导0域宽
49
- 默认根据pdf总页数来设置对应所用域宽
50
- 0表示不设域宽
51
- :param scale: 对每页图片进行缩放,一般推荐都要设成2,导出的图片才清晰
52
- :param start: 起始页码,一般建议从1开始比较符合常识直觉
53
- :param fmt_onepage: 当pdf就只有一页的时候,是否还对导出的图片编号
54
- 默认只有一页的时候,进行优化,不增设后缀格式
55
- :return: 返回转换完的图片名称清单
56
-
57
- 注:如果要导出单张图,可以用 FitzPdfPage.get_cv_image
58
- """
59
- # 1 基本参数计算
60
- srcfile, doc = self.src_file, self.doc
61
- filestem, n_page = srcfile.stem, doc.page_count
62
-
63
- # 自动推导目标目录
64
- if dst_dir is None:
65
- dst_dir = XlPath.init(srcfile.stem, srcfile.parent) if n_page > 1 else XlPath(srcfile.parent)
66
- os.makedirs(dst_dir, exist_ok=True)
67
-
68
- # 域宽
69
- num_width = num_width or get_number_width(n_page) # 根据总页数计算需要的对齐域宽
70
-
71
- # 2 导出图片
72
- if fmt_onepage or n_page != 1: # 多页的处理规则
73
- res = []
74
- for i in range(n_page):
75
- im = self.load_page(i).get_cv_image(scale)
76
- number = ('{:0' + str(num_width) + 'd}').format(i + start) # 前面的括号不要删,这样才是完整的一个字符串来使用format
77
- f = xlcv.write(im, XlPath.init(file_fmt.format(filestem=filestem, number=number), dst_dir))
78
- res.append(f)
79
- return res
80
- else:
81
- im = self.load_page(0).get_cv_image(scale)
82
- return [xlcv.write(im, XlPath.init(srcfile.stem + os.path.splitext(file_fmt)[1], dst_dir))]
83
-
84
- def to_labelmes(self, imfiles, opt='dict', *, views=(0, 0, 1, 0), scale=1, indent=None):
85
- """ 生成图片对应的标注,常跟to_images配合使用 """
86
- for i, imfile in enumerate(imfiles):
87
- page = self.load_page(i)
88
- lmdict = LabelmeDict.gen_data(imfile)
89
- lmdict['shapes'] = page.get_labelme_shapes(opt, views=views, scale=scale)
90
- imfile.with_suffix('.json').write(lmdict, indent=indent)
91
-
92
- def to_docx(self, docx_file=None):
93
- """ pdf转docx """
94
- check_install_package('pdf2docx')
95
- from pdf2docx import parse
96
-
97
- pdf_file = self.src_file
98
-
99
- if docx_file is None:
100
- docx_file = pdf_file.with_suffix('.docx')
101
-
102
- # 注意这里是日志显示进度,不是printf输出.
103
- parse(str(pdf_file), str(docx_file))
104
-
105
- def browser(self, opt='pdf'):
106
- if opt == 'pdf':
107
- f = self.src_file
108
- browser(self.src_file)
109
- elif opt == 'html':
110
- ls = []
111
- for i in range(self.page_count):
112
- page = self.load_page(i)
113
- ls.append(page.get_text('html'))
114
- data = '\n'.join(ls)
115
- etag = get_etag(data)
116
- f = XlPath.init(etag, XlPath.tempdir(), suffix='.html')
117
- f.write(data)
118
- browser(f)
119
- else:
120
- raise ValueError(f'{opt}')
121
- return f
122
-
123
- def __getattr__(self, item):
124
- return getattr(self.doc, item)
125
-
126
-
127
- class XlFitzPage(fitz.fitz.Page):
128
- """ 对fitz.fitz.Page的扩展成员方法 """
129
-
130
- def get_svg_image2(self, scale=1):
131
- # svg 是一段表述性文本
132
- if scale != 1:
133
- txt = self.get_svg_image(matrix=fitz.Matrix(scale, scale))
134
- else:
135
- txt = self.get_svg_image()
136
- return txt
137
-
138
- def _get_png_data(self, scale=1):
139
- # TODO 增加透明通道?
140
- if scale != 1:
141
- pix = self.get_pixmap(matrix=fitz.Matrix(scale, scale)) # 长宽放大到scale倍
142
- else:
143
- pix = self.get_pixmap()
144
- return pix.tobytes()
145
-
146
- def get_cv_image(self, scale=1):
147
- return xlcv.read_from_buffer(self._get_png_data(scale), flags=1)
148
-
149
- def get_pil_image(self, scale=1):
150
- # TODO 可以优化,直接从内存数据转pil,不用这样先转cv再转pil
151
- return xlpil.read_from_buffer(self._get_png_data(scale), flags=1)
152
-
153
- def to_image(self, outfile, *, scale=1, if_exists=None):
154
- """ 转成为文件 """
155
- f = XlPath(outfile)
156
- suffix = f.suffix.lower()
157
-
158
- if suffix == '.svg':
159
- content = self.get_svg_image()
160
- f.write(content, if_exists=if_exists)
161
- else:
162
- im = self.get_cv_image(scale)
163
- xlcv.write(im, if_exists=if_exists)
164
-
165
- def get_labelme_shapes(self, opt='dict', *, views=1, scale=1):
166
- """ 得到labelme版本的shapes标注信息
167
-
168
- :param opt: get_text的参数,默认使用无字符集标注的精简的dict
169
- 也可以使用rawdict,带有字符集标注的数据
170
- :param views: 若非list或者长度不足4,会补足
171
- 各位标记依次代表是否显示对应细粒度的标注:blocks、lines、spans、chars
172
- 默认只显示blocks
173
- 例如 (0, 0, 1, 0),表示只显示spans的标注
174
- :param scale: 是否需要对坐标按比例放大 (pdf经常放大两倍提取图片,则这里标注也要对应放大两倍)
175
-
176
- 【字典属性解释】
177
- blocks:
178
- number: int, 区块编号
179
- type: 0表示文本行,1表示图片
180
- lines:
181
- wmode: 好像都是0,不知道啥东西
182
- dir: [1, 0],可能是文本方向吧
183
- spans:
184
- size: 字号
185
- flags: 格式标记
186
- 1,superscript,上标
187
- 2,italic,斜体
188
- 4,serifed,有衬线。如果没开,对立面就是"sans",无衬线。
189
- 8,monospaced,等距。对立面proportional,均衡。
190
- 16,bold,加粗
191
- font:字体名称(直接用字符串赋值)
192
- color:颜色
193
- ascender:?
194
- descender:?
195
- origin:所在方格右上角坐标
196
- text/chars: dict模式有text内容,rawdict有chars详细信息。我扩展的版本,rawdict也会有text属性。
197
- char:
198
- origin: 差不多是其所在方格的右上角坐标,同一行文本,其top位置是会对齐的
199
- c: 字符内容
200
- """
201
- from pyxlpr.data.labelme import LabelmeDict
202
-
203
- # 1 参数配置
204
- if isinstance(views, int):
205
- views = [views]
206
- if len(views) < 4:
207
- views += [0] * (4 - len(views))
208
-
209
- shapes = []
210
- page_dict = self.get_text(opt)
211
-
212
- # 2 辅助函数
213
- def add_shape(name, refdict, add_keys, drop_keys=('bbox',)):
214
- """ 生成一个标注框 """
215
- msgdict = {'category_name': name}
216
- msgdict.update(add_keys)
217
- DictTool.ior(msgdict, refdict)
218
- DictTool.isub(msgdict, drop_keys)
219
- bbox = [round_int(v * scale) for v in refdict['bbox']]
220
-
221
- if 'size' in msgdict:
222
- x = round_unit(msgdict['size'], 0.5)
223
- msgdict['size'] = round_int(x) if (x * 10) % 10 < 1 else x # 没有小数的时候,优先展示为11,而不是11.0
224
- if 'color' in msgdict:
225
- # 把color映射为直观的(r, g, b)
226
- # 这个pdf解析器获取的color,不一定精确等于原值,可能会有偏差,小一个像素
227
- v = msgdict['color']
228
- msgdict['color'] = (v // 256 // 256, (v // 256) % 256, v % 256)
229
- if 'origin' in msgdict:
230
- msgdict['origin'] = [round_int(v) for v in msgdict['origin']]
231
-
232
- sp = LabelmeDict.gen_shape(json.dumps(msgdict), bbox)
233
- shapes.append(sp)
234
-
235
- # 3 遍历获取标注数据
236
- for block in page_dict['blocks']:
237
- if block['type'] == 0: # 普通的文本行
238
- if views[0]:
239
- add_shape('text_block', block, {'n_lines': len(block['lines'])}, ['bbox', 'lines'])
240
- for line in block['lines']:
241
- if views[1]:
242
- add_shape('line', line, {'n_spans': len(line['spans'])}, ['bbox', 'spans'])
243
- for span in line['spans']:
244
- if 'text' not in span and 'chars' in span:
245
- span['text'] = ''.join([x['c'] for x in span['chars']])
246
- if views[2]:
247
- add_shape('span', span, {'n_chars': len(span.get('text', ''))}, ['bbox', 'chars'])
248
- if views[3] and 'chars' in span: # 最后层算法不太一样,这样写可以加速
249
- for char in span['chars']:
250
- add_shape('char', char, {}, ['bbox'])
251
- elif block['type'] == 1: # 应该是图片
252
- add_shape('image', block, {'image_filesize': len(block['image'])}, ['bbox', 'image'])
253
- else:
254
- raise ValueError
255
-
256
- return shapes
257
-
258
- @classmethod
259
- def parse_flags(cls, n):
260
- """ 解析spans的flags参数明文含义 """
261
- flags = decode_bitflags(n, ('superscript', 'italic', 'serifed', 'monospaced', 'bold'))
262
- flags['sans'] = not flags['serifed']
263
- flags['proportional'] = not flags['monospaced']
264
- return flags
265
-
266
- def browser(self, opt='html'):
267
- if opt == 'html':
268
- data = self.get_text('html') # html、xhtml 可以转网页,虽然排版相对来说还是会乱一点
269
- data = ''.join(data)
270
- etag = get_etag(data)
271
- f = XlPath.init(etag, XlPath.tempdir(), suffix='.html')
272
- f.write(data)
273
- browser(f)
274
- else:
275
- raise ValueError
276
-
277
-
278
- inject_members(XlFitzPage, fitz.fitz.Page)
279
-
280
-
281
- class DemoFitz:
282
- """
283
- 安装: pip install PyMuPdf
284
- 使用: import fitz
285
- 官方文档: https://pymupdf.readthedocs.io/en/latest/intro/
286
- demo: https://github.com/rk700/PyMuPDF/tree/master/demo
287
- examples: https://github.com/rk700/PyMuPDF/tree/master/examples
288
- """
289
-
290
- def __init__(self, file):
291
- self.doc = fitz.open(file)
292
-
293
- def message(self):
294
- """查看pdf文档一些基础信息"""
295
- dprint(fitz.version) # fitz模块的版本
296
- dprint(self.doc.pageCount) # pdf页数
297
- dprint(self.doc._getXrefLength()) # 文档的对象总数
298
-
299
- def getToC(self):
300
- """获得书签目录"""
301
- toc = self.doc.getToC()
302
- browser(toc)
303
-
304
- def setToC(self):
305
- """设置书签目录
306
- 可以调层级、改名称、修改指向页码
307
- """
308
- toc = self.doc.getToC()
309
- toc[1][1] = '改标题名称'
310
- self.doc.setToC(toc)
311
- file = XlPath('a.pdf', XlPath.tempdir()).to_str()
312
- self.doc.save(file, garbage=4)
313
- browser(file)
314
-
315
- def setToC2(self):
316
- """修改人教版教材的标签名"""
317
- toc = self.doc.getToC()
318
- newtoc = []
319
- for i in range(len(toc)):
320
- name = toc[i][1]
321
- if '.' in name: continue
322
- # m = re.search(r'\d+', name)
323
- # if m: name = name.replace(m.group(), digits2chinese(int(m.group())))
324
- m = re.search(r'([一二三四五六]年级).*?([上下])', name)
325
- if i < len(toc) - 1:
326
- pages = toc[i + 1][2] - toc[i][2] + 1
327
- else:
328
- pages = self.doc.pageCount - toc[i][2] + 1
329
- toc[i][1] = m.group(1) + m.group(2) + ',' + str(pages)
330
- newtoc.append(toc[i])
331
- self.doc.setToC(newtoc)
332
- file = writefile(b'', 'a.pdf', if_exists='replace')
333
- self.doc.save(file, garbage=4)
334
-
335
- def rearrange_pages(self):
336
- """重新布局页面"""
337
- self.doc.select([0, 0, 1]) # 第1页展示两次后,再跟第2页
338
- file = writefile(b'', 'a.pdf', root=XlPath.tempdir(), if_exists='replace')
339
- self.doc.save(file, garbage=4) # 注意要设置garbage,否则文档并没有实际删除内容压缩文件大小
340
- browser(file)
341
-
342
- def page2png(self, page=0):
343
- """ 查看单页渲染图片 """
344
- page = self.doc.loadPage(page) # 索引第i页,下标规律同py,支持-1索引最后页
345
- # dprint(page.bound()) # 页面边界,x,y轴同图像处理中的常识定义,返回Rect(x0, y0, x1, y1)
346
-
347
- pix = page.getPixmap(fitz.Matrix(2, 2)) # 获得页面的RGBA图像,Pixmap类型;还可以用page.getSVGimage()获得矢量图
348
- # pix.writePNG('page-0.png') # 将Pixmal
349
- pngdata = pix.tobytes() # 获png文件的bytes字节码
350
- # print(len(pngdata))
351
- # browser(pngdata, 'a.png') # 用我的工具函数打开图片
352
-
353
- return pngdata
354
-
355
- def pagetext(self):
356
- """单页上的文本"""
357
- page = self.doc[0]
358
-
359
- # 获得页面上的所有文本,还支持参数: html,dict,xml,xhtml,json
360
- text = page.getText('text')
361
- dprint(text)
362
-
363
- # 获得页面上的所有文本(返回字典对象)
364
- textdict = page.getText('dict')
365
- textdict['blocks'] = textdict['blocks'][:-1]
366
- browser(pprint.pformat(textdict))
367
-
368
- def text(self):
369
- """获得整份pdf的所有文本"""
370
- return '\n'.join([page.getText('text') for page in self.doc])
371
-
372
- def xrefstr(self):
373
- """查看pdf文档的所有对象"""
374
- xrefstr = []
375
- n = self.doc._getXrefLength()
376
- for i in range(1, n): # 注意下标实际要从1卡开始
377
- # 可以边遍历边删除,不影响下标位置,因为其本质只是去除关联引用而已
378
- xrefstr.append(self.doc._getXrefString(i))
379
- browser('\n'.join(xrefstr))
380
-
381
- def page_add_ele(self):
382
- """往页面添加元素
383
- 添加元素前后xrefstr的区别: https://paste.ubuntu.com/p/Dxhnzp4XJ2/
384
- """
385
- self.doc.select([0])
386
- page = self.doc.loadPage(0)
387
- # page.insertText(fitz.Point(100, 200), 'test\ntest')
388
- file = str(XlPath.tempdir() / 'a.pdf')
389
- dprint(file)
390
- self.doc.save(file, garbage=4)
391
- browser(file)
392
-
393
-
394
- def __pdfminer():
395
- """ pdfminer的实验代码也先放这里
396
-
397
- !pip install pdfminer.six
398
- """
399
-
400
- import pdfminer
401
- print(pdfminer.__version__)
402
- # 20201018
403
-
404
-
405
- class PdfMiner:
406
- @classmethod
407
- def to_html(cls, pdf_file):
408
- """ 相比fitz,pdfminer能正常提取出下划线
409
-
410
- 文本重叠比fitz更严重,整体来说其实更不好用~~
411
- """
412
-
413
- from io import StringIO
414
-
415
- from pdfminer.high_level import extract_text_to_fp
416
- from pdfminer.layout import LAParams
417
-
418
- output_string = StringIO()
419
- with open(str(pdf_file)) as fin:
420
- extract_text_to_fp(fin, output_string, laparams=LAParams(),
421
- output_type='html', codec=None)
422
-
423
- # 打开浏览器查看重建的html效果
424
- f = pdf_file.with_suffix('.html')
425
- f.write(output_string.getvalue())
426
- browser(f)
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : 陈坤泽
4
+ # @Email : 877362867@qq.com
5
+ # @Date : 2020/06/02 16:06
6
+
7
+ from pyxllib.prog.pupil import check_install_package
8
+
9
+ check_install_package('fitz', 'PyMuPdf>=1.18.17')
10
+
11
+ import json
12
+ import os
13
+ import pprint
14
+ import re
15
+
16
+ import fitz
17
+
18
+ from pyxllib.prog.newbie import round_int, decode_bitflags
19
+ from pyxllib.prog.pupil import DictTool, inject_members, dprint
20
+ from pyxllib.prog.specialist import browser
21
+ from pyxllib.algo.newbie import round_unit
22
+ from pyxllib.prog.pupil import get_number_width
23
+ from pyxllib.file.specialist import XlPath, writefile, get_etag
24
+ from pyxllib.cv.expert import xlcv, xlpil
25
+ from pyxlpr.data.labelme import LabelmeDict
26
+
27
+
28
+ def __fitz():
29
+ print(fitz.__doc__)
30
+
31
+
32
+ class FitzDoc:
33
+ """ 原名叫FitzPdf,但不一定是处理pdf,也可能是其他文档,所以改名 FitzDoc
34
+ """
35
+
36
+ def __init__(self, file):
37
+ self.src_file = XlPath(file)
38
+ self.doc = fitz.open(str(file))
39
+
40
+ def to_images(self, dst_dir=None, file_fmt='{filestem}_{number}.jpg', num_width=None, *,
41
+ scale=1, start=1, fmt_onepage=False):
42
+ """ 将pdf转为若干页图片
43
+
44
+ :param dst_dir: 目标目录
45
+ 默认情况下,只有一页pdf则存储到对应的pdf目录,多页则存储到同名子目录下
46
+ 如果不想这样被智能控制,只要指定明确的dst即可
47
+ :param file_fmt: 后缀格式,包括修改导出的图片类型,注意要用 {} 占位符表示页码编号
48
+ :param num_width: 生成的每一页文件编号,使用的数字前导0域宽
49
+ 默认根据pdf总页数来设置对应所用域宽
50
+ 0表示不设域宽
51
+ :param scale: 对每页图片进行缩放,一般推荐都要设成2,导出的图片才清晰
52
+ :param start: 起始页码,一般建议从1开始比较符合常识直觉
53
+ :param fmt_onepage: 当pdf就只有一页的时候,是否还对导出的图片编号
54
+ 默认只有一页的时候,进行优化,不增设后缀格式
55
+ :return: 返回转换完的图片名称清单
56
+
57
+ 注:如果要导出单张图,可以用 FitzPdfPage.get_cv_image
58
+ """
59
+ # 1 基本参数计算
60
+ srcfile, doc = self.src_file, self.doc
61
+ filestem, n_page = srcfile.stem, doc.page_count
62
+
63
+ # 自动推导目标目录
64
+ if dst_dir is None:
65
+ dst_dir = XlPath.init(srcfile.stem, srcfile.parent) if n_page > 1 else XlPath(srcfile.parent)
66
+ os.makedirs(dst_dir, exist_ok=True)
67
+
68
+ # 域宽
69
+ num_width = num_width or get_number_width(n_page) # 根据总页数计算需要的对齐域宽
70
+
71
+ # 2 导出图片
72
+ if fmt_onepage or n_page != 1: # 多页的处理规则
73
+ res = []
74
+ for i in range(n_page):
75
+ im = self.load_page(i).get_cv_image(scale)
76
+ number = ('{:0' + str(num_width) + 'd}').format(i + start) # 前面的括号不要删,这样才是完整的一个字符串来使用format
77
+ f = xlcv.write(im, XlPath.init(file_fmt.format(filestem=filestem, number=number), dst_dir))
78
+ res.append(f)
79
+ return res
80
+ else:
81
+ im = self.load_page(0).get_cv_image(scale)
82
+ return [xlcv.write(im, XlPath.init(srcfile.stem + os.path.splitext(file_fmt)[1], dst_dir))]
83
+
84
+ def to_labelmes(self, imfiles, opt='dict', *, views=(0, 0, 1, 0), scale=1, indent=None):
85
+ """ 生成图片对应的标注,常跟to_images配合使用 """
86
+ for i, imfile in enumerate(imfiles):
87
+ page = self.load_page(i)
88
+ lmdict = LabelmeDict.gen_data(imfile)
89
+ lmdict['shapes'] = page.get_labelme_shapes(opt, views=views, scale=scale)
90
+ imfile.with_suffix('.json').write(lmdict, indent=indent)
91
+
92
+ def to_docx(self, docx_file=None):
93
+ """ pdf转docx """
94
+ check_install_package('pdf2docx')
95
+ from pdf2docx import parse
96
+
97
+ pdf_file = self.src_file
98
+
99
+ if docx_file is None:
100
+ docx_file = pdf_file.with_suffix('.docx')
101
+
102
+ # 注意这里是日志显示进度,不是printf输出.
103
+ parse(str(pdf_file), str(docx_file))
104
+
105
+ def browser(self, opt='pdf'):
106
+ if opt == 'pdf':
107
+ f = self.src_file
108
+ browser(self.src_file)
109
+ elif opt == 'html':
110
+ ls = []
111
+ for i in range(self.page_count):
112
+ page = self.load_page(i)
113
+ ls.append(page.get_text('html'))
114
+ data = '\n'.join(ls)
115
+ etag = get_etag(data)
116
+ f = XlPath.init(etag, XlPath.tempdir(), suffix='.html')
117
+ f.write(data)
118
+ browser(f)
119
+ else:
120
+ raise ValueError(f'{opt}')
121
+ return f
122
+
123
+ def __getattr__(self, item):
124
+ return getattr(self.doc, item)
125
+
126
+
127
+ class XlFitzPage(fitz.fitz.Page):
128
+ """ 对fitz.fitz.Page的扩展成员方法 """
129
+
130
+ def get_svg_image2(self, scale=1):
131
+ # svg 是一段表述性文本
132
+ if scale != 1:
133
+ txt = self.get_svg_image(matrix=fitz.Matrix(scale, scale))
134
+ else:
135
+ txt = self.get_svg_image()
136
+ return txt
137
+
138
+ def _get_png_data(self, scale=1):
139
+ # TODO 增加透明通道?
140
+ if scale != 1:
141
+ pix = self.get_pixmap(matrix=fitz.Matrix(scale, scale)) # 长宽放大到scale倍
142
+ else:
143
+ pix = self.get_pixmap()
144
+ return pix.tobytes()
145
+
146
+ def get_cv_image(self, scale=1):
147
+ return xlcv.read_from_buffer(self._get_png_data(scale), flags=1)
148
+
149
+ def get_pil_image(self, scale=1):
150
+ # TODO 可以优化,直接从内存数据转pil,不用这样先转cv再转pil
151
+ return xlpil.read_from_buffer(self._get_png_data(scale), flags=1)
152
+
153
+ def to_image(self, outfile, *, scale=1, if_exists=None):
154
+ """ 转成为文件 """
155
+ f = XlPath(outfile)
156
+ suffix = f.suffix.lower()
157
+
158
+ if suffix == '.svg':
159
+ content = self.get_svg_image()
160
+ f.write(content, if_exists=if_exists)
161
+ else:
162
+ im = self.get_cv_image(scale)
163
+ xlcv.write(im, if_exists=if_exists)
164
+
165
+ def get_labelme_shapes(self, opt='dict', *, views=1, scale=1):
166
+ """ 得到labelme版本的shapes标注信息
167
+
168
+ :param opt: get_text的参数,默认使用无字符集标注的精简的dict
169
+ 也可以使用rawdict,带有字符集标注的数据
170
+ :param views: 若非list或者长度不足4,会补足
171
+ 各位标记依次代表是否显示对应细粒度的标注:blocks、lines、spans、chars
172
+ 默认只显示blocks
173
+ 例如 (0, 0, 1, 0),表示只显示spans的标注
174
+ :param scale: 是否需要对坐标按比例放大 (pdf经常放大两倍提取图片,则这里标注也要对应放大两倍)
175
+
176
+ 【字典属性解释】
177
+ blocks:
178
+ number: int, 区块编号
179
+ type: 0表示文本行,1表示图片
180
+ lines:
181
+ wmode: 好像都是0,不知道啥东西
182
+ dir: [1, 0],可能是文本方向吧
183
+ spans:
184
+ size: 字号
185
+ flags: 格式标记
186
+ 1,superscript,上标
187
+ 2,italic,斜体
188
+ 4,serifed,有衬线。如果没开,对立面就是"sans",无衬线。
189
+ 8,monospaced,等距。对立面proportional,均衡。
190
+ 16,bold,加粗
191
+ font:字体名称(直接用字符串赋值)
192
+ color:颜色
193
+ ascender:?
194
+ descender:?
195
+ origin:所在方格右上角坐标
196
+ text/chars: dict模式有text内容,rawdict有chars详细信息。我扩展的版本,rawdict也会有text属性。
197
+ char:
198
+ origin: 差不多是其所在方格的右上角坐标,同一行文本,其top位置是会对齐的
199
+ c: 字符内容
200
+ """
201
+ from pyxlpr.data.labelme import LabelmeDict
202
+
203
+ # 1 参数配置
204
+ if isinstance(views, int):
205
+ views = [views]
206
+ if len(views) < 4:
207
+ views += [0] * (4 - len(views))
208
+
209
+ shapes = []
210
+ page_dict = self.get_text(opt)
211
+
212
+ # 2 辅助函数
213
+ def add_shape(name, refdict, add_keys, drop_keys=('bbox',)):
214
+ """ 生成一个标注框 """
215
+ msgdict = {'category_name': name}
216
+ msgdict.update(add_keys)
217
+ DictTool.ior(msgdict, refdict)
218
+ DictTool.isub(msgdict, drop_keys)
219
+ bbox = [round_int(v * scale) for v in refdict['bbox']]
220
+
221
+ if 'size' in msgdict:
222
+ x = round_unit(msgdict['size'], 0.5)
223
+ msgdict['size'] = round_int(x) if (x * 10) % 10 < 1 else x # 没有小数的时候,优先展示为11,而不是11.0
224
+ if 'color' in msgdict:
225
+ # 把color映射为直观的(r, g, b)
226
+ # 这个pdf解析器获取的color,不一定精确等于原值,可能会有偏差,小一个像素
227
+ v = msgdict['color']
228
+ msgdict['color'] = (v // 256 // 256, (v // 256) % 256, v % 256)
229
+ if 'origin' in msgdict:
230
+ msgdict['origin'] = [round_int(v) for v in msgdict['origin']]
231
+
232
+ sp = LabelmeDict.gen_shape(json.dumps(msgdict), bbox)
233
+ shapes.append(sp)
234
+
235
+ # 3 遍历获取标注数据
236
+ for block in page_dict['blocks']:
237
+ if block['type'] == 0: # 普通的文本行
238
+ if views[0]:
239
+ add_shape('text_block', block, {'n_lines': len(block['lines'])}, ['bbox', 'lines'])
240
+ for line in block['lines']:
241
+ if views[1]:
242
+ add_shape('line', line, {'n_spans': len(line['spans'])}, ['bbox', 'spans'])
243
+ for span in line['spans']:
244
+ if 'text' not in span and 'chars' in span:
245
+ span['text'] = ''.join([x['c'] for x in span['chars']])
246
+ if views[2]:
247
+ add_shape('span', span, {'n_chars': len(span.get('text', ''))}, ['bbox', 'chars'])
248
+ if views[3] and 'chars' in span: # 最后层算法不太一样,这样写可以加速
249
+ for char in span['chars']:
250
+ add_shape('char', char, {}, ['bbox'])
251
+ elif block['type'] == 1: # 应该是图片
252
+ add_shape('image', block, {'image_filesize': len(block['image'])}, ['bbox', 'image'])
253
+ else:
254
+ raise ValueError
255
+
256
+ return shapes
257
+
258
+ @classmethod
259
+ def parse_flags(cls, n):
260
+ """ 解析spans的flags参数明文含义 """
261
+ flags = decode_bitflags(n, ('superscript', 'italic', 'serifed', 'monospaced', 'bold'))
262
+ flags['sans'] = not flags['serifed']
263
+ flags['proportional'] = not flags['monospaced']
264
+ return flags
265
+
266
+ def browser(self, opt='html'):
267
+ if opt == 'html':
268
+ data = self.get_text('html') # html、xhtml 可以转网页,虽然排版相对来说还是会乱一点
269
+ data = ''.join(data)
270
+ etag = get_etag(data)
271
+ f = XlPath.init(etag, XlPath.tempdir(), suffix='.html')
272
+ f.write(data)
273
+ browser(f)
274
+ else:
275
+ raise ValueError
276
+
277
+
278
+ inject_members(XlFitzPage, fitz.fitz.Page)
279
+
280
+
281
+ class DemoFitz:
282
+ """
283
+ 安装: pip install PyMuPdf
284
+ 使用: import fitz
285
+ 官方文档: https://pymupdf.readthedocs.io/en/latest/intro/
286
+ demo: https://github.com/rk700/PyMuPDF/tree/master/demo
287
+ examples: https://github.com/rk700/PyMuPDF/tree/master/examples
288
+ """
289
+
290
+ def __init__(self, file):
291
+ self.doc = fitz.open(file)
292
+
293
+ def message(self):
294
+ """查看pdf文档一些基础信息"""
295
+ dprint(fitz.version) # fitz模块的版本
296
+ dprint(self.doc.pageCount) # pdf页数
297
+ dprint(self.doc._getXrefLength()) # 文档的对象总数
298
+
299
+ def getToC(self):
300
+ """获得书签目录"""
301
+ toc = self.doc.getToC()
302
+ browser(toc)
303
+
304
+ def setToC(self):
305
+ """设置书签目录
306
+ 可以调层级、改名称、修改指向页码
307
+ """
308
+ toc = self.doc.getToC()
309
+ toc[1][1] = '改标题名称'
310
+ self.doc.setToC(toc)
311
+ file = XlPath('a.pdf', XlPath.tempdir()).to_str()
312
+ self.doc.save(file, garbage=4)
313
+ browser(file)
314
+
315
+ def setToC2(self):
316
+ """修改人教版教材的标签名"""
317
+ toc = self.doc.getToC()
318
+ newtoc = []
319
+ for i in range(len(toc)):
320
+ name = toc[i][1]
321
+ if '.' in name: continue
322
+ # m = re.search(r'\d+', name)
323
+ # if m: name = name.replace(m.group(), digits2chinese(int(m.group())))
324
+ m = re.search(r'([一二三四五六]年级).*?([上下])', name)
325
+ if i < len(toc) - 1:
326
+ pages = toc[i + 1][2] - toc[i][2] + 1
327
+ else:
328
+ pages = self.doc.pageCount - toc[i][2] + 1
329
+ toc[i][1] = m.group(1) + m.group(2) + ',' + str(pages)
330
+ newtoc.append(toc[i])
331
+ self.doc.setToC(newtoc)
332
+ file = writefile(b'', 'a.pdf', if_exists='replace')
333
+ self.doc.save(file, garbage=4)
334
+
335
+ def rearrange_pages(self):
336
+ """重新布局页面"""
337
+ self.doc.select([0, 0, 1]) # 第1页展示两次后,再跟第2页
338
+ file = writefile(b'', 'a.pdf', root=XlPath.tempdir(), if_exists='replace')
339
+ self.doc.save(file, garbage=4) # 注意要设置garbage,否则文档并没有实际删除内容压缩文件大小
340
+ browser(file)
341
+
342
+ def page2png(self, page=0):
343
+ """ 查看单页渲染图片 """
344
+ page = self.doc.loadPage(page) # 索引第i页,下标规律同py,支持-1索引最后页
345
+ # dprint(page.bound()) # 页面边界,x,y轴同图像处理中的常识定义,返回Rect(x0, y0, x1, y1)
346
+
347
+ pix = page.getPixmap(fitz.Matrix(2, 2)) # 获得页面的RGBA图像,Pixmap类型;还可以用page.getSVGimage()获得矢量图
348
+ # pix.writePNG('page-0.png') # 将Pixmal
349
+ pngdata = pix.tobytes() # 获png文件的bytes字节码
350
+ # print(len(pngdata))
351
+ # browser(pngdata, 'a.png') # 用我的工具函数打开图片
352
+
353
+ return pngdata
354
+
355
+ def pagetext(self):
356
+ """单页上的文本"""
357
+ page = self.doc[0]
358
+
359
+ # 获得页面上的所有文本,还支持参数: html,dict,xml,xhtml,json
360
+ text = page.getText('text')
361
+ dprint(text)
362
+
363
+ # 获得页面上的所有文本(返回字典对象)
364
+ textdict = page.getText('dict')
365
+ textdict['blocks'] = textdict['blocks'][:-1]
366
+ browser(pprint.pformat(textdict))
367
+
368
+ def text(self):
369
+ """获得整份pdf的所有文本"""
370
+ return '\n'.join([page.getText('text') for page in self.doc])
371
+
372
+ def xrefstr(self):
373
+ """查看pdf文档的所有对象"""
374
+ xrefstr = []
375
+ n = self.doc._getXrefLength()
376
+ for i in range(1, n): # 注意下标实际要从1卡开始
377
+ # 可以边遍历边删除,不影响下标位置,因为其本质只是去除关联引用而已
378
+ xrefstr.append(self.doc._getXrefString(i))
379
+ browser('\n'.join(xrefstr))
380
+
381
+ def page_add_ele(self):
382
+ """往页面添加元素
383
+ 添加元素前后xrefstr的区别: https://paste.ubuntu.com/p/Dxhnzp4XJ2/
384
+ """
385
+ self.doc.select([0])
386
+ page = self.doc.loadPage(0)
387
+ # page.insertText(fitz.Point(100, 200), 'test\ntest')
388
+ file = str(XlPath.tempdir() / 'a.pdf')
389
+ dprint(file)
390
+ self.doc.save(file, garbage=4)
391
+ browser(file)
392
+
393
+
394
+ def __pdfminer():
395
+ """ pdfminer的实验代码也先放这里
396
+
397
+ !pip install pdfminer.six
398
+ """
399
+
400
+ import pdfminer
401
+ print(pdfminer.__version__)
402
+ # 20201018
403
+
404
+
405
+ class PdfMiner:
406
+ @classmethod
407
+ def to_html(cls, pdf_file):
408
+ """ 相比fitz,pdfminer能正常提取出下划线
409
+
410
+ 文本重叠比fitz更严重,整体来说其实更不好用~~
411
+ """
412
+
413
+ from io import StringIO
414
+
415
+ from pdfminer.high_level import extract_text_to_fp
416
+ from pdfminer.layout import LAParams
417
+
418
+ output_string = StringIO()
419
+ with open(str(pdf_file)) as fin:
420
+ extract_text_to_fp(fin, output_string, laparams=LAParams(),
421
+ output_type='html', codec=None)
422
+
423
+ # 打开浏览器查看重建的html效果
424
+ f = pdf_file.with_suffix('.html')
425
+ f.write(output_string.getvalue())
426
+ browser(f)