pyxllib 0.0.43__py3-none-any.whl → 0.3.197__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/__init__.py +9 -2
- pyxllib/algo/__init__.py +8 -0
- pyxllib/algo/disjoint.py +54 -0
- pyxllib/algo/geo.py +541 -0
- pyxllib/{util/mathlib.py → algo/intervals.py} +172 -36
- pyxllib/algo/matcher.py +389 -0
- pyxllib/algo/newbie.py +166 -0
- pyxllib/algo/pupil.py +629 -0
- pyxllib/algo/shapelylib.py +67 -0
- pyxllib/algo/specialist.py +241 -0
- pyxllib/algo/stat.py +494 -0
- pyxllib/algo/treelib.py +149 -0
- pyxllib/algo/unitlib.py +66 -0
- pyxllib/autogui/__init__.py +5 -0
- pyxllib/autogui/activewin.py +246 -0
- pyxllib/autogui/all.py +9 -0
- pyxllib/autogui/autogui.py +852 -0
- pyxllib/autogui/uiautolib.py +362 -0
- pyxllib/autogui/virtualkey.py +102 -0
- pyxllib/autogui/wechat.py +827 -0
- pyxllib/autogui/wechat_msg.py +421 -0
- pyxllib/autogui/wxautolib.py +84 -0
- pyxllib/cv/__init__.py +1 -11
- pyxllib/cv/expert.py +267 -0
- pyxllib/cv/{imlib.py → imfile.py} +18 -83
- pyxllib/cv/imhash.py +39 -0
- pyxllib/cv/pupil.py +9 -0
- pyxllib/cv/rgbfmt.py +1525 -0
- pyxllib/cv/slidercaptcha.py +137 -0
- pyxllib/cv/trackbartools.py +163 -49
- pyxllib/cv/xlcvlib.py +1040 -0
- pyxllib/cv/xlpillib.py +423 -0
- pyxllib/data/__init__.py +0 -0
- pyxllib/data/echarts.py +240 -0
- pyxllib/data/jsonlib.py +89 -0
- pyxllib/{util/oss2_.py → data/oss.py} +11 -9
- pyxllib/data/pglib.py +1127 -0
- pyxllib/data/sqlite.py +568 -0
- pyxllib/{util → data}/sqllib.py +13 -31
- pyxllib/ext/JLineViewer.py +505 -0
- pyxllib/ext/__init__.py +6 -0
- pyxllib/{util → ext}/demolib.py +119 -35
- pyxllib/ext/drissionlib.py +277 -0
- pyxllib/ext/kq5034lib.py +12 -0
- pyxllib/{util/main.py → ext/old.py} +122 -284
- pyxllib/ext/qt.py +449 -0
- pyxllib/ext/robustprocfile.py +497 -0
- pyxllib/ext/seleniumlib.py +76 -0
- pyxllib/{util/tklib.py → ext/tk.py} +10 -11
- pyxllib/ext/unixlib.py +827 -0
- pyxllib/ext/utools.py +351 -0
- pyxllib/{util/webhooklib.py → ext/webhook.py} +45 -17
- pyxllib/ext/win32lib.py +40 -0
- pyxllib/ext/wjxlib.py +88 -0
- pyxllib/ext/wpsapi.py +124 -0
- pyxllib/ext/xlwork.py +9 -0
- pyxllib/ext/yuquelib.py +1105 -0
- pyxllib/file/__init__.py +17 -0
- pyxllib/file/docxlib.py +761 -0
- pyxllib/{util → file}/gitlib.py +40 -27
- pyxllib/file/libreoffice.py +165 -0
- pyxllib/file/movielib.py +148 -0
- pyxllib/file/newbie.py +10 -0
- pyxllib/file/onenotelib.py +1469 -0
- pyxllib/file/packlib/__init__.py +330 -0
- pyxllib/{util → file/packlib}/zipfile.py +598 -195
- pyxllib/file/pdflib.py +426 -0
- pyxllib/file/pupil.py +185 -0
- pyxllib/file/specialist/__init__.py +685 -0
- pyxllib/{basic/_5_dirlib.py → file/specialist/dirlib.py} +364 -93
- pyxllib/file/specialist/download.py +193 -0
- pyxllib/file/specialist/filelib.py +2829 -0
- pyxllib/file/xlsxlib.py +3131 -0
- pyxllib/file/xlsyncfile.py +341 -0
- pyxllib/prog/__init__.py +5 -0
- pyxllib/prog/cachetools.py +64 -0
- pyxllib/prog/deprecatedlib.py +233 -0
- pyxllib/prog/filelock.py +42 -0
- pyxllib/prog/ipyexec.py +253 -0
- pyxllib/prog/multiprogs.py +940 -0
- pyxllib/prog/newbie.py +451 -0
- pyxllib/prog/pupil.py +1197 -0
- pyxllib/{sitepackages.py → prog/sitepackages.py} +5 -3
- pyxllib/prog/specialist/__init__.py +391 -0
- pyxllib/prog/specialist/bc.py +203 -0
- pyxllib/prog/specialist/browser.py +497 -0
- pyxllib/prog/specialist/common.py +347 -0
- pyxllib/prog/specialist/datetime.py +199 -0
- pyxllib/prog/specialist/tictoc.py +240 -0
- pyxllib/prog/specialist/xllog.py +180 -0
- pyxllib/prog/xlosenv.py +108 -0
- pyxllib/stdlib/__init__.py +17 -0
- pyxllib/{util → stdlib}/tablepyxl/__init__.py +1 -3
- pyxllib/{util → stdlib}/tablepyxl/style.py +1 -1
- pyxllib/{util → stdlib}/tablepyxl/tablepyxl.py +2 -4
- pyxllib/text/__init__.py +8 -0
- pyxllib/text/ahocorasick.py +39 -0
- pyxllib/text/airscript.js +744 -0
- pyxllib/text/charclasslib.py +121 -0
- pyxllib/text/jiebalib.py +267 -0
- pyxllib/text/jinjalib.py +32 -0
- pyxllib/text/jsa_ai_prompt.md +271 -0
- pyxllib/text/jscode.py +922 -0
- pyxllib/text/latex/__init__.py +158 -0
- pyxllib/text/levenshtein.py +303 -0
- pyxllib/text/nestenv.py +1215 -0
- pyxllib/text/newbie.py +300 -0
- pyxllib/text/pupil/__init__.py +8 -0
- pyxllib/text/pupil/common.py +1121 -0
- pyxllib/text/pupil/xlalign.py +326 -0
- pyxllib/text/pycode.py +47 -0
- pyxllib/text/specialist/__init__.py +8 -0
- pyxllib/text/specialist/common.py +112 -0
- pyxllib/text/specialist/ptag.py +186 -0
- pyxllib/text/spellchecker.py +172 -0
- pyxllib/text/templates/echart_base.html +11 -0
- pyxllib/text/templates/highlight_code.html +17 -0
- pyxllib/text/templates/latex_editor.html +103 -0
- pyxllib/text/vbacode.py +17 -0
- pyxllib/text/xmllib.py +747 -0
- pyxllib/xl.py +39 -0
- pyxllib/xlcv.py +17 -0
- pyxllib-0.3.197.dist-info/METADATA +48 -0
- pyxllib-0.3.197.dist-info/RECORD +126 -0
- {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +4 -5
- pyxllib/basic/_1_strlib.py +0 -945
- pyxllib/basic/_2_timelib.py +0 -488
- pyxllib/basic/_3_pathlib.py +0 -916
- pyxllib/basic/_4_loglib.py +0 -419
- pyxllib/basic/__init__.py +0 -54
- pyxllib/basic/arrow_.py +0 -250
- pyxllib/basic/chardet_.py +0 -66
- pyxllib/basic/dirlib.py +0 -529
- pyxllib/basic/dprint.py +0 -202
- pyxllib/basic/extension.py +0 -12
- pyxllib/basic/judge.py +0 -31
- pyxllib/basic/log.py +0 -204
- pyxllib/basic/pathlib_.py +0 -705
- pyxllib/basic/pytictoc.py +0 -102
- pyxllib/basic/qiniu_.py +0 -61
- pyxllib/basic/strlib.py +0 -761
- pyxllib/basic/timer.py +0 -132
- pyxllib/cv/cv.py +0 -834
- pyxllib/cv/cvlib/_1_geo.py +0 -543
- pyxllib/cv/cvlib/_2_cvprcs.py +0 -309
- pyxllib/cv/cvlib/_2_imgproc.py +0 -594
- pyxllib/cv/cvlib/_3_pilprcs.py +0 -80
- pyxllib/cv/cvlib/_4_cvimg.py +0 -211
- pyxllib/cv/cvlib/__init__.py +0 -10
- pyxllib/cv/debugtools.py +0 -82
- pyxllib/cv/fitz_.py +0 -300
- pyxllib/cv/installer.py +0 -42
- pyxllib/debug/_0_installer.py +0 -38
- pyxllib/debug/_1_typelib.py +0 -277
- pyxllib/debug/_2_chrome.py +0 -198
- pyxllib/debug/_3_showdir.py +0 -161
- pyxllib/debug/_4_bcompare.py +0 -140
- pyxllib/debug/__init__.py +0 -49
- pyxllib/debug/bcompare.py +0 -132
- pyxllib/debug/chrome.py +0 -198
- pyxllib/debug/installer.py +0 -38
- pyxllib/debug/showdir.py +0 -158
- pyxllib/debug/typelib.py +0 -278
- pyxllib/image/__init__.py +0 -12
- pyxllib/torch/__init__.py +0 -20
- pyxllib/torch/modellib.py +0 -37
- pyxllib/torch/trainlib.py +0 -344
- pyxllib/util/__init__.py +0 -20
- pyxllib/util/aip_.py +0 -141
- pyxllib/util/casiadb.py +0 -59
- pyxllib/util/excellib.py +0 -495
- pyxllib/util/filelib.py +0 -612
- pyxllib/util/jsondata.py +0 -27
- pyxllib/util/jsondata2.py +0 -92
- pyxllib/util/labelmelib.py +0 -139
- pyxllib/util/onepy/__init__.py +0 -29
- pyxllib/util/onepy/onepy.py +0 -574
- pyxllib/util/onepy/onmanager.py +0 -170
- pyxllib/util/pyautogui_.py +0 -219
- pyxllib/util/textlib.py +0 -1305
- pyxllib/util/unorder.py +0 -22
- pyxllib/util/xmllib.py +0 -639
- pyxllib-0.0.43.dist-info/METADATA +0 -39
- pyxllib-0.0.43.dist-info/RECORD +0 -80
- pyxllib-0.0.43.dist-info/top_level.txt +0 -1
- {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
pyxllib/file/pdflib.py
ADDED
@@ -0,0 +1,426 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : 陈坤泽
|
4
|
+
# @Email : 877362867@qq.com
|
5
|
+
# @Date : 2020/06/02 16:06
|
6
|
+
|
7
|
+
from pyxllib.prog.pupil import check_install_package
|
8
|
+
|
9
|
+
check_install_package('fitz', 'PyMuPdf>=1.18.17')
|
10
|
+
|
11
|
+
import json
|
12
|
+
import os
|
13
|
+
import pprint
|
14
|
+
import re
|
15
|
+
|
16
|
+
import fitz
|
17
|
+
|
18
|
+
from pyxllib.prog.newbie import round_int, decode_bitflags
|
19
|
+
from pyxllib.prog.pupil import DictTool, inject_members, dprint
|
20
|
+
from pyxllib.prog.specialist import browser
|
21
|
+
from pyxllib.algo.newbie import round_unit
|
22
|
+
from pyxllib.prog.pupil import get_number_width
|
23
|
+
from pyxllib.file.specialist import XlPath, writefile, get_etag
|
24
|
+
from pyxllib.cv.expert import xlcv, xlpil
|
25
|
+
from pyxlpr.data.labelme import LabelmeDict
|
26
|
+
|
27
|
+
|
28
|
+
def __fitz():
|
29
|
+
print(fitz.__doc__)
|
30
|
+
|
31
|
+
|
32
|
+
class FitzDoc:
|
33
|
+
""" 原名叫FitzPdf,但不一定是处理pdf,也可能是其他文档,所以改名 FitzDoc
|
34
|
+
"""
|
35
|
+
|
36
|
+
def __init__(self, file):
|
37
|
+
self.src_file = XlPath(file)
|
38
|
+
self.doc = fitz.open(str(file))
|
39
|
+
|
40
|
+
def to_images(self, dst_dir=None, file_fmt='{filestem}_{number}.jpg', num_width=None, *,
|
41
|
+
scale=1, start=1, fmt_onepage=False):
|
42
|
+
""" 将pdf转为若干页图片
|
43
|
+
|
44
|
+
:param dst_dir: 目标目录
|
45
|
+
默认情况下,只有一页pdf则存储到对应的pdf目录,多页则存储到同名子目录下
|
46
|
+
如果不想这样被智能控制,只要指定明确的dst即可
|
47
|
+
:param file_fmt: 后缀格式,包括修改导出的图片类型,注意要用 {} 占位符表示页码编号
|
48
|
+
:param num_width: 生成的每一页文件编号,使用的数字前导0域宽
|
49
|
+
默认根据pdf总页数来设置对应所用域宽
|
50
|
+
0表示不设域宽
|
51
|
+
:param scale: 对每页图片进行缩放,一般推荐都要设成2,导出的图片才清晰
|
52
|
+
:param start: 起始页码,一般建议从1开始比较符合常识直觉
|
53
|
+
:param fmt_onepage: 当pdf就只有一页的时候,是否还对导出的图片编号
|
54
|
+
默认只有一页的时候,进行优化,不增设后缀格式
|
55
|
+
:return: 返回转换完的图片名称清单
|
56
|
+
|
57
|
+
注:如果要导出单张图,可以用 FitzPdfPage.get_cv_image
|
58
|
+
"""
|
59
|
+
# 1 基本参数计算
|
60
|
+
srcfile, doc = self.src_file, self.doc
|
61
|
+
filestem, n_page = srcfile.stem, doc.page_count
|
62
|
+
|
63
|
+
# 自动推导目标目录
|
64
|
+
if dst_dir is None:
|
65
|
+
dst_dir = XlPath.init(srcfile.stem, srcfile.parent) if n_page > 1 else XlPath(srcfile.parent)
|
66
|
+
os.makedirs(dst_dir, exist_ok=True)
|
67
|
+
|
68
|
+
# 域宽
|
69
|
+
num_width = num_width or get_number_width(n_page) # 根据总页数计算需要的对齐域宽
|
70
|
+
|
71
|
+
# 2 导出图片
|
72
|
+
if fmt_onepage or n_page != 1: # 多页的处理规则
|
73
|
+
res = []
|
74
|
+
for i in range(n_page):
|
75
|
+
im = self.load_page(i).get_cv_image(scale)
|
76
|
+
number = ('{:0' + str(num_width) + 'd}').format(i + start) # 前面的括号不要删,这样才是完整的一个字符串来使用format
|
77
|
+
f = xlcv.write(im, XlPath.init(file_fmt.format(filestem=filestem, number=number), dst_dir))
|
78
|
+
res.append(f)
|
79
|
+
return res
|
80
|
+
else:
|
81
|
+
im = self.load_page(0).get_cv_image(scale)
|
82
|
+
return [xlcv.write(im, XlPath.init(srcfile.stem + os.path.splitext(file_fmt)[1], dst_dir))]
|
83
|
+
|
84
|
+
def to_labelmes(self, imfiles, opt='dict', *, views=(0, 0, 1, 0), scale=1, indent=None):
|
85
|
+
""" 生成图片对应的标注,常跟to_images配合使用 """
|
86
|
+
for i, imfile in enumerate(imfiles):
|
87
|
+
page = self.load_page(i)
|
88
|
+
lmdict = LabelmeDict.gen_data(imfile)
|
89
|
+
lmdict['shapes'] = page.get_labelme_shapes(opt, views=views, scale=scale)
|
90
|
+
imfile.with_suffix('.json').write(lmdict, indent=indent)
|
91
|
+
|
92
|
+
def to_docx(self, docx_file=None):
|
93
|
+
""" pdf转docx """
|
94
|
+
check_install_package('pdf2docx')
|
95
|
+
from pdf2docx import parse
|
96
|
+
|
97
|
+
pdf_file = self.src_file
|
98
|
+
|
99
|
+
if docx_file is None:
|
100
|
+
docx_file = pdf_file.with_suffix('.docx')
|
101
|
+
|
102
|
+
# 注意这里是日志显示进度,不是printf输出.
|
103
|
+
parse(str(pdf_file), str(docx_file))
|
104
|
+
|
105
|
+
def browser(self, opt='pdf'):
|
106
|
+
if opt == 'pdf':
|
107
|
+
f = self.src_file
|
108
|
+
browser(self.src_file)
|
109
|
+
elif opt == 'html':
|
110
|
+
ls = []
|
111
|
+
for i in range(self.page_count):
|
112
|
+
page = self.load_page(i)
|
113
|
+
ls.append(page.get_text('html'))
|
114
|
+
data = '\n'.join(ls)
|
115
|
+
etag = get_etag(data)
|
116
|
+
f = XlPath.init(etag, XlPath.tempdir(), suffix='.html')
|
117
|
+
f.write(data)
|
118
|
+
browser(f)
|
119
|
+
else:
|
120
|
+
raise ValueError(f'{opt}')
|
121
|
+
return f
|
122
|
+
|
123
|
+
def __getattr__(self, item):
|
124
|
+
return getattr(self.doc, item)
|
125
|
+
|
126
|
+
|
127
|
+
class XlFitzPage(fitz.fitz.Page):
|
128
|
+
""" 对fitz.fitz.Page的扩展成员方法 """
|
129
|
+
|
130
|
+
def get_svg_image2(self, scale=1):
|
131
|
+
# svg 是一段表述性文本
|
132
|
+
if scale != 1:
|
133
|
+
txt = self.get_svg_image(matrix=fitz.Matrix(scale, scale))
|
134
|
+
else:
|
135
|
+
txt = self.get_svg_image()
|
136
|
+
return txt
|
137
|
+
|
138
|
+
def _get_png_data(self, scale=1):
|
139
|
+
# TODO 增加透明通道?
|
140
|
+
if scale != 1:
|
141
|
+
pix = self.get_pixmap(matrix=fitz.Matrix(scale, scale)) # 长宽放大到scale倍
|
142
|
+
else:
|
143
|
+
pix = self.get_pixmap()
|
144
|
+
return pix.tobytes()
|
145
|
+
|
146
|
+
def get_cv_image(self, scale=1):
|
147
|
+
return xlcv.read_from_buffer(self._get_png_data(scale), flags=1)
|
148
|
+
|
149
|
+
def get_pil_image(self, scale=1):
|
150
|
+
# TODO 可以优化,直接从内存数据转pil,不用这样先转cv再转pil
|
151
|
+
return xlpil.read_from_buffer(self._get_png_data(scale), flags=1)
|
152
|
+
|
153
|
+
def to_image(self, outfile, *, scale=1, if_exists=None):
|
154
|
+
""" 转成为文件 """
|
155
|
+
f = XlPath(outfile)
|
156
|
+
suffix = f.suffix.lower()
|
157
|
+
|
158
|
+
if suffix == '.svg':
|
159
|
+
content = self.get_svg_image()
|
160
|
+
f.write(content, if_exists=if_exists)
|
161
|
+
else:
|
162
|
+
im = self.get_cv_image(scale)
|
163
|
+
xlcv.write(im, if_exists=if_exists)
|
164
|
+
|
165
|
+
def get_labelme_shapes(self, opt='dict', *, views=1, scale=1):
|
166
|
+
""" 得到labelme版本的shapes标注信息
|
167
|
+
|
168
|
+
:param opt: get_text的参数,默认使用无字符集标注的精简的dict
|
169
|
+
也可以使用rawdict,带有字符集标注的数据
|
170
|
+
:param views: 若非list或者长度不足4,会补足
|
171
|
+
各位标记依次代表是否显示对应细粒度的标注:blocks、lines、spans、chars
|
172
|
+
默认只显示blocks
|
173
|
+
例如 (0, 0, 1, 0),表示只显示spans的标注
|
174
|
+
:param scale: 是否需要对坐标按比例放大 (pdf经常放大两倍提取图片,则这里标注也要对应放大两倍)
|
175
|
+
|
176
|
+
【字典属性解释】
|
177
|
+
blocks:
|
178
|
+
number: int, 区块编号
|
179
|
+
type: 0表示文本行,1表示图片
|
180
|
+
lines:
|
181
|
+
wmode: 好像都是0,不知道啥东西
|
182
|
+
dir: [1, 0],可能是文本方向吧
|
183
|
+
spans:
|
184
|
+
size: 字号
|
185
|
+
flags: 格式标记
|
186
|
+
1,superscript,上标
|
187
|
+
2,italic,斜体
|
188
|
+
4,serifed,有衬线。如果没开,对立面就是"sans",无衬线。
|
189
|
+
8,monospaced,等距。对立面proportional,均衡。
|
190
|
+
16,bold,加粗
|
191
|
+
font:字体名称(直接用字符串赋值)
|
192
|
+
color:颜色
|
193
|
+
ascender:?
|
194
|
+
descender:?
|
195
|
+
origin:所在方格右上角坐标
|
196
|
+
text/chars: dict模式有text内容,rawdict有chars详细信息。我扩展的版本,rawdict也会有text属性。
|
197
|
+
char:
|
198
|
+
origin: 差不多是其所在方格的右上角坐标,同一行文本,其top位置是会对齐的
|
199
|
+
c: 字符内容
|
200
|
+
"""
|
201
|
+
from pyxlpr.data.labelme import LabelmeDict
|
202
|
+
|
203
|
+
# 1 参数配置
|
204
|
+
if isinstance(views, int):
|
205
|
+
views = [views]
|
206
|
+
if len(views) < 4:
|
207
|
+
views += [0] * (4 - len(views))
|
208
|
+
|
209
|
+
shapes = []
|
210
|
+
page_dict = self.get_text(opt)
|
211
|
+
|
212
|
+
# 2 辅助函数
|
213
|
+
def add_shape(name, refdict, add_keys, drop_keys=('bbox',)):
|
214
|
+
""" 生成一个标注框 """
|
215
|
+
msgdict = {'category_name': name}
|
216
|
+
msgdict.update(add_keys)
|
217
|
+
DictTool.ior(msgdict, refdict)
|
218
|
+
DictTool.isub(msgdict, drop_keys)
|
219
|
+
bbox = [round_int(v * scale) for v in refdict['bbox']]
|
220
|
+
|
221
|
+
if 'size' in msgdict:
|
222
|
+
x = round_unit(msgdict['size'], 0.5)
|
223
|
+
msgdict['size'] = round_int(x) if (x * 10) % 10 < 1 else x # 没有小数的时候,优先展示为11,而不是11.0
|
224
|
+
if 'color' in msgdict:
|
225
|
+
# 把color映射为直观的(r, g, b)
|
226
|
+
# 这个pdf解析器获取的color,不一定精确等于原值,可能会有偏差,小一个像素
|
227
|
+
v = msgdict['color']
|
228
|
+
msgdict['color'] = (v // 256 // 256, (v // 256) % 256, v % 256)
|
229
|
+
if 'origin' in msgdict:
|
230
|
+
msgdict['origin'] = [round_int(v) for v in msgdict['origin']]
|
231
|
+
|
232
|
+
sp = LabelmeDict.gen_shape(json.dumps(msgdict), bbox)
|
233
|
+
shapes.append(sp)
|
234
|
+
|
235
|
+
# 3 遍历获取标注数据
|
236
|
+
for block in page_dict['blocks']:
|
237
|
+
if block['type'] == 0: # 普通的文本行
|
238
|
+
if views[0]:
|
239
|
+
add_shape('text_block', block, {'n_lines': len(block['lines'])}, ['bbox', 'lines'])
|
240
|
+
for line in block['lines']:
|
241
|
+
if views[1]:
|
242
|
+
add_shape('line', line, {'n_spans': len(line['spans'])}, ['bbox', 'spans'])
|
243
|
+
for span in line['spans']:
|
244
|
+
if 'text' not in span and 'chars' in span:
|
245
|
+
span['text'] = ''.join([x['c'] for x in span['chars']])
|
246
|
+
if views[2]:
|
247
|
+
add_shape('span', span, {'n_chars': len(span.get('text', ''))}, ['bbox', 'chars'])
|
248
|
+
if views[3] and 'chars' in span: # 最后层算法不太一样,这样写可以加速
|
249
|
+
for char in span['chars']:
|
250
|
+
add_shape('char', char, {}, ['bbox'])
|
251
|
+
elif block['type'] == 1: # 应该是图片
|
252
|
+
add_shape('image', block, {'image_filesize': len(block['image'])}, ['bbox', 'image'])
|
253
|
+
else:
|
254
|
+
raise ValueError
|
255
|
+
|
256
|
+
return shapes
|
257
|
+
|
258
|
+
@classmethod
|
259
|
+
def parse_flags(cls, n):
|
260
|
+
""" 解析spans的flags参数明文含义 """
|
261
|
+
flags = decode_bitflags(n, ('superscript', 'italic', 'serifed', 'monospaced', 'bold'))
|
262
|
+
flags['sans'] = not flags['serifed']
|
263
|
+
flags['proportional'] = not flags['monospaced']
|
264
|
+
return flags
|
265
|
+
|
266
|
+
def browser(self, opt='html'):
|
267
|
+
if opt == 'html':
|
268
|
+
data = self.get_text('html') # html、xhtml 可以转网页,虽然排版相对来说还是会乱一点
|
269
|
+
data = ''.join(data)
|
270
|
+
etag = get_etag(data)
|
271
|
+
f = XlPath.init(etag, XlPath.tempdir(), suffix='.html')
|
272
|
+
f.write(data)
|
273
|
+
browser(f)
|
274
|
+
else:
|
275
|
+
raise ValueError
|
276
|
+
|
277
|
+
|
278
|
+
inject_members(XlFitzPage, fitz.fitz.Page)
|
279
|
+
|
280
|
+
|
281
|
+
class DemoFitz:
|
282
|
+
"""
|
283
|
+
安装: pip install PyMuPdf
|
284
|
+
使用: import fitz
|
285
|
+
官方文档: https://pymupdf.readthedocs.io/en/latest/intro/
|
286
|
+
demo: https://github.com/rk700/PyMuPDF/tree/master/demo
|
287
|
+
examples: https://github.com/rk700/PyMuPDF/tree/master/examples
|
288
|
+
"""
|
289
|
+
|
290
|
+
def __init__(self, file):
|
291
|
+
self.doc = fitz.open(file)
|
292
|
+
|
293
|
+
def message(self):
|
294
|
+
"""查看pdf文档一些基础信息"""
|
295
|
+
dprint(fitz.version) # fitz模块的版本
|
296
|
+
dprint(self.doc.pageCount) # pdf页数
|
297
|
+
dprint(self.doc._getXrefLength()) # 文档的对象总数
|
298
|
+
|
299
|
+
def getToC(self):
|
300
|
+
"""获得书签目录"""
|
301
|
+
toc = self.doc.getToC()
|
302
|
+
browser(toc)
|
303
|
+
|
304
|
+
def setToC(self):
|
305
|
+
"""设置书签目录
|
306
|
+
可以调层级、改名称、修改指向页码
|
307
|
+
"""
|
308
|
+
toc = self.doc.getToC()
|
309
|
+
toc[1][1] = '改标题名称'
|
310
|
+
self.doc.setToC(toc)
|
311
|
+
file = XlPath('a.pdf', XlPath.tempdir()).to_str()
|
312
|
+
self.doc.save(file, garbage=4)
|
313
|
+
browser(file)
|
314
|
+
|
315
|
+
def setToC2(self):
|
316
|
+
"""修改人教版教材的标签名"""
|
317
|
+
toc = self.doc.getToC()
|
318
|
+
newtoc = []
|
319
|
+
for i in range(len(toc)):
|
320
|
+
name = toc[i][1]
|
321
|
+
if '.' in name: continue
|
322
|
+
# m = re.search(r'\d+', name)
|
323
|
+
# if m: name = name.replace(m.group(), digits2chinese(int(m.group())))
|
324
|
+
m = re.search(r'([一二三四五六]年级).*?([上下])', name)
|
325
|
+
if i < len(toc) - 1:
|
326
|
+
pages = toc[i + 1][2] - toc[i][2] + 1
|
327
|
+
else:
|
328
|
+
pages = self.doc.pageCount - toc[i][2] + 1
|
329
|
+
toc[i][1] = m.group(1) + m.group(2) + ',' + str(pages)
|
330
|
+
newtoc.append(toc[i])
|
331
|
+
self.doc.setToC(newtoc)
|
332
|
+
file = writefile(b'', 'a.pdf', if_exists='replace')
|
333
|
+
self.doc.save(file, garbage=4)
|
334
|
+
|
335
|
+
def rearrange_pages(self):
|
336
|
+
"""重新布局页面"""
|
337
|
+
self.doc.select([0, 0, 1]) # 第1页展示两次后,再跟第2页
|
338
|
+
file = writefile(b'', 'a.pdf', root=XlPath.tempdir(), if_exists='replace')
|
339
|
+
self.doc.save(file, garbage=4) # 注意要设置garbage,否则文档并没有实际删除内容压缩文件大小
|
340
|
+
browser(file)
|
341
|
+
|
342
|
+
def page2png(self, page=0):
|
343
|
+
""" 查看单页渲染图片 """
|
344
|
+
page = self.doc.loadPage(page) # 索引第i页,下标规律同py,支持-1索引最后页
|
345
|
+
# dprint(page.bound()) # 页面边界,x,y轴同图像处理中的常识定义,返回Rect(x0, y0, x1, y1)
|
346
|
+
|
347
|
+
pix = page.getPixmap(fitz.Matrix(2, 2)) # 获得页面的RGBA图像,Pixmap类型;还可以用page.getSVGimage()获得矢量图
|
348
|
+
# pix.writePNG('page-0.png') # 将Pixmal
|
349
|
+
pngdata = pix.tobytes() # 获png文件的bytes字节码
|
350
|
+
# print(len(pngdata))
|
351
|
+
# browser(pngdata, 'a.png') # 用我的工具函数打开图片
|
352
|
+
|
353
|
+
return pngdata
|
354
|
+
|
355
|
+
def pagetext(self):
|
356
|
+
"""单页上的文本"""
|
357
|
+
page = self.doc[0]
|
358
|
+
|
359
|
+
# 获得页面上的所有文本,还支持参数: html,dict,xml,xhtml,json
|
360
|
+
text = page.getText('text')
|
361
|
+
dprint(text)
|
362
|
+
|
363
|
+
# 获得页面上的所有文本(返回字典对象)
|
364
|
+
textdict = page.getText('dict')
|
365
|
+
textdict['blocks'] = textdict['blocks'][:-1]
|
366
|
+
browser(pprint.pformat(textdict))
|
367
|
+
|
368
|
+
def text(self):
|
369
|
+
"""获得整份pdf的所有文本"""
|
370
|
+
return '\n'.join([page.getText('text') for page in self.doc])
|
371
|
+
|
372
|
+
def xrefstr(self):
|
373
|
+
"""查看pdf文档的所有对象"""
|
374
|
+
xrefstr = []
|
375
|
+
n = self.doc._getXrefLength()
|
376
|
+
for i in range(1, n): # 注意下标实际要从1卡开始
|
377
|
+
# 可以边遍历边删除,不影响下标位置,因为其本质只是去除关联引用而已
|
378
|
+
xrefstr.append(self.doc._getXrefString(i))
|
379
|
+
browser('\n'.join(xrefstr))
|
380
|
+
|
381
|
+
def page_add_ele(self):
|
382
|
+
"""往页面添加元素
|
383
|
+
添加元素前后xrefstr的区别: https://paste.ubuntu.com/p/Dxhnzp4XJ2/
|
384
|
+
"""
|
385
|
+
self.doc.select([0])
|
386
|
+
page = self.doc.loadPage(0)
|
387
|
+
# page.insertText(fitz.Point(100, 200), 'test\ntest')
|
388
|
+
file = str(XlPath.tempdir() / 'a.pdf')
|
389
|
+
dprint(file)
|
390
|
+
self.doc.save(file, garbage=4)
|
391
|
+
browser(file)
|
392
|
+
|
393
|
+
|
394
|
+
def __pdfminer():
|
395
|
+
""" pdfminer的实验代码也先放这里
|
396
|
+
|
397
|
+
!pip install pdfminer.six
|
398
|
+
"""
|
399
|
+
|
400
|
+
import pdfminer
|
401
|
+
print(pdfminer.__version__)
|
402
|
+
# 20201018
|
403
|
+
|
404
|
+
|
405
|
+
class PdfMiner:
|
406
|
+
@classmethod
|
407
|
+
def to_html(cls, pdf_file):
|
408
|
+
""" 相比fitz,pdfminer能正常提取出下划线
|
409
|
+
|
410
|
+
文本重叠比fitz更严重,整体来说其实更不好用~~
|
411
|
+
"""
|
412
|
+
|
413
|
+
from io import StringIO
|
414
|
+
|
415
|
+
from pdfminer.high_level import extract_text_to_fp
|
416
|
+
from pdfminer.layout import LAParams
|
417
|
+
|
418
|
+
output_string = StringIO()
|
419
|
+
with open(str(pdf_file)) as fin:
|
420
|
+
extract_text_to_fp(fin, output_string, laparams=LAParams(),
|
421
|
+
output_type='html', codec=None)
|
422
|
+
|
423
|
+
# 打开浏览器查看重建的html效果
|
424
|
+
f = pdf_file.with_suffix('.html')
|
425
|
+
f.write(output_string.getvalue())
|
426
|
+
browser(f)
|
pyxllib/file/pupil.py
ADDED
@@ -0,0 +1,185 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : 陈坤泽
|
4
|
+
# @Email : 877362867@qq.com
|
5
|
+
# @Date : 2021/06/03 21:17
|
6
|
+
|
7
|
+
import os
|
8
|
+
import re
|
9
|
+
import shutil
|
10
|
+
import struct
|
11
|
+
|
12
|
+
|
13
|
+
def struct_unpack(f, fmt):
|
14
|
+
r""" 类似np.fromfile的功能,读取并解析二进制数据
|
15
|
+
|
16
|
+
:param f:
|
17
|
+
如果带有read方法,则用read方法读取指定字节数
|
18
|
+
如果bytes对象则直接处理
|
19
|
+
:param fmt: 格式
|
20
|
+
默认按小端解析(2, 1, 0, 0) -> 258,如果需要大端,可以加前缀'>'
|
21
|
+
字节:c=char, b=signed char, B=unsigned char, ?=bool
|
22
|
+
2字节整数:h=short, H=unsigned short(后文同理,大写只是变成unsigned模式,不在累述)
|
23
|
+
4字节整数:i, I, l, L
|
24
|
+
8字节整数:q, Q
|
25
|
+
浮点数:e=2字节,f=4字节,d=8字节
|
26
|
+
|
27
|
+
>>> b = struct.pack('B', 127)
|
28
|
+
>>> b
|
29
|
+
b'\x7f'
|
30
|
+
>>> struct_unpack(b, 'c')
|
31
|
+
b'\x7f'
|
32
|
+
>>> struct_unpack(b, 'B')
|
33
|
+
127
|
34
|
+
|
35
|
+
>>> b = struct.pack('I', 258)
|
36
|
+
>>> b
|
37
|
+
b'\x02\x01\x00\x00'
|
38
|
+
>>> struct_unpack(b, 'I') # 默认都是按小端打包、解析
|
39
|
+
258
|
40
|
+
>>> struct_unpack(b, '>I') # 错误示范,按大端解析的值
|
41
|
+
33619968
|
42
|
+
>>> struct_unpack(b, 'H'*2) # 解析两个值,fmt*2即可
|
43
|
+
(258, 0)
|
44
|
+
|
45
|
+
>>> f = io.BytesIO(b'\x02\x01\x03\x04')
|
46
|
+
>>> struct_unpack(f, 'B'*3) # 取前3个值,等价于np.fromfile(f, dtype='uint8', count=3)
|
47
|
+
(2, 1, 3)
|
48
|
+
>>> struct_unpack(f, 'B') # 取出第4个值
|
49
|
+
4
|
50
|
+
"""
|
51
|
+
# 1 取数据
|
52
|
+
size_ = struct.calcsize(fmt)
|
53
|
+
if hasattr(f, 'read'):
|
54
|
+
data = f.read(size_)
|
55
|
+
if len(data) < size_:
|
56
|
+
raise ValueError(f'剩余数据长度 {len(data)} 小于 fmt 需要的长度 {size_}')
|
57
|
+
else: # 对于bytes等矩阵,可以多输入,但是只解析前面一部分
|
58
|
+
data = f[:size_]
|
59
|
+
|
60
|
+
# 2 解析
|
61
|
+
res = struct.unpack(fmt, data)
|
62
|
+
if len(res) == 1: # 解析结果恰好只有一个的时候,返回值本身
|
63
|
+
return res[0]
|
64
|
+
else:
|
65
|
+
return res
|
66
|
+
|
67
|
+
|
68
|
+
def recreate_folders(*dsts):
|
69
|
+
"""重建一个空目录"""
|
70
|
+
for dst in dsts:
|
71
|
+
try:
|
72
|
+
# 删除一个目录(含内容),设置ignore_errors可以忽略目录不存在时的错误
|
73
|
+
shutil.rmtree(dst, ignore_errors=True)
|
74
|
+
os.makedirs(dst) # 重新新建一个目录,注意可能存在层级关系,所以要用makedirs
|
75
|
+
except TypeError:
|
76
|
+
pass
|
77
|
+
|
78
|
+
|
79
|
+
def checkpathfile(name):
|
80
|
+
r"""判断环境变量path下是否有name这个文件,有则返回绝对路径,无则返回None
|
81
|
+
常用的有:BCompare.exe、Chrome.exe、mogrify.exe、xelatex.exe
|
82
|
+
|
83
|
+
>> checkpathfile('xelatex.exe')
|
84
|
+
'C:\\CTEX\\MiKTeX\\miktex\\bin\\xelatex.exe'
|
85
|
+
>> checkpathfile('abcd.exe')
|
86
|
+
"""
|
87
|
+
for path in os.getenv('path').split(';'):
|
88
|
+
fn = os.path.join(path, name)
|
89
|
+
if os.path.exists(fn):
|
90
|
+
return fn
|
91
|
+
return None
|
92
|
+
|
93
|
+
|
94
|
+
def filename_tail(fn, tail):
|
95
|
+
"""在文件名末尾和扩展名前面加上一个tail"""
|
96
|
+
names = os.path.splitext(fn)
|
97
|
+
return names[0] + tail + names[1]
|
98
|
+
|
99
|
+
|
100
|
+
def hasext(f, *exts):
|
101
|
+
"""判断文件f是否是exts扩展名中的一种,如果不是返回False,否则返回对应的值
|
102
|
+
|
103
|
+
所有文件名统一按照小写处理
|
104
|
+
"""
|
105
|
+
ext = os.path.splitext(f)[1].lower()
|
106
|
+
exts = tuple(map(lambda x: x.lower(), exts))
|
107
|
+
if ext in exts:
|
108
|
+
return ext
|
109
|
+
else:
|
110
|
+
return False
|
111
|
+
|
112
|
+
|
113
|
+
def isdir(fn):
|
114
|
+
"""判断输入的是不是合法的路径格式,且存在确实是一个文件夹"""
|
115
|
+
try:
|
116
|
+
return os.path.isdir(fn)
|
117
|
+
except ValueError: # 出现文件名过长的问题
|
118
|
+
return False
|
119
|
+
except TypeError: # 输入不是字符串类型
|
120
|
+
return False
|
121
|
+
|
122
|
+
|
123
|
+
__mygetfiles = """
|
124
|
+
py有os.walk可以递归遍历得到一个目录下的所有文件
|
125
|
+
但是“我们”常常要过滤掉备份文件(171020-153959),Old、temp目、.git等目录
|
126
|
+
特别是windows还有一个很坑爹的$RECYCLE.BIN目录。
|
127
|
+
所以在os.walk的基础上,再做了封装得到myoswalk。
|
128
|
+
|
129
|
+
然后在myoswalk基础上,实现mygetfiles。
|
130
|
+
"""
|
131
|
+
|
132
|
+
|
133
|
+
def gen_file_filter(s):
|
134
|
+
"""生成一个文件名过滤函数"""
|
135
|
+
if s[0] == '.':
|
136
|
+
return lambda x: x.endswith(s)
|
137
|
+
else:
|
138
|
+
s = s.replace('?', r'[\u4e00-\u9fa5]') # 中文问号可以匹配任意中文字符
|
139
|
+
return lambda x: re.search(s, x)
|
140
|
+
|
141
|
+
|
142
|
+
def getfiles(root, filter_rule=None):
|
143
|
+
r""" 对os.walk进一步封装,返回所有匹配的文件
|
144
|
+
|
145
|
+
可以这样遍历一个目录下的所有文件:
|
146
|
+
for f in getfiles(r'C:\pycode\code4101py', r'.py'):
|
147
|
+
print(f)
|
148
|
+
筛选规则除了“.+后缀”,还可以写正则匹配
|
149
|
+
"""
|
150
|
+
if isinstance(filter_rule, str):
|
151
|
+
filter_rule = gen_file_filter(filter_rule)
|
152
|
+
|
153
|
+
for root, _, files in os.walk(root, filter_rule):
|
154
|
+
for f in files:
|
155
|
+
if filter_rule and not filter_rule(f):
|
156
|
+
continue
|
157
|
+
yield root + '\\' + f
|
158
|
+
|
159
|
+
|
160
|
+
def tex_content_filefilter(f):
|
161
|
+
"""只获取正文类tex文件"""
|
162
|
+
if f.endswith('.tex') and 'Conf' not in f and 'settings' not in f:
|
163
|
+
return True
|
164
|
+
else:
|
165
|
+
return False
|
166
|
+
|
167
|
+
|
168
|
+
def tex_conf_filefilter(f):
|
169
|
+
"""只获取配置类tex文件"""
|
170
|
+
if f.endswith('.tex') and ('Conf' in f or 'settings' in f):
|
171
|
+
return True
|
172
|
+
else:
|
173
|
+
return False
|
174
|
+
|
175
|
+
|
176
|
+
def change_ext(filename, ext):
|
177
|
+
"""更改文件名后缀
|
178
|
+
返回第1个参数是新的文件名,第2个参数是这个文件是否存在
|
179
|
+
|
180
|
+
输入的fileName可以没有扩展名,如'A/B/C/a',仍然可以找对应的扩展名为ext的文件
|
181
|
+
输入的ext不要含有'.',例如正确格式是输入'tex'、'txt'
|
182
|
+
"""
|
183
|
+
name = os.path.splitext(filename)[0] # 'A/B/C/a.txt' --> 'A/B/C/a'
|
184
|
+
newname = name + '.' + ext
|
185
|
+
return newname, os.path.exists(newname)
|