pyxllib 0.3.197__py3-none-any.whl → 0.3.200__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/__init__.py +21 -21
- pyxllib/algo/__init__.py +8 -8
- pyxllib/algo/disjoint.py +54 -54
- pyxllib/algo/geo.py +541 -541
- pyxllib/algo/intervals.py +964 -964
- pyxllib/algo/matcher.py +389 -389
- pyxllib/algo/newbie.py +166 -166
- pyxllib/algo/pupil.py +629 -629
- pyxllib/algo/shapelylib.py +67 -67
- pyxllib/algo/specialist.py +241 -241
- pyxllib/algo/stat.py +494 -494
- pyxllib/algo/treelib.py +149 -149
- pyxllib/algo/unitlib.py +66 -66
- pyxllib/autogui/__init__.py +5 -5
- pyxllib/autogui/activewin.py +246 -246
- pyxllib/autogui/all.py +9 -9
- pyxllib/autogui/autogui.py +852 -852
- pyxllib/autogui/uiautolib.py +362 -362
- pyxllib/autogui/virtualkey.py +102 -102
- pyxllib/autogui/wechat.py +827 -827
- pyxllib/autogui/wechat_msg.py +421 -421
- pyxllib/autogui/wxautolib.py +84 -84
- pyxllib/cv/__init__.py +5 -5
- pyxllib/cv/expert.py +267 -267
- pyxllib/cv/imfile.py +159 -159
- pyxllib/cv/imhash.py +39 -39
- pyxllib/cv/pupil.py +9 -9
- pyxllib/cv/rgbfmt.py +1525 -1525
- pyxllib/cv/slidercaptcha.py +137 -137
- pyxllib/cv/trackbartools.py +251 -251
- pyxllib/cv/xlcvlib.py +1040 -1040
- pyxllib/cv/xlpillib.py +423 -423
- pyxllib/data/echarts.py +240 -240
- pyxllib/data/jsonlib.py +89 -89
- pyxllib/data/oss.py +72 -72
- pyxllib/data/pglib.py +1127 -1127
- pyxllib/data/sqlite.py +568 -568
- pyxllib/data/sqllib.py +297 -297
- pyxllib/ext/JLineViewer.py +505 -505
- pyxllib/ext/__init__.py +6 -6
- pyxllib/ext/demolib.py +246 -246
- pyxllib/ext/drissionlib.py +277 -277
- pyxllib/ext/kq5034lib.py +12 -12
- pyxllib/ext/old.py +663 -663
- pyxllib/ext/qt.py +449 -449
- pyxllib/ext/robustprocfile.py +497 -497
- pyxllib/ext/seleniumlib.py +76 -76
- pyxllib/ext/tk.py +173 -173
- pyxllib/ext/unixlib.py +827 -827
- pyxllib/ext/utools.py +351 -351
- pyxllib/ext/webhook.py +124 -119
- pyxllib/ext/win32lib.py +40 -40
- pyxllib/ext/wjxlib.py +88 -88
- pyxllib/ext/wpsapi.py +124 -124
- pyxllib/ext/xlwork.py +9 -9
- pyxllib/ext/yuquelib.py +1105 -1105
- pyxllib/file/__init__.py +17 -17
- pyxllib/file/docxlib.py +761 -761
- pyxllib/file/gitlib.py +309 -309
- pyxllib/file/libreoffice.py +165 -165
- pyxllib/file/movielib.py +148 -148
- pyxllib/file/newbie.py +10 -10
- pyxllib/file/onenotelib.py +1469 -1469
- pyxllib/file/packlib/__init__.py +330 -330
- pyxllib/file/packlib/zipfile.py +2441 -2441
- pyxllib/file/pdflib.py +426 -426
- pyxllib/file/pupil.py +185 -185
- pyxllib/file/specialist/__init__.py +685 -685
- pyxllib/file/specialist/dirlib.py +799 -799
- pyxllib/file/specialist/download.py +193 -193
- pyxllib/file/specialist/filelib.py +2829 -2829
- pyxllib/file/xlsxlib.py +3131 -3131
- pyxllib/file/xlsyncfile.py +341 -341
- pyxllib/prog/__init__.py +5 -5
- pyxllib/prog/cachetools.py +64 -64
- pyxllib/prog/deprecatedlib.py +233 -233
- pyxllib/prog/filelock.py +42 -42
- pyxllib/prog/ipyexec.py +253 -253
- pyxllib/prog/multiprogs.py +940 -940
- pyxllib/prog/newbie.py +451 -451
- pyxllib/prog/pupil.py +1197 -1197
- pyxllib/prog/sitepackages.py +33 -33
- pyxllib/prog/specialist/__init__.py +391 -391
- pyxllib/prog/specialist/bc.py +203 -203
- pyxllib/prog/specialist/browser.py +497 -497
- pyxllib/prog/specialist/common.py +347 -347
- pyxllib/prog/specialist/datetime.py +198 -198
- pyxllib/prog/specialist/tictoc.py +240 -240
- pyxllib/prog/specialist/xllog.py +180 -180
- pyxllib/prog/xlosenv.py +108 -108
- pyxllib/stdlib/__init__.py +17 -17
- pyxllib/stdlib/tablepyxl/__init__.py +10 -10
- pyxllib/stdlib/tablepyxl/style.py +303 -303
- pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
- pyxllib/text/__init__.py +8 -8
- pyxllib/text/ahocorasick.py +39 -39
- pyxllib/text/airscript.js +744 -744
- pyxllib/text/charclasslib.py +121 -121
- pyxllib/text/jiebalib.py +267 -267
- pyxllib/text/jinjalib.py +32 -32
- pyxllib/text/jsa_ai_prompt.md +271 -271
- pyxllib/text/jscode.py +922 -922
- pyxllib/text/latex/__init__.py +158 -158
- pyxllib/text/levenshtein.py +303 -303
- pyxllib/text/nestenv.py +1215 -1215
- pyxllib/text/newbie.py +300 -300
- pyxllib/text/pupil/__init__.py +8 -8
- pyxllib/text/pupil/common.py +1121 -1121
- pyxllib/text/pupil/xlalign.py +326 -326
- pyxllib/text/pycode.py +47 -47
- pyxllib/text/specialist/__init__.py +8 -8
- pyxllib/text/specialist/common.py +112 -112
- pyxllib/text/specialist/ptag.py +186 -186
- pyxllib/text/spellchecker.py +172 -172
- pyxllib/text/templates/echart_base.html +10 -10
- pyxllib/text/templates/highlight_code.html +16 -16
- pyxllib/text/templates/latex_editor.html +102 -102
- pyxllib/text/vbacode.py +17 -17
- pyxllib/text/xmllib.py +747 -747
- pyxllib/xl.py +42 -39
- pyxllib/xlcv.py +17 -17
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/METADATA +1 -1
- pyxllib-0.3.200.dist-info/RECORD +126 -0
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/licenses/LICENSE +190 -190
- pyxllib-0.3.197.dist-info/RECORD +0 -126
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/WHEEL +0 -0
pyxllib/cv/expert.py
CHANGED
@@ -1,267 +1,267 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : 陈坤泽
|
4
|
-
# @Email : 877362867@qq.com
|
5
|
-
# @Date : 2021/08/25 15:57
|
6
|
-
|
7
|
-
from collections import defaultdict
|
8
|
-
import concurrent.futures
|
9
|
-
|
10
|
-
import cv2
|
11
|
-
import pandas as pd
|
12
|
-
from tqdm import tqdm
|
13
|
-
|
14
|
-
import PIL.Image
|
15
|
-
|
16
|
-
from pyxllib.algo.stat import update_dataframes_to_excel
|
17
|
-
from pyxllib.file.specialist import get_etag, XlPath
|
18
|
-
from pyxllib.prog.specialist import Iterate
|
19
|
-
from pyxllib.cv.xlcvlib import CvImg, xlcv
|
20
|
-
from pyxllib.cv.xlpillib import PilImg, xlpil
|
21
|
-
|
22
|
-
|
23
|
-
def __1_目录级处理图片的功能():
|
24
|
-
pass
|
25
|
-
|
26
|
-
|
27
|
-
class ImagesDir(XlPath):
|
28
|
-
""" 这个函数功能,默认都是原地操作,如果怕以防万一出问题,最好对原始数据有另外的备份,而在新的目录里操作 """
|
29
|
-
|
30
|
-
def debug_image_func(self, func, pattern='*', *, save=None, show=False):
|
31
|
-
"""
|
32
|
-
:param func: 对每张图片执行的功能,函数应该只有一个图片路径参数 new_img = func(img)
|
33
|
-
当函数有多个参数时,可以用lambda函数技巧: lambda im: func(im, arg1=..., arg2=...)
|
34
|
-
:param save: 如果输入一个目录,会将debug结果图存储到对应的目录里
|
35
|
-
:param show: 如果该参数为True,则每处理一张会imshow显示处理效果
|
36
|
-
此时弹出的窗口里,每按任意键则显示下一张,按ESC退出
|
37
|
-
:return:
|
38
|
-
|
39
|
-
TODO 显示原图、处理后图的对比效果
|
40
|
-
TODO 支持同时显示多张图处理效果
|
41
|
-
"""
|
42
|
-
if save:
|
43
|
-
save = XlPath(save)
|
44
|
-
|
45
|
-
for f in self.glob_images(pattern):
|
46
|
-
im1 = xlcv.read(f)
|
47
|
-
im2 = func(im1)
|
48
|
-
|
49
|
-
if save:
|
50
|
-
xlcv.write(im2, self / save / f.name)
|
51
|
-
|
52
|
-
if show:
|
53
|
-
xlcv.imshow2(im2)
|
54
|
-
key = cv2.waitKey()
|
55
|
-
if key == '0x1B': # ESC 键
|
56
|
-
break
|
57
|
-
|
58
|
-
def fix_suffixs(self, pattern='**/*', log_file='_图片统计.xlsx', max_workers=None, pinterval=None):
|
59
|
-
""" 修正错误的后缀名
|
60
|
-
|
61
|
-
:param pinterval: 支持智能地判断进度间隔
|
62
|
-
"""
|
63
|
-
|
64
|
-
# 1 修改后缀
|
65
|
-
# 定义并行处理子函数
|
66
|
-
def process_image_file(args):
|
67
|
-
""" 处理单个图片文件,修正后缀名 """
|
68
|
-
file, ext = args
|
69
|
-
xlcv.write(xlcv.read(file), file) # 读取图片,并按照原本文件名期望的格式存储
|
70
|
-
ls.append([file.relpath(self).as_posix(), ext])
|
71
|
-
|
72
|
-
ls = []
|
73
|
-
files_with_exts = list(self.xglob_faker_suffix_images(pattern))
|
74
|
-
if pinterval is None and files_with_exts:
|
75
|
-
p = max(1000 * 100 // len(files_with_exts), 1) # 最小也按1%进度展示
|
76
|
-
if p < 50: # 间隔只有小余50%,才比较有显示的意义
|
77
|
-
pinterval = f'{p}%' # 每1千张显示进度
|
78
|
-
Iterate(files_with_exts).run(process_image_file, max_workers=max_workers, pinterval=pinterval)
|
79
|
-
|
80
|
-
# 2 记录修改情况
|
81
|
-
df = pd.DataFrame.from_records(ls, columns=['图片名', '原图片类型'])
|
82
|
-
if log_file:
|
83
|
-
update_dataframes_to_excel(XlPath.init(log_file, self), {'修改后缀名': df})
|
84
|
-
return df
|
85
|
-
|
86
|
-
def reduce_image_filesize(self, pattern='**/*',
|
87
|
-
limit_size=4 * 1024 ** 2, *,
|
88
|
-
read_flags=None,
|
89
|
-
change_length=False,
|
90
|
-
suffix=None,
|
91
|
-
log_file='_图片统计.xlsx',
|
92
|
-
max_workers=None, pinterval=None):
|
93
|
-
""" 减小图片尺寸,可以限制目录里尺寸最大的图片不超过多少
|
94
|
-
|
95
|
-
:param limit_size: 限制的尺寸
|
96
|
-
一般自己的相册图片,亲测300kb其实就够了~~,即 300 * 1024
|
97
|
-
百度API那边,好像不同接口不太一样,4M、6M、10M等好像都有
|
98
|
-
但百度那是base64后的尺寸,会大出1/3
|
99
|
-
为了够用,一般要限定在4M等比例的3/4比例内
|
100
|
-
:param read_flags: 读取图片时的参数,设为1,可以把各种RGBA等奇怪的格式,统一为RGB
|
101
|
-
:param change_length: 默认是要减小图片的边长,尺寸,来压缩图片的
|
102
|
-
可以设为False,不调整尺寸,纯粹读取后再重写,可能也能压缩不少尺寸
|
103
|
-
:param suffix: 可以统一图片后缀格式,默认保留原图片名称
|
104
|
-
要带前缀'.',例如'.jpg'
|
105
|
-
注意其他格式的原图会被删除
|
106
|
-
|
107
|
-
因为所有图片都会读入后再重新写入,速度可能会稍慢
|
108
|
-
"""
|
109
|
-
|
110
|
-
# 1 调试信息
|
111
|
-
print('原始大小', self.size(human_readable=True))
|
112
|
-
|
113
|
-
# 2 精简图片尺寸
|
114
|
-
# 定义并行处理子函数
|
115
|
-
def process_image_file(f):
|
116
|
-
"""处理单个图片文件,减小图片尺寸"""
|
117
|
-
size1 = f.size()
|
118
|
-
im = xlpil.read(f, read_flags)
|
119
|
-
_suffix = suffix or f.suffix
|
120
|
-
if change_length:
|
121
|
-
im = xlpil.reduce_filesize(im, limit_size, _suffix)
|
122
|
-
size2 = xlpil.evaluate_image_file_size(im, _suffix)
|
123
|
-
dst_f = f.with_suffix(_suffix)
|
124
|
-
if size2 < size1: # 只有文件尺寸确实变小的才更新
|
125
|
-
xlpil.write(im, dst_f)
|
126
|
-
if f.suffix != _suffix:
|
127
|
-
f.delete()
|
128
|
-
ls.append([f.relpath(self).as_posix(), dst_f.relpath(self).as_posix(), size1, size2])
|
129
|
-
|
130
|
-
ls = []
|
131
|
-
files = list(self.glob_images(pattern))
|
132
|
-
if pinterval is None and files:
|
133
|
-
p = max(100 * 100 // len(files), 1) # 最小也按1%进度展示
|
134
|
-
if p < 50: # 间隔只有小余50%,才比较有显示的意义
|
135
|
-
pinterval = f'{p}%' # 每1千张显示进度
|
136
|
-
Iterate(files).run(process_image_file, max_workers=max_workers, pinterval=pinterval)
|
137
|
-
|
138
|
-
print('新目录大小', self.size(human_readable=True))
|
139
|
-
|
140
|
-
# 3 记录修改细节
|
141
|
-
# 注意,如果不使用suffix参数,'新图片'的值应该跟'原图片'是一样的
|
142
|
-
# 以及当尝试精简的'新文件大小'大于'原文件大小'时,图片其实是不会被覆盖更新的
|
143
|
-
df = pd.DataFrame.from_records(ls, columns=['原图片', '新图片', '原文件大小', '新文件大小'])
|
144
|
-
if log_file:
|
145
|
-
update_dataframes_to_excel(XlPath.init(log_file, self), {'图片瘦身': df})
|
146
|
-
return df
|
147
|
-
|
148
|
-
def adjust_image_shape(self, pattern='*', min_length=None, max_length=None, print_mode=True):
|
149
|
-
""" 调整图片尺寸 """
|
150
|
-
|
151
|
-
def printf(*args, **kwargs):
|
152
|
-
if print_mode:
|
153
|
-
print(*args, **kwargs)
|
154
|
-
|
155
|
-
j = 1
|
156
|
-
for f in self.glob_images(pattern):
|
157
|
-
# 用pil库判断图片尺寸更快,但处理过程用的是cv2库
|
158
|
-
h, w = xlpil.read(f).size[::-1]
|
159
|
-
x, y = min(h, w), max(h, w)
|
160
|
-
|
161
|
-
if (min_length and x < min_length) or (max_length and y > max_length):
|
162
|
-
im = xlcv.read(f)
|
163
|
-
im2 = xlcv.adjust_shape(im, min_length, max_length)
|
164
|
-
if im2.shape != im.shape:
|
165
|
-
printf(f'{j}、{f} {im.shape} -> {im2.shape}')
|
166
|
-
xlcv.write(im2, f)
|
167
|
-
j += 1
|
168
|
-
|
169
|
-
def check_repeat_phash_images(self, pattern='**/*', **kwargs):
|
170
|
-
from pyxllib.cv.imhash import phash
|
171
|
-
if 'files' not in kwargs:
|
172
|
-
kwargs['files'] = self.glob_images(pattern)
|
173
|
-
if 'hash_func' not in kwargs:
|
174
|
-
kwargs['hash_func'] = lambda p: phash(p)
|
175
|
-
self.check_repeat_files(pattern, **kwargs)
|
176
|
-
|
177
|
-
def check_repeat_dhash_images(self, pattern='**/*', **kwargs):
|
178
|
-
from pyxllib.cv.imhash import dhash
|
179
|
-
if 'files' not in kwargs:
|
180
|
-
kwargs['files'] = self.glob_images(pattern)
|
181
|
-
if 'hash_func' not in kwargs:
|
182
|
-
kwargs['hash_func'] = lambda p: dhash(p)
|
183
|
-
self.check_repeat_files(pattern, **kwargs)
|
184
|
-
|
185
|
-
def clear_exif(self):
|
186
|
-
""" 清除图片中的exif标记 """
|
187
|
-
cnt = 0
|
188
|
-
for file in tqdm(self.rglob_images()):
|
189
|
-
im = xlpil.read(file)
|
190
|
-
exif = xlpil.get_exif(im)
|
191
|
-
if exif:
|
192
|
-
orientation = exif.get("Orientation", None)
|
193
|
-
if orientation:
|
194
|
-
cnt += 1
|
195
|
-
im = xlpil.apply_exif_orientation(im)
|
196
|
-
xlpil.write(im, file)
|
197
|
-
print(f'处理了{cnt}份exif')
|
198
|
-
|
199
|
-
|
200
|
-
def find_modified_images(dirs, print_mode=False):
|
201
|
-
""" 查找可能被修改过的图片
|
202
|
-
|
203
|
-
一般用在数据标注工作中,对收回来的数据目录,和原本数据目录做个对比,
|
204
|
-
以name作为对应关联,看前后图片是否内容发生变换,比如旋转。
|
205
|
-
|
206
|
-
:param list[str] dirs: 图片所在目录列表
|
207
|
-
:param bool print_mode: 是否打印进度提示,默认为 False
|
208
|
-
:return dict[str, list[str]]: 包含图片名字和可能被修改过的图片路径列表的字典
|
209
|
-
|
210
|
-
示例用法:
|
211
|
-
import os
|
212
|
-
from pprint import pprint
|
213
|
-
from pyxllib.cv.expert import find_modified_images
|
214
|
-
|
215
|
-
os.chdir('/home/chenkunze/data')
|
216
|
-
res = find_modified_images([r'm2305latex2lgx/train_images_sub',
|
217
|
-
r'm2305latex2lg/1、做完的数据'])
|
218
|
-
pprint(res)
|
219
|
-
"""
|
220
|
-
from pyxllib.file.specialist import get_etag # 发现不能用相似,还是得用etag
|
221
|
-
|
222
|
-
# 1 将图片按名字分组
|
223
|
-
def group_by_name(dirs):
|
224
|
-
""" 将图片按名字分组
|
225
|
-
|
226
|
-
:param list[str] dirs: 图片所在目录列表
|
227
|
-
:return dict[str, list[str]]: 包含图片名字和对应图片路径列表的字典
|
228
|
-
|
229
|
-
>>> group_by_name(['path/to/dir1', 'path/to/dir2'])
|
230
|
-
{'image1.jpg': ['path/to/dir1/image1.jpg'], 'image2.png': ['path/to/dir2/image2.png']}
|
231
|
-
"""
|
232
|
-
image_groups = {}
|
233
|
-
for dir in dirs:
|
234
|
-
for path in XlPath(dir).rglob_images():
|
235
|
-
image_name = path.name
|
236
|
-
if image_name not in image_groups:
|
237
|
-
image_groups[image_name] = []
|
238
|
-
image_groups[image_name].append(path)
|
239
|
-
return image_groups
|
240
|
-
|
241
|
-
image_groups = group_by_name(dirs)
|
242
|
-
|
243
|
-
# 2 存储有哪些变化的分组
|
244
|
-
modified_images = {}
|
245
|
-
progress_counter = 0
|
246
|
-
|
247
|
-
if print_mode:
|
248
|
-
total_files = sum(len(paths) for paths in image_groups.values())
|
249
|
-
print(f"Total files: {total_files}")
|
250
|
-
|
251
|
-
for image_name, paths in image_groups.items():
|
252
|
-
if len(paths) <= 1:
|
253
|
-
continue
|
254
|
-
|
255
|
-
hash_values = [get_etag(str(path)) for path in paths]
|
256
|
-
sizes = [PIL.Image.open(path).size for path in paths]
|
257
|
-
|
258
|
-
# 这里可以增强,更加详细展示差异,比如是不是被旋转了90度、180度、270度,但会大大提升运算量,暂时不添加
|
259
|
-
if len(set(hash_values)) > 1 or len(set(sizes)) > 1:
|
260
|
-
# 获取posix风格路径
|
261
|
-
modified_images[image_name] = [XlPath(path).as_posix() for path in paths]
|
262
|
-
|
263
|
-
if print_mode:
|
264
|
-
progress_counter += len(paths)
|
265
|
-
print(f"Progress: {progress_counter}/{total_files}")
|
266
|
-
|
267
|
-
return modified_images
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : 陈坤泽
|
4
|
+
# @Email : 877362867@qq.com
|
5
|
+
# @Date : 2021/08/25 15:57
|
6
|
+
|
7
|
+
from collections import defaultdict
|
8
|
+
import concurrent.futures
|
9
|
+
|
10
|
+
import cv2
|
11
|
+
import pandas as pd
|
12
|
+
from tqdm import tqdm
|
13
|
+
|
14
|
+
import PIL.Image
|
15
|
+
|
16
|
+
from pyxllib.algo.stat import update_dataframes_to_excel
|
17
|
+
from pyxllib.file.specialist import get_etag, XlPath
|
18
|
+
from pyxllib.prog.specialist import Iterate
|
19
|
+
from pyxllib.cv.xlcvlib import CvImg, xlcv
|
20
|
+
from pyxllib.cv.xlpillib import PilImg, xlpil
|
21
|
+
|
22
|
+
|
23
|
+
def __1_目录级处理图片的功能():
|
24
|
+
pass
|
25
|
+
|
26
|
+
|
27
|
+
class ImagesDir(XlPath):
|
28
|
+
""" 这个函数功能,默认都是原地操作,如果怕以防万一出问题,最好对原始数据有另外的备份,而在新的目录里操作 """
|
29
|
+
|
30
|
+
def debug_image_func(self, func, pattern='*', *, save=None, show=False):
|
31
|
+
"""
|
32
|
+
:param func: 对每张图片执行的功能,函数应该只有一个图片路径参数 new_img = func(img)
|
33
|
+
当函数有多个参数时,可以用lambda函数技巧: lambda im: func(im, arg1=..., arg2=...)
|
34
|
+
:param save: 如果输入一个目录,会将debug结果图存储到对应的目录里
|
35
|
+
:param show: 如果该参数为True,则每处理一张会imshow显示处理效果
|
36
|
+
此时弹出的窗口里,每按任意键则显示下一张,按ESC退出
|
37
|
+
:return:
|
38
|
+
|
39
|
+
TODO 显示原图、处理后图的对比效果
|
40
|
+
TODO 支持同时显示多张图处理效果
|
41
|
+
"""
|
42
|
+
if save:
|
43
|
+
save = XlPath(save)
|
44
|
+
|
45
|
+
for f in self.glob_images(pattern):
|
46
|
+
im1 = xlcv.read(f)
|
47
|
+
im2 = func(im1)
|
48
|
+
|
49
|
+
if save:
|
50
|
+
xlcv.write(im2, self / save / f.name)
|
51
|
+
|
52
|
+
if show:
|
53
|
+
xlcv.imshow2(im2)
|
54
|
+
key = cv2.waitKey()
|
55
|
+
if key == '0x1B': # ESC 键
|
56
|
+
break
|
57
|
+
|
58
|
+
def fix_suffixs(self, pattern='**/*', log_file='_图片统计.xlsx', max_workers=None, pinterval=None):
|
59
|
+
""" 修正错误的后缀名
|
60
|
+
|
61
|
+
:param pinterval: 支持智能地判断进度间隔
|
62
|
+
"""
|
63
|
+
|
64
|
+
# 1 修改后缀
|
65
|
+
# 定义并行处理子函数
|
66
|
+
def process_image_file(args):
|
67
|
+
""" 处理单个图片文件,修正后缀名 """
|
68
|
+
file, ext = args
|
69
|
+
xlcv.write(xlcv.read(file), file) # 读取图片,并按照原本文件名期望的格式存储
|
70
|
+
ls.append([file.relpath(self).as_posix(), ext])
|
71
|
+
|
72
|
+
ls = []
|
73
|
+
files_with_exts = list(self.xglob_faker_suffix_images(pattern))
|
74
|
+
if pinterval is None and files_with_exts:
|
75
|
+
p = max(1000 * 100 // len(files_with_exts), 1) # 最小也按1%进度展示
|
76
|
+
if p < 50: # 间隔只有小余50%,才比较有显示的意义
|
77
|
+
pinterval = f'{p}%' # 每1千张显示进度
|
78
|
+
Iterate(files_with_exts).run(process_image_file, max_workers=max_workers, pinterval=pinterval)
|
79
|
+
|
80
|
+
# 2 记录修改情况
|
81
|
+
df = pd.DataFrame.from_records(ls, columns=['图片名', '原图片类型'])
|
82
|
+
if log_file:
|
83
|
+
update_dataframes_to_excel(XlPath.init(log_file, self), {'修改后缀名': df})
|
84
|
+
return df
|
85
|
+
|
86
|
+
def reduce_image_filesize(self, pattern='**/*',
|
87
|
+
limit_size=4 * 1024 ** 2, *,
|
88
|
+
read_flags=None,
|
89
|
+
change_length=False,
|
90
|
+
suffix=None,
|
91
|
+
log_file='_图片统计.xlsx',
|
92
|
+
max_workers=None, pinterval=None):
|
93
|
+
""" 减小图片尺寸,可以限制目录里尺寸最大的图片不超过多少
|
94
|
+
|
95
|
+
:param limit_size: 限制的尺寸
|
96
|
+
一般自己的相册图片,亲测300kb其实就够了~~,即 300 * 1024
|
97
|
+
百度API那边,好像不同接口不太一样,4M、6M、10M等好像都有
|
98
|
+
但百度那是base64后的尺寸,会大出1/3
|
99
|
+
为了够用,一般要限定在4M等比例的3/4比例内
|
100
|
+
:param read_flags: 读取图片时的参数,设为1,可以把各种RGBA等奇怪的格式,统一为RGB
|
101
|
+
:param change_length: 默认是要减小图片的边长,尺寸,来压缩图片的
|
102
|
+
可以设为False,不调整尺寸,纯粹读取后再重写,可能也能压缩不少尺寸
|
103
|
+
:param suffix: 可以统一图片后缀格式,默认保留原图片名称
|
104
|
+
要带前缀'.',例如'.jpg'
|
105
|
+
注意其他格式的原图会被删除
|
106
|
+
|
107
|
+
因为所有图片都会读入后再重新写入,速度可能会稍慢
|
108
|
+
"""
|
109
|
+
|
110
|
+
# 1 调试信息
|
111
|
+
print('原始大小', self.size(human_readable=True))
|
112
|
+
|
113
|
+
# 2 精简图片尺寸
|
114
|
+
# 定义并行处理子函数
|
115
|
+
def process_image_file(f):
|
116
|
+
"""处理单个图片文件,减小图片尺寸"""
|
117
|
+
size1 = f.size()
|
118
|
+
im = xlpil.read(f, read_flags)
|
119
|
+
_suffix = suffix or f.suffix
|
120
|
+
if change_length:
|
121
|
+
im = xlpil.reduce_filesize(im, limit_size, _suffix)
|
122
|
+
size2 = xlpil.evaluate_image_file_size(im, _suffix)
|
123
|
+
dst_f = f.with_suffix(_suffix)
|
124
|
+
if size2 < size1: # 只有文件尺寸确实变小的才更新
|
125
|
+
xlpil.write(im, dst_f)
|
126
|
+
if f.suffix != _suffix:
|
127
|
+
f.delete()
|
128
|
+
ls.append([f.relpath(self).as_posix(), dst_f.relpath(self).as_posix(), size1, size2])
|
129
|
+
|
130
|
+
ls = []
|
131
|
+
files = list(self.glob_images(pattern))
|
132
|
+
if pinterval is None and files:
|
133
|
+
p = max(100 * 100 // len(files), 1) # 最小也按1%进度展示
|
134
|
+
if p < 50: # 间隔只有小余50%,才比较有显示的意义
|
135
|
+
pinterval = f'{p}%' # 每1千张显示进度
|
136
|
+
Iterate(files).run(process_image_file, max_workers=max_workers, pinterval=pinterval)
|
137
|
+
|
138
|
+
print('新目录大小', self.size(human_readable=True))
|
139
|
+
|
140
|
+
# 3 记录修改细节
|
141
|
+
# 注意,如果不使用suffix参数,'新图片'的值应该跟'原图片'是一样的
|
142
|
+
# 以及当尝试精简的'新文件大小'大于'原文件大小'时,图片其实是不会被覆盖更新的
|
143
|
+
df = pd.DataFrame.from_records(ls, columns=['原图片', '新图片', '原文件大小', '新文件大小'])
|
144
|
+
if log_file:
|
145
|
+
update_dataframes_to_excel(XlPath.init(log_file, self), {'图片瘦身': df})
|
146
|
+
return df
|
147
|
+
|
148
|
+
def adjust_image_shape(self, pattern='*', min_length=None, max_length=None, print_mode=True):
|
149
|
+
""" 调整图片尺寸 """
|
150
|
+
|
151
|
+
def printf(*args, **kwargs):
|
152
|
+
if print_mode:
|
153
|
+
print(*args, **kwargs)
|
154
|
+
|
155
|
+
j = 1
|
156
|
+
for f in self.glob_images(pattern):
|
157
|
+
# 用pil库判断图片尺寸更快,但处理过程用的是cv2库
|
158
|
+
h, w = xlpil.read(f).size[::-1]
|
159
|
+
x, y = min(h, w), max(h, w)
|
160
|
+
|
161
|
+
if (min_length and x < min_length) or (max_length and y > max_length):
|
162
|
+
im = xlcv.read(f)
|
163
|
+
im2 = xlcv.adjust_shape(im, min_length, max_length)
|
164
|
+
if im2.shape != im.shape:
|
165
|
+
printf(f'{j}、{f} {im.shape} -> {im2.shape}')
|
166
|
+
xlcv.write(im2, f)
|
167
|
+
j += 1
|
168
|
+
|
169
|
+
def check_repeat_phash_images(self, pattern='**/*', **kwargs):
|
170
|
+
from pyxllib.cv.imhash import phash
|
171
|
+
if 'files' not in kwargs:
|
172
|
+
kwargs['files'] = self.glob_images(pattern)
|
173
|
+
if 'hash_func' not in kwargs:
|
174
|
+
kwargs['hash_func'] = lambda p: phash(p)
|
175
|
+
self.check_repeat_files(pattern, **kwargs)
|
176
|
+
|
177
|
+
def check_repeat_dhash_images(self, pattern='**/*', **kwargs):
|
178
|
+
from pyxllib.cv.imhash import dhash
|
179
|
+
if 'files' not in kwargs:
|
180
|
+
kwargs['files'] = self.glob_images(pattern)
|
181
|
+
if 'hash_func' not in kwargs:
|
182
|
+
kwargs['hash_func'] = lambda p: dhash(p)
|
183
|
+
self.check_repeat_files(pattern, **kwargs)
|
184
|
+
|
185
|
+
def clear_exif(self):
|
186
|
+
""" 清除图片中的exif标记 """
|
187
|
+
cnt = 0
|
188
|
+
for file in tqdm(self.rglob_images()):
|
189
|
+
im = xlpil.read(file)
|
190
|
+
exif = xlpil.get_exif(im)
|
191
|
+
if exif:
|
192
|
+
orientation = exif.get("Orientation", None)
|
193
|
+
if orientation:
|
194
|
+
cnt += 1
|
195
|
+
im = xlpil.apply_exif_orientation(im)
|
196
|
+
xlpil.write(im, file)
|
197
|
+
print(f'处理了{cnt}份exif')
|
198
|
+
|
199
|
+
|
200
|
+
def find_modified_images(dirs, print_mode=False):
|
201
|
+
""" 查找可能被修改过的图片
|
202
|
+
|
203
|
+
一般用在数据标注工作中,对收回来的数据目录,和原本数据目录做个对比,
|
204
|
+
以name作为对应关联,看前后图片是否内容发生变换,比如旋转。
|
205
|
+
|
206
|
+
:param list[str] dirs: 图片所在目录列表
|
207
|
+
:param bool print_mode: 是否打印进度提示,默认为 False
|
208
|
+
:return dict[str, list[str]]: 包含图片名字和可能被修改过的图片路径列表的字典
|
209
|
+
|
210
|
+
示例用法:
|
211
|
+
import os
|
212
|
+
from pprint import pprint
|
213
|
+
from pyxllib.cv.expert import find_modified_images
|
214
|
+
|
215
|
+
os.chdir('/home/chenkunze/data')
|
216
|
+
res = find_modified_images([r'm2305latex2lgx/train_images_sub',
|
217
|
+
r'm2305latex2lg/1、做完的数据'])
|
218
|
+
pprint(res)
|
219
|
+
"""
|
220
|
+
from pyxllib.file.specialist import get_etag # 发现不能用相似,还是得用etag
|
221
|
+
|
222
|
+
# 1 将图片按名字分组
|
223
|
+
def group_by_name(dirs):
|
224
|
+
""" 将图片按名字分组
|
225
|
+
|
226
|
+
:param list[str] dirs: 图片所在目录列表
|
227
|
+
:return dict[str, list[str]]: 包含图片名字和对应图片路径列表的字典
|
228
|
+
|
229
|
+
>>> group_by_name(['path/to/dir1', 'path/to/dir2'])
|
230
|
+
{'image1.jpg': ['path/to/dir1/image1.jpg'], 'image2.png': ['path/to/dir2/image2.png']}
|
231
|
+
"""
|
232
|
+
image_groups = {}
|
233
|
+
for dir in dirs:
|
234
|
+
for path in XlPath(dir).rglob_images():
|
235
|
+
image_name = path.name
|
236
|
+
if image_name not in image_groups:
|
237
|
+
image_groups[image_name] = []
|
238
|
+
image_groups[image_name].append(path)
|
239
|
+
return image_groups
|
240
|
+
|
241
|
+
image_groups = group_by_name(dirs)
|
242
|
+
|
243
|
+
# 2 存储有哪些变化的分组
|
244
|
+
modified_images = {}
|
245
|
+
progress_counter = 0
|
246
|
+
|
247
|
+
if print_mode:
|
248
|
+
total_files = sum(len(paths) for paths in image_groups.values())
|
249
|
+
print(f"Total files: {total_files}")
|
250
|
+
|
251
|
+
for image_name, paths in image_groups.items():
|
252
|
+
if len(paths) <= 1:
|
253
|
+
continue
|
254
|
+
|
255
|
+
hash_values = [get_etag(str(path)) for path in paths]
|
256
|
+
sizes = [PIL.Image.open(path).size for path in paths]
|
257
|
+
|
258
|
+
# 这里可以增强,更加详细展示差异,比如是不是被旋转了90度、180度、270度,但会大大提升运算量,暂时不添加
|
259
|
+
if len(set(hash_values)) > 1 or len(set(sizes)) > 1:
|
260
|
+
# 获取posix风格路径
|
261
|
+
modified_images[image_name] = [XlPath(path).as_posix() for path in paths]
|
262
|
+
|
263
|
+
if print_mode:
|
264
|
+
progress_counter += len(paths)
|
265
|
+
print(f"Progress: {progress_counter}/{total_files}")
|
266
|
+
|
267
|
+
return modified_images
|