pyxllib 0.3.197__py3-none-any.whl → 3.201.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. pyxllib/__init__.py +14 -21
  2. pyxllib/algo/__init__.py +8 -8
  3. pyxllib/algo/disjoint.py +54 -54
  4. pyxllib/algo/geo.py +537 -541
  5. pyxllib/algo/intervals.py +964 -964
  6. pyxllib/algo/matcher.py +389 -389
  7. pyxllib/algo/newbie.py +166 -166
  8. pyxllib/algo/pupil.py +629 -629
  9. pyxllib/algo/shapelylib.py +67 -67
  10. pyxllib/algo/specialist.py +241 -241
  11. pyxllib/algo/stat.py +494 -494
  12. pyxllib/algo/treelib.py +145 -149
  13. pyxllib/algo/unitlib.py +62 -66
  14. pyxllib/autogui/__init__.py +5 -5
  15. pyxllib/autogui/activewin.py +246 -246
  16. pyxllib/autogui/all.py +9 -9
  17. pyxllib/autogui/autogui.py +846 -852
  18. pyxllib/autogui/uiautolib.py +362 -362
  19. pyxllib/autogui/virtualkey.py +102 -102
  20. pyxllib/autogui/wechat.py +827 -827
  21. pyxllib/autogui/wechat_msg.py +421 -421
  22. pyxllib/autogui/wxautolib.py +84 -84
  23. pyxllib/cv/__init__.py +5 -5
  24. pyxllib/cv/expert.py +267 -267
  25. pyxllib/cv/imfile.py +159 -159
  26. pyxllib/cv/imhash.py +39 -39
  27. pyxllib/cv/pupil.py +9 -9
  28. pyxllib/cv/rgbfmt.py +1525 -1525
  29. pyxllib/cv/slidercaptcha.py +137 -137
  30. pyxllib/cv/trackbartools.py +251 -251
  31. pyxllib/cv/xlcvlib.py +1040 -1040
  32. pyxllib/cv/xlpillib.py +423 -423
  33. pyxllib/data/echarts.py +236 -240
  34. pyxllib/data/jsonlib.py +85 -89
  35. pyxllib/data/oss.py +72 -72
  36. pyxllib/data/pglib.py +1111 -1127
  37. pyxllib/data/sqlite.py +568 -568
  38. pyxllib/data/sqllib.py +297 -297
  39. pyxllib/ext/JLineViewer.py +505 -505
  40. pyxllib/ext/__init__.py +6 -6
  41. pyxllib/ext/demolib.py +251 -246
  42. pyxllib/ext/drissionlib.py +277 -277
  43. pyxllib/ext/kq5034lib.py +12 -12
  44. pyxllib/ext/qt.py +449 -449
  45. pyxllib/ext/robustprocfile.py +493 -497
  46. pyxllib/ext/seleniumlib.py +76 -76
  47. pyxllib/ext/tk.py +173 -173
  48. pyxllib/ext/unixlib.py +821 -827
  49. pyxllib/ext/utools.py +345 -351
  50. pyxllib/ext/webhook.py +124 -119
  51. pyxllib/ext/win32lib.py +40 -40
  52. pyxllib/ext/wjxlib.py +91 -88
  53. pyxllib/ext/wpsapi.py +124 -124
  54. pyxllib/ext/xlwork.py +9 -9
  55. pyxllib/ext/yuquelib.py +1110 -1105
  56. pyxllib/file/__init__.py +17 -17
  57. pyxllib/file/docxlib.py +757 -761
  58. pyxllib/file/gitlib.py +309 -309
  59. pyxllib/file/libreoffice.py +165 -165
  60. pyxllib/file/movielib.py +144 -148
  61. pyxllib/file/newbie.py +10 -10
  62. pyxllib/file/onenotelib.py +1469 -1469
  63. pyxllib/file/packlib/__init__.py +330 -330
  64. pyxllib/file/packlib/zipfile.py +2441 -2441
  65. pyxllib/file/pdflib.py +422 -426
  66. pyxllib/file/pupil.py +185 -185
  67. pyxllib/file/specialist/__init__.py +681 -685
  68. pyxllib/file/specialist/dirlib.py +799 -799
  69. pyxllib/file/specialist/download.py +193 -193
  70. pyxllib/file/specialist/filelib.py +2825 -2829
  71. pyxllib/file/xlsxlib.py +3122 -3131
  72. pyxllib/file/xlsyncfile.py +341 -341
  73. pyxllib/prog/__init__.py +5 -5
  74. pyxllib/prog/cachetools.py +58 -64
  75. pyxllib/prog/deprecatedlib.py +233 -233
  76. pyxllib/prog/filelock.py +42 -42
  77. pyxllib/prog/ipyexec.py +253 -253
  78. pyxllib/prog/multiprogs.py +940 -940
  79. pyxllib/prog/newbie.py +451 -451
  80. pyxllib/prog/pupil.py +1208 -1197
  81. pyxllib/prog/sitepackages.py +33 -33
  82. pyxllib/prog/specialist/__init__.py +348 -391
  83. pyxllib/prog/specialist/bc.py +203 -203
  84. pyxllib/prog/specialist/browser.py +497 -497
  85. pyxllib/prog/specialist/common.py +347 -347
  86. pyxllib/prog/specialist/datetime.py +198 -198
  87. pyxllib/prog/specialist/tictoc.py +240 -240
  88. pyxllib/prog/specialist/xllog.py +180 -180
  89. pyxllib/prog/xlosenv.py +110 -108
  90. pyxllib/stdlib/__init__.py +17 -17
  91. pyxllib/stdlib/tablepyxl/__init__.py +10 -10
  92. pyxllib/stdlib/tablepyxl/style.py +303 -303
  93. pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
  94. pyxllib/text/__init__.py +8 -8
  95. pyxllib/text/ahocorasick.py +36 -39
  96. pyxllib/text/airscript.js +754 -744
  97. pyxllib/text/charclasslib.py +121 -121
  98. pyxllib/text/jiebalib.py +267 -267
  99. pyxllib/text/jinjalib.py +27 -32
  100. pyxllib/text/jsa_ai_prompt.md +271 -271
  101. pyxllib/text/jscode.py +922 -922
  102. pyxllib/text/latex/__init__.py +158 -158
  103. pyxllib/text/levenshtein.py +303 -303
  104. pyxllib/text/nestenv.py +1215 -1215
  105. pyxllib/text/newbie.py +300 -300
  106. pyxllib/text/pupil/__init__.py +8 -8
  107. pyxllib/text/pupil/common.py +1121 -1121
  108. pyxllib/text/pupil/xlalign.py +326 -326
  109. pyxllib/text/pycode.py +47 -47
  110. pyxllib/text/specialist/__init__.py +8 -8
  111. pyxllib/text/specialist/common.py +112 -112
  112. pyxllib/text/specialist/ptag.py +186 -186
  113. pyxllib/text/spellchecker.py +172 -172
  114. pyxllib/text/templates/echart_base.html +10 -10
  115. pyxllib/text/templates/highlight_code.html +16 -16
  116. pyxllib/text/templates/latex_editor.html +102 -102
  117. pyxllib/text/vbacode.py +17 -17
  118. pyxllib/text/xmllib.py +741 -747
  119. pyxllib/xl.py +42 -39
  120. pyxllib/xlcv.py +17 -17
  121. pyxllib-3.201.1.dist-info/METADATA +296 -0
  122. pyxllib-3.201.1.dist-info/RECORD +125 -0
  123. {pyxllib-0.3.197.dist-info → pyxllib-3.201.1.dist-info}/licenses/LICENSE +190 -190
  124. pyxllib/ext/old.py +0 -663
  125. pyxllib-0.3.197.dist-info/METADATA +0 -48
  126. pyxllib-0.3.197.dist-info/RECORD +0 -126
  127. {pyxllib-0.3.197.dist-info → pyxllib-3.201.1.dist-info}/WHEEL +0 -0
pyxllib/cv/expert.py CHANGED
@@ -1,267 +1,267 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Date : 2021/08/25 15:57
6
-
7
- from collections import defaultdict
8
- import concurrent.futures
9
-
10
- import cv2
11
- import pandas as pd
12
- from tqdm import tqdm
13
-
14
- import PIL.Image
15
-
16
- from pyxllib.algo.stat import update_dataframes_to_excel
17
- from pyxllib.file.specialist import get_etag, XlPath
18
- from pyxllib.prog.specialist import Iterate
19
- from pyxllib.cv.xlcvlib import CvImg, xlcv
20
- from pyxllib.cv.xlpillib import PilImg, xlpil
21
-
22
-
23
- def __1_目录级处理图片的功能():
24
- pass
25
-
26
-
27
- class ImagesDir(XlPath):
28
- """ 这个函数功能,默认都是原地操作,如果怕以防万一出问题,最好对原始数据有另外的备份,而在新的目录里操作 """
29
-
30
- def debug_image_func(self, func, pattern='*', *, save=None, show=False):
31
- """
32
- :param func: 对每张图片执行的功能,函数应该只有一个图片路径参数 new_img = func(img)
33
- 当函数有多个参数时,可以用lambda函数技巧: lambda im: func(im, arg1=..., arg2=...)
34
- :param save: 如果输入一个目录,会将debug结果图存储到对应的目录里
35
- :param show: 如果该参数为True,则每处理一张会imshow显示处理效果
36
- 此时弹出的窗口里,每按任意键则显示下一张,按ESC退出
37
- :return:
38
-
39
- TODO 显示原图、处理后图的对比效果
40
- TODO 支持同时显示多张图处理效果
41
- """
42
- if save:
43
- save = XlPath(save)
44
-
45
- for f in self.glob_images(pattern):
46
- im1 = xlcv.read(f)
47
- im2 = func(im1)
48
-
49
- if save:
50
- xlcv.write(im2, self / save / f.name)
51
-
52
- if show:
53
- xlcv.imshow2(im2)
54
- key = cv2.waitKey()
55
- if key == '0x1B': # ESC 键
56
- break
57
-
58
- def fix_suffixs(self, pattern='**/*', log_file='_图片统计.xlsx', max_workers=None, pinterval=None):
59
- """ 修正错误的后缀名
60
-
61
- :param pinterval: 支持智能地判断进度间隔
62
- """
63
-
64
- # 1 修改后缀
65
- # 定义并行处理子函数
66
- def process_image_file(args):
67
- """ 处理单个图片文件,修正后缀名 """
68
- file, ext = args
69
- xlcv.write(xlcv.read(file), file) # 读取图片,并按照原本文件名期望的格式存储
70
- ls.append([file.relpath(self).as_posix(), ext])
71
-
72
- ls = []
73
- files_with_exts = list(self.xglob_faker_suffix_images(pattern))
74
- if pinterval is None and files_with_exts:
75
- p = max(1000 * 100 // len(files_with_exts), 1) # 最小也按1%进度展示
76
- if p < 50: # 间隔只有小余50%,才比较有显示的意义
77
- pinterval = f'{p}%' # 每1千张显示进度
78
- Iterate(files_with_exts).run(process_image_file, max_workers=max_workers, pinterval=pinterval)
79
-
80
- # 2 记录修改情况
81
- df = pd.DataFrame.from_records(ls, columns=['图片名', '原图片类型'])
82
- if log_file:
83
- update_dataframes_to_excel(XlPath.init(log_file, self), {'修改后缀名': df})
84
- return df
85
-
86
- def reduce_image_filesize(self, pattern='**/*',
87
- limit_size=4 * 1024 ** 2, *,
88
- read_flags=None,
89
- change_length=False,
90
- suffix=None,
91
- log_file='_图片统计.xlsx',
92
- max_workers=None, pinterval=None):
93
- """ 减小图片尺寸,可以限制目录里尺寸最大的图片不超过多少
94
-
95
- :param limit_size: 限制的尺寸
96
- 一般自己的相册图片,亲测300kb其实就够了~~,即 300 * 1024
97
- 百度API那边,好像不同接口不太一样,4M、6M、10M等好像都有
98
- 但百度那是base64后的尺寸,会大出1/3
99
- 为了够用,一般要限定在4M等比例的3/4比例内
100
- :param read_flags: 读取图片时的参数,设为1,可以把各种RGBA等奇怪的格式,统一为RGB
101
- :param change_length: 默认是要减小图片的边长,尺寸,来压缩图片的
102
- 可以设为False,不调整尺寸,纯粹读取后再重写,可能也能压缩不少尺寸
103
- :param suffix: 可以统一图片后缀格式,默认保留原图片名称
104
- 要带前缀'.',例如'.jpg'
105
- 注意其他格式的原图会被删除
106
-
107
- 因为所有图片都会读入后再重新写入,速度可能会稍慢
108
- """
109
-
110
- # 1 调试信息
111
- print('原始大小', self.size(human_readable=True))
112
-
113
- # 2 精简图片尺寸
114
- # 定义并行处理子函数
115
- def process_image_file(f):
116
- """处理单个图片文件,减小图片尺寸"""
117
- size1 = f.size()
118
- im = xlpil.read(f, read_flags)
119
- _suffix = suffix or f.suffix
120
- if change_length:
121
- im = xlpil.reduce_filesize(im, limit_size, _suffix)
122
- size2 = xlpil.evaluate_image_file_size(im, _suffix)
123
- dst_f = f.with_suffix(_suffix)
124
- if size2 < size1: # 只有文件尺寸确实变小的才更新
125
- xlpil.write(im, dst_f)
126
- if f.suffix != _suffix:
127
- f.delete()
128
- ls.append([f.relpath(self).as_posix(), dst_f.relpath(self).as_posix(), size1, size2])
129
-
130
- ls = []
131
- files = list(self.glob_images(pattern))
132
- if pinterval is None and files:
133
- p = max(100 * 100 // len(files), 1) # 最小也按1%进度展示
134
- if p < 50: # 间隔只有小余50%,才比较有显示的意义
135
- pinterval = f'{p}%' # 每1千张显示进度
136
- Iterate(files).run(process_image_file, max_workers=max_workers, pinterval=pinterval)
137
-
138
- print('新目录大小', self.size(human_readable=True))
139
-
140
- # 3 记录修改细节
141
- # 注意,如果不使用suffix参数,'新图片'的值应该跟'原图片'是一样的
142
- # 以及当尝试精简的'新文件大小'大于'原文件大小'时,图片其实是不会被覆盖更新的
143
- df = pd.DataFrame.from_records(ls, columns=['原图片', '新图片', '原文件大小', '新文件大小'])
144
- if log_file:
145
- update_dataframes_to_excel(XlPath.init(log_file, self), {'图片瘦身': df})
146
- return df
147
-
148
- def adjust_image_shape(self, pattern='*', min_length=None, max_length=None, print_mode=True):
149
- """ 调整图片尺寸 """
150
-
151
- def printf(*args, **kwargs):
152
- if print_mode:
153
- print(*args, **kwargs)
154
-
155
- j = 1
156
- for f in self.glob_images(pattern):
157
- # 用pil库判断图片尺寸更快,但处理过程用的是cv2库
158
- h, w = xlpil.read(f).size[::-1]
159
- x, y = min(h, w), max(h, w)
160
-
161
- if (min_length and x < min_length) or (max_length and y > max_length):
162
- im = xlcv.read(f)
163
- im2 = xlcv.adjust_shape(im, min_length, max_length)
164
- if im2.shape != im.shape:
165
- printf(f'{j}、{f} {im.shape} -> {im2.shape}')
166
- xlcv.write(im2, f)
167
- j += 1
168
-
169
- def check_repeat_phash_images(self, pattern='**/*', **kwargs):
170
- from pyxllib.cv.imhash import phash
171
- if 'files' not in kwargs:
172
- kwargs['files'] = self.glob_images(pattern)
173
- if 'hash_func' not in kwargs:
174
- kwargs['hash_func'] = lambda p: phash(p)
175
- self.check_repeat_files(pattern, **kwargs)
176
-
177
- def check_repeat_dhash_images(self, pattern='**/*', **kwargs):
178
- from pyxllib.cv.imhash import dhash
179
- if 'files' not in kwargs:
180
- kwargs['files'] = self.glob_images(pattern)
181
- if 'hash_func' not in kwargs:
182
- kwargs['hash_func'] = lambda p: dhash(p)
183
- self.check_repeat_files(pattern, **kwargs)
184
-
185
- def clear_exif(self):
186
- """ 清除图片中的exif标记 """
187
- cnt = 0
188
- for file in tqdm(self.rglob_images()):
189
- im = xlpil.read(file)
190
- exif = xlpil.get_exif(im)
191
- if exif:
192
- orientation = exif.get("Orientation", None)
193
- if orientation:
194
- cnt += 1
195
- im = xlpil.apply_exif_orientation(im)
196
- xlpil.write(im, file)
197
- print(f'处理了{cnt}份exif')
198
-
199
-
200
- def find_modified_images(dirs, print_mode=False):
201
- """ 查找可能被修改过的图片
202
-
203
- 一般用在数据标注工作中,对收回来的数据目录,和原本数据目录做个对比,
204
- 以name作为对应关联,看前后图片是否内容发生变换,比如旋转。
205
-
206
- :param list[str] dirs: 图片所在目录列表
207
- :param bool print_mode: 是否打印进度提示,默认为 False
208
- :return dict[str, list[str]]: 包含图片名字和可能被修改过的图片路径列表的字典
209
-
210
- 示例用法:
211
- import os
212
- from pprint import pprint
213
- from pyxllib.cv.expert import find_modified_images
214
-
215
- os.chdir('/home/chenkunze/data')
216
- res = find_modified_images([r'm2305latex2lgx/train_images_sub',
217
- r'm2305latex2lg/1、做完的数据'])
218
- pprint(res)
219
- """
220
- from pyxllib.file.specialist import get_etag # 发现不能用相似,还是得用etag
221
-
222
- # 1 将图片按名字分组
223
- def group_by_name(dirs):
224
- """ 将图片按名字分组
225
-
226
- :param list[str] dirs: 图片所在目录列表
227
- :return dict[str, list[str]]: 包含图片名字和对应图片路径列表的字典
228
-
229
- >>> group_by_name(['path/to/dir1', 'path/to/dir2'])
230
- {'image1.jpg': ['path/to/dir1/image1.jpg'], 'image2.png': ['path/to/dir2/image2.png']}
231
- """
232
- image_groups = {}
233
- for dir in dirs:
234
- for path in XlPath(dir).rglob_images():
235
- image_name = path.name
236
- if image_name not in image_groups:
237
- image_groups[image_name] = []
238
- image_groups[image_name].append(path)
239
- return image_groups
240
-
241
- image_groups = group_by_name(dirs)
242
-
243
- # 2 存储有哪些变化的分组
244
- modified_images = {}
245
- progress_counter = 0
246
-
247
- if print_mode:
248
- total_files = sum(len(paths) for paths in image_groups.values())
249
- print(f"Total files: {total_files}")
250
-
251
- for image_name, paths in image_groups.items():
252
- if len(paths) <= 1:
253
- continue
254
-
255
- hash_values = [get_etag(str(path)) for path in paths]
256
- sizes = [PIL.Image.open(path).size for path in paths]
257
-
258
- # 这里可以增强,更加详细展示差异,比如是不是被旋转了90度、180度、270度,但会大大提升运算量,暂时不添加
259
- if len(set(hash_values)) > 1 or len(set(sizes)) > 1:
260
- # 获取posix风格路径
261
- modified_images[image_name] = [XlPath(path).as_posix() for path in paths]
262
-
263
- if print_mode:
264
- progress_counter += len(paths)
265
- print(f"Progress: {progress_counter}/{total_files}")
266
-
267
- return modified_images
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : 陈坤泽
4
+ # @Email : 877362867@qq.com
5
+ # @Date : 2021/08/25 15:57
6
+
7
+ from collections import defaultdict
8
+ import concurrent.futures
9
+
10
+ import cv2
11
+ import pandas as pd
12
+ from tqdm import tqdm
13
+
14
+ import PIL.Image
15
+
16
+ from pyxllib.algo.stat import update_dataframes_to_excel
17
+ from pyxllib.file.specialist import get_etag, XlPath
18
+ from pyxllib.prog.specialist import Iterate
19
+ from pyxllib.cv.xlcvlib import CvImg, xlcv
20
+ from pyxllib.cv.xlpillib import PilImg, xlpil
21
+
22
+
23
+ def __1_目录级处理图片的功能():
24
+ pass
25
+
26
+
27
+ class ImagesDir(XlPath):
28
+ """ 这个函数功能,默认都是原地操作,如果怕以防万一出问题,最好对原始数据有另外的备份,而在新的目录里操作 """
29
+
30
+ def debug_image_func(self, func, pattern='*', *, save=None, show=False):
31
+ """
32
+ :param func: 对每张图片执行的功能,函数应该只有一个图片路径参数 new_img = func(img)
33
+ 当函数有多个参数时,可以用lambda函数技巧: lambda im: func(im, arg1=..., arg2=...)
34
+ :param save: 如果输入一个目录,会将debug结果图存储到对应的目录里
35
+ :param show: 如果该参数为True,则每处理一张会imshow显示处理效果
36
+ 此时弹出的窗口里,每按任意键则显示下一张,按ESC退出
37
+ :return:
38
+
39
+ TODO 显示原图、处理后图的对比效果
40
+ TODO 支持同时显示多张图处理效果
41
+ """
42
+ if save:
43
+ save = XlPath(save)
44
+
45
+ for f in self.glob_images(pattern):
46
+ im1 = xlcv.read(f)
47
+ im2 = func(im1)
48
+
49
+ if save:
50
+ xlcv.write(im2, self / save / f.name)
51
+
52
+ if show:
53
+ xlcv.imshow2(im2)
54
+ key = cv2.waitKey()
55
+ if key == '0x1B': # ESC 键
56
+ break
57
+
58
+ def fix_suffixs(self, pattern='**/*', log_file='_图片统计.xlsx', max_workers=None, pinterval=None):
59
+ """ 修正错误的后缀名
60
+
61
+ :param pinterval: 支持智能地判断进度间隔
62
+ """
63
+
64
+ # 1 修改后缀
65
+ # 定义并行处理子函数
66
+ def process_image_file(args):
67
+ """ 处理单个图片文件,修正后缀名 """
68
+ file, ext = args
69
+ xlcv.write(xlcv.read(file), file) # 读取图片,并按照原本文件名期望的格式存储
70
+ ls.append([file.relpath(self).as_posix(), ext])
71
+
72
+ ls = []
73
+ files_with_exts = list(self.xglob_faker_suffix_images(pattern))
74
+ if pinterval is None and files_with_exts:
75
+ p = max(1000 * 100 // len(files_with_exts), 1) # 最小也按1%进度展示
76
+ if p < 50: # 间隔只有小余50%,才比较有显示的意义
77
+ pinterval = f'{p}%' # 每1千张显示进度
78
+ Iterate(files_with_exts).run(process_image_file, max_workers=max_workers, pinterval=pinterval)
79
+
80
+ # 2 记录修改情况
81
+ df = pd.DataFrame.from_records(ls, columns=['图片名', '原图片类型'])
82
+ if log_file:
83
+ update_dataframes_to_excel(XlPath.init(log_file, self), {'修改后缀名': df})
84
+ return df
85
+
86
+ def reduce_image_filesize(self, pattern='**/*',
87
+ limit_size=4 * 1024 ** 2, *,
88
+ read_flags=None,
89
+ change_length=False,
90
+ suffix=None,
91
+ log_file='_图片统计.xlsx',
92
+ max_workers=None, pinterval=None):
93
+ """ 减小图片尺寸,可以限制目录里尺寸最大的图片不超过多少
94
+
95
+ :param limit_size: 限制的尺寸
96
+ 一般自己的相册图片,亲测300kb其实就够了~~,即 300 * 1024
97
+ 百度API那边,好像不同接口不太一样,4M、6M、10M等好像都有
98
+ 但百度那是base64后的尺寸,会大出1/3
99
+ 为了够用,一般要限定在4M等比例的3/4比例内
100
+ :param read_flags: 读取图片时的参数,设为1,可以把各种RGBA等奇怪的格式,统一为RGB
101
+ :param change_length: 默认是要减小图片的边长,尺寸,来压缩图片的
102
+ 可以设为False,不调整尺寸,纯粹读取后再重写,可能也能压缩不少尺寸
103
+ :param suffix: 可以统一图片后缀格式,默认保留原图片名称
104
+ 要带前缀'.',例如'.jpg'
105
+ 注意其他格式的原图会被删除
106
+
107
+ 因为所有图片都会读入后再重新写入,速度可能会稍慢
108
+ """
109
+
110
+ # 1 调试信息
111
+ print('原始大小', self.size(human_readable=True))
112
+
113
+ # 2 精简图片尺寸
114
+ # 定义并行处理子函数
115
+ def process_image_file(f):
116
+ """处理单个图片文件,减小图片尺寸"""
117
+ size1 = f.size()
118
+ im = xlpil.read(f, read_flags)
119
+ _suffix = suffix or f.suffix
120
+ if change_length:
121
+ im = xlpil.reduce_filesize(im, limit_size, _suffix)
122
+ size2 = xlpil.evaluate_image_file_size(im, _suffix)
123
+ dst_f = f.with_suffix(_suffix)
124
+ if size2 < size1: # 只有文件尺寸确实变小的才更新
125
+ xlpil.write(im, dst_f)
126
+ if f.suffix != _suffix:
127
+ f.delete()
128
+ ls.append([f.relpath(self).as_posix(), dst_f.relpath(self).as_posix(), size1, size2])
129
+
130
+ ls = []
131
+ files = list(self.glob_images(pattern))
132
+ if pinterval is None and files:
133
+ p = max(100 * 100 // len(files), 1) # 最小也按1%进度展示
134
+ if p < 50: # 间隔只有小余50%,才比较有显示的意义
135
+ pinterval = f'{p}%' # 每1千张显示进度
136
+ Iterate(files).run(process_image_file, max_workers=max_workers, pinterval=pinterval)
137
+
138
+ print('新目录大小', self.size(human_readable=True))
139
+
140
+ # 3 记录修改细节
141
+ # 注意,如果不使用suffix参数,'新图片'的值应该跟'原图片'是一样的
142
+ # 以及当尝试精简的'新文件大小'大于'原文件大小'时,图片其实是不会被覆盖更新的
143
+ df = pd.DataFrame.from_records(ls, columns=['原图片', '新图片', '原文件大小', '新文件大小'])
144
+ if log_file:
145
+ update_dataframes_to_excel(XlPath.init(log_file, self), {'图片瘦身': df})
146
+ return df
147
+
148
+ def adjust_image_shape(self, pattern='*', min_length=None, max_length=None, print_mode=True):
149
+ """ 调整图片尺寸 """
150
+
151
+ def printf(*args, **kwargs):
152
+ if print_mode:
153
+ print(*args, **kwargs)
154
+
155
+ j = 1
156
+ for f in self.glob_images(pattern):
157
+ # 用pil库判断图片尺寸更快,但处理过程用的是cv2库
158
+ h, w = xlpil.read(f).size[::-1]
159
+ x, y = min(h, w), max(h, w)
160
+
161
+ if (min_length and x < min_length) or (max_length and y > max_length):
162
+ im = xlcv.read(f)
163
+ im2 = xlcv.adjust_shape(im, min_length, max_length)
164
+ if im2.shape != im.shape:
165
+ printf(f'{j}、{f} {im.shape} -> {im2.shape}')
166
+ xlcv.write(im2, f)
167
+ j += 1
168
+
169
+ def check_repeat_phash_images(self, pattern='**/*', **kwargs):
170
+ from pyxllib.cv.imhash import phash
171
+ if 'files' not in kwargs:
172
+ kwargs['files'] = self.glob_images(pattern)
173
+ if 'hash_func' not in kwargs:
174
+ kwargs['hash_func'] = lambda p: phash(p)
175
+ self.check_repeat_files(pattern, **kwargs)
176
+
177
+ def check_repeat_dhash_images(self, pattern='**/*', **kwargs):
178
+ from pyxllib.cv.imhash import dhash
179
+ if 'files' not in kwargs:
180
+ kwargs['files'] = self.glob_images(pattern)
181
+ if 'hash_func' not in kwargs:
182
+ kwargs['hash_func'] = lambda p: dhash(p)
183
+ self.check_repeat_files(pattern, **kwargs)
184
+
185
+ def clear_exif(self):
186
+ """ 清除图片中的exif标记 """
187
+ cnt = 0
188
+ for file in tqdm(self.rglob_images()):
189
+ im = xlpil.read(file)
190
+ exif = xlpil.get_exif(im)
191
+ if exif:
192
+ orientation = exif.get("Orientation", None)
193
+ if orientation:
194
+ cnt += 1
195
+ im = xlpil.apply_exif_orientation(im)
196
+ xlpil.write(im, file)
197
+ print(f'处理了{cnt}份exif')
198
+
199
+
200
+ def find_modified_images(dirs, print_mode=False):
201
+ """ 查找可能被修改过的图片
202
+
203
+ 一般用在数据标注工作中,对收回来的数据目录,和原本数据目录做个对比,
204
+ 以name作为对应关联,看前后图片是否内容发生变换,比如旋转。
205
+
206
+ :param list[str] dirs: 图片所在目录列表
207
+ :param bool print_mode: 是否打印进度提示,默认为 False
208
+ :return dict[str, list[str]]: 包含图片名字和可能被修改过的图片路径列表的字典
209
+
210
+ 示例用法:
211
+ import os
212
+ from pprint import pprint
213
+ from pyxllib.cv.expert import find_modified_images
214
+
215
+ os.chdir('/home/chenkunze/data')
216
+ res = find_modified_images([r'm2305latex2lgx/train_images_sub',
217
+ r'm2305latex2lg/1、做完的数据'])
218
+ pprint(res)
219
+ """
220
+ from pyxllib.file.specialist import get_etag # 发现不能用相似,还是得用etag
221
+
222
+ # 1 将图片按名字分组
223
+ def group_by_name(dirs):
224
+ """ 将图片按名字分组
225
+
226
+ :param list[str] dirs: 图片所在目录列表
227
+ :return dict[str, list[str]]: 包含图片名字和对应图片路径列表的字典
228
+
229
+ >>> group_by_name(['path/to/dir1', 'path/to/dir2'])
230
+ {'image1.jpg': ['path/to/dir1/image1.jpg'], 'image2.png': ['path/to/dir2/image2.png']}
231
+ """
232
+ image_groups = {}
233
+ for dir in dirs:
234
+ for path in XlPath(dir).rglob_images():
235
+ image_name = path.name
236
+ if image_name not in image_groups:
237
+ image_groups[image_name] = []
238
+ image_groups[image_name].append(path)
239
+ return image_groups
240
+
241
+ image_groups = group_by_name(dirs)
242
+
243
+ # 2 存储有哪些变化的分组
244
+ modified_images = {}
245
+ progress_counter = 0
246
+
247
+ if print_mode:
248
+ total_files = sum(len(paths) for paths in image_groups.values())
249
+ print(f"Total files: {total_files}")
250
+
251
+ for image_name, paths in image_groups.items():
252
+ if len(paths) <= 1:
253
+ continue
254
+
255
+ hash_values = [get_etag(str(path)) for path in paths]
256
+ sizes = [PIL.Image.open(path).size for path in paths]
257
+
258
+ # 这里可以增强,更加详细展示差异,比如是不是被旋转了90度、180度、270度,但会大大提升运算量,暂时不添加
259
+ if len(set(hash_values)) > 1 or len(set(sizes)) > 1:
260
+ # 获取posix风格路径
261
+ modified_images[image_name] = [XlPath(path).as_posix() for path in paths]
262
+
263
+ if print_mode:
264
+ progress_counter += len(paths)
265
+ print(f"Progress: {progress_counter}/{total_files}")
266
+
267
+ return modified_images