audiolibrarian 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
picard_src/README.md ADDED
@@ -0,0 +1,11 @@
1
+ # Notes #
2
+
3
+ Code in this directory comes from (or is derived from) the super-cool Picard project.
4
+
5
+ https://github.com/metabrainz/picard
6
+
7
+ I'd like to express my thanks to all of have contributed to that project.
8
+
9
+ Note: we could have just set the Picard library as a dependency of audiolibrarian, but
10
+ it includes other dependencies (such as PyQT) that add a lot of overhead not required
11
+ by this project.
picard_src/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ from picard_src.textencoding import (
2
+ replace_non_ascii,
3
+ unicode_simplify_accents,
4
+ unicode_simplify_combinations,
5
+ unicode_simplify_compatibility,
6
+ unicode_simplify_punctuation,
7
+ )
@@ -0,0 +1,495 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Picard, the next-generation MusicBrainz tagger
4
+ #
5
+ # Copyright (C) 2004 Robert Kaye
6
+ # Copyright (C) 2006 Lukáš Lalinský
7
+ # Copyright (C) 2014 Sophist-UK
8
+ # Copyright (C) 2014, 2018 Laurent Monin
9
+ # Copyright (C) 2017 Sambhav Kothari
10
+ # Copyright (C) 2018-2019 Philipp Wolfer
11
+ #
12
+ # This program is free software; you can redistribute it and/or
13
+ # modify it under the terms of the GNU General Public License
14
+ # as published by the Free Software Foundation; either version 2
15
+ # of the License, or (at your option) any later version.
16
+ #
17
+ # This program is distributed in the hope that it will be useful,
18
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
19
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20
+ # GNU General Public License for more details.
21
+ #
22
+ # You should have received a copy of the GNU General Public License
23
+ # along with this program; if not, write to the Free Software
24
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25
+
26
+
27
+ # This modules provides functionality for simplifying unicode strings.
28
+
29
+ # The unicode character set (of over 1m codepoints and 24,000 characters) includes:
30
+ # Normal ascii (latin) non-accented characters
31
+ # Combined latin characters e.g. ae in normal usage
32
+ # Compatibility combined latin characters (retained for compatibility with other character sets)
33
+ # These can look very similar to normal characters and can be confusing for searches, sort
34
+ # orders etc.
35
+ # Non-latin (e.g. japanese, greek, hebrew etc.) characters
36
+ # Both latin and non-latin characters can be accented. Accents can be either:
37
+ # Provided by separate nonspacing_mark characters which are visually overlaid (visually 1
38
+ # character is actually 2); or
39
+ # Integrated accented characters (i.e. non-accented characters combined with a nonspace_mark
40
+ # into a single character)
41
+ # Again these can be confusing for searches, sort orders etc.
42
+ # Punctuation can also be confusing in unicode e.g. several types of single or double quote mark.
43
+
44
+ # For latin script:
45
+ # Combined characters, accents and punctuation can be visually similar but look different to
46
+ # search engines,
47
+ # sort orders etc. and the number of ways to use similar looking characters can (does) result in
48
+ # inconsistent
49
+ # usage inside Music metadata.
50
+ #
51
+ # Simplifying # the unicode character sets by many-to-one mappings can improve consistency and
52
+ # reduce confusion,
53
+ # however sometimes the choice of specific characters can be a deliberate part of an album, song
54
+ # title or artist name
55
+ # (and should not therefore be changed without careful thought) and occasionally the choice of
56
+ # characters can be
57
+ # malicious (i.e. to defeat firewalls or spam filters or to appear to be something else).
58
+ #
59
+ # Finally, given the size of the unicode character set, fonts are unlikely to display all
60
+ # characters,
61
+ # making simplification a necessity.
62
+ #
63
+ # Simplification may also be needed to make tags conform to ISO-8859-1 (extended ascii) or to
64
+ # make tags or filenames
65
+ # into ascii, perhaps because the file system or player cannot support unicode.
66
+ #
67
+ # Non-latin scripts may also need to be converted to latin scripts through:
68
+ # Translation (e.g. hebrew word for mother is translated to "mother"); or
69
+ # Transliteration (e.g. the SOUND of the hebrew letter or word is spelt out in latin)
70
+ # These are non-trivial, and the software to do these is far from comprehensive.
71
+
72
+ # This module provides utility functions to enable simplification of latin and punctuation unicode:
73
+ # 1. simplify compatibility characters;
74
+ # 2. split combined characters;
75
+ # 3. remove accents (entirely or if not in ISO-8859-1 as applicable);
76
+ # 4. replace remaining non-ascii or non-ISO-8859-1 characters with a default character
77
+ # This module also provides an extension infrastructure to allow translation and / or
78
+ # transliteration plugins to be added.
79
+
80
+ import codecs
81
+ import unicodedata
82
+ from functools import partial
83
+
84
+ # STJ # from picard.util import sanitize_filename
85
+
86
+ # LATIN SIMPLIFICATION
87
+ # The translation tables for punctuation and latin combined-characters are taken from
88
+ # http://unicode.org/repos/cldr/trunk/common/transforms/Latin-ASCII.xml
89
+ # Various bugs and mistakes in this have been ironed out during testing.
90
+
91
+
92
+ _additional_compatibility = {
93
+ "\u0276": "Œ", # LATIN LETTER SMALL CAPITAL OE
94
+ "\u1d00": "A", # LATIN LETTER SMALL CAPITAL A
95
+ "\u1d01": "Æ", # LATIN LETTER SMALL CAPITAL AE
96
+ "\u1d04": "C", # LATIN LETTER SMALL CAPITAL C
97
+ "\u1d05": "D", # LATIN LETTER SMALL CAPITAL D
98
+ "\u1d07": "E", # LATIN LETTER SMALL CAPITAL E
99
+ "\u1d0a": "J", # LATIN LETTER SMALL CAPITAL J
100
+ "\u1d0b": "K", # LATIN LETTER SMALL CAPITAL K
101
+ "\u1d0d": "M", # LATIN LETTER SMALL CAPITAL M
102
+ "\u1d0f": "O", # LATIN LETTER SMALL CAPITAL O
103
+ "\u1d18": "P", # LATIN LETTER SMALL CAPITAL P
104
+ "\u1d1b": "T", # LATIN LETTER SMALL CAPITAL T
105
+ "\u1d1c": "U", # LATIN LETTER SMALL CAPITAL U
106
+ "\u1d20": "V", # LATIN LETTER SMALL CAPITAL V
107
+ "\u1d21": "W", # LATIN LETTER SMALL CAPITAL W
108
+ "\u1d22": "Z", # LATIN LETTER SMALL CAPITAL Z
109
+ "\u3007": "0", # IDEOGRAPHIC NUMBER ZERO
110
+ "\u00a0": " ", # NO-BREAK SPACE
111
+ "\u3000": " ", # IDEOGRAPHIC SPACE (from ‹character-fallback›)
112
+ "\u2033": "”", # DOUBLE PRIME
113
+ }
114
+
115
+
116
+ def unicode_simplify_compatibility(string):
117
+ interim = "".join(_additional_compatibility.get(c, c) for c in string)
118
+ return unicodedata.normalize("NFKC", interim)
119
+
120
+
121
+ _simplify_punctuation = {
122
+ "\u013f": "L", # LATIN CAPITAL LETTER L WITH MIDDLE DOT (compat)
123
+ "\u0140": "l", # LATIN SMALL LETTER L WITH MIDDLE DOT (compat)
124
+ "\u2018": "'", # LEFT SINGLE QUOTATION MARK (from ‹character-fallback›)
125
+ "\u2019": "'", # RIGHT SINGLE QUOTATION MARK (from ‹character-fallback›)
126
+ "\u201a": "'", # SINGLE LOW-9 QUOTATION MARK (from ‹character-fallback›)
127
+ "\u201b": "'", # SINGLE HIGH-REVERSED-9 QUOTATION MARK (from ‹character-fallback›)
128
+ "\u201c": '"', # LEFT DOUBLE QUOTATION MARK (from ‹character-fallback›)
129
+ "\u201d": '"', # RIGHT DOUBLE QUOTATION MARK (from ‹character-fallback›)
130
+ "\u201e": '"', # DOUBLE LOW-9 QUOTATION MARK (from ‹character-fallback›)
131
+ "\u201f": '"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK (from ‹character-fallback›)
132
+ "\u2032": "'", # PRIME
133
+ "\u2033": '"', # DOUBLE PRIME
134
+ "\u301d": '"', # REVERSED DOUBLE PRIME QUOTATION MARK
135
+ "\u301e": '"', # DOUBLE PRIME QUOTATION MARK
136
+ "\u00ab": "<<", # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK (from ‹character-fallback›)
137
+ "\u00bb": ">>", # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK (from ‹character-fallback›)
138
+ "\u2039": "<", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
139
+ "\u203a": ">", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
140
+ "\u00ad": "", # SOFT HYPHEN (from ‹character-fallback›)
141
+ "\u2010": "-", # HYPHEN (from ‹character-fallback›)
142
+ "\u2011": "-", # NON-BREAKING HYPHEN (from ‹character-fallback›)
143
+ "\u2012": "-", # FIGURE DASH (from ‹character-fallback›)
144
+ "\u2013": "-", # EN DASH (from ‹character-fallback›)
145
+ "\u2014": "-", # EM DASH (from ‹character-fallback›)
146
+ "\u2015": "-", # HORIZONTAL BAR (from ‹character-fallback›)
147
+ "\ufe31": "|", # PRESENTATION FORM FOR VERTICAL EM DASH (compat)
148
+ "\ufe32": "|", # PRESENTATION FORM FOR VERTICAL EN DASH (compat)
149
+ "\ufe58": "-", # SMALL EM DASH (compat)
150
+ "\u2016": "||", # DOUBLE VERTICAL LINE
151
+ "\u2044": "/", # FRACTION SLASH (from ‹character-fallback›)
152
+ "\u2045": "[", # LEFT SQUARE BRACKET WITH QUILL
153
+ "\u2046": "]", # RIGHT SQUARE BRACKET WITH QUILL
154
+ "\u204e": "*", # LOW ASTERISK
155
+ "\u3008": "<", # LEFT ANGLE BRACKET
156
+ "\u3009": ">", # RIGHT ANGLE BRACKET
157
+ "\u300a": "<<", # LEFT DOUBLE ANGLE BRACKET
158
+ "\u300b": ">>", # RIGHT DOUBLE ANGLE BRACKET
159
+ "\u3014": "[", # LEFT TORTOISE SHELL BRACKET
160
+ "\u3015": "]", # RIGHT TORTOISE SHELL BRACKET
161
+ "\u3018": "[", # LEFT WHITE TORTOISE SHELL BRACKET
162
+ "\u3019": "]", # RIGHT WHITE TORTOISE SHELL BRACKET
163
+ "\u301a": "[", # LEFT WHITE SQUARE BRACKET
164
+ "\u301b": "]", # RIGHT WHITE SQUARE BRACKET
165
+ "\ufe11": ",", # PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA (compat)
166
+ "\ufe12": ".", # PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP (compat)
167
+ "\ufe39": "[", # PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET (compat)
168
+ "\ufe3a": "]", # PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET (compat)
169
+ "\ufe3d": "<<", # PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET (compat)
170
+ "\ufe3e": ">>", # PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET (compat)
171
+ "\ufe3f": "<", # PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET (compat)
172
+ "\ufe40": ">", # PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET (compat)
173
+ "\ufe51": ",", # SMALL IDEOGRAPHIC COMMA (compat)
174
+ "\ufe5d": "[", # SMALL LEFT TORTOISE SHELL BRACKET (compat)
175
+ "\ufe5e": "]", # SMALL RIGHT TORTOISE SHELL BRACKET (compat)
176
+ "\uff5f": "((", # FULLWIDTH LEFT WHITE PARENTHESIS (compat)(from ‹character-fallback›)
177
+ "\uff60": "))", # FULLWIDTH RIGHT WHITE PARENTHESIS (compat)(from ‹character-fallback›)
178
+ "\uff61": ".", # HALFWIDTH IDEOGRAPHIC FULL STOP (compat)
179
+ "\uff64": ",", # HALFWIDTH IDEOGRAPHIC COMMA (compat)
180
+ "\u2212": "-", # MINUS SIGN (from ‹character-fallback›)
181
+ "\u2215": "/", # DIVISION SLASH (from ‹character-fallback›)
182
+ "\u2216": "\\", # SET MINUS (from ‹character-fallback›)
183
+ "\u2223": "|", # DIVIDES (from ‹character-fallback›)
184
+ "\u2225": "||", # PARALLEL TO (from ‹character-fallback›)
185
+ "\u226a": "<<", # MUCH LESS-THAN
186
+ "\u226b": ">>", # MUCH GREATER-THAN
187
+ "\u2985": "((", # LEFT WHITE PARENTHESIS
188
+ "\u2986": "))", # RIGHT WHITE PARENTHESIS
189
+ "\u2022": "-", # BULLET
190
+ "\u200b": "", # Zero Width Space
191
+ }
192
+
193
+
194
+ # STJ # def unicode_simplify_punctuation(string, pathsave=False, win_compat=False):
195
+ def unicode_simplify_punctuation(string): # STJ #
196
+ temp = []
197
+ for c in string:
198
+ try:
199
+ result = _simplify_punctuation[c]
200
+ # STJ # if c != result and pathsave:
201
+ # STJ # result = sanitize_filename(result, win_compat=win_compat)
202
+ except KeyError:
203
+ result = c
204
+ temp.append(result)
205
+ return "".join(temp)
206
+
207
+
208
+ _simplify_combinations = {
209
+ "\u00c6": "AE", # LATIN CAPITAL LETTER AE (from ‹character-fallback›)
210
+ "\u00d0": "D", # LATIN CAPITAL LETTER ETH
211
+ "\u00d8": "OE", # LATIN CAPITAL LETTER O WITH STROKE see https://en.wikipedia.org/wiki/%C3%98
212
+ "\u00de": "TH", # LATIN CAPITAL LETTER THORN
213
+ "\u00df": "ss", # LATIN SMALL LETTER SHARP S (from ‹character-fallback›)
214
+ "\u00e6": "ae", # LATIN SMALL LETTER AE (from ‹character-fallback›)
215
+ "\u00f0": "d", # LATIN SMALL LETTER ETH
216
+ "\u00f8": "oe", # LATIN SMALL LETTER O WITH STROKE (see https://en.wikipedia.org/wiki/%C3%98)
217
+ "\u00fe": "th", # LATIN SMALL LETTER THORN
218
+ "\u0110": "D", # LATIN CAPITAL LETTER D WITH STROKE
219
+ "\u0111": "d", # LATIN SMALL LETTER D WITH STROKE
220
+ "\u0126": "H", # LATIN CAPITAL LETTER H WITH STROKE
221
+ "\u0127": "h", # LATIN CAPITAL LETTER H WITH STROKE
222
+ "\u0131": "i", # LATIN SMALL LETTER DOTLESS I
223
+ "\u0138": "q", # LATIN SMALL LETTER KRA (collates with q in DUCET)
224
+ "\u0141": "L", # LATIN CAPITAL LETTER L WITH STROKE
225
+ "\u0142": "l", # LATIN SMALL LETTER L WITH STROKE
226
+ "\u0149": "'n", # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE (from ‹character-fallback›)
227
+ "\u014a": "N", # LATIN CAPITAL LETTER ENG
228
+ "\u014b": "n", # LATIN SMALL LETTER ENG
229
+ "\u0152": "OE", # LATIN CAPITAL LIGATURE OE (from ‹character-fallback›)
230
+ "\u0153": "oe", # LATIN SMALL LIGATURE OE (from ‹character-fallback›)
231
+ "\u0166": "T", # LATIN CAPITAL LETTER T WITH STROKE
232
+ "\u0167": "t", # LATIN SMALL LETTER T WITH STROKE
233
+ "\u0180": "b", # LATIN SMALL LETTER B WITH STROKE
234
+ "\u0181": "B", # LATIN CAPITAL LETTER B WITH HOOK
235
+ "\u0182": "B", # LATIN CAPITAL LETTER B WITH TOPBAR
236
+ "\u0183": "b", # LATIN SMALL LETTER B WITH TOPBAR
237
+ "\u0187": "C", # LATIN CAPITAL LETTER C WITH HOOK
238
+ "\u0188": "c", # LATIN SMALL LETTER C WITH HOOK
239
+ "\u0189": "D", # LATIN CAPITAL LETTER AFRICAN D
240
+ "\u018a": "D", # LATIN CAPITAL LETTER D WITH HOOK
241
+ "\u018b": "D", # LATIN CAPITAL LETTER D WITH TOPBAR
242
+ "\u018c": "d", # LATIN SMALL LETTER D WITH TOPBAR
243
+ "\u0190": "E", # LATIN CAPITAL LETTER OPEN E
244
+ "\u0191": "F", # LATIN CAPITAL LETTER F WITH HOOK
245
+ "\u0192": "f", # LATIN SMALL LETTER F WITH HOOK
246
+ "\u0193": "G", # LATIN CAPITAL LETTER G WITH HOOK
247
+ "\u0195": "hv", # LATIN SMALL LETTER HV
248
+ "\u0196": "I", # LATIN CAPITAL LETTER IOTA
249
+ "\u0197": "I", # LATIN CAPITAL LETTER I WITH STROKE
250
+ "\u0198": "K", # LATIN CAPITAL LETTER K WITH HOOK
251
+ "\u0199": "k", # LATIN SMALL LETTER K WITH HOOK
252
+ "\u019a": "l", # LATIN SMALL LETTER L WITH BAR
253
+ "\u019d": "N", # LATIN CAPITAL LETTER N WITH LEFT HOOK
254
+ "\u019e": "n", # LATIN SMALL LETTER N WITH LONG RIGHT LEG
255
+ "\u01a2": "GH", # LATIN CAPITAL LETTER GHA (see http://unicode.org/notes/tn27/)
256
+ "\u01a3": "gh", # LATIN SMALL LETTER GHA (see http://unicode.org/notes/tn27/)
257
+ "\u01a4": "P", # LATIN CAPITAL LETTER P WITH HOOK
258
+ "\u01a5": "p", # LATIN SMALL LETTER P WITH HOOK
259
+ "\u01ab": "t", # LATIN SMALL LETTER T WITH PALATAL HOOK
260
+ "\u01ac": "T", # LATIN CAPITAL LETTER T WITH HOOK
261
+ "\u01ad": "t", # LATIN SMALL LETTER T WITH HOOK
262
+ "\u01ae": "T", # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
263
+ "\u01b2": "V", # LATIN CAPITAL LETTER V WITH HOOK
264
+ "\u01b3": "Y", # LATIN CAPITAL LETTER Y WITH HOOK
265
+ "\u01b4": "y", # LATIN SMALL LETTER Y WITH HOOK
266
+ "\u01b5": "Z", # LATIN CAPITAL LETTER Z WITH STROKE
267
+ "\u01b6": "z", # LATIN SMALL LETTER Z WITH STROKE
268
+ "\u01c4": "DZ", # LATIN CAPITAL LETTER DZ WITH CARON (compat)
269
+ "\u01c5": "Dz", # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON (compat)
270
+ "\u01c6": "dz", # LATIN SMALL LETTER DZ WITH CARON (compat)
271
+ "\u01e4": "G", # LATIN CAPITAL LETTER G WITH STROKE
272
+ "\u01e5": "g", # LATIN SMALL LETTER G WITH STROKE
273
+ "\u0221": "d", # LATIN SMALL LETTER D WITH CURL
274
+ "\u0224": "Z", # LATIN CAPITAL LETTER Z WITH HOOK
275
+ "\u0225": "z", # LATIN SMALL LETTER Z WITH HOOK
276
+ "\u0234": "l", # LATIN SMALL LETTER L WITH CURL
277
+ "\u0235": "n", # LATIN SMALL LETTER N WITH CURL
278
+ "\u0236": "t", # LATIN SMALL LETTER T WITH CURL
279
+ "\u0237": "j", # LATIN SMALL LETTER DOTLESS J
280
+ "\u0238": "db", # LATIN SMALL LETTER DB DIGRAPH
281
+ "\u0239": "qp", # LATIN SMALL LETTER QP DIGRAPH
282
+ "\u023a": "A", # LATIN CAPITAL LETTER A WITH STROKE
283
+ "\u023b": "C", # LATIN CAPITAL LETTER C WITH STROKE
284
+ "\u023c": "c", # LATIN SMALL LETTER C WITH STROKE
285
+ "\u023d": "L", # LATIN CAPITAL LETTER L WITH BAR
286
+ "\u023e": "T", # LATIN CAPITAL LETTER T WITH DIAGONAL STROKE
287
+ "\u023f": "s", # LATIN SMALL LETTER S WITH SWASH TAIL
288
+ "\u0240": "z", # LATIN SMALL LETTER Z WITH SWASH TAIL
289
+ "\u0243": "B", # LATIN CAPITAL LETTER B WITH STROKE
290
+ "\u0244": "U", # LATIN CAPITAL LETTER U BAR
291
+ "\u0246": "E", # LATIN CAPITAL LETTER E WITH STROKE
292
+ "\u0247": "e", # LATIN SMALL LETTER E WITH STROKE
293
+ "\u0248": "J", # LATIN CAPITAL LETTER J WITH STROKE
294
+ "\u0249": "j", # LATIN SMALL LETTER J WITH STROKE
295
+ "\u024c": "R", # LATIN CAPITAL LETTER R WITH STROKE
296
+ "\u024d": "r", # LATIN SMALL LETTER R WITH STROKE
297
+ "\u024e": "Y", # LATIN CAPITAL LETTER Y WITH STROKE
298
+ "\u024f": "y", # LATIN SMALL LETTER Y WITH STROKE
299
+ "\u0253": "b", # LATIN SMALL LETTER B WITH HOOK
300
+ "\u0255": "c", # LATIN SMALL LETTER C WITH CURL
301
+ "\u0256": "d", # LATIN SMALL LETTER D WITH TAIL
302
+ "\u0257": "d", # LATIN SMALL LETTER D WITH HOOK
303
+ "\u025b": "e", # LATIN SMALL LETTER OPEN E
304
+ "\u025f": "j", # LATIN SMALL LETTER DOTLESS J WITH STROKE
305
+ "\u0260": "g", # LATIN SMALL LETTER G WITH HOOK
306
+ "\u0261": "g", # LATIN SMALL LETTER SCRIPT G
307
+ "\u0262": "G", # LATIN LETTER SMALL CAPITAL G
308
+ "\u0266": "h", # LATIN SMALL LETTER H WITH HOOK
309
+ "\u0267": "h", # LATIN SMALL LETTER HENG WITH HOOK
310
+ "\u0268": "i", # LATIN SMALL LETTER I WITH STROKE
311
+ "\u026a": "I", # LATIN LETTER SMALL CAPITAL I
312
+ "\u026b": "l", # LATIN SMALL LETTER L WITH MIDDLE TILDE
313
+ "\u026c": "l", # LATIN SMALL LETTER L WITH BELT
314
+ "\u026d": "l", # LATIN SMALL LETTER L WITH RETROFLEX HOOK
315
+ "\u0271": "m", # LATIN SMALL LETTER M WITH HOOK
316
+ "\u0272": "n", # LATIN SMALL LETTER N WITH LEFT HOOK
317
+ "\u0273": "n", # LATIN SMALL LETTER N WITH RETROFLEX HOOK
318
+ "\u0274": "N", # LATIN LETTER SMALL CAPITAL N
319
+ "\u0276": "OE", # LATIN LETTER SMALL CAPITAL OE
320
+ "\u027c": "r", # LATIN SMALL LETTER R WITH LONG LEG
321
+ "\u027d": "r", # LATIN SMALL LETTER R WITH TAIL
322
+ "\u027e": "r", # LATIN SMALL LETTER R WITH FISHHOOK
323
+ "\u0280": "R", # LATIN LETTER SMALL CAPITAL R
324
+ "\u0282": "s", # LATIN SMALL LETTER S WITH HOOK
325
+ "\u0288": "t", # LATIN SMALL LETTER T WITH RETROFLEX HOOK
326
+ "\u0289": "u", # LATIN SMALL LETTER U BAR
327
+ "\u028b": "v", # LATIN SMALL LETTER V WITH HOOK
328
+ "\u028f": "Y", # LATIN LETTER SMALL CAPITAL Y
329
+ "\u0290": "z", # LATIN SMALL LETTER Z WITH RETROFLEX HOOK
330
+ "\u0291": "z", # LATIN SMALL LETTER Z WITH CURL
331
+ "\u0299": "B", # LATIN LETTER SMALL CAPITAL B
332
+ "\u029b": "G", # LATIN LETTER SMALL CAPITAL G WITH HOOK
333
+ "\u029c": "H", # LATIN LETTER SMALL CAPITAL H
334
+ "\u029d": "j", # LATIN SMALL LETTER J WITH CROSSED-TAIL
335
+ "\u029f": "L", # LATIN LETTER SMALL CAPITAL L
336
+ "\u02a0": "q", # LATIN SMALL LETTER Q WITH HOOK
337
+ "\u02a3": "dz", # LATIN SMALL LETTER DZ DIGRAPH
338
+ "\u02a5": "dz", # LATIN SMALL LETTER DZ DIGRAPH WITH CURL
339
+ "\u02a6": "ts", # LATIN SMALL LETTER TS DIGRAPH
340
+ "\u02aa": "ls", # LATIN SMALL LETTER LS DIGRAPH
341
+ "\u02ab": "lz", # LATIN SMALL LETTER LZ DIGRAPH
342
+ "\u1d01": "AE", # LATIN LETTER SMALL CAPITAL AE
343
+ "\u1d03": "B", # LATIN LETTER SMALL CAPITAL BARRED B
344
+ "\u1d06": "D", # LATIN LETTER SMALL CAPITAL ETH
345
+ "\u1d0c": "L", # LATIN LETTER SMALL CAPITAL L WITH STROKE
346
+ "\u1d6b": "ue", # LATIN SMALL LETTER UE
347
+ "\u1d6c": "b", # LATIN SMALL LETTER B WITH MIDDLE TILDE
348
+ "\u1d6d": "d", # LATIN SMALL LETTER D WITH MIDDLE TILDE
349
+ "\u1d6e": "f", # LATIN SMALL LETTER F WITH MIDDLE TILDE
350
+ "\u1d6f": "m", # LATIN SMALL LETTER M WITH MIDDLE TILDE
351
+ "\u1d70": "n", # LATIN SMALL LETTER N WITH MIDDLE TILDE
352
+ "\u1d71": "p", # LATIN SMALL LETTER P WITH MIDDLE TILDE
353
+ "\u1d72": "r", # LATIN SMALL LETTER R WITH MIDDLE TILDE
354
+ "\u1d73": "r", # LATIN SMALL LETTER R WITH FISHHOOK AND MIDDLE TILDE
355
+ "\u1d74": "s", # LATIN SMALL LETTER S WITH MIDDLE TILDE
356
+ "\u1d75": "t", # LATIN SMALL LETTER T WITH MIDDLE TILDE
357
+ "\u1d76": "z", # LATIN SMALL LETTER Z WITH MIDDLE TILDE
358
+ "\u1d7a": "th", # LATIN SMALL LETTER TH WITH STRIKETHROUGH
359
+ "\u1d7b": "I", # LATIN SMALL CAPITAL LETTER I WITH STROKE
360
+ "\u1d7d": "p", # LATIN SMALL LETTER P WITH STROKE
361
+ "\u1d7e": "U", # LATIN SMALL CAPITAL LETTER U WITH STROKE
362
+ "\u1d80": "b", # LATIN SMALL LETTER B WITH PALATAL HOOK
363
+ "\u1d81": "d", # LATIN SMALL LETTER D WITH PALATAL HOOK
364
+ "\u1d82": "f", # LATIN SMALL LETTER F WITH PALATAL HOOK
365
+ "\u1d83": "g", # LATIN SMALL LETTER G WITH PALATAL HOOK
366
+ "\u1d84": "k", # LATIN SMALL LETTER K WITH PALATAL HOOK
367
+ "\u1d85": "l", # LATIN SMALL LETTER L WITH PALATAL HOOK
368
+ "\u1d86": "m", # LATIN SMALL LETTER M WITH PALATAL HOOK
369
+ "\u1d87": "n", # LATIN SMALL LETTER N WITH PALATAL HOOK
370
+ "\u1d88": "p", # LATIN SMALL LETTER P WITH PALATAL HOOK
371
+ "\u1d89": "r", # LATIN SMALL LETTER R WITH PALATAL HOOK
372
+ "\u1d8a": "s", # LATIN SMALL LETTER S WITH PALATAL HOOK
373
+ "\u1d8c": "v", # LATIN SMALL LETTER V WITH PALATAL HOOK
374
+ "\u1d8d": "x", # LATIN SMALL LETTER X WITH PALATAL HOOK
375
+ "\u1d8e": "z", # LATIN SMALL LETTER Z WITH PALATAL HOOK
376
+ "\u1d8f": "a", # LATIN SMALL LETTER A WITH RETROFLEX HOOK
377
+ "\u1d91": "d", # LATIN SMALL LETTER D WITH HOOK AND TAIL
378
+ "\u1d92": "e", # LATIN SMALL LETTER E WITH RETROFLEX HOOK
379
+ "\u1d93": "e", # LATIN SMALL LETTER OPEN E WITH RETROFLEX HOOK
380
+ "\u1d96": "i", # LATIN SMALL LETTER I WITH RETROFLEX HOOK
381
+ "\u1d99": "u", # LATIN SMALL LETTER U WITH RETROFLEX HOOK
382
+ "\u1e9a": "a", # LATIN SMALL LETTER A WITH RIGHT HALF RING
383
+ "\u1e9c": "s", # LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE
384
+ "\u1e9d": "s", # LATIN SMALL LETTER LONG S WITH HIGH STROKE
385
+ "\u1e9e": "SS", # LATIN CAPITAL LETTER SHARP S
386
+ "\u1efa": "LL", # LATIN CAPITAL LETTER MIDDLE-WELSH LL
387
+ "\u1efb": "ll", # LATIN SMALL LETTER MIDDLE-WELSH LL
388
+ "\u1efc": "V", # LATIN CAPITAL LETTER MIDDLE-WELSH V
389
+ "\u1efd": "v", # LATIN SMALL LETTER MIDDLE-WELSH V
390
+ "\u1efe": "Y", # LATIN CAPITAL LETTER Y WITH LOOP
391
+ "\u1eff": "y", # LATIN SMALL LETTER Y WITH LOOP
392
+ "\u00a9": "(C)", # COPYRIGHT SIGN (from ‹character-fallback›)
393
+ "\u00ae": "(R)", # REGISTERED SIGN (from ‹character-fallback›)
394
+ "\u20a0": "CE", # EURO-CURRENCY SIGN (from ‹character-fallback›)
395
+ "\u20a2": "Cr", # CRUZEIRO SIGN (from ‹character-fallback›)
396
+ "\u20a3": "Fr.", # FRENCH FRANC SIGN (from ‹character-fallback›)
397
+ "\u20a4": "L.", # LIRA SIGN (from ‹character-fallback›)
398
+ "\u20a7": "Pts", # PESETA SIGN (from ‹character-fallback›)
399
+ "\u20ba": "TL", # TURKISH LIRA SIGN (from ‹character-fallback›)
400
+ "\u20b9": "Rs", # INDIAN RUPEE SIGN (from ‹character-fallback›)
401
+ "\u211e": "Rx", # PRESCRIPTION TAKE (from ‹character-fallback›)
402
+ "\u33a7": "m/s", # SQUARE M OVER S (compat) (from ‹character-fallback›)
403
+ "\u33ae": "rad/s", # SQUARE RAD OVER S (compat) (from ‹character-fallback›)
404
+ "\u33c6": "C/kg", # SQUARE C OVER KG (compat) (from ‹character-fallback›)
405
+ "\u33de": "V/m", # SQUARE V OVER M (compat) (from ‹character-fallback›)
406
+ "\u33df": "A/m", # SQUARE A OVER M (compat) (from ‹character-fallback›)
407
+ "\u00bc": " 1/4", # VULGAR FRACTION ONE QUARTER (from ‹character-fallback›)
408
+ "\u00bd": " 1/2", # VULGAR FRACTION ONE HALF (from ‹character-fallback›)
409
+ "\u00be": " 3/4", # VULGAR FRACTION THREE QUARTERS (from ‹character-fallback›)
410
+ "\u2153": " 1/3", # VULGAR FRACTION ONE THIRD (from ‹character-fallback›)
411
+ "\u2154": " 2/3", # VULGAR FRACTION TWO THIRDS (from ‹character-fallback›)
412
+ "\u2155": " 1/5", # VULGAR FRACTION ONE FIFTH (from ‹character-fallback›)
413
+ "\u2156": " 2/5", # VULGAR FRACTION TWO FIFTHS (from ‹character-fallback›)
414
+ "\u2157": " 3/5", # VULGAR FRACTION THREE FIFTHS (from ‹character-fallback›)
415
+ "\u2158": " 4/5", # VULGAR FRACTION FOUR FIFTHS (from ‹character-fallback›)
416
+ "\u2159": " 1/6", # VULGAR FRACTION ONE SIXTH (from ‹character-fallback›)
417
+ "\u215a": " 5/6", # VULGAR FRACTION FIVE SIXTHS (from ‹character-fallback›)
418
+ "\u215b": " 1/8", # VULGAR FRACTION ONE EIGHTH (from ‹character-fallback›)
419
+ "\u215c": " 3/8", # VULGAR FRACTION THREE EIGHTHS (from ‹character-fallback›)
420
+ "\u215d": " 5/8", # VULGAR FRACTION FIVE EIGHTHS (from ‹character-fallback›)
421
+ "\u215e": " 7/8", # VULGAR FRACTION SEVEN EIGHTHS (from ‹character-fallback›)
422
+ "\u215f": " 1/", # FRACTION NUMERATOR ONE (from ‹character-fallback›)
423
+ "\u3001": ",", # IDEOGRAPHIC COMMA
424
+ "\u3002": ".", # IDEOGRAPHIC FULL STOP
425
+ "\u00d7": "x", # MULTIPLICATION SIGN
426
+ "\u00f7": "/", # DIVISION SIGN
427
+ "\u00b7": ".", # MIDDLE DOT
428
+ "\u1e9f": "dd", # LATIN SMALL LETTER DELTA
429
+ "\u0184": "H", # LATIN CAPITAL LETTER TONE SIX
430
+ "\u0185": "h", # LATIN SMALL LETTER TONE SIX
431
+ "\u01be": "ts", # LATIN LETTER TS LIGATION (see http://unicode.org/notes/tn27/)
432
+ }
433
+
434
+
435
+ # STJ # def _replace_unicode_simplify_combinations(char, pathsave, win_compat):
436
+ def _replace_unicode_simplify_combinations(char): # STJ #
437
+ result = _simplify_combinations.get(char)
438
+ if result is None:
439
+ return char
440
+ # STJ # elif not pathsave:
441
+ # STJ # return result
442
+ # STJ # else:
443
+ # STJ # return sanitize_filename(result, win_compat=win_compat)
444
+ return result # STJ #
445
+
446
+
447
+ # STJ # def unicode_simplify_combinations(string, pathsave=False, win_compat=False):
448
+ def unicode_simplify_combinations(string): # STJ #
449
+ return "".join(
450
+ # STJ # _replace_unicode_simplify_combinations(c, pathsave, win_compat) for c in string)
451
+ _replace_unicode_simplify_combinations(c)
452
+ for c in string
453
+ )
454
+
455
+
456
+ def unicode_simplify_accents(string):
457
+ result = "".join(
458
+ c for c in unicodedata.normalize("NFKD", string) if not unicodedata.combining(c)
459
+ )
460
+ return result
461
+
462
+
463
+ def asciipunct(string):
464
+ interim = unicode_simplify_compatibility(string)
465
+ return unicode_simplify_punctuation(interim)
466
+
467
+
468
+ def unaccent(string):
469
+ """Remove accents ``string``."""
470
+ return unicode_simplify_accents(string)
471
+
472
+
473
+ # STJ # def replace_non_ascii(string, repl="_", pathsave=False, win_compat=False):
474
+ def replace_non_ascii(string, repl="_"): # STJ #
475
+ """Replace non-ASCII characters from ``string`` by ``repl``."""
476
+ # STJ # interim = unicode_simplify_combinations(string, pathsave, win_compat)
477
+ interim = unicode_simplify_combinations(string) # STJ #
478
+ interim = unicode_simplify_accents(interim)
479
+ # STJ # interim = unicode_simplify_punctuation(interim, pathsave, win_compat)
480
+ interim = unicode_simplify_punctuation(interim) # STJ #
481
+ interim = unicode_simplify_compatibility(interim) # type: ignore
482
+
483
+ # STJ #
484
+ # noinspection PyShadowingNames
485
+
486
+ def error_repl(e, repl="_"): # type: ignore
487
+ # STJ #
488
+ # noinspection PyRedundantParentheses
489
+ return (repl, e.start + 1)
490
+
491
+ # STJ #
492
+ # noinspection PyTypeChecker
493
+ codecs.register_error("repl", partial(error_repl, repl=repl))
494
+ # Decoding and encoding to allow replacements
495
+ return interim.encode("ascii", "repl").decode("ascii")