audiolibrarian 0.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- audiolibrarian/__init__.py +19 -0
- audiolibrarian/audiofile/__init__.py +23 -0
- audiolibrarian/audiofile/audiofile.py +114 -0
- audiolibrarian/audiofile/formats/__init__.py +1 -0
- audiolibrarian/audiofile/formats/flac.py +207 -0
- audiolibrarian/audiofile/formats/m4a.py +221 -0
- audiolibrarian/audiofile/formats/mp3.py +259 -0
- audiolibrarian/audiofile/tags.py +48 -0
- audiolibrarian/audiosource.py +215 -0
- audiolibrarian/base.py +433 -0
- audiolibrarian/cli.py +123 -0
- audiolibrarian/commands.py +283 -0
- audiolibrarian/genremanager.py +176 -0
- audiolibrarian/musicbrainz.py +465 -0
- audiolibrarian/output.py +57 -0
- audiolibrarian/records.py +259 -0
- audiolibrarian/settings.py +79 -0
- audiolibrarian/sh.py +55 -0
- audiolibrarian/text.py +115 -0
- audiolibrarian-0.16.2.dist-info/METADATA +334 -0
- audiolibrarian-0.16.2.dist-info/RECORD +28 -0
- audiolibrarian-0.16.2.dist-info/WHEEL +4 -0
- audiolibrarian-0.16.2.dist-info/entry_points.txt +2 -0
- audiolibrarian-0.16.2.dist-info/licenses/COPYING +674 -0
- audiolibrarian-0.16.2.dist-info/licenses/LICENSE +674 -0
- picard_src/README.md +11 -0
- picard_src/__init__.py +7 -0
- picard_src/textencoding.py +495 -0
picard_src/README.md
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
# Notes #
|
2
|
+
|
3
|
+
Code in this directory comes from (or is derived from) the super-cool Picard project.
|
4
|
+
|
5
|
+
https://github.com/metabrainz/picard
|
6
|
+
|
7
|
+
I'd like to express my thanks to all of have contributed to that project.
|
8
|
+
|
9
|
+
Note: we could have just set the Picard library as a dependency of audiolibrarian, but
|
10
|
+
it includes other dependencies (such as PyQT) that add a lot of overhead not required
|
11
|
+
by this project.
|
picard_src/__init__.py
ADDED
@@ -0,0 +1,495 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#
|
3
|
+
# Picard, the next-generation MusicBrainz tagger
|
4
|
+
#
|
5
|
+
# Copyright (C) 2004 Robert Kaye
|
6
|
+
# Copyright (C) 2006 Lukáš Lalinský
|
7
|
+
# Copyright (C) 2014 Sophist-UK
|
8
|
+
# Copyright (C) 2014, 2018 Laurent Monin
|
9
|
+
# Copyright (C) 2017 Sambhav Kothari
|
10
|
+
# Copyright (C) 2018-2019 Philipp Wolfer
|
11
|
+
#
|
12
|
+
# This program is free software; you can redistribute it and/or
|
13
|
+
# modify it under the terms of the GNU General Public License
|
14
|
+
# as published by the Free Software Foundation; either version 2
|
15
|
+
# of the License, or (at your option) any later version.
|
16
|
+
#
|
17
|
+
# This program is distributed in the hope that it will be useful,
|
18
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
19
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
20
|
+
# GNU General Public License for more details.
|
21
|
+
#
|
22
|
+
# You should have received a copy of the GNU General Public License
|
23
|
+
# along with this program; if not, write to the Free Software
|
24
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
25
|
+
|
26
|
+
|
27
|
+
# This modules provides functionality for simplifying unicode strings.
|
28
|
+
|
29
|
+
# The unicode character set (of over 1m codepoints and 24,000 characters) includes:
|
30
|
+
# Normal ascii (latin) non-accented characters
|
31
|
+
# Combined latin characters e.g. ae in normal usage
|
32
|
+
# Compatibility combined latin characters (retained for compatibility with other character sets)
|
33
|
+
# These can look very similar to normal characters and can be confusing for searches, sort
|
34
|
+
# orders etc.
|
35
|
+
# Non-latin (e.g. japanese, greek, hebrew etc.) characters
|
36
|
+
# Both latin and non-latin characters can be accented. Accents can be either:
|
37
|
+
# Provided by separate nonspacing_mark characters which are visually overlaid (visually 1
|
38
|
+
# character is actually 2); or
|
39
|
+
# Integrated accented characters (i.e. non-accented characters combined with a nonspace_mark
|
40
|
+
# into a single character)
|
41
|
+
# Again these can be confusing for searches, sort orders etc.
|
42
|
+
# Punctuation can also be confusing in unicode e.g. several types of single or double quote mark.
|
43
|
+
|
44
|
+
# For latin script:
|
45
|
+
# Combined characters, accents and punctuation can be visually similar but look different to
|
46
|
+
# search engines,
|
47
|
+
# sort orders etc. and the number of ways to use similar looking characters can (does) result in
|
48
|
+
# inconsistent
|
49
|
+
# usage inside Music metadata.
|
50
|
+
#
|
51
|
+
# Simplifying # the unicode character sets by many-to-one mappings can improve consistency and
|
52
|
+
# reduce confusion,
|
53
|
+
# however sometimes the choice of specific characters can be a deliberate part of an album, song
|
54
|
+
# title or artist name
|
55
|
+
# (and should not therefore be changed without careful thought) and occasionally the choice of
|
56
|
+
# characters can be
|
57
|
+
# malicious (i.e. to defeat firewalls or spam filters or to appear to be something else).
|
58
|
+
#
|
59
|
+
# Finally, given the size of the unicode character set, fonts are unlikely to display all
|
60
|
+
# characters,
|
61
|
+
# making simplification a necessity.
|
62
|
+
#
|
63
|
+
# Simplification may also be needed to make tags conform to ISO-8859-1 (extended ascii) or to
|
64
|
+
# make tags or filenames
|
65
|
+
# into ascii, perhaps because the file system or player cannot support unicode.
|
66
|
+
#
|
67
|
+
# Non-latin scripts may also need to be converted to latin scripts through:
|
68
|
+
# Translation (e.g. hebrew word for mother is translated to "mother"); or
|
69
|
+
# Transliteration (e.g. the SOUND of the hebrew letter or word is spelt out in latin)
|
70
|
+
# These are non-trivial, and the software to do these is far from comprehensive.
|
71
|
+
|
72
|
+
# This module provides utility functions to enable simplification of latin and punctuation unicode:
|
73
|
+
# 1. simplify compatibility characters;
|
74
|
+
# 2. split combined characters;
|
75
|
+
# 3. remove accents (entirely or if not in ISO-8859-1 as applicable);
|
76
|
+
# 4. replace remaining non-ascii or non-ISO-8859-1 characters with a default character
|
77
|
+
# This module also provides an extension infrastructure to allow translation and / or
|
78
|
+
# transliteration plugins to be added.
|
79
|
+
|
80
|
+
import codecs
|
81
|
+
import unicodedata
|
82
|
+
from functools import partial
|
83
|
+
|
84
|
+
# STJ # from picard.util import sanitize_filename
|
85
|
+
|
86
|
+
# LATIN SIMPLIFICATION
|
87
|
+
# The translation tables for punctuation and latin combined-characters are taken from
|
88
|
+
# http://unicode.org/repos/cldr/trunk/common/transforms/Latin-ASCII.xml
|
89
|
+
# Various bugs and mistakes in this have been ironed out during testing.
|
90
|
+
|
91
|
+
|
92
|
+
_additional_compatibility = {
|
93
|
+
"\u0276": "Œ", # LATIN LETTER SMALL CAPITAL OE
|
94
|
+
"\u1d00": "A", # LATIN LETTER SMALL CAPITAL A
|
95
|
+
"\u1d01": "Æ", # LATIN LETTER SMALL CAPITAL AE
|
96
|
+
"\u1d04": "C", # LATIN LETTER SMALL CAPITAL C
|
97
|
+
"\u1d05": "D", # LATIN LETTER SMALL CAPITAL D
|
98
|
+
"\u1d07": "E", # LATIN LETTER SMALL CAPITAL E
|
99
|
+
"\u1d0a": "J", # LATIN LETTER SMALL CAPITAL J
|
100
|
+
"\u1d0b": "K", # LATIN LETTER SMALL CAPITAL K
|
101
|
+
"\u1d0d": "M", # LATIN LETTER SMALL CAPITAL M
|
102
|
+
"\u1d0f": "O", # LATIN LETTER SMALL CAPITAL O
|
103
|
+
"\u1d18": "P", # LATIN LETTER SMALL CAPITAL P
|
104
|
+
"\u1d1b": "T", # LATIN LETTER SMALL CAPITAL T
|
105
|
+
"\u1d1c": "U", # LATIN LETTER SMALL CAPITAL U
|
106
|
+
"\u1d20": "V", # LATIN LETTER SMALL CAPITAL V
|
107
|
+
"\u1d21": "W", # LATIN LETTER SMALL CAPITAL W
|
108
|
+
"\u1d22": "Z", # LATIN LETTER SMALL CAPITAL Z
|
109
|
+
"\u3007": "0", # IDEOGRAPHIC NUMBER ZERO
|
110
|
+
"\u00a0": " ", # NO-BREAK SPACE
|
111
|
+
"\u3000": " ", # IDEOGRAPHIC SPACE (from ‹character-fallback›)
|
112
|
+
"\u2033": "”", # DOUBLE PRIME
|
113
|
+
}
|
114
|
+
|
115
|
+
|
116
|
+
def unicode_simplify_compatibility(string):
|
117
|
+
interim = "".join(_additional_compatibility.get(c, c) for c in string)
|
118
|
+
return unicodedata.normalize("NFKC", interim)
|
119
|
+
|
120
|
+
|
121
|
+
_simplify_punctuation = {
|
122
|
+
"\u013f": "L", # LATIN CAPITAL LETTER L WITH MIDDLE DOT (compat)
|
123
|
+
"\u0140": "l", # LATIN SMALL LETTER L WITH MIDDLE DOT (compat)
|
124
|
+
"\u2018": "'", # LEFT SINGLE QUOTATION MARK (from ‹character-fallback›)
|
125
|
+
"\u2019": "'", # RIGHT SINGLE QUOTATION MARK (from ‹character-fallback›)
|
126
|
+
"\u201a": "'", # SINGLE LOW-9 QUOTATION MARK (from ‹character-fallback›)
|
127
|
+
"\u201b": "'", # SINGLE HIGH-REVERSED-9 QUOTATION MARK (from ‹character-fallback›)
|
128
|
+
"\u201c": '"', # LEFT DOUBLE QUOTATION MARK (from ‹character-fallback›)
|
129
|
+
"\u201d": '"', # RIGHT DOUBLE QUOTATION MARK (from ‹character-fallback›)
|
130
|
+
"\u201e": '"', # DOUBLE LOW-9 QUOTATION MARK (from ‹character-fallback›)
|
131
|
+
"\u201f": '"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK (from ‹character-fallback›)
|
132
|
+
"\u2032": "'", # PRIME
|
133
|
+
"\u2033": '"', # DOUBLE PRIME
|
134
|
+
"\u301d": '"', # REVERSED DOUBLE PRIME QUOTATION MARK
|
135
|
+
"\u301e": '"', # DOUBLE PRIME QUOTATION MARK
|
136
|
+
"\u00ab": "<<", # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK (from ‹character-fallback›)
|
137
|
+
"\u00bb": ">>", # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK (from ‹character-fallback›)
|
138
|
+
"\u2039": "<", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
139
|
+
"\u203a": ">", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
140
|
+
"\u00ad": "", # SOFT HYPHEN (from ‹character-fallback›)
|
141
|
+
"\u2010": "-", # HYPHEN (from ‹character-fallback›)
|
142
|
+
"\u2011": "-", # NON-BREAKING HYPHEN (from ‹character-fallback›)
|
143
|
+
"\u2012": "-", # FIGURE DASH (from ‹character-fallback›)
|
144
|
+
"\u2013": "-", # EN DASH (from ‹character-fallback›)
|
145
|
+
"\u2014": "-", # EM DASH (from ‹character-fallback›)
|
146
|
+
"\u2015": "-", # HORIZONTAL BAR (from ‹character-fallback›)
|
147
|
+
"\ufe31": "|", # PRESENTATION FORM FOR VERTICAL EM DASH (compat)
|
148
|
+
"\ufe32": "|", # PRESENTATION FORM FOR VERTICAL EN DASH (compat)
|
149
|
+
"\ufe58": "-", # SMALL EM DASH (compat)
|
150
|
+
"\u2016": "||", # DOUBLE VERTICAL LINE
|
151
|
+
"\u2044": "/", # FRACTION SLASH (from ‹character-fallback›)
|
152
|
+
"\u2045": "[", # LEFT SQUARE BRACKET WITH QUILL
|
153
|
+
"\u2046": "]", # RIGHT SQUARE BRACKET WITH QUILL
|
154
|
+
"\u204e": "*", # LOW ASTERISK
|
155
|
+
"\u3008": "<", # LEFT ANGLE BRACKET
|
156
|
+
"\u3009": ">", # RIGHT ANGLE BRACKET
|
157
|
+
"\u300a": "<<", # LEFT DOUBLE ANGLE BRACKET
|
158
|
+
"\u300b": ">>", # RIGHT DOUBLE ANGLE BRACKET
|
159
|
+
"\u3014": "[", # LEFT TORTOISE SHELL BRACKET
|
160
|
+
"\u3015": "]", # RIGHT TORTOISE SHELL BRACKET
|
161
|
+
"\u3018": "[", # LEFT WHITE TORTOISE SHELL BRACKET
|
162
|
+
"\u3019": "]", # RIGHT WHITE TORTOISE SHELL BRACKET
|
163
|
+
"\u301a": "[", # LEFT WHITE SQUARE BRACKET
|
164
|
+
"\u301b": "]", # RIGHT WHITE SQUARE BRACKET
|
165
|
+
"\ufe11": ",", # PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA (compat)
|
166
|
+
"\ufe12": ".", # PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP (compat)
|
167
|
+
"\ufe39": "[", # PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET (compat)
|
168
|
+
"\ufe3a": "]", # PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET (compat)
|
169
|
+
"\ufe3d": "<<", # PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET (compat)
|
170
|
+
"\ufe3e": ">>", # PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET (compat)
|
171
|
+
"\ufe3f": "<", # PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET (compat)
|
172
|
+
"\ufe40": ">", # PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET (compat)
|
173
|
+
"\ufe51": ",", # SMALL IDEOGRAPHIC COMMA (compat)
|
174
|
+
"\ufe5d": "[", # SMALL LEFT TORTOISE SHELL BRACKET (compat)
|
175
|
+
"\ufe5e": "]", # SMALL RIGHT TORTOISE SHELL BRACKET (compat)
|
176
|
+
"\uff5f": "((", # FULLWIDTH LEFT WHITE PARENTHESIS (compat)(from ‹character-fallback›)
|
177
|
+
"\uff60": "))", # FULLWIDTH RIGHT WHITE PARENTHESIS (compat)(from ‹character-fallback›)
|
178
|
+
"\uff61": ".", # HALFWIDTH IDEOGRAPHIC FULL STOP (compat)
|
179
|
+
"\uff64": ",", # HALFWIDTH IDEOGRAPHIC COMMA (compat)
|
180
|
+
"\u2212": "-", # MINUS SIGN (from ‹character-fallback›)
|
181
|
+
"\u2215": "/", # DIVISION SLASH (from ‹character-fallback›)
|
182
|
+
"\u2216": "\\", # SET MINUS (from ‹character-fallback›)
|
183
|
+
"\u2223": "|", # DIVIDES (from ‹character-fallback›)
|
184
|
+
"\u2225": "||", # PARALLEL TO (from ‹character-fallback›)
|
185
|
+
"\u226a": "<<", # MUCH LESS-THAN
|
186
|
+
"\u226b": ">>", # MUCH GREATER-THAN
|
187
|
+
"\u2985": "((", # LEFT WHITE PARENTHESIS
|
188
|
+
"\u2986": "))", # RIGHT WHITE PARENTHESIS
|
189
|
+
"\u2022": "-", # BULLET
|
190
|
+
"\u200b": "", # Zero Width Space
|
191
|
+
}
|
192
|
+
|
193
|
+
|
194
|
+
# STJ # def unicode_simplify_punctuation(string, pathsave=False, win_compat=False):
|
195
|
+
def unicode_simplify_punctuation(string): # STJ #
|
196
|
+
temp = []
|
197
|
+
for c in string:
|
198
|
+
try:
|
199
|
+
result = _simplify_punctuation[c]
|
200
|
+
# STJ # if c != result and pathsave:
|
201
|
+
# STJ # result = sanitize_filename(result, win_compat=win_compat)
|
202
|
+
except KeyError:
|
203
|
+
result = c
|
204
|
+
temp.append(result)
|
205
|
+
return "".join(temp)
|
206
|
+
|
207
|
+
|
208
|
+
_simplify_combinations = {
|
209
|
+
"\u00c6": "AE", # LATIN CAPITAL LETTER AE (from ‹character-fallback›)
|
210
|
+
"\u00d0": "D", # LATIN CAPITAL LETTER ETH
|
211
|
+
"\u00d8": "OE", # LATIN CAPITAL LETTER O WITH STROKE see https://en.wikipedia.org/wiki/%C3%98
|
212
|
+
"\u00de": "TH", # LATIN CAPITAL LETTER THORN
|
213
|
+
"\u00df": "ss", # LATIN SMALL LETTER SHARP S (from ‹character-fallback›)
|
214
|
+
"\u00e6": "ae", # LATIN SMALL LETTER AE (from ‹character-fallback›)
|
215
|
+
"\u00f0": "d", # LATIN SMALL LETTER ETH
|
216
|
+
"\u00f8": "oe", # LATIN SMALL LETTER O WITH STROKE (see https://en.wikipedia.org/wiki/%C3%98)
|
217
|
+
"\u00fe": "th", # LATIN SMALL LETTER THORN
|
218
|
+
"\u0110": "D", # LATIN CAPITAL LETTER D WITH STROKE
|
219
|
+
"\u0111": "d", # LATIN SMALL LETTER D WITH STROKE
|
220
|
+
"\u0126": "H", # LATIN CAPITAL LETTER H WITH STROKE
|
221
|
+
"\u0127": "h", # LATIN CAPITAL LETTER H WITH STROKE
|
222
|
+
"\u0131": "i", # LATIN SMALL LETTER DOTLESS I
|
223
|
+
"\u0138": "q", # LATIN SMALL LETTER KRA (collates with q in DUCET)
|
224
|
+
"\u0141": "L", # LATIN CAPITAL LETTER L WITH STROKE
|
225
|
+
"\u0142": "l", # LATIN SMALL LETTER L WITH STROKE
|
226
|
+
"\u0149": "'n", # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE (from ‹character-fallback›)
|
227
|
+
"\u014a": "N", # LATIN CAPITAL LETTER ENG
|
228
|
+
"\u014b": "n", # LATIN SMALL LETTER ENG
|
229
|
+
"\u0152": "OE", # LATIN CAPITAL LIGATURE OE (from ‹character-fallback›)
|
230
|
+
"\u0153": "oe", # LATIN SMALL LIGATURE OE (from ‹character-fallback›)
|
231
|
+
"\u0166": "T", # LATIN CAPITAL LETTER T WITH STROKE
|
232
|
+
"\u0167": "t", # LATIN SMALL LETTER T WITH STROKE
|
233
|
+
"\u0180": "b", # LATIN SMALL LETTER B WITH STROKE
|
234
|
+
"\u0181": "B", # LATIN CAPITAL LETTER B WITH HOOK
|
235
|
+
"\u0182": "B", # LATIN CAPITAL LETTER B WITH TOPBAR
|
236
|
+
"\u0183": "b", # LATIN SMALL LETTER B WITH TOPBAR
|
237
|
+
"\u0187": "C", # LATIN CAPITAL LETTER C WITH HOOK
|
238
|
+
"\u0188": "c", # LATIN SMALL LETTER C WITH HOOK
|
239
|
+
"\u0189": "D", # LATIN CAPITAL LETTER AFRICAN D
|
240
|
+
"\u018a": "D", # LATIN CAPITAL LETTER D WITH HOOK
|
241
|
+
"\u018b": "D", # LATIN CAPITAL LETTER D WITH TOPBAR
|
242
|
+
"\u018c": "d", # LATIN SMALL LETTER D WITH TOPBAR
|
243
|
+
"\u0190": "E", # LATIN CAPITAL LETTER OPEN E
|
244
|
+
"\u0191": "F", # LATIN CAPITAL LETTER F WITH HOOK
|
245
|
+
"\u0192": "f", # LATIN SMALL LETTER F WITH HOOK
|
246
|
+
"\u0193": "G", # LATIN CAPITAL LETTER G WITH HOOK
|
247
|
+
"\u0195": "hv", # LATIN SMALL LETTER HV
|
248
|
+
"\u0196": "I", # LATIN CAPITAL LETTER IOTA
|
249
|
+
"\u0197": "I", # LATIN CAPITAL LETTER I WITH STROKE
|
250
|
+
"\u0198": "K", # LATIN CAPITAL LETTER K WITH HOOK
|
251
|
+
"\u0199": "k", # LATIN SMALL LETTER K WITH HOOK
|
252
|
+
"\u019a": "l", # LATIN SMALL LETTER L WITH BAR
|
253
|
+
"\u019d": "N", # LATIN CAPITAL LETTER N WITH LEFT HOOK
|
254
|
+
"\u019e": "n", # LATIN SMALL LETTER N WITH LONG RIGHT LEG
|
255
|
+
"\u01a2": "GH", # LATIN CAPITAL LETTER GHA (see http://unicode.org/notes/tn27/)
|
256
|
+
"\u01a3": "gh", # LATIN SMALL LETTER GHA (see http://unicode.org/notes/tn27/)
|
257
|
+
"\u01a4": "P", # LATIN CAPITAL LETTER P WITH HOOK
|
258
|
+
"\u01a5": "p", # LATIN SMALL LETTER P WITH HOOK
|
259
|
+
"\u01ab": "t", # LATIN SMALL LETTER T WITH PALATAL HOOK
|
260
|
+
"\u01ac": "T", # LATIN CAPITAL LETTER T WITH HOOK
|
261
|
+
"\u01ad": "t", # LATIN SMALL LETTER T WITH HOOK
|
262
|
+
"\u01ae": "T", # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
|
263
|
+
"\u01b2": "V", # LATIN CAPITAL LETTER V WITH HOOK
|
264
|
+
"\u01b3": "Y", # LATIN CAPITAL LETTER Y WITH HOOK
|
265
|
+
"\u01b4": "y", # LATIN SMALL LETTER Y WITH HOOK
|
266
|
+
"\u01b5": "Z", # LATIN CAPITAL LETTER Z WITH STROKE
|
267
|
+
"\u01b6": "z", # LATIN SMALL LETTER Z WITH STROKE
|
268
|
+
"\u01c4": "DZ", # LATIN CAPITAL LETTER DZ WITH CARON (compat)
|
269
|
+
"\u01c5": "Dz", # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON (compat)
|
270
|
+
"\u01c6": "dz", # LATIN SMALL LETTER DZ WITH CARON (compat)
|
271
|
+
"\u01e4": "G", # LATIN CAPITAL LETTER G WITH STROKE
|
272
|
+
"\u01e5": "g", # LATIN SMALL LETTER G WITH STROKE
|
273
|
+
"\u0221": "d", # LATIN SMALL LETTER D WITH CURL
|
274
|
+
"\u0224": "Z", # LATIN CAPITAL LETTER Z WITH HOOK
|
275
|
+
"\u0225": "z", # LATIN SMALL LETTER Z WITH HOOK
|
276
|
+
"\u0234": "l", # LATIN SMALL LETTER L WITH CURL
|
277
|
+
"\u0235": "n", # LATIN SMALL LETTER N WITH CURL
|
278
|
+
"\u0236": "t", # LATIN SMALL LETTER T WITH CURL
|
279
|
+
"\u0237": "j", # LATIN SMALL LETTER DOTLESS J
|
280
|
+
"\u0238": "db", # LATIN SMALL LETTER DB DIGRAPH
|
281
|
+
"\u0239": "qp", # LATIN SMALL LETTER QP DIGRAPH
|
282
|
+
"\u023a": "A", # LATIN CAPITAL LETTER A WITH STROKE
|
283
|
+
"\u023b": "C", # LATIN CAPITAL LETTER C WITH STROKE
|
284
|
+
"\u023c": "c", # LATIN SMALL LETTER C WITH STROKE
|
285
|
+
"\u023d": "L", # LATIN CAPITAL LETTER L WITH BAR
|
286
|
+
"\u023e": "T", # LATIN CAPITAL LETTER T WITH DIAGONAL STROKE
|
287
|
+
"\u023f": "s", # LATIN SMALL LETTER S WITH SWASH TAIL
|
288
|
+
"\u0240": "z", # LATIN SMALL LETTER Z WITH SWASH TAIL
|
289
|
+
"\u0243": "B", # LATIN CAPITAL LETTER B WITH STROKE
|
290
|
+
"\u0244": "U", # LATIN CAPITAL LETTER U BAR
|
291
|
+
"\u0246": "E", # LATIN CAPITAL LETTER E WITH STROKE
|
292
|
+
"\u0247": "e", # LATIN SMALL LETTER E WITH STROKE
|
293
|
+
"\u0248": "J", # LATIN CAPITAL LETTER J WITH STROKE
|
294
|
+
"\u0249": "j", # LATIN SMALL LETTER J WITH STROKE
|
295
|
+
"\u024c": "R", # LATIN CAPITAL LETTER R WITH STROKE
|
296
|
+
"\u024d": "r", # LATIN SMALL LETTER R WITH STROKE
|
297
|
+
"\u024e": "Y", # LATIN CAPITAL LETTER Y WITH STROKE
|
298
|
+
"\u024f": "y", # LATIN SMALL LETTER Y WITH STROKE
|
299
|
+
"\u0253": "b", # LATIN SMALL LETTER B WITH HOOK
|
300
|
+
"\u0255": "c", # LATIN SMALL LETTER C WITH CURL
|
301
|
+
"\u0256": "d", # LATIN SMALL LETTER D WITH TAIL
|
302
|
+
"\u0257": "d", # LATIN SMALL LETTER D WITH HOOK
|
303
|
+
"\u025b": "e", # LATIN SMALL LETTER OPEN E
|
304
|
+
"\u025f": "j", # LATIN SMALL LETTER DOTLESS J WITH STROKE
|
305
|
+
"\u0260": "g", # LATIN SMALL LETTER G WITH HOOK
|
306
|
+
"\u0261": "g", # LATIN SMALL LETTER SCRIPT G
|
307
|
+
"\u0262": "G", # LATIN LETTER SMALL CAPITAL G
|
308
|
+
"\u0266": "h", # LATIN SMALL LETTER H WITH HOOK
|
309
|
+
"\u0267": "h", # LATIN SMALL LETTER HENG WITH HOOK
|
310
|
+
"\u0268": "i", # LATIN SMALL LETTER I WITH STROKE
|
311
|
+
"\u026a": "I", # LATIN LETTER SMALL CAPITAL I
|
312
|
+
"\u026b": "l", # LATIN SMALL LETTER L WITH MIDDLE TILDE
|
313
|
+
"\u026c": "l", # LATIN SMALL LETTER L WITH BELT
|
314
|
+
"\u026d": "l", # LATIN SMALL LETTER L WITH RETROFLEX HOOK
|
315
|
+
"\u0271": "m", # LATIN SMALL LETTER M WITH HOOK
|
316
|
+
"\u0272": "n", # LATIN SMALL LETTER N WITH LEFT HOOK
|
317
|
+
"\u0273": "n", # LATIN SMALL LETTER N WITH RETROFLEX HOOK
|
318
|
+
"\u0274": "N", # LATIN LETTER SMALL CAPITAL N
|
319
|
+
"\u0276": "OE", # LATIN LETTER SMALL CAPITAL OE
|
320
|
+
"\u027c": "r", # LATIN SMALL LETTER R WITH LONG LEG
|
321
|
+
"\u027d": "r", # LATIN SMALL LETTER R WITH TAIL
|
322
|
+
"\u027e": "r", # LATIN SMALL LETTER R WITH FISHHOOK
|
323
|
+
"\u0280": "R", # LATIN LETTER SMALL CAPITAL R
|
324
|
+
"\u0282": "s", # LATIN SMALL LETTER S WITH HOOK
|
325
|
+
"\u0288": "t", # LATIN SMALL LETTER T WITH RETROFLEX HOOK
|
326
|
+
"\u0289": "u", # LATIN SMALL LETTER U BAR
|
327
|
+
"\u028b": "v", # LATIN SMALL LETTER V WITH HOOK
|
328
|
+
"\u028f": "Y", # LATIN LETTER SMALL CAPITAL Y
|
329
|
+
"\u0290": "z", # LATIN SMALL LETTER Z WITH RETROFLEX HOOK
|
330
|
+
"\u0291": "z", # LATIN SMALL LETTER Z WITH CURL
|
331
|
+
"\u0299": "B", # LATIN LETTER SMALL CAPITAL B
|
332
|
+
"\u029b": "G", # LATIN LETTER SMALL CAPITAL G WITH HOOK
|
333
|
+
"\u029c": "H", # LATIN LETTER SMALL CAPITAL H
|
334
|
+
"\u029d": "j", # LATIN SMALL LETTER J WITH CROSSED-TAIL
|
335
|
+
"\u029f": "L", # LATIN LETTER SMALL CAPITAL L
|
336
|
+
"\u02a0": "q", # LATIN SMALL LETTER Q WITH HOOK
|
337
|
+
"\u02a3": "dz", # LATIN SMALL LETTER DZ DIGRAPH
|
338
|
+
"\u02a5": "dz", # LATIN SMALL LETTER DZ DIGRAPH WITH CURL
|
339
|
+
"\u02a6": "ts", # LATIN SMALL LETTER TS DIGRAPH
|
340
|
+
"\u02aa": "ls", # LATIN SMALL LETTER LS DIGRAPH
|
341
|
+
"\u02ab": "lz", # LATIN SMALL LETTER LZ DIGRAPH
|
342
|
+
"\u1d01": "AE", # LATIN LETTER SMALL CAPITAL AE
|
343
|
+
"\u1d03": "B", # LATIN LETTER SMALL CAPITAL BARRED B
|
344
|
+
"\u1d06": "D", # LATIN LETTER SMALL CAPITAL ETH
|
345
|
+
"\u1d0c": "L", # LATIN LETTER SMALL CAPITAL L WITH STROKE
|
346
|
+
"\u1d6b": "ue", # LATIN SMALL LETTER UE
|
347
|
+
"\u1d6c": "b", # LATIN SMALL LETTER B WITH MIDDLE TILDE
|
348
|
+
"\u1d6d": "d", # LATIN SMALL LETTER D WITH MIDDLE TILDE
|
349
|
+
"\u1d6e": "f", # LATIN SMALL LETTER F WITH MIDDLE TILDE
|
350
|
+
"\u1d6f": "m", # LATIN SMALL LETTER M WITH MIDDLE TILDE
|
351
|
+
"\u1d70": "n", # LATIN SMALL LETTER N WITH MIDDLE TILDE
|
352
|
+
"\u1d71": "p", # LATIN SMALL LETTER P WITH MIDDLE TILDE
|
353
|
+
"\u1d72": "r", # LATIN SMALL LETTER R WITH MIDDLE TILDE
|
354
|
+
"\u1d73": "r", # LATIN SMALL LETTER R WITH FISHHOOK AND MIDDLE TILDE
|
355
|
+
"\u1d74": "s", # LATIN SMALL LETTER S WITH MIDDLE TILDE
|
356
|
+
"\u1d75": "t", # LATIN SMALL LETTER T WITH MIDDLE TILDE
|
357
|
+
"\u1d76": "z", # LATIN SMALL LETTER Z WITH MIDDLE TILDE
|
358
|
+
"\u1d7a": "th", # LATIN SMALL LETTER TH WITH STRIKETHROUGH
|
359
|
+
"\u1d7b": "I", # LATIN SMALL CAPITAL LETTER I WITH STROKE
|
360
|
+
"\u1d7d": "p", # LATIN SMALL LETTER P WITH STROKE
|
361
|
+
"\u1d7e": "U", # LATIN SMALL CAPITAL LETTER U WITH STROKE
|
362
|
+
"\u1d80": "b", # LATIN SMALL LETTER B WITH PALATAL HOOK
|
363
|
+
"\u1d81": "d", # LATIN SMALL LETTER D WITH PALATAL HOOK
|
364
|
+
"\u1d82": "f", # LATIN SMALL LETTER F WITH PALATAL HOOK
|
365
|
+
"\u1d83": "g", # LATIN SMALL LETTER G WITH PALATAL HOOK
|
366
|
+
"\u1d84": "k", # LATIN SMALL LETTER K WITH PALATAL HOOK
|
367
|
+
"\u1d85": "l", # LATIN SMALL LETTER L WITH PALATAL HOOK
|
368
|
+
"\u1d86": "m", # LATIN SMALL LETTER M WITH PALATAL HOOK
|
369
|
+
"\u1d87": "n", # LATIN SMALL LETTER N WITH PALATAL HOOK
|
370
|
+
"\u1d88": "p", # LATIN SMALL LETTER P WITH PALATAL HOOK
|
371
|
+
"\u1d89": "r", # LATIN SMALL LETTER R WITH PALATAL HOOK
|
372
|
+
"\u1d8a": "s", # LATIN SMALL LETTER S WITH PALATAL HOOK
|
373
|
+
"\u1d8c": "v", # LATIN SMALL LETTER V WITH PALATAL HOOK
|
374
|
+
"\u1d8d": "x", # LATIN SMALL LETTER X WITH PALATAL HOOK
|
375
|
+
"\u1d8e": "z", # LATIN SMALL LETTER Z WITH PALATAL HOOK
|
376
|
+
"\u1d8f": "a", # LATIN SMALL LETTER A WITH RETROFLEX HOOK
|
377
|
+
"\u1d91": "d", # LATIN SMALL LETTER D WITH HOOK AND TAIL
|
378
|
+
"\u1d92": "e", # LATIN SMALL LETTER E WITH RETROFLEX HOOK
|
379
|
+
"\u1d93": "e", # LATIN SMALL LETTER OPEN E WITH RETROFLEX HOOK
|
380
|
+
"\u1d96": "i", # LATIN SMALL LETTER I WITH RETROFLEX HOOK
|
381
|
+
"\u1d99": "u", # LATIN SMALL LETTER U WITH RETROFLEX HOOK
|
382
|
+
"\u1e9a": "a", # LATIN SMALL LETTER A WITH RIGHT HALF RING
|
383
|
+
"\u1e9c": "s", # LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE
|
384
|
+
"\u1e9d": "s", # LATIN SMALL LETTER LONG S WITH HIGH STROKE
|
385
|
+
"\u1e9e": "SS", # LATIN CAPITAL LETTER SHARP S
|
386
|
+
"\u1efa": "LL", # LATIN CAPITAL LETTER MIDDLE-WELSH LL
|
387
|
+
"\u1efb": "ll", # LATIN SMALL LETTER MIDDLE-WELSH LL
|
388
|
+
"\u1efc": "V", # LATIN CAPITAL LETTER MIDDLE-WELSH V
|
389
|
+
"\u1efd": "v", # LATIN SMALL LETTER MIDDLE-WELSH V
|
390
|
+
"\u1efe": "Y", # LATIN CAPITAL LETTER Y WITH LOOP
|
391
|
+
"\u1eff": "y", # LATIN SMALL LETTER Y WITH LOOP
|
392
|
+
"\u00a9": "(C)", # COPYRIGHT SIGN (from ‹character-fallback›)
|
393
|
+
"\u00ae": "(R)", # REGISTERED SIGN (from ‹character-fallback›)
|
394
|
+
"\u20a0": "CE", # EURO-CURRENCY SIGN (from ‹character-fallback›)
|
395
|
+
"\u20a2": "Cr", # CRUZEIRO SIGN (from ‹character-fallback›)
|
396
|
+
"\u20a3": "Fr.", # FRENCH FRANC SIGN (from ‹character-fallback›)
|
397
|
+
"\u20a4": "L.", # LIRA SIGN (from ‹character-fallback›)
|
398
|
+
"\u20a7": "Pts", # PESETA SIGN (from ‹character-fallback›)
|
399
|
+
"\u20ba": "TL", # TURKISH LIRA SIGN (from ‹character-fallback›)
|
400
|
+
"\u20b9": "Rs", # INDIAN RUPEE SIGN (from ‹character-fallback›)
|
401
|
+
"\u211e": "Rx", # PRESCRIPTION TAKE (from ‹character-fallback›)
|
402
|
+
"\u33a7": "m/s", # SQUARE M OVER S (compat) (from ‹character-fallback›)
|
403
|
+
"\u33ae": "rad/s", # SQUARE RAD OVER S (compat) (from ‹character-fallback›)
|
404
|
+
"\u33c6": "C/kg", # SQUARE C OVER KG (compat) (from ‹character-fallback›)
|
405
|
+
"\u33de": "V/m", # SQUARE V OVER M (compat) (from ‹character-fallback›)
|
406
|
+
"\u33df": "A/m", # SQUARE A OVER M (compat) (from ‹character-fallback›)
|
407
|
+
"\u00bc": " 1/4", # VULGAR FRACTION ONE QUARTER (from ‹character-fallback›)
|
408
|
+
"\u00bd": " 1/2", # VULGAR FRACTION ONE HALF (from ‹character-fallback›)
|
409
|
+
"\u00be": " 3/4", # VULGAR FRACTION THREE QUARTERS (from ‹character-fallback›)
|
410
|
+
"\u2153": " 1/3", # VULGAR FRACTION ONE THIRD (from ‹character-fallback›)
|
411
|
+
"\u2154": " 2/3", # VULGAR FRACTION TWO THIRDS (from ‹character-fallback›)
|
412
|
+
"\u2155": " 1/5", # VULGAR FRACTION ONE FIFTH (from ‹character-fallback›)
|
413
|
+
"\u2156": " 2/5", # VULGAR FRACTION TWO FIFTHS (from ‹character-fallback›)
|
414
|
+
"\u2157": " 3/5", # VULGAR FRACTION THREE FIFTHS (from ‹character-fallback›)
|
415
|
+
"\u2158": " 4/5", # VULGAR FRACTION FOUR FIFTHS (from ‹character-fallback›)
|
416
|
+
"\u2159": " 1/6", # VULGAR FRACTION ONE SIXTH (from ‹character-fallback›)
|
417
|
+
"\u215a": " 5/6", # VULGAR FRACTION FIVE SIXTHS (from ‹character-fallback›)
|
418
|
+
"\u215b": " 1/8", # VULGAR FRACTION ONE EIGHTH (from ‹character-fallback›)
|
419
|
+
"\u215c": " 3/8", # VULGAR FRACTION THREE EIGHTHS (from ‹character-fallback›)
|
420
|
+
"\u215d": " 5/8", # VULGAR FRACTION FIVE EIGHTHS (from ‹character-fallback›)
|
421
|
+
"\u215e": " 7/8", # VULGAR FRACTION SEVEN EIGHTHS (from ‹character-fallback›)
|
422
|
+
"\u215f": " 1/", # FRACTION NUMERATOR ONE (from ‹character-fallback›)
|
423
|
+
"\u3001": ",", # IDEOGRAPHIC COMMA
|
424
|
+
"\u3002": ".", # IDEOGRAPHIC FULL STOP
|
425
|
+
"\u00d7": "x", # MULTIPLICATION SIGN
|
426
|
+
"\u00f7": "/", # DIVISION SIGN
|
427
|
+
"\u00b7": ".", # MIDDLE DOT
|
428
|
+
"\u1e9f": "dd", # LATIN SMALL LETTER DELTA
|
429
|
+
"\u0184": "H", # LATIN CAPITAL LETTER TONE SIX
|
430
|
+
"\u0185": "h", # LATIN SMALL LETTER TONE SIX
|
431
|
+
"\u01be": "ts", # LATIN LETTER TS LIGATION (see http://unicode.org/notes/tn27/)
|
432
|
+
}
|
433
|
+
|
434
|
+
|
435
|
+
# STJ # def _replace_unicode_simplify_combinations(char, pathsave, win_compat):
|
436
|
+
def _replace_unicode_simplify_combinations(char): # STJ #
|
437
|
+
result = _simplify_combinations.get(char)
|
438
|
+
if result is None:
|
439
|
+
return char
|
440
|
+
# STJ # elif not pathsave:
|
441
|
+
# STJ # return result
|
442
|
+
# STJ # else:
|
443
|
+
# STJ # return sanitize_filename(result, win_compat=win_compat)
|
444
|
+
return result # STJ #
|
445
|
+
|
446
|
+
|
447
|
+
# STJ # def unicode_simplify_combinations(string, pathsave=False, win_compat=False):
|
448
|
+
def unicode_simplify_combinations(string): # STJ #
|
449
|
+
return "".join(
|
450
|
+
# STJ # _replace_unicode_simplify_combinations(c, pathsave, win_compat) for c in string)
|
451
|
+
_replace_unicode_simplify_combinations(c)
|
452
|
+
for c in string
|
453
|
+
)
|
454
|
+
|
455
|
+
|
456
|
+
def unicode_simplify_accents(string):
|
457
|
+
result = "".join(
|
458
|
+
c for c in unicodedata.normalize("NFKD", string) if not unicodedata.combining(c)
|
459
|
+
)
|
460
|
+
return result
|
461
|
+
|
462
|
+
|
463
|
+
def asciipunct(string):
|
464
|
+
interim = unicode_simplify_compatibility(string)
|
465
|
+
return unicode_simplify_punctuation(interim)
|
466
|
+
|
467
|
+
|
468
|
+
def unaccent(string):
|
469
|
+
"""Remove accents ``string``."""
|
470
|
+
return unicode_simplify_accents(string)
|
471
|
+
|
472
|
+
|
473
|
+
# STJ # def replace_non_ascii(string, repl="_", pathsave=False, win_compat=False):
|
474
|
+
def replace_non_ascii(string, repl="_"): # STJ #
|
475
|
+
"""Replace non-ASCII characters from ``string`` by ``repl``."""
|
476
|
+
# STJ # interim = unicode_simplify_combinations(string, pathsave, win_compat)
|
477
|
+
interim = unicode_simplify_combinations(string) # STJ #
|
478
|
+
interim = unicode_simplify_accents(interim)
|
479
|
+
# STJ # interim = unicode_simplify_punctuation(interim, pathsave, win_compat)
|
480
|
+
interim = unicode_simplify_punctuation(interim) # STJ #
|
481
|
+
interim = unicode_simplify_compatibility(interim) # type: ignore
|
482
|
+
|
483
|
+
# STJ #
|
484
|
+
# noinspection PyShadowingNames
|
485
|
+
|
486
|
+
def error_repl(e, repl="_"): # type: ignore
|
487
|
+
# STJ #
|
488
|
+
# noinspection PyRedundantParentheses
|
489
|
+
return (repl, e.start + 1)
|
490
|
+
|
491
|
+
# STJ #
|
492
|
+
# noinspection PyTypeChecker
|
493
|
+
codecs.register_error("repl", partial(error_repl, repl=repl))
|
494
|
+
# Decoding and encoding to allow replacements
|
495
|
+
return interim.encode("ascii", "repl").decode("ascii")
|