datamarket 0.9.30__py3-none-any.whl → 0.9.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamarket/utils/strings/normalization.py +88 -29
- {datamarket-0.9.30.dist-info → datamarket-0.9.31.dist-info}/METADATA +1 -1
- {datamarket-0.9.30.dist-info → datamarket-0.9.31.dist-info}/RECORD +5 -5
- {datamarket-0.9.30.dist-info → datamarket-0.9.31.dist-info}/LICENSE +0 -0
- {datamarket-0.9.30.dist-info → datamarket-0.9.31.dist-info}/WHEEL +0 -0
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
import unicodedata
|
|
5
5
|
from enum import Enum, auto
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, Optional, Set
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
from inflection import camelize, parameterize, titleize, underscore
|
|
@@ -23,6 +23,8 @@ class NormalizationMode(Enum):
|
|
|
23
23
|
|
|
24
24
|
class NamingConvention(Enum):
|
|
25
25
|
NONE = auto() # no style change
|
|
26
|
+
LOWER = auto() # lowercase
|
|
27
|
+
UPPER = auto() # UPPERCASE
|
|
26
28
|
CONSTANT = auto() # CONSTANT_CASE (uppercase, underscores)
|
|
27
29
|
SNAKE = auto() # snake_case (lowercase, underscores)
|
|
28
30
|
CAMEL = auto() # camelCase (capitalize words except first one, no spaces)
|
|
@@ -35,20 +37,52 @@ class NamingConvention(Enum):
|
|
|
35
37
|
# FUNCTIONS
|
|
36
38
|
|
|
37
39
|
|
|
38
|
-
def
|
|
40
|
+
def get_unidecoded_text(input_text: str, allowed_chars: Set[str], apply_lowercase: bool = False) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Processes a string by unidecoding characters, optionally lowercasing them,
|
|
43
|
+
while preserving a specified set of allowed characters.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
input_text: The string to process.
|
|
47
|
+
allowed_chars: A set of characters to preserve in their original form.
|
|
48
|
+
apply_lowercase: Whether to convert unidecoded characters to lowercase. Defaults to False.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
The processed string.
|
|
52
|
+
"""
|
|
53
|
+
chars_list: list[str] = []
|
|
54
|
+
for char_original in input_text:
|
|
55
|
+
if char_original in allowed_chars:
|
|
56
|
+
chars_list.append(char_original)
|
|
57
|
+
else:
|
|
58
|
+
decoded_segment = unidecode(char_original)
|
|
59
|
+
for dc in decoded_segment: # unidecode can return multiple chars
|
|
60
|
+
if apply_lowercase:
|
|
61
|
+
chars_list.append(dc.lower())
|
|
62
|
+
else:
|
|
63
|
+
chars_list.append(dc)
|
|
64
|
+
return "".join(chars_list)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def transliterate_symbols(s: str, allowed_symbols_set: Optional[Set[str]] = None) -> str:
|
|
39
68
|
"""
|
|
40
69
|
Translates Unicode symbols (category S*) in the input string to their lowercase Unicode names,
|
|
41
|
-
with spaces replaced by underscores. Other characters remain unchanged.
|
|
70
|
+
with spaces replaced by underscores. Other characters, or characters in allowed_symbols_set, remain unchanged.
|
|
42
71
|
|
|
43
72
|
Args:
|
|
44
73
|
s: The input string.
|
|
74
|
+
allowed_symbols_set: A set of characters to preserve without transliteration.
|
|
45
75
|
|
|
46
76
|
Returns:
|
|
47
|
-
The string with symbols transliterated.
|
|
77
|
+
The string with symbols transliterated or preserved.
|
|
48
78
|
"""
|
|
79
|
+
if allowed_symbols_set is None:
|
|
80
|
+
allowed_symbols_set = set()
|
|
49
81
|
out: list[str] = []
|
|
50
82
|
for c in s:
|
|
51
|
-
if
|
|
83
|
+
if c in allowed_symbols_set:
|
|
84
|
+
out.append(c)
|
|
85
|
+
elif unicodedata.category(c).startswith("S"):
|
|
52
86
|
name = unicodedata.name(c, "")
|
|
53
87
|
if name:
|
|
54
88
|
out.append(name.lower().replace(" ", "_"))
|
|
@@ -58,21 +92,32 @@ def transliterate_symbols(s: str) -> str:
|
|
|
58
92
|
|
|
59
93
|
|
|
60
94
|
def normalize(
|
|
61
|
-
s: Any,
|
|
95
|
+
s: Any,
|
|
96
|
+
mode: NormalizationMode = NormalizationMode.BASIC,
|
|
97
|
+
naming: NamingConvention = NamingConvention.LOWER,
|
|
98
|
+
allowed_symbols: Optional[str] = None,
|
|
62
99
|
) -> str:
|
|
63
100
|
"""
|
|
64
101
|
Normalizes and applies a naming convention to the input.
|
|
65
102
|
|
|
66
103
|
Handles None and NaN values by returning an empty string. Converts non-string inputs to strings.
|
|
67
104
|
|
|
68
|
-
Normalization
|
|
69
|
-
- NONE: Returns the input as a string without any normalization.
|
|
70
|
-
- BASIC: Removes accents
|
|
105
|
+
Normalization (controlled by `mode`) occurs first, followed by naming convention application.
|
|
106
|
+
- NONE: Returns the input as a string without any normalization. Case is preserved.
|
|
107
|
+
- BASIC: Removes accents (via unidecode). Punctuation and spaces typically become single spaces between tokens.
|
|
108
|
+
Case is preserved from the unidecode step by default.
|
|
71
109
|
- SYMBOLS: Translates only Unicode symbols (category S*) to their lowercase Unicode names with underscores.
|
|
72
|
-
|
|
110
|
+
Other characters are preserved, including their case.
|
|
111
|
+
- FULL: Applies unidecode (case-preserved by default) and then SYMBOLS-like transliteration for S* category
|
|
112
|
+
characters not otherwise handled.
|
|
113
|
+
|
|
114
|
+
The `allowed_symbols` parameter can be used to specify characters that should be preserved in their original form
|
|
115
|
+
throughout the normalization process. These characters will not be unidecoded or transliterated by the symbol logic.
|
|
73
116
|
|
|
74
|
-
After normalization, a naming convention
|
|
75
|
-
- NONE: Returns the normalized text.
|
|
117
|
+
After normalization, a naming convention (controlled by `naming`) is applied:
|
|
118
|
+
- NONE: Returns the normalized text, preserving its case from the normalization step.
|
|
119
|
+
- LOWER: Converts the normalized text to lowercase. (Default)
|
|
120
|
+
- UPPER: Converts the normalized text to UPPERCASE.
|
|
76
121
|
- CONSTANT: Converts to CONSTANT_CASE (uppercase with underscores).
|
|
77
122
|
- SNAKE: Converts to snake_case (lowercase with underscores).
|
|
78
123
|
- CAMEL: Converts to camelCase (lowercase first word, capitalize subsequent words, no spaces).
|
|
@@ -83,7 +128,8 @@ def normalize(
|
|
|
83
128
|
Args:
|
|
84
129
|
s: The input value to normalize and format. Can be any type.
|
|
85
130
|
mode: The normalization mode to apply. Defaults to NormalizationMode.BASIC.
|
|
86
|
-
naming: The naming convention to apply. Defaults to NamingConvention.
|
|
131
|
+
naming: The naming convention to apply. Defaults to NamingConvention.LOWER.
|
|
132
|
+
allowed_symbols: A string of characters to preserve during normalization.
|
|
87
133
|
|
|
88
134
|
Returns:
|
|
89
135
|
The normalized and formatted string.
|
|
@@ -94,6 +140,8 @@ def normalize(
|
|
|
94
140
|
if isinstance(naming, str):
|
|
95
141
|
naming = NamingConvention[naming]
|
|
96
142
|
|
|
143
|
+
_allowed_symbols_set: Set[str] = set(allowed_symbols) if allowed_symbols else set()
|
|
144
|
+
|
|
97
145
|
# Handling null values
|
|
98
146
|
if s is None or (isinstance(s, float) and np.isnan(s)):
|
|
99
147
|
normalized = ""
|
|
@@ -101,41 +149,52 @@ def normalize(
|
|
|
101
149
|
return str(s)
|
|
102
150
|
else:
|
|
103
151
|
text = prettify(strip_html(str(s), True))
|
|
152
|
+
|
|
104
153
|
if mode is NormalizationMode.NONE:
|
|
105
154
|
normalized = text
|
|
106
155
|
elif mode is NormalizationMode.SYMBOLS:
|
|
107
|
-
normalized = transliterate_symbols(text)
|
|
156
|
+
normalized = transliterate_symbols(text, _allowed_symbols_set)
|
|
108
157
|
else:
|
|
109
|
-
# BASIC and FULL
|
|
110
|
-
|
|
158
|
+
# BASIC and FULL modes
|
|
159
|
+
intermediate_text = get_unidecoded_text(text, _allowed_symbols_set)
|
|
160
|
+
|
|
161
|
+
# Now, tokenize the intermediate_text for BASIC and FULL
|
|
111
162
|
tokens: list[str] = []
|
|
112
|
-
|
|
163
|
+
current_token_chars: list[str] = []
|
|
113
164
|
|
|
114
|
-
def
|
|
115
|
-
nonlocal
|
|
116
|
-
if
|
|
117
|
-
tokens.append("".join(
|
|
118
|
-
|
|
165
|
+
def flush_current_token():
|
|
166
|
+
nonlocal current_token_chars
|
|
167
|
+
if current_token_chars:
|
|
168
|
+
tokens.append("".join(current_token_chars))
|
|
169
|
+
current_token_chars.clear()
|
|
119
170
|
|
|
120
|
-
for c in
|
|
171
|
+
for c in intermediate_text:
|
|
121
172
|
cat = unicodedata.category(c)
|
|
122
|
-
if c
|
|
123
|
-
|
|
173
|
+
if c in _allowed_symbols_set: # Allowed symbols are part of tokens
|
|
174
|
+
current_token_chars.append(c)
|
|
175
|
+
elif c.isalnum():
|
|
176
|
+
current_token_chars.append(c)
|
|
124
177
|
elif mode is NormalizationMode.FULL and cat.startswith("S"):
|
|
125
|
-
|
|
178
|
+
# Transliterate S* category symbols not in allowed_symbols
|
|
179
|
+
flush_current_token()
|
|
126
180
|
name = unicodedata.name(c, "")
|
|
127
181
|
if name:
|
|
128
182
|
tokens.append(name.lower().replace(" ", "_"))
|
|
129
183
|
elif cat.startswith("P") or c.isspace():
|
|
130
|
-
|
|
131
|
-
|
|
184
|
+
# Punctuation (not allowed) or space acts as a separator
|
|
185
|
+
flush_current_token()
|
|
186
|
+
# Other characters are ignored
|
|
132
187
|
|
|
133
|
-
|
|
188
|
+
flush_current_token()
|
|
134
189
|
normalized = " ".join(tokens)
|
|
135
190
|
|
|
136
191
|
# Apply naming convention
|
|
137
192
|
if naming is NamingConvention.NONE:
|
|
138
193
|
return normalized
|
|
194
|
+
if naming is NamingConvention.LOWER:
|
|
195
|
+
return normalized.lower()
|
|
196
|
+
if naming is NamingConvention.UPPER:
|
|
197
|
+
return normalized.upper()
|
|
139
198
|
if naming is NamingConvention.PARAM:
|
|
140
199
|
return parameterize(normalized)
|
|
141
200
|
if naming is NamingConvention.TITLE:
|
|
@@ -17,11 +17,11 @@ datamarket/utils/main.py,sha256=j8wnAxeLvijdRU9M4V6HunWH7vgWWHP4u4xamzkWcUU,7009
|
|
|
17
17
|
datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
|
|
18
18
|
datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
|
|
19
19
|
datamarket/utils/strings/__init__.py,sha256=RmyN3hKGXmUym8w5tn28yWkw2uM-b5OvntB4D0lU1eo,84
|
|
20
|
-
datamarket/utils/strings/normalization.py,sha256=
|
|
20
|
+
datamarket/utils/strings/normalization.py,sha256=z2dDXFVQ-nVqPDRR1T4HWmELpJuXUVt_P6leHqPVheY,8666
|
|
21
21
|
datamarket/utils/strings/obfuscation.py,sha256=8gMepfjPq0N4_IpKR6i2dy_9VJugQ3qJiRiRvKavB3s,5246
|
|
22
22
|
datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
|
|
23
23
|
datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
|
|
24
|
-
datamarket-0.9.
|
|
25
|
-
datamarket-0.9.
|
|
26
|
-
datamarket-0.9.
|
|
27
|
-
datamarket-0.9.
|
|
24
|
+
datamarket-0.9.31.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
25
|
+
datamarket-0.9.31.dist-info/METADATA,sha256=bM8NgTxQZXA9bSE5vsraTMx_TZTuSsC9VuTU1fyhalQ,6871
|
|
26
|
+
datamarket-0.9.31.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
|
27
|
+
datamarket-0.9.31.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|