datamarket 0.9.29__tar.gz → 0.9.31__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (28) hide show
  1. {datamarket-0.9.29 → datamarket-0.9.31}/PKG-INFO +10 -5
  2. {datamarket-0.9.29 → datamarket-0.9.31}/pyproject.toml +11 -2
  3. datamarket-0.9.31/src/datamarket/utils/strings/__init__.py +2 -0
  4. datamarket-0.9.31/src/datamarket/utils/strings/normalization.py +211 -0
  5. datamarket-0.9.31/src/datamarket/utils/strings/obfuscation.py +153 -0
  6. datamarket-0.9.29/src/datamarket/utils/strings.py +0 -152
  7. {datamarket-0.9.29 → datamarket-0.9.31}/LICENSE +0 -0
  8. {datamarket-0.9.29 → datamarket-0.9.31}/README.md +0 -0
  9. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/__init__.py +0 -0
  10. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/interfaces/__init__.py +0 -0
  11. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/interfaces/alchemy.py +0 -0
  12. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/interfaces/aws.py +0 -0
  13. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/interfaces/drive.py +0 -0
  14. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/interfaces/ftp.py +0 -0
  15. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/interfaces/nominatim.py +0 -0
  16. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/interfaces/peerdb.py +0 -0
  17. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/interfaces/proxy.py +0 -0
  18. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/interfaces/tinybird.py +0 -0
  19. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/params/__init__.py +0 -0
  20. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/params/nominatim.py +0 -0
  21. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/utils/__init__.py +0 -0
  22. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/utils/airflow.py +0 -0
  23. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/utils/alchemy.py +0 -0
  24. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/utils/main.py +0 -0
  25. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/utils/selenium.py +0 -0
  26. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/utils/soda.py +0 -0
  27. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/utils/typer.py +0 -0
  28. {datamarket-0.9.29 → datamarket-0.9.31}/src/datamarket/utils/types.py +0 -0
@@ -1,17 +1,17 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.1
2
2
  Name: datamarket
3
- Version: 0.9.29
3
+ Version: 0.9.31
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
+ Home-page: https://datamarket.es
5
6
  License: GPL-3.0-or-later
6
7
  Author: DataMarket
7
8
  Author-email: techsupport@datamarket.es
8
- Requires-Python: >=3.12,<4.0
9
+ Requires-Python: >=3.12,<3.13
9
10
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
10
11
  Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
11
12
  Classifier: Operating System :: OS Independent
12
13
  Classifier: Programming Language :: Python :: 3
13
14
  Classifier: Programming Language :: Python :: 3.12
14
- Classifier: Programming Language :: Python :: 3.13
15
15
  Provides-Extra: alchemy
16
16
  Provides-Extra: aws
17
17
  Provides-Extra: azure-storage-blob
@@ -40,6 +40,7 @@ Provides-Extra: openpyxl
40
40
  Provides-Extra: pandas
41
41
  Provides-Extra: pandera
42
42
  Provides-Extra: peerdb
43
+ Provides-Extra: pii
43
44
  Provides-Extra: pillow
44
45
  Provides-Extra: playwright
45
46
  Provides-Extra: playwright-stealth
@@ -96,6 +97,8 @@ Requires-Dist: pendulum (>=3.0.0,<4.0.0)
96
97
  Requires-Dist: pillow (>=11.0.0,<12.0.0) ; extra == "pillow"
97
98
  Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
98
99
  Requires-Dist: pre-commit (>=4.0.0,<5.0.0)
100
+ Requires-Dist: presidio-analyzer[phonenumbers] (>=2.0.0,<3.0.0) ; extra == "pii"
101
+ Requires-Dist: presidio-anonymizer (>=2.0.0,<3.0.0) ; extra == "pii"
99
102
  Requires-Dist: psycopg2-binary (>=2.0.0,<3.0.0)
100
103
  Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
101
104
  Requires-Dist: pycountry (>=24.0.0,<25.0.0)
@@ -111,14 +114,16 @@ Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
111
114
  Requires-Dist: shapely (>=2.0.0,<3.0.0) ; extra == "shapely"
112
115
  Requires-Dist: soda-core-mysql (>=3.0.0,<4.0.0) ; extra == "soda-core-mysql"
113
116
  Requires-Dist: soda-core-postgres (>=3.0.0,<4.0.0) ; extra == "soda-core-postgres"
117
+ Requires-Dist: spacy (>=3.0.0,<4.0.0) ; extra == "pii"
118
+ Requires-Dist: spacy-langdetect (>=0.1.0,<0.2.0) ; extra == "pii"
114
119
  Requires-Dist: stem (>=1.0.0,<2.0.0) ; extra == "stem" or extra == "proxy"
115
120
  Requires-Dist: tenacity (>=9.0.0,<10.0.0)
121
+ Requires-Dist: tf-playwright-stealth (>=1.0.0,<2.0.0)
116
122
  Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
117
123
  Requires-Dist: typer (>=0.15.0,<0.16.0)
118
124
  Requires-Dist: unidecode (>=1.0.0,<2.0.0)
119
125
  Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
120
126
  Project-URL: Documentation, https://github.com/Data-Market/datamarket
121
- Project-URL: Homepage, https://datamarket.es
122
127
  Project-URL: Repository, https://github.com/Data-Market/datamarket
123
128
  Description-Content-Type: text/markdown
124
129
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.9.29"
3
+ version = "0.9.31"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -15,7 +15,7 @@ classifiers = [
15
15
  ]
16
16
 
17
17
  [tool.poetry.dependencies]
18
- python = "^3.12"
18
+ python = ">=3.12,<3.13"
19
19
  typer = "~0.15.0"
20
20
  psycopg2-binary = "^2.0.0"
21
21
  requests = "^2.0.0"
@@ -76,6 +76,12 @@ openpyxl = { version = "^3.0.0", optional = true }
76
76
  httpx = { extras = ["http2"], version = "~0.28.0", optional = true }
77
77
  SQLAlchemy = { version = "^2.0.0", optional = true }
78
78
  camoufox = { extras = ["geoip"], version = "~0.4.11", optional = true }
79
+ presidio-analyzer = { version = "^2.0.0", optional = true, extras = [
80
+ "phonenumbers",
81
+ ] }
82
+ presidio-anonymizer = { version = "^2.0.0", optional = true }
83
+ spacy = { version = "^3.0.0", optional = true }
84
+ spacy-langdetect = { version = "~0.1.0", optional = true }
79
85
 
80
86
  [tool.poetry.extras]
81
87
  boto3 = ["boto3"]
@@ -128,6 +134,9 @@ peerdb = ["boto3", "clickhouse-driver"]
128
134
  proxy = ["stem"]
129
135
  alchemy = ["SQLAlchemy"]
130
136
 
137
+ # Other groups
138
+ pii = ["presidio-analyzer", "presidio-anonymizer", "spacy", "spacy-langdetect"]
139
+
131
140
 
132
141
  [build-system]
133
142
  requires = ["poetry-core>=1.0.0"]
@@ -0,0 +1,2 @@
1
+ from .normalization import * # noqa: F403
2
+ from .obfuscation import * # noqa: F403
@@ -0,0 +1,211 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import unicodedata
5
+ from enum import Enum, auto
6
+ from typing import Any, Optional, Set
7
+
8
+ import numpy as np
9
+ from inflection import camelize, parameterize, titleize, underscore
10
+ from string_utils import prettify, strip_html
11
+ from unidecode import unidecode
12
+
13
+ ########################################################################################################################
14
+ # CLASSES
15
+
16
+
17
+ class NormalizationMode(Enum):
18
+ NONE = auto()
19
+ BASIC = auto() # removes accents and converts punctuation to spaces
20
+ SYMBOLS = auto() # translates only symbols to Unicode name
21
+ FULL = auto() # BASIC + SYMBOLS
22
+
23
+
24
+ class NamingConvention(Enum):
25
+ NONE = auto() # no style change
26
+ LOWER = auto() # lowercase
27
+ UPPER = auto() # UPPERCASE
28
+ CONSTANT = auto() # CONSTANT_CASE (uppercase, underscores)
29
+ SNAKE = auto() # snake_case (lowercase, underscores)
30
+ CAMEL = auto() # camelCase (capitalize words except first one, no spaces)
31
+ PASCAL = auto() # PascalCase (capitalize words including first one, no spaces)
32
+ PARAM = auto() # parameterize (hyphens)
33
+ TITLE = auto() # titleize (capitalize words)
34
+
35
+
36
+ ########################################################################################################################
37
+ # FUNCTIONS
38
+
39
+
40
+ def get_unidecoded_text(input_text: str, allowed_chars: Set[str], apply_lowercase: bool = False) -> str:
41
+ """
42
+ Processes a string by unidecoding characters, optionally lowercasing them,
43
+ while preserving a specified set of allowed characters.
44
+
45
+ Args:
46
+ input_text: The string to process.
47
+ allowed_chars: A set of characters to preserve in their original form.
48
+ apply_lowercase: Whether to convert unidecoded characters to lowercase. Defaults to False.
49
+
50
+ Returns:
51
+ The processed string.
52
+ """
53
+ chars_list: list[str] = []
54
+ for char_original in input_text:
55
+ if char_original in allowed_chars:
56
+ chars_list.append(char_original)
57
+ else:
58
+ decoded_segment = unidecode(char_original)
59
+ for dc in decoded_segment: # unidecode can return multiple chars
60
+ if apply_lowercase:
61
+ chars_list.append(dc.lower())
62
+ else:
63
+ chars_list.append(dc)
64
+ return "".join(chars_list)
65
+
66
+
67
+ def transliterate_symbols(s: str, allowed_symbols_set: Optional[Set[str]] = None) -> str:
68
+ """
69
+ Translates Unicode symbols (category S*) in the input string to their lowercase Unicode names,
70
+ with spaces replaced by underscores. Other characters, or characters in allowed_symbols_set, remain unchanged.
71
+
72
+ Args:
73
+ s: The input string.
74
+ allowed_symbols_set: A set of characters to preserve without transliteration.
75
+
76
+ Returns:
77
+ The string with symbols transliterated or preserved.
78
+ """
79
+ if allowed_symbols_set is None:
80
+ allowed_symbols_set = set()
81
+ out: list[str] = []
82
+ for c in s:
83
+ if c in allowed_symbols_set:
84
+ out.append(c)
85
+ elif unicodedata.category(c).startswith("S"):
86
+ name = unicodedata.name(c, "")
87
+ if name:
88
+ out.append(name.lower().replace(" ", "_"))
89
+ else:
90
+ out.append(c)
91
+ return "".join(out)
92
+
93
+
94
+ def normalize(
95
+ s: Any,
96
+ mode: NormalizationMode = NormalizationMode.BASIC,
97
+ naming: NamingConvention = NamingConvention.LOWER,
98
+ allowed_symbols: Optional[str] = None,
99
+ ) -> str:
100
+ """
101
+ Normalizes and applies a naming convention to the input.
102
+
103
+ Handles None and NaN values by returning an empty string. Converts non-string inputs to strings.
104
+
105
+ Normalization (controlled by `mode`) occurs first, followed by naming convention application.
106
+ - NONE: Returns the input as a string without any normalization. Case is preserved.
107
+ - BASIC: Removes accents (via unidecode). Punctuation and spaces typically become single spaces between tokens.
108
+ Case is preserved from the unidecode step by default.
109
+ - SYMBOLS: Translates only Unicode symbols (category S*) to their lowercase Unicode names with underscores.
110
+ Other characters are preserved, including their case.
111
+ - FULL: Applies unidecode (case-preserved by default) and then SYMBOLS-like transliteration for S* category
112
+ characters not otherwise handled.
113
+
114
+ The `allowed_symbols` parameter can be used to specify characters that should be preserved in their original form
115
+ throughout the normalization process. These characters will not be unidecoded or transliterated by the symbol logic.
116
+
117
+ After normalization, a naming convention (controlled by `naming`) is applied:
118
+ - NONE: Returns the normalized text, preserving its case from the normalization step.
119
+ - LOWER: Converts the normalized text to lowercase. (Default)
120
+ - UPPER: Converts the normalized text to UPPERCASE.
121
+ - CONSTANT: Converts to CONSTANT_CASE (uppercase with underscores).
122
+ - SNAKE: Converts to snake_case (lowercase with underscores).
123
+ - CAMEL: Converts to camelCase (lowercase first word, capitalize subsequent words, no spaces).
124
+ - PASCAL: Converts to PascalCase (capitalize all words, no spaces).
125
+ - PARAM: Converts to parameterize (lowercase with hyphens).
126
+ - TITLE: Converts to Title Case (capitalize each word).
127
+
128
+ Args:
129
+ s: The input value to normalize and format. Can be any type.
130
+ mode: The normalization mode to apply. Defaults to NormalizationMode.BASIC.
131
+ naming: The naming convention to apply. Defaults to NamingConvention.LOWER.
132
+ allowed_symbols: A string of characters to preserve during normalization.
133
+
134
+ Returns:
135
+ The normalized and formatted string.
136
+ """
137
+ # Parameter mapping
138
+ if isinstance(mode, str):
139
+ mode = NormalizationMode[mode]
140
+ if isinstance(naming, str):
141
+ naming = NamingConvention[naming]
142
+
143
+ _allowed_symbols_set: Set[str] = set(allowed_symbols) if allowed_symbols else set()
144
+
145
+ # Handling null values
146
+ if s is None or (isinstance(s, float) and np.isnan(s)):
147
+ normalized = ""
148
+ elif not isinstance(s, str):
149
+ return str(s)
150
+ else:
151
+ text = prettify(strip_html(str(s), True))
152
+
153
+ if mode is NormalizationMode.NONE:
154
+ normalized = text
155
+ elif mode is NormalizationMode.SYMBOLS:
156
+ normalized = transliterate_symbols(text, _allowed_symbols_set)
157
+ else:
158
+ # BASIC and FULL modes
159
+ intermediate_text = get_unidecoded_text(text, _allowed_symbols_set)
160
+
161
+ # Now, tokenize the intermediate_text for BASIC and FULL
162
+ tokens: list[str] = []
163
+ current_token_chars: list[str] = []
164
+
165
+ def flush_current_token():
166
+ nonlocal current_token_chars
167
+ if current_token_chars:
168
+ tokens.append("".join(current_token_chars))
169
+ current_token_chars.clear()
170
+
171
+ for c in intermediate_text:
172
+ cat = unicodedata.category(c)
173
+ if c in _allowed_symbols_set: # Allowed symbols are part of tokens
174
+ current_token_chars.append(c)
175
+ elif c.isalnum():
176
+ current_token_chars.append(c)
177
+ elif mode is NormalizationMode.FULL and cat.startswith("S"):
178
+ # Transliterate S* category symbols not in allowed_symbols
179
+ flush_current_token()
180
+ name = unicodedata.name(c, "")
181
+ if name:
182
+ tokens.append(name.lower().replace(" ", "_"))
183
+ elif cat.startswith("P") or c.isspace():
184
+ # Punctuation (not allowed) or space acts as a separator
185
+ flush_current_token()
186
+ # Other characters are ignored
187
+
188
+ flush_current_token()
189
+ normalized = " ".join(tokens)
190
+
191
+ # Apply naming convention
192
+ if naming is NamingConvention.NONE:
193
+ return normalized
194
+ if naming is NamingConvention.LOWER:
195
+ return normalized.lower()
196
+ if naming is NamingConvention.UPPER:
197
+ return normalized.upper()
198
+ if naming is NamingConvention.PARAM:
199
+ return parameterize(normalized)
200
+ if naming is NamingConvention.TITLE:
201
+ return titleize(normalized)
202
+
203
+ underscored = underscore(parameterize(normalized))
204
+ if naming is NamingConvention.CONSTANT:
205
+ return underscored.upper()
206
+ if naming is NamingConvention.CAMEL:
207
+ return camelize(underscored, False)
208
+ if naming is NamingConvention.PASCAL:
209
+ return camelize(underscored)
210
+
211
+ return underscored
@@ -0,0 +1,153 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import logging
5
+ import warnings
6
+ from typing import List, Optional
7
+
8
+
9
+ class PiiDependenciesMissingError(ImportError):
10
+ pass
11
+
12
+
13
+ class SpacyModelNotFoundError(ImportError):
14
+ pass
15
+
16
+
17
+ try:
18
+ import phonenumbers
19
+ import spacy
20
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
21
+ from presidio_analyzer.nlp_engine import NlpEngineProvider
22
+ from presidio_analyzer.predefined_recognizers import PhoneRecognizer
23
+ from presidio_anonymizer import AnonymizerEngine
24
+ from spacy.language import Language
25
+ from spacy_langdetect import LanguageDetector
26
+ except ImportError as e:
27
+ raise PiiDependenciesMissingError(
28
+ "One or more PII anonymization dependencies are missing. "
29
+ "Please install them by running: pip install datamarket[pii]\n"
30
+ f"Original error: {e}"
31
+ ) from e
32
+
33
+
34
+ ########################################################################################################################
35
+ # SETTINGS
36
+
37
+ logger = logging.getLogger()
38
+ logging.getLogger("presidio-analyzer").setLevel(logging.ERROR)
39
+
40
+ warnings.filterwarnings(
41
+ "ignore",
42
+ message=r"\[W108\]",
43
+ category=UserWarning,
44
+ module="spacy.pipeline.lemmatizer",
45
+ )
46
+
47
+
48
+ @Language.factory("language_detector")
49
+ def get_lang_detector(nlp, name):
50
+ return LanguageDetector()
51
+
52
+
53
+ ########################################################################################################################
54
+ # CLASSES
55
+
56
+
57
+ class PiiAnonymizer:
58
+ SUPPORTED_LANG = ["es", "en"]
59
+
60
+ def __init__(self):
61
+ # Check for required spaCy models
62
+ required_models = {
63
+ "en_core_web_md": "python -m spacy download en_core_web_md",
64
+ "es_core_news_md": "python -m spacy download es_core_news_md",
65
+ }
66
+ missing_models_instructions = []
67
+ for model_name, install_command in required_models.items():
68
+ if not spacy.util.is_package(model_name):
69
+ missing_models_instructions.append(
70
+ f"Model '{model_name}' not found. Please install it by running: {install_command}"
71
+ )
72
+
73
+ if missing_models_instructions:
74
+ raise SpacyModelNotFoundError("\n".join(missing_models_instructions))
75
+
76
+ self.anonymizer = AnonymizerEngine()
77
+ self.analyzer = self._load_analyzer_engine()
78
+
79
+ self.nlp = self._nlp()
80
+
81
+ def _nlp(self) -> Language:
82
+ analyzer_en_model = self.analyzer.nlp_engine.nlp.get("en")
83
+ shared_vocab = analyzer_en_model.vocab
84
+ nlp = spacy.blank("en", vocab=shared_vocab)
85
+
86
+ if nlp.has_factory("sentencizer"):
87
+ nlp.add_pipe("sentencizer")
88
+
89
+ if nlp.has_factory("language_detector"):
90
+ nlp.add_pipe("language_detector", last=True)
91
+
92
+ return nlp
93
+
94
+ @staticmethod
95
+ def _nlp_config():
96
+ return {
97
+ "nlp_engine_name": "spacy",
98
+ "models": [
99
+ {"lang_code": "es", "model_name": "es_core_news_md"},
100
+ {"lang_code": "en", "model_name": "en_core_web_md"},
101
+ ],
102
+ }
103
+
104
+ def _load_analyzer_engine(self) -> AnalyzerEngine:
105
+ provider = NlpEngineProvider(nlp_configuration=PiiAnonymizer._nlp_config())
106
+ nlp_engine = provider.create_engine()
107
+ phone_recognizer_es = PhoneRecognizer(
108
+ supported_language="es",
109
+ supported_regions=phonenumbers.SUPPORTED_REGIONS,
110
+ context=["teléfono", "móvil", "número"],
111
+ )
112
+ registry = RecognizerRegistry(supported_languages=self.SUPPORTED_LANG)
113
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine, languages=self.SUPPORTED_LANG)
114
+ registry.add_recognizer(phone_recognizer_es)
115
+
116
+ analyzer = AnalyzerEngine(
117
+ registry=registry,
118
+ nlp_engine=nlp_engine,
119
+ supported_languages=self.SUPPORTED_LANG,
120
+ )
121
+ return analyzer
122
+
123
+ def detect_lang(self, text: str) -> str:
124
+ if hasattr(self, "nlp") and self.nlp:
125
+ with self.nlp.select_pipes(enable=["tokenizer", "sentencizer", "language_detector"]):
126
+ doc = self.nlp(text)
127
+ return doc._.language["language"]
128
+ else:
129
+ logger.error("Language detection NLP model not initialized. Cannot detect language.")
130
+ return "unknown"
131
+
132
+ def anonymize_text(
133
+ self,
134
+ text: str,
135
+ entities: Optional[List[str]] = None,
136
+ lang: str = "unknown",
137
+ ) -> str:
138
+ if lang == "unknown":
139
+ lang = self.detect_lang(text)
140
+ if lang not in self.SUPPORTED_LANG:
141
+ logger.warning(f"Support for language {lang} is not implemented yet! Fail safe to empty string.")
142
+ return ""
143
+ elif lang not in self.SUPPORTED_LANG:
144
+ logger.warning(f"Support for language {lang} is not implemented yet! Fail safe to empty string.")
145
+ return ""
146
+
147
+ analyzer_result = self.analyzer.analyze(
148
+ text=text,
149
+ entities=entities,
150
+ language=lang,
151
+ )
152
+ anonymizer_result = self.anonymizer.anonymize(text=text, analyzer_results=analyzer_result)
153
+ return anonymizer_result.text
@@ -1,152 +0,0 @@
1
- ########################################################################################################################
2
- # IMPORTS
3
-
4
- from enum import Enum, auto
5
- from typing import Any
6
- import unicodedata
7
-
8
- import numpy as np
9
- from unidecode import unidecode
10
- from inflection import parameterize, underscore, titleize, camelize
11
- from string_utils import prettify, strip_html
12
-
13
- ########################################################################################################################
14
- # CLASSES
15
-
16
-
17
- class NormalizationMode(Enum):
18
- NONE = auto()
19
- BASIC = auto() # removes accents and converts punctuation to spaces
20
- SYMBOLS = auto() # translates only symbols to Unicode name
21
- FULL = auto() # BASIC + SYMBOLS
22
-
23
-
24
- class NamingConvention(Enum):
25
- NONE = auto() # no style change
26
- CONSTANT = auto() # CONSTANT_CASE (uppercase, underscores)
27
- SNAKE = auto() # snake_case (lowercase, underscores)
28
- CAMEL = auto() # camelCase (capitalize words except first one, no spaces)
29
- PASCAL = auto() # PascalCase (capitalize words including first one, no spaces)
30
- PARAM = auto() # parameterize (hyphens)
31
- TITLE = auto() # titleize (capitalize words)
32
-
33
-
34
- ########################################################################################################################
35
- # FUNCTIONS
36
-
37
-
38
- def transliterate_symbols(s: str) -> str:
39
- """
40
- Translates Unicode symbols (category S*) in the input string to their lowercase Unicode names,
41
- with spaces replaced by underscores. Other characters remain unchanged.
42
-
43
- Args:
44
- s: The input string.
45
-
46
- Returns:
47
- The string with symbols transliterated.
48
- """
49
- out: list[str] = []
50
- for c in s:
51
- if unicodedata.category(c).startswith("S"):
52
- name = unicodedata.name(c, "")
53
- if name:
54
- out.append(name.lower().replace(" ", "_"))
55
- else:
56
- out.append(c)
57
- return "".join(out)
58
-
59
-
60
- def normalize(
61
- s: Any, mode: NormalizationMode = NormalizationMode.BASIC, naming: NamingConvention = NamingConvention.NONE
62
- ) -> str:
63
- """
64
- Normalizes and applies a naming convention to the input.
65
-
66
- Handles None and NaN values by returning an empty string. Converts non-string inputs to strings.
67
-
68
- Normalization is applied according to `mode`:
69
- - NONE: Returns the input as a string without any normalization.
70
- - BASIC: Removes accents, converts punctuation and spaces to single spaces, and preserves alphanumeric characters.
71
- - SYMBOLS: Translates only Unicode symbols (category S*) to their lowercase Unicode names with underscores.
72
- - FULL: Applies both BASIC and SYMBOLS normalization.
73
-
74
- After normalization, a naming convention is applied according to `naming`:
75
- - NONE: Returns the normalized text.
76
- - CONSTANT: Converts to CONSTANT_CASE (uppercase with underscores).
77
- - SNAKE: Converts to snake_case (lowercase with underscores).
78
- - CAMEL: Converts to camelCase (lowercase first word, capitalize subsequent words, no spaces).
79
- - PASCAL: Converts to PascalCase (capitalize all words, no spaces).
80
- - PARAM: Converts to parameterize (lowercase with hyphens).
81
- - TITLE: Converts to Title Case (capitalize each word).
82
-
83
- Args:
84
- s: The input value to normalize and format. Can be any type.
85
- mode: The normalization mode to apply. Defaults to NormalizationMode.BASIC.
86
- naming: The naming convention to apply. Defaults to NamingConvention.NONE.
87
-
88
- Returns:
89
- The normalized and formatted string.
90
- """
91
- # Parameter mapping
92
- if isinstance(mode, str):
93
- mode = NormalizationMode[mode]
94
- if isinstance(naming, str):
95
- naming = NamingConvention[naming]
96
-
97
- # Handling null values
98
- if s is None or (isinstance(s, float) and np.isnan(s)):
99
- normalized = ""
100
- elif not isinstance(s, str):
101
- return str(s)
102
- else:
103
- text = prettify(strip_html(str(s), True))
104
- if mode is NormalizationMode.NONE:
105
- normalized = text
106
- elif mode is NormalizationMode.SYMBOLS:
107
- normalized = transliterate_symbols(text)
108
- else:
109
- # BASIC and FULL: remove accents and lowercase
110
- normalized = unidecode(text).lower()
111
- tokens: list[str] = []
112
- current: list[str] = []
113
-
114
- def flush_current():
115
- nonlocal current
116
- if current:
117
- tokens.append("".join(current))
118
- current.clear()
119
-
120
- for c in normalized:
121
- cat = unicodedata.category(c)
122
- if c.isalnum():
123
- current.append(c)
124
- elif mode is NormalizationMode.FULL and cat.startswith("S"):
125
- flush_current()
126
- name = unicodedata.name(c, "")
127
- if name:
128
- tokens.append(name.lower().replace(" ", "_"))
129
- elif cat.startswith("P") or c.isspace():
130
- flush_current()
131
- # other characters ignored
132
-
133
- flush_current()
134
- normalized = " ".join(tokens)
135
-
136
- # Apply naming convention
137
- if naming is NamingConvention.NONE:
138
- return normalized
139
- if naming is NamingConvention.PARAM:
140
- return parameterize(normalized)
141
- if naming is NamingConvention.TITLE:
142
- return titleize(normalized)
143
-
144
- underscored = underscore(parameterize(normalized))
145
- if naming is NamingConvention.CONSTANT:
146
- return underscored.upper()
147
- if naming is NamingConvention.CAMEL:
148
- return camelize(underscored, False)
149
- if naming is NamingConvention.PASCAL:
150
- return camelize(underscored)
151
-
152
- return underscored
File without changes
File without changes