datamarket 0.9.25__py3-none-any.whl → 0.9.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -0,0 +1,152 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ from enum import Enum, auto
5
+ from typing import Any
6
+ import unicodedata
7
+
8
+ import numpy as np
9
+ from unidecode import unidecode
10
+ from inflection import parameterize, underscore, titleize, camelize
11
+ from string_utils import prettify, strip_html
12
+
13
+ ########################################################################################################################
14
+ # CLASSES
15
+
16
+
17
+ class NormalizationMode(Enum):
18
+ NONE = auto()
19
+ BASIC = auto() # removes accents and converts punctuation to spaces
20
+ SYMBOLS = auto() # translates only symbols to Unicode name
21
+ FULL = auto() # BASIC + SYMBOLS
22
+
23
+
24
+ class NamingConvention(Enum):
25
+ NONE = auto() # no style change
26
+ CONSTANT = auto() # CONSTANT_CASE (uppercase, underscores)
27
+ SNAKE = auto() # snake_case (lowercase, underscores)
28
+ CAMEL = auto() # camelCase (capitalize words except first one, no spaces)
29
+ PASCAL = auto() # PascalCase (capitalize words including first one, no spaces)
30
+ PARAM = auto() # parameterize (hyphens)
31
+ TITLE = auto() # titleize (capitalize words)
32
+
33
+
34
+ ########################################################################################################################
35
+ # FUNCTIONS
36
+
37
+
38
+ def transliterate_symbols(s: str) -> str:
39
+ """
40
+ Translates Unicode symbols (category S*) in the input string to their lowercase Unicode names,
41
+ with spaces replaced by underscores. Other characters remain unchanged.
42
+
43
+ Args:
44
+ s: The input string.
45
+
46
+ Returns:
47
+ The string with symbols transliterated.
48
+ """
49
+ out: list[str] = []
50
+ for c in s:
51
+ if unicodedata.category(c).startswith("S"):
52
+ name = unicodedata.name(c, "")
53
+ if name:
54
+ out.append(name.lower().replace(" ", "_"))
55
+ else:
56
+ out.append(c)
57
+ return "".join(out)
58
+
59
+
60
+ def normalize(
61
+ s: Any, mode: NormalizationMode = NormalizationMode.BASIC, naming: NamingConvention = NamingConvention.NONE
62
+ ) -> str:
63
+ """
64
+ Normalizes and applies a naming convention to the input.
65
+
66
+ Handles None and NaN values by returning an empty string. Converts non-string inputs to strings.
67
+
68
+ Normalization is applied according to `mode`:
69
+ - NONE: Returns the input as a string without any normalization.
70
+ - BASIC: Removes accents, converts punctuation and spaces to single spaces, and preserves alphanumeric characters.
71
+ - SYMBOLS: Translates only Unicode symbols (category S*) to their lowercase Unicode names with underscores.
72
+ - FULL: Applies both BASIC and SYMBOLS normalization.
73
+
74
+ After normalization, a naming convention is applied according to `naming`:
75
+ - NONE: Returns the normalized text.
76
+ - CONSTANT: Converts to CONSTANT_CASE (uppercase with underscores).
77
+ - SNAKE: Converts to snake_case (lowercase with underscores).
78
+ - CAMEL: Converts to camelCase (lowercase first word, capitalize subsequent words, no spaces).
79
+ - PASCAL: Converts to PascalCase (capitalize all words, no spaces).
80
+ - PARAM: Converts to parameterize (lowercase with hyphens).
81
+ - TITLE: Converts to Title Case (capitalize each word).
82
+
83
+ Args:
84
+ s: The input value to normalize and format. Can be any type.
85
+ mode: The normalization mode to apply. Defaults to NormalizationMode.BASIC.
86
+ naming: The naming convention to apply. Defaults to NamingConvention.NONE.
87
+
88
+ Returns:
89
+ The normalized and formatted string.
90
+ """
91
+ # Parameter mapping
92
+ if isinstance(mode, str):
93
+ mode = NormalizationMode[mode]
94
+ if isinstance(naming, str):
95
+ naming = NamingConvention[naming]
96
+
97
+ # Handling null values
98
+ if s is None or (isinstance(s, float) and np.isnan(s)):
99
+ normalized = ""
100
+ elif not isinstance(s, str):
101
+ return str(s)
102
+ else:
103
+ text = prettify(strip_html(str(s), True))
104
+ if mode is NormalizationMode.NONE:
105
+ normalized = text
106
+ elif mode is NormalizationMode.SYMBOLS:
107
+ normalized = transliterate_symbols(text)
108
+ else:
109
+ # BASIC and FULL: remove accents and lowercase
110
+ normalized = unidecode(text).lower()
111
+ tokens: list[str] = []
112
+ current: list[str] = []
113
+
114
+ def flush_current():
115
+ nonlocal current
116
+ if current:
117
+ tokens.append("".join(current))
118
+ current.clear()
119
+
120
+ for c in normalized:
121
+ cat = unicodedata.category(c)
122
+ if c.isalnum():
123
+ current.append(c)
124
+ elif mode is NormalizationMode.FULL and cat.startswith("S"):
125
+ flush_current()
126
+ name = unicodedata.name(c, "")
127
+ if name:
128
+ tokens.append(name.lower().replace(" ", "_"))
129
+ elif cat.startswith("P") or c.isspace():
130
+ flush_current()
131
+ # other characters ignored
132
+
133
+ flush_current()
134
+ normalized = " ".join(tokens)
135
+
136
+ # Apply naming convention
137
+ if naming is NamingConvention.NONE:
138
+ return normalized
139
+ if naming is NamingConvention.PARAM:
140
+ return parameterize(normalized)
141
+ if naming is NamingConvention.TITLE:
142
+ return titleize(normalized)
143
+
144
+ underscored = underscore(parameterize(normalized))
145
+ if naming is NamingConvention.CONSTANT:
146
+ return underscored.upper()
147
+ if naming is NamingConvention.CAMEL:
148
+ return camelize(underscored, False)
149
+ if naming is NamingConvention.PASCAL:
150
+ return camelize(underscored)
151
+
152
+ return underscored
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.25
3
+ Version: 0.9.27
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -59,7 +59,6 @@ Provides-Extra: soda-core-postgres
59
59
  Provides-Extra: stem
60
60
  Provides-Extra: tqdm
61
61
  Provides-Extra: undetected-chromedriver
62
- Provides-Extra: unidecode
63
62
  Provides-Extra: xmltodict
64
63
  Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0) ; extra == "alchemy"
65
64
  Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
@@ -84,10 +83,12 @@ Requires-Dist: google-auth-httplib2 (>=0.2.0,<0.3.0) ; extra == "google-auth-htt
84
83
  Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oauthlib"
85
84
  Requires-Dist: html2text (>=2024.0.0,<2025.0.0) ; extra == "html2text"
86
85
  Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
86
+ Requires-Dist: inflection (>=0.5.0,<0.6.0)
87
87
  Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
88
88
  Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
89
89
  Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
90
90
  Requires-Dist: nodriver (>=0.44,<0.45) ; extra == "nodriver"
91
+ Requires-Dist: numpy (>=2.0.0,<3.0.0)
91
92
  Requires-Dist: openpyxl (>=3.0.0,<4.0.0) ; extra == "openpyxl"
92
93
  Requires-Dist: pandas (>=2.0.0,<3.0.0) ; extra == "pandas"
93
94
  Requires-Dist: pandera (>=0.22.0,<0.23.0) ; extra == "pandera"
@@ -102,6 +103,7 @@ Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
102
103
  Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
103
104
  Requires-Dist: pyspark (>=3.0.0,<4.0.0) ; extra == "pyspark"
104
105
  Requires-Dist: pytest (>=8.0.0,<9.0.0) ; extra == "pytest"
106
+ Requires-Dist: python-string-utils (>=1.0.0,<2.0.0)
105
107
  Requires-Dist: rapidfuzz (>=3.0.0,<4.0.0) ; extra == "rapidfuzz"
106
108
  Requires-Dist: requests (>=2.0.0,<3.0.0)
107
109
  Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
@@ -112,7 +114,7 @@ Requires-Dist: stem (>=1.0.0,<2.0.0) ; extra == "stem" or extra == "proxy"
112
114
  Requires-Dist: tenacity (>=9.0.0,<10.0.0)
113
115
  Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
114
116
  Requires-Dist: typer (>=0.15.0,<0.16.0)
115
- Requires-Dist: unidecode (>=1.0.0,<2.0.0) ; extra == "unidecode"
117
+ Requires-Dist: unidecode (>=1.0.0,<2.0.0)
116
118
  Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
117
119
  Project-URL: Documentation, https://github.com/Data-Market/datamarket
118
120
  Project-URL: Homepage, https://datamarket.es
@@ -16,9 +16,10 @@ datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,6
16
16
  datamarket/utils/main.py,sha256=j8wnAxeLvijdRU9M4V6HunWH7vgWWHP4u4xamzkWcUU,7009
17
17
  datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
18
18
  datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
19
+ datamarket/utils/strings.py,sha256=rEX9NeBG4C7RECgT0EQebgoFoxgZMy9-7EcBSxgBANU,5654
19
20
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
20
21
  datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
21
- datamarket-0.9.25.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
22
- datamarket-0.9.25.dist-info/METADATA,sha256=NOyJL33qAwMuqJalrtwxe4bzVUpusmMaLVzn0QcBfw8,6459
23
- datamarket-0.9.25.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
24
- datamarket-0.9.25.dist-info/RECORD,,
22
+ datamarket-0.9.27.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
23
+ datamarket-0.9.27.dist-info/METADATA,sha256=ZDopWDfk3f0HeTZSVAKSnAmfOPSBUOJNlos9fqGzKJA,6543
24
+ datamarket-0.9.27.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
25
+ datamarket-0.9.27.dist-info/RECORD,,