datamarket 0.9.25__tar.gz → 0.9.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- {datamarket-0.9.25 → datamarket-0.9.27}/PKG-INFO +5 -3
- {datamarket-0.9.25 → datamarket-0.9.27}/pyproject.toml +5 -3
- datamarket-0.9.27/src/datamarket/utils/strings.py +152 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/LICENSE +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/README.md +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/__init__.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/interfaces/__init__.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/interfaces/alchemy.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/interfaces/aws.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/interfaces/drive.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/interfaces/ftp.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/interfaces/nominatim.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/interfaces/peerdb.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/interfaces/proxy.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/interfaces/tinybird.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/params/__init__.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/params/nominatim.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/utils/__init__.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/utils/airflow.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/utils/alchemy.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/utils/main.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/utils/selenium.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/utils/soda.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/utils/typer.py +0 -0
- {datamarket-0.9.25 → datamarket-0.9.27}/src/datamarket/utils/types.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: datamarket
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.27
|
|
4
4
|
Summary: Utilities that integrate advanced scraping knowledge into just one library.
|
|
5
5
|
License: GPL-3.0-or-later
|
|
6
6
|
Author: DataMarket
|
|
@@ -59,7 +59,6 @@ Provides-Extra: soda-core-postgres
|
|
|
59
59
|
Provides-Extra: stem
|
|
60
60
|
Provides-Extra: tqdm
|
|
61
61
|
Provides-Extra: undetected-chromedriver
|
|
62
|
-
Provides-Extra: unidecode
|
|
63
62
|
Provides-Extra: xmltodict
|
|
64
63
|
Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0) ; extra == "alchemy"
|
|
65
64
|
Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
|
|
@@ -84,10 +83,12 @@ Requires-Dist: google-auth-httplib2 (>=0.2.0,<0.3.0) ; extra == "google-auth-htt
|
|
|
84
83
|
Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oauthlib"
|
|
85
84
|
Requires-Dist: html2text (>=2024.0.0,<2025.0.0) ; extra == "html2text"
|
|
86
85
|
Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
|
|
86
|
+
Requires-Dist: inflection (>=0.5.0,<0.6.0)
|
|
87
87
|
Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
|
|
88
88
|
Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
|
|
89
89
|
Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
|
|
90
90
|
Requires-Dist: nodriver (>=0.44,<0.45) ; extra == "nodriver"
|
|
91
|
+
Requires-Dist: numpy (>=2.0.0,<3.0.0)
|
|
91
92
|
Requires-Dist: openpyxl (>=3.0.0,<4.0.0) ; extra == "openpyxl"
|
|
92
93
|
Requires-Dist: pandas (>=2.0.0,<3.0.0) ; extra == "pandas"
|
|
93
94
|
Requires-Dist: pandera (>=0.22.0,<0.23.0) ; extra == "pandera"
|
|
@@ -102,6 +103,7 @@ Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
|
|
|
102
103
|
Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
|
|
103
104
|
Requires-Dist: pyspark (>=3.0.0,<4.0.0) ; extra == "pyspark"
|
|
104
105
|
Requires-Dist: pytest (>=8.0.0,<9.0.0) ; extra == "pytest"
|
|
106
|
+
Requires-Dist: python-string-utils (>=1.0.0,<2.0.0)
|
|
105
107
|
Requires-Dist: rapidfuzz (>=3.0.0,<4.0.0) ; extra == "rapidfuzz"
|
|
106
108
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
|
107
109
|
Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
|
|
@@ -112,7 +114,7 @@ Requires-Dist: stem (>=1.0.0,<2.0.0) ; extra == "stem" or extra == "proxy"
|
|
|
112
114
|
Requires-Dist: tenacity (>=9.0.0,<10.0.0)
|
|
113
115
|
Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
|
|
114
116
|
Requires-Dist: typer (>=0.15.0,<0.16.0)
|
|
115
|
-
Requires-Dist: unidecode (>=1.0.0,<2.0.0)
|
|
117
|
+
Requires-Dist: unidecode (>=1.0.0,<2.0.0)
|
|
116
118
|
Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
|
|
117
119
|
Project-URL: Documentation, https://github.com/Data-Market/datamarket
|
|
118
120
|
Project-URL: Homepage, https://datamarket.es
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "datamarket"
|
|
3
|
-
version = "0.9.
|
|
3
|
+
version = "0.9.27"
|
|
4
4
|
description = "Utilities that integrate advanced scraping knowledge into just one library."
|
|
5
5
|
authors = ["DataMarket <techsupport@datamarket.es>"]
|
|
6
6
|
license = "GPL-3.0-or-later"
|
|
@@ -26,9 +26,12 @@ pendulum = "^3.0.0"
|
|
|
26
26
|
croniter = "^3.0.0"
|
|
27
27
|
dynaconf = "^3.0.0"
|
|
28
28
|
jinja2 = "^3.0.0"
|
|
29
|
+
inflection = "~0.5.0"
|
|
30
|
+
python-string-utils = "^1.0.0"
|
|
31
|
+
unidecode = "^1.0.0"
|
|
32
|
+
numpy = "^2.0.0"
|
|
29
33
|
|
|
30
34
|
boto3 = { version = "~1.35.0", optional = true }
|
|
31
|
-
unidecode = { version = "^1.0.0", optional = true }
|
|
32
35
|
lxml = { extras = ["html-clean"], version = "^5.0.0", optional = true }
|
|
33
36
|
tqdm = { version = "^4.0.0", optional = true }
|
|
34
37
|
pandas = { version = "^2.0.0", optional = true }
|
|
@@ -74,7 +77,6 @@ camoufox = { extras = ["geoip"], version = "~0.4.11", optional = true }
|
|
|
74
77
|
|
|
75
78
|
[tool.poetry.extras]
|
|
76
79
|
boto3 = ["boto3"]
|
|
77
|
-
unidecode = ["unidecode"]
|
|
78
80
|
lxml = ["lxml"]
|
|
79
81
|
tqdm = ["tqdm"]
|
|
80
82
|
pandas = ["pandas"]
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# IMPORTS
|
|
3
|
+
|
|
4
|
+
from enum import Enum, auto
|
|
5
|
+
from typing import Any
|
|
6
|
+
import unicodedata
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from unidecode import unidecode
|
|
10
|
+
from inflection import parameterize, underscore, titleize, camelize
|
|
11
|
+
from string_utils import prettify, strip_html
|
|
12
|
+
|
|
13
|
+
########################################################################################################################
|
|
14
|
+
# CLASSES
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class NormalizationMode(Enum):
|
|
18
|
+
NONE = auto()
|
|
19
|
+
BASIC = auto() # removes accents and converts punctuation to spaces
|
|
20
|
+
SYMBOLS = auto() # translates only symbols to Unicode name
|
|
21
|
+
FULL = auto() # BASIC + SYMBOLS
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class NamingConvention(Enum):
|
|
25
|
+
NONE = auto() # no style change
|
|
26
|
+
CONSTANT = auto() # CONSTANT_CASE (uppercase, underscores)
|
|
27
|
+
SNAKE = auto() # snake_case (lowercase, underscores)
|
|
28
|
+
CAMEL = auto() # camelCase (capitalize words except first one, no spaces)
|
|
29
|
+
PASCAL = auto() # PascalCase (capitalize words including first one, no spaces)
|
|
30
|
+
PARAM = auto() # parameterize (hyphens)
|
|
31
|
+
TITLE = auto() # titleize (capitalize words)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
########################################################################################################################
|
|
35
|
+
# FUNCTIONS
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def transliterate_symbols(s: str) -> str:
|
|
39
|
+
"""
|
|
40
|
+
Translates Unicode symbols (category S*) in the input string to their lowercase Unicode names,
|
|
41
|
+
with spaces replaced by underscores. Other characters remain unchanged.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
s: The input string.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
The string with symbols transliterated.
|
|
48
|
+
"""
|
|
49
|
+
out: list[str] = []
|
|
50
|
+
for c in s:
|
|
51
|
+
if unicodedata.category(c).startswith("S"):
|
|
52
|
+
name = unicodedata.name(c, "")
|
|
53
|
+
if name:
|
|
54
|
+
out.append(name.lower().replace(" ", "_"))
|
|
55
|
+
else:
|
|
56
|
+
out.append(c)
|
|
57
|
+
return "".join(out)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def normalize(
|
|
61
|
+
s: Any, mode: NormalizationMode = NormalizationMode.BASIC, naming: NamingConvention = NamingConvention.NONE
|
|
62
|
+
) -> str:
|
|
63
|
+
"""
|
|
64
|
+
Normalizes and applies a naming convention to the input.
|
|
65
|
+
|
|
66
|
+
Handles None and NaN values by returning an empty string. Converts non-string inputs to strings.
|
|
67
|
+
|
|
68
|
+
Normalization is applied according to `mode`:
|
|
69
|
+
- NONE: Returns the input as a string without any normalization.
|
|
70
|
+
- BASIC: Removes accents, converts punctuation and spaces to single spaces, and preserves alphanumeric characters.
|
|
71
|
+
- SYMBOLS: Translates only Unicode symbols (category S*) to their lowercase Unicode names with underscores.
|
|
72
|
+
- FULL: Applies both BASIC and SYMBOLS normalization.
|
|
73
|
+
|
|
74
|
+
After normalization, a naming convention is applied according to `naming`:
|
|
75
|
+
- NONE: Returns the normalized text.
|
|
76
|
+
- CONSTANT: Converts to CONSTANT_CASE (uppercase with underscores).
|
|
77
|
+
- SNAKE: Converts to snake_case (lowercase with underscores).
|
|
78
|
+
- CAMEL: Converts to camelCase (lowercase first word, capitalize subsequent words, no spaces).
|
|
79
|
+
- PASCAL: Converts to PascalCase (capitalize all words, no spaces).
|
|
80
|
+
- PARAM: Converts to parameterize (lowercase with hyphens).
|
|
81
|
+
- TITLE: Converts to Title Case (capitalize each word).
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
s: The input value to normalize and format. Can be any type.
|
|
85
|
+
mode: The normalization mode to apply. Defaults to NormalizationMode.BASIC.
|
|
86
|
+
naming: The naming convention to apply. Defaults to NamingConvention.NONE.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
The normalized and formatted string.
|
|
90
|
+
"""
|
|
91
|
+
# Parameter mapping
|
|
92
|
+
if isinstance(mode, str):
|
|
93
|
+
mode = NormalizationMode[mode]
|
|
94
|
+
if isinstance(naming, str):
|
|
95
|
+
naming = NamingConvention[naming]
|
|
96
|
+
|
|
97
|
+
# Handling null values
|
|
98
|
+
if s is None or (isinstance(s, float) and np.isnan(s)):
|
|
99
|
+
normalized = ""
|
|
100
|
+
elif not isinstance(s, str):
|
|
101
|
+
return str(s)
|
|
102
|
+
else:
|
|
103
|
+
text = prettify(strip_html(str(s), True))
|
|
104
|
+
if mode is NormalizationMode.NONE:
|
|
105
|
+
normalized = text
|
|
106
|
+
elif mode is NormalizationMode.SYMBOLS:
|
|
107
|
+
normalized = transliterate_symbols(text)
|
|
108
|
+
else:
|
|
109
|
+
# BASIC and FULL: remove accents and lowercase
|
|
110
|
+
normalized = unidecode(text).lower()
|
|
111
|
+
tokens: list[str] = []
|
|
112
|
+
current: list[str] = []
|
|
113
|
+
|
|
114
|
+
def flush_current():
|
|
115
|
+
nonlocal current
|
|
116
|
+
if current:
|
|
117
|
+
tokens.append("".join(current))
|
|
118
|
+
current.clear()
|
|
119
|
+
|
|
120
|
+
for c in normalized:
|
|
121
|
+
cat = unicodedata.category(c)
|
|
122
|
+
if c.isalnum():
|
|
123
|
+
current.append(c)
|
|
124
|
+
elif mode is NormalizationMode.FULL and cat.startswith("S"):
|
|
125
|
+
flush_current()
|
|
126
|
+
name = unicodedata.name(c, "")
|
|
127
|
+
if name:
|
|
128
|
+
tokens.append(name.lower().replace(" ", "_"))
|
|
129
|
+
elif cat.startswith("P") or c.isspace():
|
|
130
|
+
flush_current()
|
|
131
|
+
# other characters ignored
|
|
132
|
+
|
|
133
|
+
flush_current()
|
|
134
|
+
normalized = " ".join(tokens)
|
|
135
|
+
|
|
136
|
+
# Apply naming convention
|
|
137
|
+
if naming is NamingConvention.NONE:
|
|
138
|
+
return normalized
|
|
139
|
+
if naming is NamingConvention.PARAM:
|
|
140
|
+
return parameterize(normalized)
|
|
141
|
+
if naming is NamingConvention.TITLE:
|
|
142
|
+
return titleize(normalized)
|
|
143
|
+
|
|
144
|
+
underscored = underscore(parameterize(normalized))
|
|
145
|
+
if naming is NamingConvention.CONSTANT:
|
|
146
|
+
return underscored.upper()
|
|
147
|
+
if naming is NamingConvention.CAMEL:
|
|
148
|
+
return camelize(underscored, False)
|
|
149
|
+
if naming is NamingConvention.PASCAL:
|
|
150
|
+
return camelize(underscored)
|
|
151
|
+
|
|
152
|
+
return underscored
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|