datamarket 0.9.25__tar.gz → 0.9.26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (25) hide show
  1. {datamarket-0.9.25 → datamarket-0.9.26}/PKG-INFO +5 -3
  2. {datamarket-0.9.25 → datamarket-0.9.26}/pyproject.toml +5 -3
  3. datamarket-0.9.26/src/datamarket/utils/strings.py +130 -0
  4. {datamarket-0.9.25 → datamarket-0.9.26}/LICENSE +0 -0
  5. {datamarket-0.9.25 → datamarket-0.9.26}/README.md +0 -0
  6. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/__init__.py +0 -0
  7. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/interfaces/__init__.py +0 -0
  8. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/interfaces/alchemy.py +0 -0
  9. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/interfaces/aws.py +0 -0
  10. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/interfaces/drive.py +0 -0
  11. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/interfaces/ftp.py +0 -0
  12. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/interfaces/nominatim.py +0 -0
  13. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/interfaces/peerdb.py +0 -0
  14. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/interfaces/proxy.py +0 -0
  15. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/interfaces/tinybird.py +0 -0
  16. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/params/__init__.py +0 -0
  17. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/params/nominatim.py +0 -0
  18. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/utils/__init__.py +0 -0
  19. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/utils/airflow.py +0 -0
  20. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/utils/alchemy.py +0 -0
  21. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/utils/main.py +0 -0
  22. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/utils/selenium.py +0 -0
  23. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/utils/soda.py +0 -0
  24. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/utils/typer.py +0 -0
  25. {datamarket-0.9.25 → datamarket-0.9.26}/src/datamarket/utils/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.25
3
+ Version: 0.9.26
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -59,7 +59,6 @@ Provides-Extra: soda-core-postgres
59
59
  Provides-Extra: stem
60
60
  Provides-Extra: tqdm
61
61
  Provides-Extra: undetected-chromedriver
62
- Provides-Extra: unidecode
63
62
  Provides-Extra: xmltodict
64
63
  Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0) ; extra == "alchemy"
65
64
  Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
@@ -84,10 +83,12 @@ Requires-Dist: google-auth-httplib2 (>=0.2.0,<0.3.0) ; extra == "google-auth-htt
84
83
  Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oauthlib"
85
84
  Requires-Dist: html2text (>=2024.0.0,<2025.0.0) ; extra == "html2text"
86
85
  Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
86
+ Requires-Dist: inflection (>=0.5.0,<0.6.0)
87
87
  Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
88
88
  Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
89
89
  Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
90
90
  Requires-Dist: nodriver (>=0.44,<0.45) ; extra == "nodriver"
91
+ Requires-Dist: numpy (>=2.0.0,<3.0.0)
91
92
  Requires-Dist: openpyxl (>=3.0.0,<4.0.0) ; extra == "openpyxl"
92
93
  Requires-Dist: pandas (>=2.0.0,<3.0.0) ; extra == "pandas"
93
94
  Requires-Dist: pandera (>=0.22.0,<0.23.0) ; extra == "pandera"
@@ -102,6 +103,7 @@ Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
102
103
  Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
103
104
  Requires-Dist: pyspark (>=3.0.0,<4.0.0) ; extra == "pyspark"
104
105
  Requires-Dist: pytest (>=8.0.0,<9.0.0) ; extra == "pytest"
106
+ Requires-Dist: python-string-utils (>=1.0.0,<2.0.0)
105
107
  Requires-Dist: rapidfuzz (>=3.0.0,<4.0.0) ; extra == "rapidfuzz"
106
108
  Requires-Dist: requests (>=2.0.0,<3.0.0)
107
109
  Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
@@ -112,7 +114,7 @@ Requires-Dist: stem (>=1.0.0,<2.0.0) ; extra == "stem" or extra == "proxy"
112
114
  Requires-Dist: tenacity (>=9.0.0,<10.0.0)
113
115
  Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
114
116
  Requires-Dist: typer (>=0.15.0,<0.16.0)
115
- Requires-Dist: unidecode (>=1.0.0,<2.0.0) ; extra == "unidecode"
117
+ Requires-Dist: unidecode (>=1.0.0,<2.0.0)
116
118
  Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
117
119
  Project-URL: Documentation, https://github.com/Data-Market/datamarket
118
120
  Project-URL: Homepage, https://datamarket.es
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.9.25"
3
+ version = "0.9.26"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -26,9 +26,12 @@ pendulum = "^3.0.0"
26
26
  croniter = "^3.0.0"
27
27
  dynaconf = "^3.0.0"
28
28
  jinja2 = "^3.0.0"
29
+ inflection = "~0.5.0"
30
+ python-string-utils = "^1.0.0"
31
+ unidecode = "^1.0.0"
32
+ numpy = "^2.0.0"
29
33
 
30
34
  boto3 = { version = "~1.35.0", optional = true }
31
- unidecode = { version = "^1.0.0", optional = true }
32
35
  lxml = { extras = ["html-clean"], version = "^5.0.0", optional = true }
33
36
  tqdm = { version = "^4.0.0", optional = true }
34
37
  pandas = { version = "^2.0.0", optional = true }
@@ -74,7 +77,6 @@ camoufox = { extras = ["geoip"], version = "~0.4.11", optional = true }
74
77
 
75
78
  [tool.poetry.extras]
76
79
  boto3 = ["boto3"]
77
- unidecode = ["unidecode"]
78
80
  lxml = ["lxml"]
79
81
  tqdm = ["tqdm"]
80
82
  pandas = ["pandas"]
@@ -0,0 +1,130 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ from enum import Enum, auto
5
+ from typing import Any
6
+ import unicodedata
7
+
8
+ import numpy as np
9
+ from unidecode import unidecode
10
+ from inflection import parameterize, underscore, titleize, camelize
11
+ from string_utils import prettify, strip_html
12
+
13
+ ########################################################################################################################
14
+ # CLASSES
15
+
16
+
17
+ class NormalizationMode(Enum):
18
+ NONE = auto()
19
+ BASIC = auto() # removes accents and converts punctuation to spaces
20
+ SYMBOLS = auto() # translates only symbols to Unicode name
21
+ FULL = auto() # BASIC + SYMBOLS
22
+
23
+
24
+ class NamingConvention(Enum):
25
+ NONE = auto() # no style change
26
+ CONSTANT = auto() # CONSTANT_CASE (uppercase, underscores)
27
+ SNAKE = auto() # snake_case (lowercase, underscores)
28
+ CAMEL = auto() # camelCase (capitalize words except first one, no spaces)
29
+ PASCAL = auto() # PascalCase (capitalize words including first one, no spaces)
30
+ PARAM = auto() # parameterize (hyphens)
31
+ TITLE = auto() # titleize (capitalize words)
32
+
33
+
34
+ ########################################################################################################################
35
+ # FUNCTIONS
36
+
37
+
38
+ def transliterate_symbols(s: str) -> str:
39
+ """
40
+ Translates symbols (category S*) to lowercase Unicode names,
41
+ with spaces→underscores. The rest of the text remains the same.
42
+ """
43
+ out: list[str] = []
44
+ for c in s:
45
+ if unicodedata.category(c).startswith("S"):
46
+ name = unicodedata.name(c, "")
47
+ if name:
48
+ out.append(name.lower().replace(" ", "_"))
49
+ else:
50
+ out.append(c)
51
+ return "".join(out)
52
+
53
+
54
+ def normalize(
55
+ s: Any, mode: NormalizationMode = NormalizationMode.BASIC, naming: NamingConvention = NamingConvention.NONE
56
+ ) -> str:
57
+ """
58
+ 1. Normalizes the string according to `mode`:
59
+ - NONE: returns the original input as an unprocessed string.
60
+ - BASIC: removes accents, converts punctuation to spaces, preserves alphanumeric characters.
61
+ - SYMBOLS: translates only symbols to Unicode name.
62
+ - FULL: combines BASIC + SYMBOLS.
63
+ 2. Applies naming convention according to `naming`:
64
+ - NONE: returns the normalized text.
65
+ - PARAM: parameterize (hyphens).
66
+ - SNAKE: snake_case (underscore, lowercase).
67
+ - CONSTANT: CONSTANT_CASE (underscore, uppercase).
68
+ """
69
+ # Parameter mapping
70
+ if isinstance(mode, str):
71
+ mode = NormalizationMode[mode]
72
+ if isinstance(naming, str):
73
+ naming = NamingConvention[naming]
74
+
75
+ # Handling null values
76
+ if s is None or (isinstance(s, float) and np.isnan(s)):
77
+ normalized = ""
78
+ elif not isinstance(s, str):
79
+ return str(s)
80
+ else:
81
+ text = prettify(strip_html(str(s), True))
82
+ if mode is NormalizationMode.NONE:
83
+ normalized = text
84
+ elif mode is NormalizationMode.SYMBOLS:
85
+ normalized = transliterate_symbols(text)
86
+ else:
87
+ # BASIC and FULL: remove accents and lowercase
88
+ normalized = unidecode(text).lower()
89
+ tokens: list[str] = []
90
+ current: list[str] = []
91
+
92
+ def flush_current():
93
+ nonlocal current
94
+ if current:
95
+ tokens.append("".join(current))
96
+ current.clear()
97
+
98
+ for c in normalized:
99
+ cat = unicodedata.category(c)
100
+ if c.isalnum():
101
+ current.append(c)
102
+ elif mode is NormalizationMode.FULL and cat.startswith("S"):
103
+ flush_current()
104
+ name = unicodedata.name(c, "")
105
+ if name:
106
+ tokens.append(name.lower().replace(" ", "_"))
107
+ elif cat.startswith("P") or c.isspace():
108
+ flush_current()
109
+ # other characters ignored
110
+
111
+ flush_current()
112
+ normalized = " ".join(tokens)
113
+
114
+ # Apply naming convention
115
+ if naming is NamingConvention.NONE:
116
+ return normalized
117
+ if naming is NamingConvention.PARAM:
118
+ return parameterize(normalized)
119
+ if naming is NamingConvention.TITLE:
120
+ return titleize(normalized)
121
+
122
+ underscored = underscore(parameterize(normalized))
123
+ if naming is NamingConvention.CONSTANT:
124
+ return underscored.upper()
125
+ if naming is NamingConvention.CAMEL:
126
+ return camelize(underscored, False)
127
+ if naming is NamingConvention.PASCAL:
128
+ return camelize(underscored)
129
+
130
+ return underscored
File without changes
File without changes