datamarket 0.9.24__tar.gz → 0.9.26__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- {datamarket-0.9.24 → datamarket-0.9.26}/PKG-INFO +5 -3
- {datamarket-0.9.24 → datamarket-0.9.26}/pyproject.toml +5 -3
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/alchemy.py +33 -6
- datamarket-0.9.26/src/datamarket/utils/strings.py +130 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/LICENSE +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/README.md +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/__init__.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/__init__.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/aws.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/drive.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/ftp.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/nominatim.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/peerdb.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/proxy.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/tinybird.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/params/__init__.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/params/nominatim.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/utils/__init__.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/utils/airflow.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/utils/alchemy.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/utils/main.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/utils/selenium.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/utils/soda.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/utils/typer.py +0 -0
- {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/utils/types.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: datamarket
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.26
|
|
4
4
|
Summary: Utilities that integrate advanced scraping knowledge into just one library.
|
|
5
5
|
License: GPL-3.0-or-later
|
|
6
6
|
Author: DataMarket
|
|
@@ -59,7 +59,6 @@ Provides-Extra: soda-core-postgres
|
|
|
59
59
|
Provides-Extra: stem
|
|
60
60
|
Provides-Extra: tqdm
|
|
61
61
|
Provides-Extra: undetected-chromedriver
|
|
62
|
-
Provides-Extra: unidecode
|
|
63
62
|
Provides-Extra: xmltodict
|
|
64
63
|
Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0) ; extra == "alchemy"
|
|
65
64
|
Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
|
|
@@ -84,10 +83,12 @@ Requires-Dist: google-auth-httplib2 (>=0.2.0,<0.3.0) ; extra == "google-auth-htt
|
|
|
84
83
|
Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oauthlib"
|
|
85
84
|
Requires-Dist: html2text (>=2024.0.0,<2025.0.0) ; extra == "html2text"
|
|
86
85
|
Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
|
|
86
|
+
Requires-Dist: inflection (>=0.5.0,<0.6.0)
|
|
87
87
|
Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
|
|
88
88
|
Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
|
|
89
89
|
Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
|
|
90
90
|
Requires-Dist: nodriver (>=0.44,<0.45) ; extra == "nodriver"
|
|
91
|
+
Requires-Dist: numpy (>=2.0.0,<3.0.0)
|
|
91
92
|
Requires-Dist: openpyxl (>=3.0.0,<4.0.0) ; extra == "openpyxl"
|
|
92
93
|
Requires-Dist: pandas (>=2.0.0,<3.0.0) ; extra == "pandas"
|
|
93
94
|
Requires-Dist: pandera (>=0.22.0,<0.23.0) ; extra == "pandera"
|
|
@@ -102,6 +103,7 @@ Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
|
|
|
102
103
|
Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
|
|
103
104
|
Requires-Dist: pyspark (>=3.0.0,<4.0.0) ; extra == "pyspark"
|
|
104
105
|
Requires-Dist: pytest (>=8.0.0,<9.0.0) ; extra == "pytest"
|
|
106
|
+
Requires-Dist: python-string-utils (>=1.0.0,<2.0.0)
|
|
105
107
|
Requires-Dist: rapidfuzz (>=3.0.0,<4.0.0) ; extra == "rapidfuzz"
|
|
106
108
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
|
107
109
|
Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
|
|
@@ -112,7 +114,7 @@ Requires-Dist: stem (>=1.0.0,<2.0.0) ; extra == "stem" or extra == "proxy"
|
|
|
112
114
|
Requires-Dist: tenacity (>=9.0.0,<10.0.0)
|
|
113
115
|
Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
|
|
114
116
|
Requires-Dist: typer (>=0.15.0,<0.16.0)
|
|
115
|
-
Requires-Dist: unidecode (>=1.0.0,<2.0.0)
|
|
117
|
+
Requires-Dist: unidecode (>=1.0.0,<2.0.0)
|
|
116
118
|
Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
|
|
117
119
|
Project-URL: Documentation, https://github.com/Data-Market/datamarket
|
|
118
120
|
Project-URL: Homepage, https://datamarket.es
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "datamarket"
|
|
3
|
-
version = "0.9.
|
|
3
|
+
version = "0.9.26"
|
|
4
4
|
description = "Utilities that integrate advanced scraping knowledge into just one library."
|
|
5
5
|
authors = ["DataMarket <techsupport@datamarket.es>"]
|
|
6
6
|
license = "GPL-3.0-or-later"
|
|
@@ -26,9 +26,12 @@ pendulum = "^3.0.0"
|
|
|
26
26
|
croniter = "^3.0.0"
|
|
27
27
|
dynaconf = "^3.0.0"
|
|
28
28
|
jinja2 = "^3.0.0"
|
|
29
|
+
inflection = "~0.5.0"
|
|
30
|
+
python-string-utils = "^1.0.0"
|
|
31
|
+
unidecode = "^1.0.0"
|
|
32
|
+
numpy = "^2.0.0"
|
|
29
33
|
|
|
30
34
|
boto3 = { version = "~1.35.0", optional = true }
|
|
31
|
-
unidecode = { version = "^1.0.0", optional = true }
|
|
32
35
|
lxml = { extras = ["html-clean"], version = "^5.0.0", optional = true }
|
|
33
36
|
tqdm = { version = "^4.0.0", optional = true }
|
|
34
37
|
pandas = { version = "^2.0.0", optional = true }
|
|
@@ -74,7 +77,6 @@ camoufox = { extras = ["geoip"], version = "~0.4.11", optional = true }
|
|
|
74
77
|
|
|
75
78
|
[tool.poetry.extras]
|
|
76
79
|
boto3 = ["boto3"]
|
|
77
|
-
unidecode = ["unidecode"]
|
|
78
80
|
lxml = ["lxml"]
|
|
79
81
|
tqdm = ["tqdm"]
|
|
80
82
|
pandas = ["pandas"]
|
|
@@ -11,6 +11,7 @@ from sqlalchemy.dialects.postgresql import insert
|
|
|
11
11
|
from sqlalchemy.exc import IntegrityError
|
|
12
12
|
from sqlalchemy.ext.declarative import DeclarativeMeta
|
|
13
13
|
from sqlalchemy.orm import Session, sessionmaker
|
|
14
|
+
from sqlalchemy.sql.expression import ClauseElement
|
|
14
15
|
from enum import Enum
|
|
15
16
|
|
|
16
17
|
########################################################################################################################
|
|
@@ -220,17 +221,43 @@ class AlchemyInterface:
|
|
|
220
221
|
if not silent:
|
|
221
222
|
logger.info(f"upserting {alchemy_obj}")
|
|
222
223
|
|
|
223
|
-
|
|
224
|
-
|
|
224
|
+
table = alchemy_obj.__table__
|
|
225
|
+
primary_keys = list(col.name for col in table.primary_key.columns.values())
|
|
226
|
+
|
|
227
|
+
# Build the dictionary for the INSERT values
|
|
228
|
+
insert_values = {
|
|
229
|
+
col.name: getattr(alchemy_obj, col.name)
|
|
230
|
+
for col in table.columns
|
|
231
|
+
if getattr(alchemy_obj, col.name) is not None # Include all non-None values for insert
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
# Build the dictionary for the UPDATE set clause
|
|
235
|
+
# Start with values from the object, excluding primary keys
|
|
236
|
+
update_set_values = {
|
|
225
237
|
col.name: val
|
|
226
|
-
for col in
|
|
238
|
+
for col in table.columns
|
|
227
239
|
if col.name not in primary_keys and (val := getattr(alchemy_obj, col.name)) is not None
|
|
228
240
|
}
|
|
229
241
|
|
|
242
|
+
# Add columns with SQL-based onupdate values explicitly to the set clause
|
|
243
|
+
for column in table.columns:
|
|
244
|
+
actual_sql_expression = None
|
|
245
|
+
if column.onupdate is not None:
|
|
246
|
+
if hasattr(column.onupdate, "arg") and isinstance(column.onupdate.arg, ClauseElement):
|
|
247
|
+
# This handles wrappers like ColumnElementColumnDefault,
|
|
248
|
+
# where the actual SQL expression is in the .arg attribute.
|
|
249
|
+
actual_sql_expression = column.onupdate.arg
|
|
250
|
+
elif isinstance(column.onupdate, ClauseElement):
|
|
251
|
+
# This handles cases where onupdate might be a direct SQL expression.
|
|
252
|
+
actual_sql_expression = column.onupdate
|
|
253
|
+
|
|
254
|
+
if actual_sql_expression is not None:
|
|
255
|
+
update_set_values[column.name] = actual_sql_expression
|
|
256
|
+
|
|
230
257
|
statement = (
|
|
231
|
-
insert(
|
|
232
|
-
.values(
|
|
233
|
-
.on_conflict_do_update(index_elements=index_elements, set_=
|
|
258
|
+
insert(table)
|
|
259
|
+
.values(insert_values)
|
|
260
|
+
.on_conflict_do_update(index_elements=index_elements, set_=update_set_values)
|
|
234
261
|
)
|
|
235
262
|
|
|
236
263
|
try:
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# IMPORTS
|
|
3
|
+
|
|
4
|
+
from enum import Enum, auto
|
|
5
|
+
from typing import Any
|
|
6
|
+
import unicodedata
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from unidecode import unidecode
|
|
10
|
+
from inflection import parameterize, underscore, titleize, camelize
|
|
11
|
+
from string_utils import prettify, strip_html
|
|
12
|
+
|
|
13
|
+
########################################################################################################################
|
|
14
|
+
# CLASSES
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class NormalizationMode(Enum):
|
|
18
|
+
NONE = auto()
|
|
19
|
+
BASIC = auto() # removes accents and converts punctuation to spaces
|
|
20
|
+
SYMBOLS = auto() # translates only symbols to Unicode name
|
|
21
|
+
FULL = auto() # BASIC + SYMBOLS
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class NamingConvention(Enum):
|
|
25
|
+
NONE = auto() # no style change
|
|
26
|
+
CONSTANT = auto() # CONSTANT_CASE (uppercase, underscores)
|
|
27
|
+
SNAKE = auto() # snake_case (lowercase, underscores)
|
|
28
|
+
CAMEL = auto() # camelCase (capitalize words except first one, no spaces)
|
|
29
|
+
PASCAL = auto() # PascalCase (capitalize words including first one, no spaces)
|
|
30
|
+
PARAM = auto() # parameterize (hyphens)
|
|
31
|
+
TITLE = auto() # titleize (capitalize words)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
########################################################################################################################
|
|
35
|
+
# FUNCTIONS
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def transliterate_symbols(s: str) -> str:
|
|
39
|
+
"""
|
|
40
|
+
Translates symbols (category S*) to lowercase Unicode names,
|
|
41
|
+
with spaces→underscores. The rest of the text remains the same.
|
|
42
|
+
"""
|
|
43
|
+
out: list[str] = []
|
|
44
|
+
for c in s:
|
|
45
|
+
if unicodedata.category(c).startswith("S"):
|
|
46
|
+
name = unicodedata.name(c, "")
|
|
47
|
+
if name:
|
|
48
|
+
out.append(name.lower().replace(" ", "_"))
|
|
49
|
+
else:
|
|
50
|
+
out.append(c)
|
|
51
|
+
return "".join(out)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def normalize(
|
|
55
|
+
s: Any, mode: NormalizationMode = NormalizationMode.BASIC, naming: NamingConvention = NamingConvention.NONE
|
|
56
|
+
) -> str:
|
|
57
|
+
"""
|
|
58
|
+
1. Normalizes the string according to `mode`:
|
|
59
|
+
- NONE: returns the original input as an unprocessed string.
|
|
60
|
+
- BASIC: removes accents, converts punctuation to spaces, preserves alphanumeric characters.
|
|
61
|
+
- SYMBOLS: translates only symbols to Unicode name.
|
|
62
|
+
- FULL: combines BASIC + SYMBOLS.
|
|
63
|
+
2. Applies naming convention according to `naming`:
|
|
64
|
+
- NONE: returns the normalized text.
|
|
65
|
+
- PARAM: parameterize (hyphens).
|
|
66
|
+
- SNAKE: snake_case (underscore, lowercase).
|
|
67
|
+
- CONSTANT: CONSTANT_CASE (underscore, uppercase).
|
|
68
|
+
"""
|
|
69
|
+
# Parameter mapping
|
|
70
|
+
if isinstance(mode, str):
|
|
71
|
+
mode = NormalizationMode[mode]
|
|
72
|
+
if isinstance(naming, str):
|
|
73
|
+
naming = NamingConvention[naming]
|
|
74
|
+
|
|
75
|
+
# Handling null values
|
|
76
|
+
if s is None or (isinstance(s, float) and np.isnan(s)):
|
|
77
|
+
normalized = ""
|
|
78
|
+
elif not isinstance(s, str):
|
|
79
|
+
return str(s)
|
|
80
|
+
else:
|
|
81
|
+
text = prettify(strip_html(str(s), True))
|
|
82
|
+
if mode is NormalizationMode.NONE:
|
|
83
|
+
normalized = text
|
|
84
|
+
elif mode is NormalizationMode.SYMBOLS:
|
|
85
|
+
normalized = transliterate_symbols(text)
|
|
86
|
+
else:
|
|
87
|
+
# BASIC and FULL: remove accents and lowercase
|
|
88
|
+
normalized = unidecode(text).lower()
|
|
89
|
+
tokens: list[str] = []
|
|
90
|
+
current: list[str] = []
|
|
91
|
+
|
|
92
|
+
def flush_current():
|
|
93
|
+
nonlocal current
|
|
94
|
+
if current:
|
|
95
|
+
tokens.append("".join(current))
|
|
96
|
+
current.clear()
|
|
97
|
+
|
|
98
|
+
for c in normalized:
|
|
99
|
+
cat = unicodedata.category(c)
|
|
100
|
+
if c.isalnum():
|
|
101
|
+
current.append(c)
|
|
102
|
+
elif mode is NormalizationMode.FULL and cat.startswith("S"):
|
|
103
|
+
flush_current()
|
|
104
|
+
name = unicodedata.name(c, "")
|
|
105
|
+
if name:
|
|
106
|
+
tokens.append(name.lower().replace(" ", "_"))
|
|
107
|
+
elif cat.startswith("P") or c.isspace():
|
|
108
|
+
flush_current()
|
|
109
|
+
# other characters ignored
|
|
110
|
+
|
|
111
|
+
flush_current()
|
|
112
|
+
normalized = " ".join(tokens)
|
|
113
|
+
|
|
114
|
+
# Apply naming convention
|
|
115
|
+
if naming is NamingConvention.NONE:
|
|
116
|
+
return normalized
|
|
117
|
+
if naming is NamingConvention.PARAM:
|
|
118
|
+
return parameterize(normalized)
|
|
119
|
+
if naming is NamingConvention.TITLE:
|
|
120
|
+
return titleize(normalized)
|
|
121
|
+
|
|
122
|
+
underscored = underscore(parameterize(normalized))
|
|
123
|
+
if naming is NamingConvention.CONSTANT:
|
|
124
|
+
return underscored.upper()
|
|
125
|
+
if naming is NamingConvention.CAMEL:
|
|
126
|
+
return camelize(underscored, False)
|
|
127
|
+
if naming is NamingConvention.PASCAL:
|
|
128
|
+
return camelize(underscored)
|
|
129
|
+
|
|
130
|
+
return underscored
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|