datamarket 0.9.24__tar.gz → 0.9.26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (25) hide show
  1. {datamarket-0.9.24 → datamarket-0.9.26}/PKG-INFO +5 -3
  2. {datamarket-0.9.24 → datamarket-0.9.26}/pyproject.toml +5 -3
  3. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/alchemy.py +33 -6
  4. datamarket-0.9.26/src/datamarket/utils/strings.py +130 -0
  5. {datamarket-0.9.24 → datamarket-0.9.26}/LICENSE +0 -0
  6. {datamarket-0.9.24 → datamarket-0.9.26}/README.md +0 -0
  7. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/__init__.py +0 -0
  8. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/__init__.py +0 -0
  9. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/aws.py +0 -0
  10. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/drive.py +0 -0
  11. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/ftp.py +0 -0
  12. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/nominatim.py +0 -0
  13. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/peerdb.py +0 -0
  14. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/proxy.py +0 -0
  15. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/interfaces/tinybird.py +0 -0
  16. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/params/__init__.py +0 -0
  17. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/params/nominatim.py +0 -0
  18. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/utils/__init__.py +0 -0
  19. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/utils/airflow.py +0 -0
  20. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/utils/alchemy.py +0 -0
  21. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/utils/main.py +0 -0
  22. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/utils/selenium.py +0 -0
  23. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/utils/soda.py +0 -0
  24. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/utils/typer.py +0 -0
  25. {datamarket-0.9.24 → datamarket-0.9.26}/src/datamarket/utils/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.24
3
+ Version: 0.9.26
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -59,7 +59,6 @@ Provides-Extra: soda-core-postgres
59
59
  Provides-Extra: stem
60
60
  Provides-Extra: tqdm
61
61
  Provides-Extra: undetected-chromedriver
62
- Provides-Extra: unidecode
63
62
  Provides-Extra: xmltodict
64
63
  Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0) ; extra == "alchemy"
65
64
  Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
@@ -84,10 +83,12 @@ Requires-Dist: google-auth-httplib2 (>=0.2.0,<0.3.0) ; extra == "google-auth-htt
84
83
  Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oauthlib"
85
84
  Requires-Dist: html2text (>=2024.0.0,<2025.0.0) ; extra == "html2text"
86
85
  Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
86
+ Requires-Dist: inflection (>=0.5.0,<0.6.0)
87
87
  Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
88
88
  Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
89
89
  Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
90
90
  Requires-Dist: nodriver (>=0.44,<0.45) ; extra == "nodriver"
91
+ Requires-Dist: numpy (>=2.0.0,<3.0.0)
91
92
  Requires-Dist: openpyxl (>=3.0.0,<4.0.0) ; extra == "openpyxl"
92
93
  Requires-Dist: pandas (>=2.0.0,<3.0.0) ; extra == "pandas"
93
94
  Requires-Dist: pandera (>=0.22.0,<0.23.0) ; extra == "pandera"
@@ -102,6 +103,7 @@ Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
102
103
  Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
103
104
  Requires-Dist: pyspark (>=3.0.0,<4.0.0) ; extra == "pyspark"
104
105
  Requires-Dist: pytest (>=8.0.0,<9.0.0) ; extra == "pytest"
106
+ Requires-Dist: python-string-utils (>=1.0.0,<2.0.0)
105
107
  Requires-Dist: rapidfuzz (>=3.0.0,<4.0.0) ; extra == "rapidfuzz"
106
108
  Requires-Dist: requests (>=2.0.0,<3.0.0)
107
109
  Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
@@ -112,7 +114,7 @@ Requires-Dist: stem (>=1.0.0,<2.0.0) ; extra == "stem" or extra == "proxy"
112
114
  Requires-Dist: tenacity (>=9.0.0,<10.0.0)
113
115
  Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
114
116
  Requires-Dist: typer (>=0.15.0,<0.16.0)
115
- Requires-Dist: unidecode (>=1.0.0,<2.0.0) ; extra == "unidecode"
117
+ Requires-Dist: unidecode (>=1.0.0,<2.0.0)
116
118
  Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
117
119
  Project-URL: Documentation, https://github.com/Data-Market/datamarket
118
120
  Project-URL: Homepage, https://datamarket.es
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.9.24"
3
+ version = "0.9.26"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -26,9 +26,12 @@ pendulum = "^3.0.0"
26
26
  croniter = "^3.0.0"
27
27
  dynaconf = "^3.0.0"
28
28
  jinja2 = "^3.0.0"
29
+ inflection = "~0.5.0"
30
+ python-string-utils = "^1.0.0"
31
+ unidecode = "^1.0.0"
32
+ numpy = "^2.0.0"
29
33
 
30
34
  boto3 = { version = "~1.35.0", optional = true }
31
- unidecode = { version = "^1.0.0", optional = true }
32
35
  lxml = { extras = ["html-clean"], version = "^5.0.0", optional = true }
33
36
  tqdm = { version = "^4.0.0", optional = true }
34
37
  pandas = { version = "^2.0.0", optional = true }
@@ -74,7 +77,6 @@ camoufox = { extras = ["geoip"], version = "~0.4.11", optional = true }
74
77
 
75
78
  [tool.poetry.extras]
76
79
  boto3 = ["boto3"]
77
- unidecode = ["unidecode"]
78
80
  lxml = ["lxml"]
79
81
  tqdm = ["tqdm"]
80
82
  pandas = ["pandas"]
@@ -11,6 +11,7 @@ from sqlalchemy.dialects.postgresql import insert
11
11
  from sqlalchemy.exc import IntegrityError
12
12
  from sqlalchemy.ext.declarative import DeclarativeMeta
13
13
  from sqlalchemy.orm import Session, sessionmaker
14
+ from sqlalchemy.sql.expression import ClauseElement
14
15
  from enum import Enum
15
16
 
16
17
  ########################################################################################################################
@@ -220,17 +221,43 @@ class AlchemyInterface:
220
221
  if not silent:
221
222
  logger.info(f"upserting {alchemy_obj}")
222
223
 
223
- primary_keys = list(col.name for col in alchemy_obj.__table__.primary_key.columns.values())
224
- obj_dict = {
224
+ table = alchemy_obj.__table__
225
+ primary_keys = list(col.name for col in table.primary_key.columns.values())
226
+
227
+ # Build the dictionary for the INSERT values
228
+ insert_values = {
229
+ col.name: getattr(alchemy_obj, col.name)
230
+ for col in table.columns
231
+ if getattr(alchemy_obj, col.name) is not None # Include all non-None values for insert
232
+ }
233
+
234
+ # Build the dictionary for the UPDATE set clause
235
+ # Start with values from the object, excluding primary keys
236
+ update_set_values = {
225
237
  col.name: val
226
- for col in alchemy_obj.__table__.columns
238
+ for col in table.columns
227
239
  if col.name not in primary_keys and (val := getattr(alchemy_obj, col.name)) is not None
228
240
  }
229
241
 
242
+ # Add columns with SQL-based onupdate values explicitly to the set clause
243
+ for column in table.columns:
244
+ actual_sql_expression = None
245
+ if column.onupdate is not None:
246
+ if hasattr(column.onupdate, "arg") and isinstance(column.onupdate.arg, ClauseElement):
247
+ # This handles wrappers like ColumnElementColumnDefault,
248
+ # where the actual SQL expression is in the .arg attribute.
249
+ actual_sql_expression = column.onupdate.arg
250
+ elif isinstance(column.onupdate, ClauseElement):
251
+ # This handles cases where onupdate might be a direct SQL expression.
252
+ actual_sql_expression = column.onupdate
253
+
254
+ if actual_sql_expression is not None:
255
+ update_set_values[column.name] = actual_sql_expression
256
+
230
257
  statement = (
231
- insert(alchemy_obj.__table__)
232
- .values(obj_dict)
233
- .on_conflict_do_update(index_elements=index_elements, set_=obj_dict)
258
+ insert(table)
259
+ .values(insert_values)
260
+ .on_conflict_do_update(index_elements=index_elements, set_=update_set_values)
234
261
  )
235
262
 
236
263
  try:
@@ -0,0 +1,130 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ from enum import Enum, auto
5
+ from typing import Any
6
+ import unicodedata
7
+
8
+ import numpy as np
9
+ from unidecode import unidecode
10
+ from inflection import parameterize, underscore, titleize, camelize
11
+ from string_utils import prettify, strip_html
12
+
13
+ ########################################################################################################################
14
+ # CLASSES
15
+
16
+
17
+ class NormalizationMode(Enum):
18
+ NONE = auto()
19
+ BASIC = auto() # removes accents and converts punctuation to spaces
20
+ SYMBOLS = auto() # translates only symbols to Unicode name
21
+ FULL = auto() # BASIC + SYMBOLS
22
+
23
+
24
+ class NamingConvention(Enum):
25
+ NONE = auto() # no style change
26
+ CONSTANT = auto() # CONSTANT_CASE (uppercase, underscores)
27
+ SNAKE = auto() # snake_case (lowercase, underscores)
28
+ CAMEL = auto() # camelCase (capitalize words except first one, no spaces)
29
+ PASCAL = auto() # PascalCase (capitalize words including first one, no spaces)
30
+ PARAM = auto() # parameterize (hyphens)
31
+ TITLE = auto() # titleize (capitalize words)
32
+
33
+
34
+ ########################################################################################################################
35
+ # FUNCTIONS
36
+
37
+
38
+ def transliterate_symbols(s: str) -> str:
39
+ """
40
+ Translates symbols (category S*) to lowercase Unicode names,
41
+ with spaces→underscores. The rest of the text remains the same.
42
+ """
43
+ out: list[str] = []
44
+ for c in s:
45
+ if unicodedata.category(c).startswith("S"):
46
+ name = unicodedata.name(c, "")
47
+ if name:
48
+ out.append(name.lower().replace(" ", "_"))
49
+ else:
50
+ out.append(c)
51
+ return "".join(out)
52
+
53
+
54
+ def normalize(
55
+ s: Any, mode: NormalizationMode = NormalizationMode.BASIC, naming: NamingConvention = NamingConvention.NONE
56
+ ) -> str:
57
+ """
58
+ 1. Normalizes the string according to `mode`:
59
+ - NONE: returns the original input as an unprocessed string.
60
+ - BASIC: removes accents, converts punctuation to spaces, preserves alphanumeric characters.
61
+ - SYMBOLS: translates only symbols to Unicode name.
62
+ - FULL: combines BASIC + SYMBOLS.
63
+ 2. Applies naming convention according to `naming`:
64
+ - NONE: returns the normalized text.
65
+ - PARAM: parameterize (hyphens).
66
+ - SNAKE: snake_case (underscore, lowercase).
67
+ - CONSTANT: CONSTANT_CASE (underscore, uppercase).
68
+ """
69
+ # Parameter mapping
70
+ if isinstance(mode, str):
71
+ mode = NormalizationMode[mode]
72
+ if isinstance(naming, str):
73
+ naming = NamingConvention[naming]
74
+
75
+ # Handling null values
76
+ if s is None or (isinstance(s, float) and np.isnan(s)):
77
+ normalized = ""
78
+ elif not isinstance(s, str):
79
+ return str(s)
80
+ else:
81
+ text = prettify(strip_html(str(s), True))
82
+ if mode is NormalizationMode.NONE:
83
+ normalized = text
84
+ elif mode is NormalizationMode.SYMBOLS:
85
+ normalized = transliterate_symbols(text)
86
+ else:
87
+ # BASIC and FULL: remove accents and lowercase
88
+ normalized = unidecode(text).lower()
89
+ tokens: list[str] = []
90
+ current: list[str] = []
91
+
92
+ def flush_current():
93
+ nonlocal current
94
+ if current:
95
+ tokens.append("".join(current))
96
+ current.clear()
97
+
98
+ for c in normalized:
99
+ cat = unicodedata.category(c)
100
+ if c.isalnum():
101
+ current.append(c)
102
+ elif mode is NormalizationMode.FULL and cat.startswith("S"):
103
+ flush_current()
104
+ name = unicodedata.name(c, "")
105
+ if name:
106
+ tokens.append(name.lower().replace(" ", "_"))
107
+ elif cat.startswith("P") or c.isspace():
108
+ flush_current()
109
+ # other characters ignored
110
+
111
+ flush_current()
112
+ normalized = " ".join(tokens)
113
+
114
+ # Apply naming convention
115
+ if naming is NamingConvention.NONE:
116
+ return normalized
117
+ if naming is NamingConvention.PARAM:
118
+ return parameterize(normalized)
119
+ if naming is NamingConvention.TITLE:
120
+ return titleize(normalized)
121
+
122
+ underscored = underscore(parameterize(normalized))
123
+ if naming is NamingConvention.CONSTANT:
124
+ return underscored.upper()
125
+ if naming is NamingConvention.CAMEL:
126
+ return camelize(underscored, False)
127
+ if naming is NamingConvention.PASCAL:
128
+ return camelize(underscored)
129
+
130
+ return underscored
File without changes
File without changes