datamarket 0.9.30__py3-none-any.whl → 0.9.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -63,7 +63,9 @@ class Nominatim:
63
63
  if key in raw_json:
64
64
  return raw_json[key]
65
65
 
66
- def _calculate_distance(self, lat_str: Optional[str], lon_str: Optional[str], input_coords: Tuple[float, float]) -> float:
66
+ def _calculate_distance(
67
+ self, lat_str: Optional[str], lon_str: Optional[str], input_coords: Tuple[float, float]
68
+ ) -> float:
67
69
  dist = float("inf")
68
70
  if lat_str and lon_str:
69
71
  try:
@@ -139,7 +141,10 @@ class Nominatim:
139
141
  }
140
142
 
141
143
  def _select_postcode_and_derived_province(
142
- self, parsed_nominatim_result: Dict[str, Optional[str]], parsed_geonames_result: Dict[str, Optional[str]], nominatim_address_province_raw: Optional[str]
144
+ self,
145
+ parsed_nominatim_result: Dict[str, Optional[str]],
146
+ parsed_geonames_result: Dict[str, Optional[str]],
147
+ nominatim_address_province_raw: Optional[str],
143
148
  ) -> Tuple[Optional[str], Optional[str]]:
144
149
  """
145
150
  Determines the postcode and its derived province based on comparisons
@@ -210,7 +215,11 @@ class Nominatim:
210
215
  elif dist_geonames < dist_nominatim and dist_geonames != float("inf"):
211
216
  final_result = parsed_geonames_result
212
217
  final_result["postcode"] = authoritative_postcode
213
- final_result["province"] = authoritative_province_from_postcode
218
+ final_result["province"] = (
219
+ nominatim_address_province_raw
220
+ if authoritative_postcode is None
221
+ else authoritative_province_from_postcode
222
+ )
214
223
  else:
215
224
  final_result = self._get_empty_address_result()
216
225
  return final_result
@@ -3,7 +3,7 @@
3
3
 
4
4
  import unicodedata
5
5
  from enum import Enum, auto
6
- from typing import Any
6
+ from typing import Any, Optional, Set
7
7
 
8
8
  import numpy as np
9
9
  from inflection import camelize, parameterize, titleize, underscore
@@ -23,6 +23,8 @@ class NormalizationMode(Enum):
23
23
 
24
24
  class NamingConvention(Enum):
25
25
  NONE = auto() # no style change
26
+ LOWER = auto() # lowercase
27
+ UPPER = auto() # UPPERCASE
26
28
  CONSTANT = auto() # CONSTANT_CASE (uppercase, underscores)
27
29
  SNAKE = auto() # snake_case (lowercase, underscores)
28
30
  CAMEL = auto() # camelCase (capitalize words except first one, no spaces)
@@ -35,20 +37,52 @@ class NamingConvention(Enum):
35
37
  # FUNCTIONS
36
38
 
37
39
 
38
- def transliterate_symbols(s: str) -> str:
40
+ def get_unidecoded_text(input_text: str, allowed_chars: Set[str], apply_lowercase: bool = False) -> str:
41
+ """
42
+ Processes a string by unidecoding characters, optionally lowercasing them,
43
+ while preserving a specified set of allowed characters.
44
+
45
+ Args:
46
+ input_text: The string to process.
47
+ allowed_chars: A set of characters to preserve in their original form.
48
+ apply_lowercase: Whether to convert unidecoded characters to lowercase. Defaults to False.
49
+
50
+ Returns:
51
+ The processed string.
52
+ """
53
+ chars_list: list[str] = []
54
+ for char_original in input_text:
55
+ if char_original in allowed_chars:
56
+ chars_list.append(char_original)
57
+ else:
58
+ decoded_segment = unidecode(char_original)
59
+ for dc in decoded_segment: # unidecode can return multiple chars
60
+ if apply_lowercase:
61
+ chars_list.append(dc.lower())
62
+ else:
63
+ chars_list.append(dc)
64
+ return "".join(chars_list)
65
+
66
+
67
+ def transliterate_symbols(s: str, allowed_symbols_set: Optional[Set[str]] = None) -> str:
39
68
  """
40
69
  Translates Unicode symbols (category S*) in the input string to their lowercase Unicode names,
41
- with spaces replaced by underscores. Other characters remain unchanged.
70
+ with spaces replaced by underscores. Other characters, or characters in allowed_symbols_set, remain unchanged.
42
71
 
43
72
  Args:
44
73
  s: The input string.
74
+ allowed_symbols_set: A set of characters to preserve without transliteration.
45
75
 
46
76
  Returns:
47
- The string with symbols transliterated.
77
+ The string with symbols transliterated or preserved.
48
78
  """
79
+ if allowed_symbols_set is None:
80
+ allowed_symbols_set = set()
49
81
  out: list[str] = []
50
82
  for c in s:
51
- if unicodedata.category(c).startswith("S"):
83
+ if c in allowed_symbols_set:
84
+ out.append(c)
85
+ elif unicodedata.category(c).startswith("S"):
52
86
  name = unicodedata.name(c, "")
53
87
  if name:
54
88
  out.append(name.lower().replace(" ", "_"))
@@ -58,21 +92,32 @@ def transliterate_symbols(s: str) -> str:
58
92
 
59
93
 
60
94
  def normalize(
61
- s: Any, mode: NormalizationMode = NormalizationMode.BASIC, naming: NamingConvention = NamingConvention.NONE
95
+ s: Any,
96
+ mode: NormalizationMode = NormalizationMode.BASIC,
97
+ naming: NamingConvention = NamingConvention.LOWER,
98
+ allowed_symbols: Optional[str] = None,
62
99
  ) -> str:
63
100
  """
64
101
  Normalizes and applies a naming convention to the input.
65
102
 
66
103
  Handles None and NaN values by returning an empty string. Converts non-string inputs to strings.
67
104
 
68
- Normalization is applied according to `mode`:
69
- - NONE: Returns the input as a string without any normalization.
70
- - BASIC: Removes accents, converts punctuation and spaces to single spaces, and preserves alphanumeric characters.
105
+ Normalization (controlled by `mode`) occurs first, followed by naming convention application.
106
+ - NONE: Returns the input as a string without any normalization. Case is preserved.
107
+ - BASIC: Removes accents (via unidecode). Punctuation and spaces typically become single spaces between tokens.
108
+ Case is preserved from the unidecode step by default.
71
109
  - SYMBOLS: Translates only Unicode symbols (category S*) to their lowercase Unicode names with underscores.
72
- - FULL: Applies both BASIC and SYMBOLS normalization.
110
+ Other characters are preserved, including their case.
111
+ - FULL: Applies unidecode (case-preserved by default) and then SYMBOLS-like transliteration for S* category
112
+ characters not otherwise handled.
113
+
114
+ The `allowed_symbols` parameter can be used to specify characters that should be preserved in their original form
115
+ throughout the normalization process. These characters will not be unidecoded or transliterated by the symbol logic.
73
116
 
74
- After normalization, a naming convention is applied according to `naming`:
75
- - NONE: Returns the normalized text.
117
+ After normalization, a naming convention (controlled by `naming`) is applied:
118
+ - NONE: Returns the normalized text, preserving its case from the normalization step.
119
+ - LOWER: Converts the normalized text to lowercase. (Default)
120
+ - UPPER: Converts the normalized text to UPPERCASE.
76
121
  - CONSTANT: Converts to CONSTANT_CASE (uppercase with underscores).
77
122
  - SNAKE: Converts to snake_case (lowercase with underscores).
78
123
  - CAMEL: Converts to camelCase (lowercase first word, capitalize subsequent words, no spaces).
@@ -83,7 +128,8 @@ def normalize(
83
128
  Args:
84
129
  s: The input value to normalize and format. Can be any type.
85
130
  mode: The normalization mode to apply. Defaults to NormalizationMode.BASIC.
86
- naming: The naming convention to apply. Defaults to NamingConvention.NONE.
131
+ naming: The naming convention to apply. Defaults to NamingConvention.LOWER.
132
+ allowed_symbols: A string of characters to preserve during normalization.
87
133
 
88
134
  Returns:
89
135
  The normalized and formatted string.
@@ -94,6 +140,8 @@ def normalize(
94
140
  if isinstance(naming, str):
95
141
  naming = NamingConvention[naming]
96
142
 
143
+ _allowed_symbols_set: Set[str] = set(allowed_symbols) if allowed_symbols else set()
144
+
97
145
  # Handling null values
98
146
  if s is None or (isinstance(s, float) and np.isnan(s)):
99
147
  normalized = ""
@@ -101,41 +149,52 @@ def normalize(
101
149
  return str(s)
102
150
  else:
103
151
  text = prettify(strip_html(str(s), True))
152
+
104
153
  if mode is NormalizationMode.NONE:
105
154
  normalized = text
106
155
  elif mode is NormalizationMode.SYMBOLS:
107
- normalized = transliterate_symbols(text)
156
+ normalized = transliterate_symbols(text, _allowed_symbols_set)
108
157
  else:
109
- # BASIC and FULL: remove accents and lowercase
110
- normalized = unidecode(text).lower()
158
+ # BASIC and FULL modes
159
+ intermediate_text = get_unidecoded_text(text, _allowed_symbols_set)
160
+
161
+ # Now, tokenize the intermediate_text for BASIC and FULL
111
162
  tokens: list[str] = []
112
- current: list[str] = []
163
+ current_token_chars: list[str] = []
113
164
 
114
- def flush_current():
115
- nonlocal current
116
- if current:
117
- tokens.append("".join(current))
118
- current.clear()
165
+ def flush_current_token():
166
+ nonlocal current_token_chars
167
+ if current_token_chars:
168
+ tokens.append("".join(current_token_chars))
169
+ current_token_chars.clear()
119
170
 
120
- for c in normalized:
171
+ for c in intermediate_text:
121
172
  cat = unicodedata.category(c)
122
- if c.isalnum():
123
- current.append(c)
173
+ if c in _allowed_symbols_set: # Allowed symbols are part of tokens
174
+ current_token_chars.append(c)
175
+ elif c.isalnum():
176
+ current_token_chars.append(c)
124
177
  elif mode is NormalizationMode.FULL and cat.startswith("S"):
125
- flush_current()
178
+ # Transliterate S* category symbols not in allowed_symbols
179
+ flush_current_token()
126
180
  name = unicodedata.name(c, "")
127
181
  if name:
128
182
  tokens.append(name.lower().replace(" ", "_"))
129
183
  elif cat.startswith("P") or c.isspace():
130
- flush_current()
131
- # other characters ignored
184
+ # Punctuation (not allowed) or space acts as a separator
185
+ flush_current_token()
186
+ # Other characters are ignored
132
187
 
133
- flush_current()
188
+ flush_current_token()
134
189
  normalized = " ".join(tokens)
135
190
 
136
191
  # Apply naming convention
137
192
  if naming is NamingConvention.NONE:
138
193
  return normalized
194
+ if naming is NamingConvention.LOWER:
195
+ return normalized.lower()
196
+ if naming is NamingConvention.UPPER:
197
+ return normalized.upper()
139
198
  if naming is NamingConvention.PARAM:
140
199
  return parameterize(normalized)
141
200
  if naming is NamingConvention.TITLE:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamarket
3
- Version: 0.9.30
3
+ Version: 0.9.32
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  Home-page: https://datamarket.es
6
6
  License: GPL-3.0-or-later
@@ -4,7 +4,7 @@ datamarket/interfaces/alchemy.py,sha256=4q_gLKCKPK437VKOpdBKSrCyy42P_yWxIhE7KuvH
4
4
  datamarket/interfaces/aws.py,sha256=7KLUeBxmPN7avEMPsu5HC_KHB1N7W6Anp2X8fo43mlw,2383
5
5
  datamarket/interfaces/drive.py,sha256=shbV5jpQVe_KPE-8Idx6Z9te5Zu1SmVfrvSAyd9ZIgE,2915
6
6
  datamarket/interfaces/ftp.py,sha256=o0KlJxtksbop9OjCiQRzyAa2IeG_ExVXagS6apwrAQo,1881
7
- datamarket/interfaces/nominatim.py,sha256=rUnodcRKyZ_reBtyfFFjXNqP1TN0NMScW7zSGiJQ10I,12380
7
+ datamarket/interfaces/nominatim.py,sha256=xTSx7kivndKg8OQmhq2mgmQWGxxxPddLD8RsKnLPlU4,12553
8
8
  datamarket/interfaces/peerdb.py,sha256=cwYwvO740GyaPo9zLAwJsf3UeJDGDiYzjQVM9Q6s-_g,23652
9
9
  datamarket/interfaces/proxy.py,sha256=updoOStKd8-nQBbxWbnD9eOt6HksnYi-5dQ0rEySf5M,3152
10
10
  datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
@@ -17,11 +17,11 @@ datamarket/utils/main.py,sha256=j8wnAxeLvijdRU9M4V6HunWH7vgWWHP4u4xamzkWcUU,7009
17
17
  datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
18
18
  datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
19
19
  datamarket/utils/strings/__init__.py,sha256=RmyN3hKGXmUym8w5tn28yWkw2uM-b5OvntB4D0lU1eo,84
20
- datamarket/utils/strings/normalization.py,sha256=337M2UPwEETvhVTOnP4w_igTXpHUHoaD8e7x_-L-Bpk,5654
20
+ datamarket/utils/strings/normalization.py,sha256=z2dDXFVQ-nVqPDRR1T4HWmELpJuXUVt_P6leHqPVheY,8666
21
21
  datamarket/utils/strings/obfuscation.py,sha256=8gMepfjPq0N4_IpKR6i2dy_9VJugQ3qJiRiRvKavB3s,5246
22
22
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
23
23
  datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
24
- datamarket-0.9.30.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
25
- datamarket-0.9.30.dist-info/METADATA,sha256=zzhHMrHhBf_CfBLwjj4melul8sCkcO8np-nmay0jKOQ,6871
26
- datamarket-0.9.30.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
27
- datamarket-0.9.30.dist-info/RECORD,,
24
+ datamarket-0.9.32.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
25
+ datamarket-0.9.32.dist-info/METADATA,sha256=Y6-CNLWhDhPQmV_z6a3KyWJIpjPXDLp5juhrzgTEIcg,6871
26
+ datamarket-0.9.32.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
27
+ datamarket-0.9.32.dist-info/RECORD,,