sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +7 -1
- sibi_dst/df_helper/__init__.py +3 -2
- sibi_dst/df_helper/_artifact_updater_async.py +238 -0
- sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
- sibi_dst/df_helper/_df_helper.py +418 -118
- sibi_dst/df_helper/_parquet_artifact.py +275 -283
- sibi_dst/df_helper/_parquet_reader.py +9 -10
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
- sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
- sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
- sibi_dst/osmnx_helper/route_path_builder.py +45 -46
- sibi_dst/utils/__init__.py +2 -0
- sibi_dst/utils/base.py +235 -100
- sibi_dst/utils/business_days.py +248 -0
- sibi_dst/utils/clickhouse_writer.py +472 -206
- sibi_dst/utils/data_utils.py +139 -186
- sibi_dst/utils/data_wrapper.py +392 -88
- sibi_dst/utils/date_utils.py +711 -393
- sibi_dst/utils/df_utils.py +193 -213
- sibi_dst/utils/file_age_checker.py +301 -0
- sibi_dst/utils/file_utils.py +3 -2
- sibi_dst/utils/filepath_generator.py +314 -152
- sibi_dst/utils/log_utils.py +581 -242
- sibi_dst/utils/manifest_manager.py +60 -76
- sibi_dst/utils/parquet_saver.py +33 -27
- sibi_dst/utils/periods.py +42 -0
- sibi_dst/utils/phone_formatter.py +88 -95
- sibi_dst/utils/update_planner.py +180 -178
- sibi_dst/utils/webdav_client.py +116 -166
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/METADATA +1 -1
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/RECORD +36 -30
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -422
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/WHEEL +0 -0
@@ -3,50 +3,45 @@ import fsspec
|
|
3
3
|
import threading
|
4
4
|
import uuid
|
5
5
|
from typing import List, Optional, Set, Dict, Any
|
6
|
-
|
6
|
+
|
7
7
|
from sibi_dst.utils import Logger
|
8
8
|
|
9
9
|
|
10
10
|
class MissingManifestManager:
|
11
11
|
"""
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
orphaned temporary files from previous runs.
|
18
|
-
|
19
|
-
Attributes:
|
20
|
-
fs (fsspec.AbstractFileSystem): The filesystem object to interact with.
|
21
|
-
manifest_path (str): The full path to the manifest file.
|
22
|
-
clear_existing (bool): If True, any existing manifest will be overwritten
|
23
|
-
on the first save operation of this instance's lifecycle.
|
24
|
-
logger (Logger): A logger instance for logging messages.
|
12
|
+
Thread-safe manager for a Parquet file manifest of missing partitions.
|
13
|
+
|
14
|
+
- Atomic writes via temp → copy → remove
|
15
|
+
- Cleans up orphan temp files (best-effort)
|
16
|
+
- Stores a simple table with a single column: 'path'
|
25
17
|
"""
|
26
18
|
|
27
19
|
def __init__(
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
20
|
+
self,
|
21
|
+
fs: fsspec.AbstractFileSystem,
|
22
|
+
manifest_path: str,
|
23
|
+
clear_existing: bool = False,
|
24
|
+
**kwargs: Any,
|
33
25
|
):
|
34
26
|
self.fs: fsspec.AbstractFileSystem = fs
|
35
27
|
self.manifest_path: str = manifest_path.rstrip("/")
|
36
28
|
self.clear_existing: bool = clear_existing
|
37
|
-
|
29
|
+
self.clear_existing: bool = clear_existing
|
30
|
+
self.ignore_missing: bool = kwargs.get("ignore_missing", False)
|
31
|
+
if self.clear_existing:
|
32
|
+
self.ignore_missing = False
|
38
33
|
self.debug: bool = kwargs.get("debug", False)
|
39
34
|
self.logger: Logger = kwargs.get(
|
40
35
|
"logger",
|
41
|
-
Logger.default_logger(logger_name="missing_manifest_manager")
|
36
|
+
Logger.default_logger(logger_name="missing_manifest_manager"),
|
42
37
|
)
|
43
38
|
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
44
39
|
|
45
40
|
self._new_records: List[Dict[str, str]] = []
|
46
41
|
self._loaded_paths: Optional[Set[str]] = None
|
47
|
-
self._lock = threading.Lock()
|
42
|
+
self._lock = threading.Lock()
|
48
43
|
|
49
|
-
# Clean up any orphaned temp files from previous failed runs
|
44
|
+
# Clean up any orphaned temp files from previous failed runs (best-effort)
|
50
45
|
self._cleanup_orphaned_files()
|
51
46
|
|
52
47
|
def _safe_exists(self, path: str) -> bool:
|
@@ -59,13 +54,8 @@ class MissingManifestManager:
|
|
59
54
|
|
60
55
|
def load_existing(self) -> Set[str]:
|
61
56
|
"""
|
62
|
-
Loads the set of paths from the existing manifest file.
|
63
|
-
|
64
|
-
The result is cached in memory. If the manifest does not exist or fails
|
65
|
-
to load, an empty set is returned. This operation is thread-safe.
|
66
|
-
|
67
|
-
Returns:
|
68
|
-
A set of strings, where each string is a path from the manifest.
|
57
|
+
Loads the set of paths from the existing manifest file into memory.
|
58
|
+
Returns an empty set if not found or unreadable.
|
69
59
|
"""
|
70
60
|
with self._lock:
|
71
61
|
if self._loaded_paths is not None:
|
@@ -77,7 +67,6 @@ class MissingManifestManager:
|
|
77
67
|
|
78
68
|
try:
|
79
69
|
df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
|
80
|
-
# Robustly extract non-empty, non-null paths
|
81
70
|
paths = (
|
82
71
|
df.get("path", pd.Series(dtype=str))
|
83
72
|
.dropna().astype(str)
|
@@ -96,9 +85,6 @@ class MissingManifestManager:
|
|
96
85
|
def record(self, full_path: str) -> None:
|
97
86
|
"""
|
98
87
|
Records a new path to be added to the manifest upon the next save.
|
99
|
-
|
100
|
-
Args:
|
101
|
-
full_path: The path to record.
|
102
88
|
"""
|
103
89
|
if not full_path or not isinstance(full_path, str):
|
104
90
|
return
|
@@ -107,12 +93,7 @@ class MissingManifestManager:
|
|
107
93
|
|
108
94
|
def save(self) -> None:
|
109
95
|
"""
|
110
|
-
Saves all new records to the manifest file.
|
111
|
-
|
112
|
-
This method merges new records with existing ones (unless `clear_existing`
|
113
|
-
is True), removes duplicates, and writes the result back to the manifest.
|
114
|
-
The write operation is performed atomically by writing to a temporary file
|
115
|
-
first, then renaming or copying it to the final destination.
|
96
|
+
Saves all new records to the manifest file atomically.
|
116
97
|
"""
|
117
98
|
with self._lock:
|
118
99
|
if not self._new_records and not self.clear_existing:
|
@@ -143,65 +124,68 @@ class MissingManifestManager:
|
|
143
124
|
|
144
125
|
# Ensure parent directory exists
|
145
126
|
parent = self.manifest_path.rsplit("/", 1)[0]
|
146
|
-
|
127
|
+
try:
|
128
|
+
self.fs.makedirs(parent, exist_ok=True)
|
129
|
+
except TypeError:
|
130
|
+
try:
|
131
|
+
self.fs.makedirs(parent)
|
132
|
+
except FileExistsError:
|
133
|
+
pass
|
147
134
|
|
148
135
|
# Perform an atomic write using a temporary file
|
149
136
|
temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
|
150
137
|
try:
|
151
138
|
out_df.to_parquet(temp_path, filesystem=self.fs, index=False)
|
152
|
-
|
153
|
-
self.fs
|
154
|
-
|
139
|
+
# some fs lack atomic rename; copy then remove
|
140
|
+
if hasattr(self.fs, "rename"):
|
141
|
+
try:
|
142
|
+
self.fs.rename(temp_path, self.manifest_path)
|
143
|
+
except Exception:
|
144
|
+
self.fs.copy(temp_path, self.manifest_path)
|
145
|
+
self.fs.rm_file(temp_path)
|
146
|
+
else:
|
147
|
+
self.fs.copy(temp_path, self.manifest_path)
|
148
|
+
self.fs.rm_file(temp_path)
|
149
|
+
self.logger.info(f"Wrote manifest to {self.manifest_path}")
|
155
150
|
except Exception as e:
|
156
151
|
self.logger.error(f"Failed to write or move manifest: {e}")
|
157
|
-
#
|
158
|
-
#raise
|
152
|
+
# not re-raising to avoid breaking the ETL run
|
159
153
|
finally:
|
160
|
-
#
|
161
|
-
|
162
|
-
|
163
|
-
self.
|
164
|
-
|
165
|
-
|
154
|
+
# Always try to clean temp leftovers
|
155
|
+
try:
|
156
|
+
if self._safe_exists(temp_path):
|
157
|
+
if hasattr(self.fs, "rm_file"):
|
158
|
+
self.fs.rm_file(temp_path)
|
159
|
+
else:
|
160
|
+
self.fs.rm(temp_path, recursive=False)
|
161
|
+
except Exception:
|
162
|
+
pass
|
166
163
|
|
167
164
|
# Reset internal state
|
168
165
|
self._new_records.clear()
|
169
|
-
|
170
|
-
|
166
|
+
try:
|
167
|
+
self._loaded_paths = set(out_df["path"].tolist())
|
168
|
+
except Exception:
|
169
|
+
self._loaded_paths = None
|
171
170
|
self.clear_existing = False
|
172
171
|
|
173
172
|
def _cleanup_orphaned_files(self) -> None:
|
174
|
-
"""
|
175
|
-
self.logger.debug("Checking for orphaned temporary files...")
|
176
|
-
if not hasattr(self.fs, "s3"):
|
177
|
-
self.logger.info("Filesystem is not s3fs; skipping temp cleanup.")
|
178
|
-
return
|
173
|
+
"""Best-effort removal of leftover temporary manifest files."""
|
179
174
|
try:
|
180
|
-
|
181
|
-
# Use glob to find all files matching the temp pattern in a filesystem-agnostic way
|
182
175
|
temp_file_pattern = f"{self.manifest_path}.tmp-*"
|
183
176
|
orphaned_files = self.fs.glob(temp_file_pattern)
|
184
|
-
|
185
177
|
if not orphaned_files:
|
186
|
-
self.logger.debug("No orphaned files found.")
|
187
178
|
return
|
188
179
|
|
189
|
-
self.logger.info(f"Found {orphaned_files} orphaned temp manifest(s). Cleaning up...")
|
190
180
|
for f_path in orphaned_files:
|
191
181
|
try:
|
192
|
-
self.fs
|
182
|
+
if hasattr(self.fs, "rm_file"):
|
183
|
+
self.fs.rm_file(f_path)
|
184
|
+
else:
|
185
|
+
self.fs.rm(f_path, recursive=False)
|
193
186
|
self.logger.info(f"Deleted orphaned file: {f_path}")
|
194
187
|
except Exception as e:
|
195
188
|
self.logger.warning(f"Failed to delete orphaned temp file '{f_path}': {e}")
|
196
189
|
except Exception as e:
|
197
|
-
#
|
198
|
-
self.logger.
|
199
|
-
|
200
|
-
@staticmethod
|
201
|
-
def _parse_s3_path(s3_path: str):
|
202
|
-
if not s3_path.startswith("s3://"):
|
203
|
-
raise ValueError("Invalid S3 path. Must start with 's3://'.")
|
204
|
-
path_parts = s3_path[5:].split("/", 1)
|
205
|
-
bucket_name = path_parts[0]
|
206
|
-
prefix = path_parts[1] if len(path_parts) > 1 else ""
|
207
|
-
return bucket_name, prefix
|
190
|
+
# Non-critical
|
191
|
+
self.logger.debug(f"Temp cleanup skipped: {e}")
|
sibi_dst/utils/parquet_saver.py
CHANGED
@@ -1,10 +1,7 @@
|
|
1
|
-
import logging
|
2
1
|
import warnings
|
3
|
-
from typing import Optional
|
4
2
|
|
5
3
|
import dask.dataframe as dd
|
6
4
|
import pyarrow as pa
|
7
|
-
from fsspec import AbstractFileSystem
|
8
5
|
|
9
6
|
from . import ManagedResource
|
10
7
|
|
@@ -14,19 +11,20 @@ warnings.filterwarnings("ignore", message="Passing 'overwrite=True' to to_parque
|
|
14
11
|
class ParquetSaver(ManagedResource):
|
15
12
|
"""
|
16
13
|
Saves Dask DataFrames to Parquet, with a workaround for S3-compatible
|
17
|
-
storage that
|
14
|
+
storage providers that misbehave on batch delete operations.
|
15
|
+
|
16
|
+
Assumes `df_result` is a Dask DataFrame.
|
18
17
|
"""
|
19
18
|
|
20
19
|
def __init__(
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
20
|
+
self,
|
21
|
+
df_result: dd.DataFrame,
|
22
|
+
parquet_storage_path: str,
|
23
|
+
**kwargs,
|
25
24
|
):
|
26
25
|
super().__init__(**kwargs)
|
27
26
|
self.df_result = df_result
|
28
27
|
self.parquet_storage_path = parquet_storage_path.rstrip("/")
|
29
|
-
# Determine protocol for special handling (e.g., 's3')
|
30
28
|
if not self.fs:
|
31
29
|
raise ValueError("File system (fs) must be provided to ParquetSaver.")
|
32
30
|
|
@@ -36,7 +34,7 @@ class ParquetSaver(ManagedResource):
|
|
36
34
|
|
37
35
|
def save_to_parquet(self, output_directory_name: str = "default_output", overwrite: bool = True):
|
38
36
|
"""
|
39
|
-
Saves the DataFrame to a Parquet dataset.
|
37
|
+
Saves the Dask DataFrame to a Parquet dataset.
|
40
38
|
|
41
39
|
If overwrite is True, it manually clears the destination directory before
|
42
40
|
writing to avoid issues with certain S3-compatible storage providers.
|
@@ -52,18 +50,18 @@ class ParquetSaver(ManagedResource):
|
|
52
50
|
|
53
51
|
schema = self._define_schema()
|
54
52
|
self.logger.info(f"Saving DataFrame to Parquet dataset at: {full_path}")
|
55
|
-
|
53
|
+
|
54
|
+
# persist then write (lets the graph be shared if the caller reuses it)
|
55
|
+
ddf = self.df_result.persist()
|
56
|
+
|
56
57
|
try:
|
57
|
-
|
58
|
-
# handled the directory clearing manually.
|
59
|
-
self.df_result.to_parquet(
|
58
|
+
ddf.to_parquet(
|
60
59
|
path=full_path,
|
61
60
|
engine="pyarrow",
|
62
61
|
schema=schema,
|
63
|
-
overwrite=False,
|
62
|
+
overwrite=False, # we've handled deletion already
|
64
63
|
filesystem=self.fs,
|
65
64
|
write_index=False,
|
66
|
-
compute=True, # Use compute=True over persisted ddf for immediate execution.
|
67
65
|
)
|
68
66
|
self.logger.info(f"Successfully saved Parquet dataset to: {full_path}")
|
69
67
|
except Exception as e:
|
@@ -73,8 +71,8 @@ class ParquetSaver(ManagedResource):
|
|
73
71
|
def _clear_directory_safely(self, directory: str):
|
74
72
|
"""
|
75
73
|
Clears the contents of a directory robustly.
|
76
|
-
- For S3,
|
77
|
-
- For other filesystems,
|
74
|
+
- For S3, deletes files one-by-one to bypass brittle multi-delete.
|
75
|
+
- For other filesystems, uses the standard recursive remove.
|
78
76
|
"""
|
79
77
|
if self.protocol == "s3":
|
80
78
|
self.logger.warning(
|
@@ -82,15 +80,23 @@ class ParquetSaver(ManagedResource):
|
|
82
80
|
"This may be slow for directories with many files."
|
83
81
|
)
|
84
82
|
# Glob all contents (files and subdirs) and delete them individually.
|
85
|
-
# Calling fs.rm() on a single file path should trigger a single
|
86
|
-
# DeleteObject call, avoiding the faulty batch operation.
|
87
|
-
# We sort by length descending to delete contents of subdirectories first.
|
88
83
|
all_paths = self.fs.glob(f"{directory}/**")
|
89
|
-
|
90
|
-
|
91
|
-
for path in paths_to_delete:
|
84
|
+
# delete contents (deepest first)
|
85
|
+
for path in sorted([p for p in all_paths if p != directory], key=len, reverse=True):
|
92
86
|
self.logger.debug(f"Deleting: {path}")
|
93
|
-
|
87
|
+
try:
|
88
|
+
# prefer rm_file if available (minio, s3fs expose it)
|
89
|
+
if hasattr(self.fs, "rm_file"):
|
90
|
+
self.fs.rm_file(path)
|
91
|
+
else:
|
92
|
+
self.fs.rm(path, recursive=False)
|
93
|
+
except Exception as e:
|
94
|
+
self.logger.warning(f"Failed to delete '{path}': {e}")
|
95
|
+
# remove the (now empty) directory if present
|
96
|
+
try:
|
97
|
+
self.fs.rm(directory, recursive=False)
|
98
|
+
except Exception:
|
99
|
+
pass
|
94
100
|
else:
|
95
101
|
# Standard, fast deletion for other filesystems (local, etc.)
|
96
102
|
self.fs.rm(directory, recursive=True)
|
@@ -98,6 +104,7 @@ class ParquetSaver(ManagedResource):
|
|
98
104
|
def _define_schema(self) -> pa.Schema:
|
99
105
|
"""
|
100
106
|
Defines a PyArrow schema dynamically based on DataFrame's column types.
|
107
|
+
Works for Dask by using known dtypes on the collection.
|
101
108
|
"""
|
102
109
|
pandas_dtype_to_pa = {
|
103
110
|
"object": pa.string(), "string": pa.string(),
|
@@ -113,5 +120,4 @@ class ParquetSaver(ManagedResource):
|
|
113
120
|
pa.field(c, pandas_dtype_to_pa.get(str(d), pa.string()))
|
114
121
|
for c, d in self.df_result.dtypes.items()
|
115
122
|
]
|
116
|
-
return pa.schema(fields)
|
117
|
-
|
123
|
+
return pa.schema(fields)
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# sibi_dst/periods.py
|
2
|
+
from __future__ import annotations
|
3
|
+
import datetime as dt
|
4
|
+
from typing import Dict, Tuple
|
5
|
+
|
6
|
+
# Map all user-facing labels to canonical keys your orchestrators expect.
|
7
|
+
CANON: Dict[str, str] = {
|
8
|
+
"ytd": "ytd",
|
9
|
+
"itd": "itd",
|
10
|
+
"current_month": "current_month",
|
11
|
+
"today": "today",
|
12
|
+
"custom": "custom", # generic custom range
|
13
|
+
# labels that imply a date RANGE
|
14
|
+
"last_3_days": "custom",
|
15
|
+
"last_7_days": "custom",
|
16
|
+
"last_14_days": "custom",
|
17
|
+
}
|
18
|
+
|
19
|
+
def normalize_period(user_period: str) -> str:
|
20
|
+
"""
|
21
|
+
Normalize a user-facing period label to your canonical key.
|
22
|
+
Raises ValueError with allowed labels if unsupported.
|
23
|
+
"""
|
24
|
+
try:
|
25
|
+
return CANON[user_period]
|
26
|
+
except KeyError:
|
27
|
+
allowed = ", ".join(sorted(CANON))
|
28
|
+
raise ValueError(f"Unsupported period '{user_period}'. Allowed: {allowed}")
|
29
|
+
|
30
|
+
def compute_range_days(label: str, *, today: dt.date | None = None) -> Tuple[dt.date, dt.date]:
|
31
|
+
"""
|
32
|
+
Convert 'last_N_days' label to an inclusive (start_date, end_date).
|
33
|
+
Example: last_3_days with today=2025-08-11 -> (2025-08-08, 2025-08-11)
|
34
|
+
"""
|
35
|
+
today = today or dt.date.today()
|
36
|
+
try:
|
37
|
+
# label format: 'last_<N>_days'
|
38
|
+
days = int(label.split("_")[1])
|
39
|
+
except Exception as e:
|
40
|
+
raise ValueError(f"Invalid range label '{label}'. Expected 'last_<N>_days'.") from e
|
41
|
+
start = today - dt.timedelta(days=days)
|
42
|
+
return (start, today)
|
@@ -1,127 +1,120 @@
|
|
1
1
|
import re
|
2
2
|
from enum import Enum
|
3
|
-
from typing import Optional, Union, Callable
|
3
|
+
from typing import Optional, Union, Callable, Tuple, Iterable
|
4
4
|
|
5
|
-
class CountryCode(Enum):
|
6
|
-
"""Enum for supported country codes, including phone number length and formatting rules."""
|
7
5
|
|
8
|
-
|
9
|
-
|
10
|
-
FRANCE = ("33", 9, lambda number: f"{number[:1]} {number[1:3]} {number[3:5]} {number[5:]}")
|
11
|
-
SPAIN = ("34", 9, lambda number: f"{number[:2]} {number[2:5]} {number[5:]}")
|
12
|
-
DEFAULT = ("506", 8, lambda number: f"{number[:4]}-{number[4:]}")
|
6
|
+
def _only_digits(s: str) -> str:
|
7
|
+
return re.sub(r"\D", "", s)
|
13
8
|
|
14
|
-
def __init__(self, code: str, length: int, formatter: Callable[[str], str]):
|
15
|
-
"""
|
16
|
-
Initialize a CountryCode enum member.
|
17
|
-
|
18
|
-
:param code: The country code.
|
19
|
-
:type code: str
|
20
|
-
:param length: The expected length of the phone number (excluding the country code).
|
21
|
-
:type length: int
|
22
|
-
:param formatter: A function to format the phone number.
|
23
|
-
:type formatter: Callable[[str], str]
|
24
|
-
"""
|
25
|
-
self.code = code
|
26
|
-
self.length = length
|
27
|
-
self.formatter = formatter
|
28
9
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
10
|
+
def _normalize_raw_input(phone: Union[str, int, float]) -> str:
|
11
|
+
"""
|
12
|
+
Normalize raw input to just digits, preserving leading zeros for strings.
|
13
|
+
Reject floats because they lose leading zeros and can be formatted (e.g., 1e10).
|
14
|
+
"""
|
15
|
+
if isinstance(phone, float):
|
16
|
+
# Floats are unsafe for phone numbers; caller should pass string or int
|
17
|
+
raise ValueError("Phone numbers as float are ambiguous; pass a string or int.")
|
18
|
+
if isinstance(phone, int):
|
19
|
+
# int loses leading zeros by definition, but this matches your original behavior
|
20
|
+
return str(phone)
|
21
|
+
if not isinstance(phone, str):
|
22
|
+
raise TypeError("phone_number must be str|int")
|
23
|
+
|
24
|
+
phone = phone.strip()
|
25
|
+
# Allow leading '+' or '00' international format; we'll strip them before digit normalization
|
26
|
+
if phone.startswith("+"):
|
27
|
+
phone = phone[1:]
|
28
|
+
elif phone.startswith("00"):
|
29
|
+
phone = phone[2:]
|
30
|
+
return _only_digits(phone)
|
33
31
|
|
34
|
-
:return: The country code.
|
35
|
-
:rtype: str
|
36
|
-
"""
|
37
|
-
return self.code
|
38
32
|
|
39
|
-
|
40
|
-
|
41
|
-
|
33
|
+
class CountryCode(Enum):
|
34
|
+
"""
|
35
|
+
Supported countries with:
|
36
|
+
- dial_code: country calling code
|
37
|
+
- nsn_length: expected National Significant Number length (no country code)
|
38
|
+
- formatter: formats the national number
|
39
|
+
- trunk_prefix: '0' for countries that commonly include a trunk code domestically (strip if present)
|
40
|
+
"""
|
42
41
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
42
|
+
USA = ("1", 10, lambda n: f"({n[:3]}) {n[3:6]}-{n[6:]}", "")
|
43
|
+
UK = ("44", 10, lambda n: f"{n[:2]} {n[2:6]} {n[6:]}", "0")
|
44
|
+
FRANCE= ("33", 9, lambda n: f"{n[:1]} {n[1:3]} {n[3:5]} {n[5:]}", "0")
|
45
|
+
SPAIN = ("34", 9, lambda n: f"{n[:2]} {n[2:5]} {n[5:]}", "")
|
46
|
+
# Default to Costa Rica in your original code
|
47
|
+
DEFAULT = ("506", 8, lambda n: f"{n[:4]}-{n[4:]}", "")
|
49
48
|
|
50
|
-
def
|
51
|
-
|
52
|
-
|
49
|
+
def __init__(self, dial_code: str, nsn_length: int, formatter: Callable[[str], str], trunk_prefix: str):
|
50
|
+
self.dial_code = dial_code
|
51
|
+
self.nsn_length = nsn_length
|
52
|
+
self.formatter = formatter
|
53
|
+
self.trunk_prefix = trunk_prefix
|
54
|
+
|
55
|
+
def validate_length(self, nsn: str) -> bool:
|
56
|
+
return len(nsn) == self.nsn_length
|
57
|
+
|
58
|
+
def strip_trunk(self, nsn: str) -> str:
|
59
|
+
if self.trunk_prefix and nsn.startswith(self.trunk_prefix) and len(nsn) > self.nsn_length:
|
60
|
+
# If someone passed trunk + nsn (e.g., '0' + 10 digits for UK),
|
61
|
+
# remove only a single leading trunk.
|
62
|
+
return nsn[1:]
|
63
|
+
return nsn
|
64
|
+
|
65
|
+
def format_number(self, nsn: str) -> str:
|
66
|
+
return self.formatter(nsn)
|
53
67
|
|
54
|
-
:param number: The phone number part to format.
|
55
|
-
:type number: str
|
56
|
-
:return: The formatted number.
|
57
|
-
:rtype: str
|
58
|
-
"""
|
59
|
-
return self.formatter(number)
|
60
68
|
|
61
69
|
class PhoneNumberFormatter:
|
62
70
|
"""
|
63
|
-
|
64
|
-
|
65
|
-
The class supports phone numbers for the UK, USA, France, and Spain. It detects the country code
|
66
|
-
from the input or uses a default country code if missing. Phone numbers are formatted according
|
67
|
-
to country-specific rules.
|
71
|
+
Validate and format a phone number into E.164-like string with country-specific formatting of the NSN.
|
72
|
+
Keeps backward compatibility with your previous API.
|
68
73
|
"""
|
69
74
|
|
70
75
|
def __init__(self, default_country_code: CountryCode = CountryCode.DEFAULT):
|
71
|
-
"""
|
72
|
-
Initialize the PhoneNumberFormatter with a default country code.
|
73
|
-
|
74
|
-
:param default_country_code: The default country code to use if missing.
|
75
|
-
:type default_country_code: CountryCode
|
76
|
-
"""
|
77
76
|
self.default_country_code = default_country_code
|
78
77
|
|
79
78
|
def format_phone_number(self, phone_number: Union[str, int, float]) -> Optional[str]:
|
80
79
|
"""
|
81
|
-
|
82
|
-
|
83
|
-
If the input is numeric (e.g., an integer or float), it will be converted to a string.
|
84
|
-
If the country code is missing, the default country code will be used. The phone number
|
85
|
-
will be formatted according to the detected country's rules.
|
86
|
-
|
87
|
-
:param phone_number: The phone number to validate and format. Can be a string, integer, or float.
|
88
|
-
:type phone_number: Union[str, int, float]
|
89
|
-
:return: The formatted phone number, or None if the input is invalid.
|
90
|
-
:rtype: Optional[str]
|
80
|
+
Returns: "+<country_code> <pretty national format>" or None if invalid.
|
91
81
|
"""
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
# Remove all non-digit characters
|
97
|
-
digits = re.sub(r"\D", "", phone_number)
|
82
|
+
try:
|
83
|
+
digits = _normalize_raw_input(phone_number)
|
84
|
+
except (TypeError, ValueError):
|
85
|
+
return None
|
98
86
|
|
99
|
-
|
100
|
-
if not digits or len(digits) < 7: # Minimum length for a valid phone number
|
87
|
+
if not digits or len(digits) < 7: # minimal sanity check
|
101
88
|
return None
|
102
89
|
|
103
|
-
|
104
|
-
country_code, number = self._detect_country_code(digits)
|
90
|
+
country, nsn = self._detect_country_code(digits)
|
105
91
|
|
106
|
-
#
|
107
|
-
|
108
|
-
return None
|
92
|
+
# Strip a single trunk prefix if present (e.g., UK/FR leading '0' before the NSN)
|
93
|
+
nsn = country.strip_trunk(nsn)
|
109
94
|
|
110
|
-
|
111
|
-
|
95
|
+
if not country.validate_length(nsn):
|
96
|
+
return None
|
112
97
|
|
113
|
-
|
98
|
+
pretty = country.format_number(nsn)
|
99
|
+
return f"+{country.dial_code} {pretty}"
|
114
100
|
|
115
|
-
def _detect_country_code(self, digits: str) ->
|
101
|
+
def _detect_country_code(self, digits: str) -> Tuple[CountryCode, str]:
|
116
102
|
"""
|
117
|
-
Detect the country
|
118
|
-
|
119
|
-
:param digits: The phone number digits (without non-digit characters).
|
120
|
-
:type digits: str
|
121
|
-
:return: A tuple containing the detected country code and the remaining number.
|
122
|
-
:rtype: tuple[CountryCode, str]
|
103
|
+
Detect the country by trying the longest dial codes first to avoid prefix collisions.
|
104
|
+
Falls back to default if none matches.
|
123
105
|
"""
|
124
|
-
for
|
125
|
-
|
126
|
-
|
106
|
+
# Iterate members excluding DEFAULT for detection, sorted by dial_code length desc
|
107
|
+
candidates: Iterable[CountryCode] = (
|
108
|
+
c for c in sorted(
|
109
|
+
(m for m in CountryCode if m is not CountryCode.DEFAULT),
|
110
|
+
key=lambda m: len(m.dial_code),
|
111
|
+
reverse=True,
|
112
|
+
)
|
113
|
+
)
|
114
|
+
|
115
|
+
for country in candidates:
|
116
|
+
if digits.startswith(country.dial_code):
|
117
|
+
return country, digits[len(country.dial_code):]
|
118
|
+
|
119
|
+
# No match → assume default country; entire string is NSN
|
127
120
|
return self.default_country_code, digits
|