datamarket 0.6.0__py3-none-any.whl → 0.10.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (38) hide show
  1. datamarket/__init__.py +0 -1
  2. datamarket/exceptions/__init__.py +1 -0
  3. datamarket/exceptions/main.py +118 -0
  4. datamarket/interfaces/alchemy.py +1934 -25
  5. datamarket/interfaces/aws.py +81 -14
  6. datamarket/interfaces/azure.py +127 -0
  7. datamarket/interfaces/drive.py +60 -10
  8. datamarket/interfaces/ftp.py +37 -14
  9. datamarket/interfaces/llm.py +1220 -0
  10. datamarket/interfaces/nominatim.py +314 -42
  11. datamarket/interfaces/peerdb.py +272 -104
  12. datamarket/interfaces/proxy.py +354 -50
  13. datamarket/interfaces/tinybird.py +7 -15
  14. datamarket/params/nominatim.py +439 -0
  15. datamarket/utils/__init__.py +1 -1
  16. datamarket/utils/airflow.py +10 -7
  17. datamarket/utils/alchemy.py +2 -1
  18. datamarket/utils/logs.py +88 -0
  19. datamarket/utils/main.py +138 -10
  20. datamarket/utils/nominatim.py +201 -0
  21. datamarket/utils/playwright/__init__.py +0 -0
  22. datamarket/utils/playwright/async_api.py +274 -0
  23. datamarket/utils/playwright/sync_api.py +281 -0
  24. datamarket/utils/requests.py +655 -0
  25. datamarket/utils/selenium.py +6 -12
  26. datamarket/utils/strings/__init__.py +1 -0
  27. datamarket/utils/strings/normalization.py +217 -0
  28. datamarket/utils/strings/obfuscation.py +153 -0
  29. datamarket/utils/strings/standardization.py +40 -0
  30. datamarket/utils/typer.py +2 -1
  31. datamarket/utils/types.py +1 -0
  32. datamarket-0.10.3.dist-info/METADATA +172 -0
  33. datamarket-0.10.3.dist-info/RECORD +38 -0
  34. {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info}/WHEEL +1 -2
  35. datamarket-0.6.0.dist-info/METADATA +0 -49
  36. datamarket-0.6.0.dist-info/RECORD +0 -24
  37. datamarket-0.6.0.dist-info/top_level.txt +0 -1
  38. {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info/licenses}/LICENSE +0 -0
@@ -3,6 +3,8 @@
3
3
 
4
4
  import io
5
5
  import logging
6
+ from typing import Any, Dict, List, Optional
7
+
6
8
  import boto3
7
9
 
8
10
  ########################################################################################################################
@@ -12,34 +14,44 @@ logger = logging.getLogger(__name__)
12
14
 
13
15
 
14
16
  class AWSInterface:
15
- def __init__(self, config):
16
- self.profiles = []
17
+ def __init__(self, config) -> None:
18
+ self.profiles: List[Dict[str, Any]] = []
17
19
  self.config = config
18
20
 
19
- for section in self.config.sections():
21
+ for section in getattr(self.config, "sections", lambda: [])():
20
22
  if section.startswith("aws:"):
21
23
  profile_name = section.split(":", 1)[1]
24
+ bucket_value = self.config[section].get("buckets", "")
25
+ buckets = [b.strip() for b in bucket_value.split(",") if b.strip()]
26
+ session = boto3.Session(profile_name=profile_name)
27
+
22
28
  self.profiles.append(
23
29
  {
24
30
  "profile": profile_name,
25
- "bucket": self.config[section]["bucket"],
26
- "session": boto3.Session(profile_name=profile_name),
31
+ "buckets": buckets,
32
+ "session": session,
27
33
  }
28
34
  )
29
35
 
30
36
  if not self.profiles:
31
37
  logger.warning("No AWS profiles found in config file")
32
38
 
33
- self.current_profile = self.profiles[0] if self.profiles else None
39
+ self.current_profile: Optional[Dict[str, Any]] = self.profiles[0] if self.profiles else None
34
40
  self._update_resources()
35
41
 
36
- def _update_resources(self):
42
+ def _update_resources(self) -> None:
43
+ """Refresh S3 resources for the current profile and set default bucket (first in list)"""
37
44
  if self.current_profile:
38
45
  self.s3 = self.current_profile["session"].resource("s3")
39
46
  self.s3_client = self.s3.meta.client
40
- self.bucket = self.current_profile["bucket"]
47
+ buckets = self.current_profile.get("buckets", [])
48
+ self.bucket = buckets[0] if buckets else None
49
+ else:
50
+ self.s3 = None
51
+ self.s3_client = None
52
+ self.bucket = None
41
53
 
42
- def switch_profile(self, profile_name):
54
+ def switch_profile(self, profile_name: str) -> None:
43
55
  for profile in self.profiles:
44
56
  if profile["profile"] == profile_name:
45
57
  self.current_profile = profile
@@ -47,14 +59,69 @@ class AWSInterface:
47
59
  return
48
60
  logger.warning(f"Profile {profile_name} not found")
49
61
 
50
- def get_file(self, s3_path):
62
+ def switch_bucket(self, bucket: str) -> None:
63
+ if not self.current_profile:
64
+ logger.warning("No current AWS profile to switch bucket on")
65
+ return
66
+
67
+ buckets = self.current_profile.get("buckets") or []
68
+ if bucket not in buckets:
69
+ logger.warning(f"Bucket {bucket} not found in profile {self.current_profile.get('profile')}")
70
+ return
71
+
72
+ self.bucket = bucket
73
+
74
+ def switch_bucket_for_profile(self, profile_name: str, bucket: str) -> None:
75
+ """
76
+ Select a profile and then switch its active bucket.
77
+ """
78
+ for profile in self.profiles:
79
+ if profile["profile"] == profile_name:
80
+ self.current_profile = profile
81
+ self._update_resources() # sets default bucket & s3 clients
82
+ self.switch_bucket(bucket) # only sets self.bucket if valid
83
+ return
84
+ logger.warning(f"Profile {profile_name} not found")
85
+
86
+ def get_bucket_url(self) -> Optional[str]:
87
+ """Return active bucket URL."""
88
+ if not self.bucket:
89
+ logger.warning("No active bucket selected")
90
+ return None
91
+ region = self.s3_client.meta.region_name
92
+ return f"https://{self.bucket}.s3.{region}.amazonaws.com"
93
+
94
+ def get_file(self, s3_path: str):
95
+ if not self.bucket:
96
+ logger.warning("No active bucket selected")
97
+ return None
51
98
  try:
52
99
  return self.s3.Object(self.bucket, s3_path).get()
53
100
  except self.s3_client.exceptions.NoSuchKey:
54
101
  logger.info(f"{s3_path} does not exist")
102
+ return None
103
+
104
+ def file_exists(self, s3_path: str) -> bool:
105
+ if not self.bucket:
106
+ logger.warning("No active bucket selected")
107
+ return False
108
+ try:
109
+ self.s3_client.head_object(Bucket=self.bucket, Key=s3_path)
110
+ return True
111
+ except self.s3_client.exceptions.NoSuchKey:
112
+ return False
113
+ except Exception as e:
114
+ logger.error(f"Error checking existence of {s3_path}: {e}")
115
+ raise
55
116
 
56
- def read_file_as_bytes(self, s3_path):
57
- return io.BytesIO(self.get_file(s3_path)["Body"].read())
117
+ def read_file_as_bytes(self, s3_path: str) -> Optional[io.BytesIO]:
118
+ obj = self.get_file(s3_path)
119
+ if not obj:
120
+ return None
121
+ return io.BytesIO(obj["Body"].read())
58
122
 
59
- def upload_file(self, local_path, s3_path):
60
- self.s3.Bucket(self.bucket).upload_file(local_path, s3_path)
123
+ def upload_file(self, local_path: str, s3_path: str, **kwargs) -> None:
124
+ if not self.bucket:
125
+ logger.warning("No active bucket selected")
126
+ return
127
+ self.s3.Bucket(self.bucket).upload_file(local_path, s3_path, **kwargs)
@@ -0,0 +1,127 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ from azure.storage.blob import BlobServiceClient, ContainerClient
9
+ from pendulum import now
10
+
11
+ ########################################################################################################################
12
+ # CLASSES
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class AzureBlobInterface:
18
+ def __init__(self, config):
19
+ self.profiles: List[Dict[str, Any]] = []
20
+ self.config = config
21
+
22
+ for section in getattr(self.config, "sections", lambda: [])():
23
+ if section.startswith("azure:"):
24
+ profile_name = section.split(":", 1)[1]
25
+ connection_string = self.config[section].get("connection_string")
26
+ container_name = self.config[section].get("container_name")
27
+ sas_container_url = self.config[section].get("sas_container_url")
28
+
29
+ if sas_container_url:
30
+ session = ContainerClient.from_container_url(sas_container_url)
31
+ elif connection_string and container_name:
32
+ session = BlobServiceClient.from_connection_string(connection_string).get_container_client(
33
+ container_name
34
+ )
35
+
36
+ self.profiles.append(
37
+ {
38
+ "profile": profile_name,
39
+ "container_name": container_name,
40
+ "session": session,
41
+ }
42
+ )
43
+
44
+ if not self.profiles:
45
+ logger.warning("No Azure profiles found in config file")
46
+ self.current_profile: Optional[Dict[str, Any]] = self.profiles[0] if self.profiles else None
47
+
48
+ def switch_profile(self, profile_name: str) -> None:
49
+ for profile in self.profiles:
50
+ if profile["profile"] == profile_name:
51
+ self.current_profile = profile
52
+ return
53
+ logger.warning(f"Profile {profile_name} not found")
54
+
55
+ def upload_file(
56
+ self,
57
+ local_file,
58
+ remote_folder,
59
+ remote_file=None,
60
+ upload_file_info=False,
61
+ **file_info_data,
62
+ ):
63
+ if not remote_file:
64
+ remote_file = Path(local_file).name
65
+
66
+ remote_path = f"{remote_folder}/{remote_file}" if remote_folder else remote_file
67
+
68
+ blob_client = self.current_profile["session"].get_blob_client(remote_path)
69
+ with open(local_file, "rb") as data:
70
+ blob_client.upload_blob(data, overwrite=True)
71
+
72
+ if upload_file_info:
73
+ self.upload_file_info(remote_path, **file_info_data)
74
+
75
+ def upload_file_info(self, remote_path, **file_info_data):
76
+ summary_file = remote_path.split(".")[0] + "_resumen.csv"
77
+ blob_client = self.current_profile["session"].get_blob_client(summary_file)
78
+
79
+ new_record = {
80
+ "file": remote_path,
81
+ "num_rows": file_info_data.get("num_rows"),
82
+ "schema_version": file_info_data.get("schema_version"),
83
+ "upload_date": now(tz="Europe/Madrid").to_datetime_string(),
84
+ }
85
+
86
+ new_record_str = "file,num_rows,schema_version,upload_date\n"
87
+ new_record_str += ",".join([str(v) for v in new_record.values()]) + "\n"
88
+
89
+ blob_client.upload_blob(new_record_str, overwrite=True)
90
+
91
+ def download_file(self, local_file, remote_path):
92
+ blob_client = self.current_profile["session"].get_blob_client(remote_path)
93
+ blob_data = blob_client.download_blob()
94
+ with open(local_file, "wb") as f:
95
+ blob_data.readinto(f)
96
+
97
+ def check_file_exists_and_not_empty(self, remote_file, remote_folder):
98
+ """
99
+ Checks if a blob exists in the specified folder and has a size greater than 100 bytes.
100
+
101
+ Args:
102
+ remote_file (str): The name of the file (blob) to check.
103
+ remote_folder (str): The folder (prefix) where the file is located.
104
+
105
+ Returns:
106
+ bool: True if the blob exists and has a size greater than 100, False otherwise.
107
+ """
108
+
109
+ remote_path = f"{remote_folder}/{remote_file}" if remote_folder else remote_file
110
+
111
+ try:
112
+ blob_client = self.current_profile["session"].get_blob_client(remote_path)
113
+ if blob_client.exists():
114
+ properties = blob_client.get_blob_properties()
115
+ if properties.size > 100: # Check if size is greater than 100 bytes
116
+ logger.debug(f"Blob '{remote_path}' exists and is not empty (size: {properties.size}).")
117
+ return True
118
+ else:
119
+ logger.debug(f"Blob '{remote_path}' exists but size ({properties.size}) is not > 100 bytes.")
120
+ return False
121
+ else:
122
+ logger.debug(f"Blob '{remote_path}' does not exist.")
123
+ return False
124
+ except Exception as e:
125
+ logger.error(f"Error checking blob '{remote_path}': {e}")
126
+ # In case of error, assume it doesn't exist or is empty to allow upload attempt
127
+ return False
@@ -18,13 +18,9 @@ class DriveInterface:
18
18
  if "drive" in config:
19
19
  self.config = config["drive"]
20
20
 
21
- GoogleAuth.DEFAULT_SETTINGS[
22
- "client_config_file"
23
- ] = f'{self.config["config_path"]}/credentials.json'
21
+ GoogleAuth.DEFAULT_SETTINGS["client_config_file"] = f"{self.config['config_path']}/credentials.json"
24
22
 
25
- self.gauth = GoogleAuth(
26
- settings_file=f'{self.config["config_path"]}/settings.yaml'
27
- )
23
+ self.gauth = GoogleAuth(settings_file=f"{self.config['config_path']}/settings.yaml")
28
24
  self.gauth.LocalWebserverAuth()
29
25
 
30
26
  self.drive = GoogleDrive(self.gauth)
@@ -48,24 +44,78 @@ class DriveInterface:
48
44
  logger.info(f"deleting old {filename}...")
49
45
  drive_file.Delete(param={"supportsTeamDrives": True})
50
46
 
47
+ def _create_remote_dir_tree(self, base_folder_id, path_parts):
48
+ """
49
+ Ensure the nested folders described by path_parts exist under base_folder_id.
50
+ Returns the folder_id of the deepest folder (or base_folder_id if path_parts is empty).
51
+ """
52
+ parent_id = base_folder_id
53
+ for part in path_parts:
54
+ part = part.strip()
55
+ if not part:
56
+ continue
57
+
58
+ query = (
59
+ f"'{parent_id}' in parents and title = '{part}'"
60
+ " and mimeType = 'application/vnd.google-apps.folder' and trashed=false"
61
+ )
62
+ results = self.drive.ListFile(
63
+ {
64
+ "q": query,
65
+ "corpora": "teamDrive",
66
+ "teamDriveId": self.team_id,
67
+ "includeTeamDriveItems": True,
68
+ "supportsTeamDrives": True,
69
+ }
70
+ ).GetList()
71
+
72
+ if results:
73
+ parent_id = results[0]["id"]
74
+ else:
75
+ folder_metadata = {
76
+ "title": part,
77
+ "mimeType": "application/vnd.google-apps.folder",
78
+ "parents": [
79
+ {
80
+ "kind": "drive#fileLink",
81
+ "teamDriveId": self.team_id,
82
+ "id": parent_id,
83
+ }
84
+ ],
85
+ }
86
+ folder = self.drive.CreateFile(folder_metadata)
87
+ folder.Upload(param={"supportsTeamDrives": True})
88
+ parent_id = folder["id"]
89
+
90
+ return parent_id
91
+
51
92
  def upload_file(self, local_filename, drive_filename, folder_id):
52
- self.delete_old_files(drive_filename, folder_id)
93
+ drive_filename = drive_filename.strip("/")
94
+ parts = drive_filename.split("/")
95
+ if len(parts) > 1:
96
+ *folders, filename = parts
97
+ target_folder_id = self._create_remote_dir_tree(folder_id, folders)
98
+ else:
99
+ filename = parts[0]
100
+ target_folder_id = folder_id
101
+
102
+ self.delete_old_files(filename, target_folder_id)
53
103
 
54
104
  f = self.drive.CreateFile(
55
105
  {
56
- "title": drive_filename,
106
+ "title": filename,
57
107
  "parents": [
58
108
  {
59
109
  "kind": "drive#fileLink",
60
110
  "teamDriveId": self.team_id,
61
- "id": folder_id,
111
+ "id": target_folder_id,
62
112
  }
63
113
  ],
64
114
  }
65
115
  )
66
116
  f.SetContentFile(local_filename)
67
117
 
68
- logger.info(f"uploading {drive_filename} to folder: {folder_id}...")
118
+ logger.info(f"uploading {drive_filename} to folder: {target_folder_id}...")
69
119
  f.Upload(param={"supportsTeamDrives": True})
70
120
 
71
121
  def validate_file(self, filename, folder_id):
@@ -4,6 +4,7 @@
4
4
  import logging
5
5
  from ftplib import FTP, FTP_TLS
6
6
  from pathlib import Path
7
+ from typing import Any, Dict, List, Optional
7
8
 
8
9
  ########################################################################################################################
9
10
  # CLASSES
@@ -13,29 +14,37 @@ logger = logging.getLogger(__name__)
13
14
 
14
15
  class FTPInterface:
15
16
  def __init__(self, config):
16
- if "ftp" in config:
17
- self.config = config["ftp"]
17
+ self.profiles: List[Dict[str, Any]] = []
18
+ self.config = config
19
+ for section in getattr(self.config, "sections", lambda: [])():
20
+ if section.startswith("ftp:"):
21
+ profile_name = section.split(":", 1)[1]
22
+ ftps = self.config[section]["ftps"].lower() == "true"
23
+ ftp_conn = FTP_TLS(self.config[section]["server"]) if ftps else FTP(self.config[section]["server"]) # noqa: S321
24
+ ftp_conn.login(self.config[section]["username"], self.config[section]["password"])
25
+ self.profiles.append({"profile": profile_name, "session": ftp_conn})
18
26
 
19
- self.ftp = self.get_ftp()
20
- else:
27
+ if not self.profiles:
21
28
  logger.warning("no ftp section in config")
22
29
 
23
- def get_ftp(self):
24
- if self.config["ftps"].lower() == "true":
25
- ftp_conn = FTP_TLS(self.config["server"])
30
+ self.current_profile: Optional[Dict[str, Any]] = self.profiles[0] if self.profiles else None
31
+ self.ftp = self.current_profile["session"] if self.current_profile else None
26
32
 
27
- else:
28
- ftp_conn = FTP(self.config["server"])
29
-
30
- ftp_conn.login(self.config["username"], self.config["password"])
31
-
32
- return ftp_conn
33
+ def switch_profile(self, profile_name: str) -> None:
34
+ for profile in self.profiles:
35
+ if profile["profile"] == profile_name:
36
+ self.current_profile = profile
37
+ self.ftp = profile["session"]
38
+ return
39
+ logger.warning(f"Profile {profile_name} not found")
33
40
 
34
41
  def upload_file(self, local_file, remote_folder, remote_file=None):
35
42
  if not remote_file:
36
43
  remote_file = Path(local_file).name
37
44
 
38
- self.ftp.cwd(remote_folder)
45
+ self._create_remote_dir_tree(full_path=f"/{remote_folder}{remote_file}")
46
+
47
+ self.ftp.cwd(f"/{remote_folder}")
39
48
 
40
49
  with open(local_file, "rb") as f:
41
50
  self.ftp.storbinary(f"STOR {remote_file}", f)
@@ -43,3 +52,17 @@ class FTPInterface:
43
52
  def download_file(self, local_file, remote_file):
44
53
  with open(local_file, "wb") as f:
45
54
  self.ftp.retrbinary(f"RETR {remote_file}", f.write)
55
+
56
+ def _create_remote_dir_tree(self, full_path):
57
+ dir_tree = full_path.split("/")[0:-1] # Exclude filename
58
+
59
+ for part in dir_tree:
60
+ if not part:
61
+ continue
62
+
63
+ try:
64
+ self.ftp.cwd(part)
65
+ except Exception as e:
66
+ logger.warning(f"Error while creating remote directory: {e}")
67
+ self.ftp.mkd(part)
68
+ self.ftp.cwd(part)