datamarket 0.9.5__tar.gz → 0.9.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (23) hide show
  1. {datamarket-0.9.5 → datamarket-0.9.7}/PKG-INFO +1 -1
  2. {datamarket-0.9.5 → datamarket-0.9.7}/pyproject.toml +1 -1
  3. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/interfaces/aws.py +2 -3
  4. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/interfaces/ftp.py +2 -2
  5. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/interfaces/peerdb.py +28 -18
  6. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/utils/main.py +30 -12
  7. {datamarket-0.9.5 → datamarket-0.9.7}/LICENSE +0 -0
  8. {datamarket-0.9.5 → datamarket-0.9.7}/README.md +0 -0
  9. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/__init__.py +0 -0
  10. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/interfaces/__init__.py +0 -0
  11. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/interfaces/alchemy.py +0 -0
  12. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/interfaces/drive.py +0 -0
  13. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/interfaces/nominatim.py +0 -0
  14. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/interfaces/proxy.py +0 -0
  15. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/interfaces/tinybird.py +0 -0
  16. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/params/__init__.py +0 -0
  17. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/params/nominatim.py +0 -0
  18. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/utils/__init__.py +0 -0
  19. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/utils/airflow.py +0 -0
  20. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/utils/alchemy.py +0 -0
  21. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/utils/selenium.py +0 -0
  22. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/utils/soda.py +0 -0
  23. {datamarket-0.9.5 → datamarket-0.9.7}/src/datamarket/utils/typer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.5
3
+ Version: 0.9.7
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.9.5"
3
+ version = "0.9.7"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -4,8 +4,7 @@
4
4
  import io
5
5
  import logging
6
6
  import boto3
7
-
8
- from ..utils.main import Config
7
+ from dynaconf import Dynaconf
9
8
 
10
9
  ########################################################################################################################
11
10
  # CLASSES
@@ -14,7 +13,7 @@ logger = logging.getLogger(__name__)
14
13
 
15
14
 
16
15
  class AWSInterface:
17
- def __init__(self, config: Config) -> None:
16
+ def __init__(self, config: Dynaconf) -> None:
18
17
  self.profiles = []
19
18
  self.config = config
20
19
 
@@ -5,7 +5,7 @@ import logging
5
5
  from ftplib import FTP, FTP_TLS
6
6
  from pathlib import Path
7
7
 
8
- from ..utils.main import Config
8
+ from dynaconf import Dynaconf
9
9
 
10
10
  ########################################################################################################################
11
11
  # CLASSES
@@ -13,7 +13,7 @@ from ..utils.main import Config
13
13
  logger = logging.getLogger(__name__)
14
14
 
15
15
  class FTPInterface:
16
- def __init__(self, config: Config):
16
+ def __init__(self, config: Dynaconf):
17
17
  if "ftp" in config:
18
18
  self.config = config["ftp"]
19
19
 
@@ -15,6 +15,13 @@ from tenacity import before_sleep_log, retry, stop_after_attempt, wait_exponenti
15
15
 
16
16
  from .alchemy import AlchemyInterface
17
17
 
18
+ ########################################################################################################################
19
+ # EXCEPTIONS
20
+
21
+ class DatabaseNotConnectedError(Exception):
22
+ """Custom error for when database is not connected."""
23
+ pass
24
+
18
25
  ########################################################################################################################
19
26
  # CLASSES
20
27
 
@@ -105,32 +112,32 @@ class ClickhousePeer:
105
112
  def __init__(self, config):
106
113
  if "clickhouse" in config:
107
114
  self.config = config["clickhouse"]
108
- self.ensure_database_exists()
109
- self.client = self._create_client(database=self.config["database"])
110
115
  else:
111
116
  logger.warning("no clickhouse section in config")
112
117
 
113
- def _create_client(self, database=None):
114
- client_config = {
115
- "host": self.config["host"],
116
- "port": self.config["port"],
117
- "user": self.config["user"],
118
- "password": self.config["password"],
119
- }
120
- if database:
121
- client_config["database"] = database
122
- return clickhouse_driver.Client(**client_config)
118
+ def connect(self, database):
119
+ if not database:
120
+ return
121
+
122
+ self.ensure_database_exists(database)
123
+ self.config["database"] = database
124
+ self.client = clickhouse_driver.Client(**self.config)
125
+
126
+ def _check_connection(self):
127
+ if self.client is None:
128
+ raise DatabaseNotConnectedError("Database not connected. Call connect() method first.")
123
129
 
124
- def ensure_database_exists(self):
125
- logger.info(f"Checking if database '{self.config['database']}' exists in Clickhouse")
126
- temp_client = self._create_client()
130
+ def ensure_database_exists(self, database):
131
+ logger.info(f"Checking if database '{database}' exists in Clickhouse")
132
+ temp_client = clickhouse_driver.Client(**self.config)
127
133
  databases = temp_client.execute("SHOW DATABASES")
128
- if self.config["database"] not in [db[0] for db in databases]:
129
- logger.info(f"Creating database '{self.config['database']}'")
130
- temp_client.execute(f"CREATE DATABASE IF NOT EXISTS {self.config['database']}")
134
+ if database not in [db[0] for db in databases]:
135
+ logger.info(f"Creating database '{database}'")
136
+ temp_client.execute(f"CREATE DATABASE IF NOT EXISTS {database}")
131
137
  temp_client.disconnect()
132
138
 
133
139
  def delete_existing_tables(self, table_names):
140
+ self._check_connection()
134
141
  logger.info(f"Deleting existing tables in Clickhouse for database: {self.config['database']}")
135
142
 
136
143
  all_tables = self.client.execute("SHOW TABLES")
@@ -156,6 +163,7 @@ class ClickhousePeer:
156
163
  logger.info("Finished deleting existing tables in Clickhouse")
157
164
 
158
165
  def create_row_policies(self, schema_name, table_names):
166
+ self._check_connection()
159
167
  logger.info(f"Creating row policies for schema: {schema_name}")
160
168
  for table_name in table_names:
161
169
  policy_name = "non_deleted"
@@ -167,6 +175,7 @@ class ClickhousePeer:
167
175
  logger.info(f"Created row policy '{policy_name}' for table '{table_name}'")
168
176
 
169
177
  def execute_sql_file(self, file_path):
178
+ self._check_connection()
170
179
  try:
171
180
  with file_path.open("r") as sql_file:
172
181
  sql_content = sql_file.read()
@@ -501,6 +510,7 @@ class PeerDBInterface:
501
510
  self.source.create_user(peerdb_user, peerdb_pwd)
502
511
  self.source.grant_permissions(schema_name, peerdb_user)
503
512
  self.source.create_publication(schema_name, mirror_tablenames)
513
+ self.destination.connect(schema_name)
504
514
  self.create_postgres_peer()
505
515
  self.create_clickhouse_peer(schema_name)
506
516
  self.pre_init(schema_name, mirror_tablenames, clickhouse_sql_path, resync, hard_resync)
@@ -2,6 +2,7 @@
2
2
  # IMPORTS
3
3
 
4
4
  import asyncio
5
+ from dataclasses import dataclass
5
6
  import inspect
6
7
  import logging
7
8
  import random
@@ -11,23 +12,35 @@ import shutil
11
12
  import subprocess
12
13
  import time
13
14
  from pathlib import Path
14
- from typing import Literal, Union
15
+ from typing import Literal, Optional, Union
15
16
 
16
17
  import pendulum
17
18
  from croniter import croniter
18
19
  from configparser import RawConfigParser
19
20
  from dynaconf import Dynaconf, add_converter
20
21
 
22
+ logger = logging.getLogger(__name__)
23
+
21
24
  ########################################################################################################################
22
- # FUNCTIONS
25
+ # CLASSES
26
+
27
+
28
+ @dataclass
29
+ class ProjectMetadata:
30
+ cmd_prefix: str
31
+ package_name: str
32
+ env_name: str
33
+ path: Path
34
+ config_path: Path
23
35
 
24
- logger = logging.getLogger(__name__)
25
36
 
26
- Config = Union[RawConfigParser, Dynaconf]
37
+ ########################################################################################################################
38
+ # FUNCTIONS
27
39
 
28
40
 
29
41
  def get_granular_date(
30
- granularity: Union[Literal["monthly", "biweekly", "weekly", "daily"], str], tz: str = "Europe/Madrid"
42
+ granularity: Union[Literal["monthly", "biweekly", "weekly", "daily"], str],
43
+ tz: str = "Europe/Madrid",
31
44
  ) -> pendulum.DateTime:
32
45
  """
33
46
  Returns the most recent date based on the given granularity or a custom cron expression.
@@ -67,8 +80,10 @@ def read_converter(path_str: str):
67
80
 
68
81
 
69
82
  def get_config(
70
- config_file: Path, tz: str = "Europe/Madrid"
71
- ) -> Union[RawConfigParser, Dynaconf]:
83
+ config_file: Optional[Path] = None, tz: str = "Europe/Madrid"
84
+ ) -> Dynaconf:
85
+ config_file = config_file or get_project_metadata().config_path
86
+
72
87
  if Path(config_file).suffix == ".ini":
73
88
  logger.warning("Using legacy INI config reader. Please migrate to TOML")
74
89
  cfg = RawConfigParser()
@@ -106,17 +121,20 @@ def get_config(
106
121
 
107
122
  return config
108
123
 
109
-
110
- def get_project_metadata():
124
+ def get_project_metadata() -> ProjectMetadata:
111
125
  caller_frame = inspect.stack()[1]
112
126
  current_file_parts = Path(caller_frame.filename).resolve().parts
113
127
  src_index = current_file_parts.index("src")
128
+
114
129
  cmd_prefix = "dix vnc run --" if shutil.which("dix") else ""
115
- pkg_name = current_file_parts[src_index + 1]
116
- env_name = f"{pkg_name}_env"
130
+ package_name = current_file_parts[src_index + 1]
131
+ env_name = f"{package_name}_env"
117
132
  project_path = Path(*current_file_parts[:src_index])
133
+ config_path = project_path / "config.toml"
118
134
 
119
- return {"cmd_prefix": cmd_prefix, "pkg_name": pkg_name, "env_name": env_name, "project_path": project_path}
135
+ return ProjectMetadata(
136
+ cmd_prefix, package_name, env_name, project_path, config_path
137
+ )
120
138
 
121
139
 
122
140
  def set_logger(level):
File without changes
File without changes