datamarket 0.7.41__py3-none-any.whl → 0.7.125__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamarket/__init__.py +0 -1
- datamarket/exceptions/__init__.py +1 -0
- datamarket/exceptions/main.py +118 -0
- datamarket/interfaces/alchemy.py +206 -64
- datamarket/interfaces/aws.py +81 -14
- datamarket/interfaces/azure.py +127 -0
- datamarket/interfaces/drive.py +60 -10
- datamarket/interfaces/ftp.py +23 -16
- datamarket/interfaces/nominatim.py +312 -39
- datamarket/interfaces/peerdb.py +40 -5
- datamarket/interfaces/proxy.py +354 -50
- datamarket/interfaces/tinybird.py +4 -12
- datamarket/params/nominatim.py +439 -0
- datamarket/utils/__init__.py +1 -1
- datamarket/utils/airflow.py +10 -7
- datamarket/utils/alchemy.py +2 -1
- datamarket/utils/main.py +127 -6
- datamarket/utils/nominatim.py +201 -0
- datamarket/utils/playwright/__init__.py +0 -0
- datamarket/utils/playwright/async_api.py +274 -0
- datamarket/utils/playwright/sync_api.py +281 -0
- datamarket/utils/requests.py +653 -0
- datamarket/utils/selenium.py +6 -12
- datamarket/utils/strings/__init__.py +1 -0
- datamarket/utils/strings/normalization.py +217 -0
- datamarket/utils/strings/obfuscation.py +153 -0
- datamarket/utils/strings/standardization.py +40 -0
- datamarket/utils/typer.py +2 -1
- datamarket/utils/types.py +1 -0
- {datamarket-0.7.41.dist-info → datamarket-0.7.125.dist-info}/METADATA +38 -18
- datamarket-0.7.125.dist-info/RECORD +36 -0
- {datamarket-0.7.41.dist-info → datamarket-0.7.125.dist-info}/WHEEL +1 -1
- datamarket-0.7.41.dist-info/RECORD +0 -23
- {datamarket-0.7.41.dist-info → datamarket-0.7.125.dist-info}/LICENSE +0 -0
datamarket/__init__.py
CHANGED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
# init file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .main import * # noqa: F403
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# CLASSES
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from requests import Request, Response
|
|
8
|
+
from requests.exceptions import HTTPError
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ManagedHTTPError(HTTPError):
|
|
12
|
+
"""Signal that this HTTP status was handled and should not be retried."""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
message: Optional[str] = None,
|
|
17
|
+
response: Optional[Response] = None,
|
|
18
|
+
request: Optional[Request] = None,
|
|
19
|
+
*args,
|
|
20
|
+
**kwargs,
|
|
21
|
+
):
|
|
22
|
+
self.response = response
|
|
23
|
+
self.request = request or getattr(response, "request", None)
|
|
24
|
+
|
|
25
|
+
# Build a safe default message
|
|
26
|
+
if not message:
|
|
27
|
+
status = getattr(self.response, "status_code", "unknown")
|
|
28
|
+
url = getattr(self.request, "url", "unknown")
|
|
29
|
+
message = f"HTTP {status} for {url}"
|
|
30
|
+
|
|
31
|
+
self.message = message
|
|
32
|
+
|
|
33
|
+
super().__init__(message, *args, response=response, **kwargs)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class IgnoredHTTPError(ManagedHTTPError):
|
|
37
|
+
"""Exception type that signals the error should be ignored by retry logic."""
|
|
38
|
+
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class NotFoundError(ManagedHTTPError):
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
message: Optional[str] = None,
|
|
46
|
+
response: Optional[Response] = None,
|
|
47
|
+
request: Optional[Request] = None,
|
|
48
|
+
*args,
|
|
49
|
+
**kwargs,
|
|
50
|
+
):
|
|
51
|
+
if not message:
|
|
52
|
+
status = getattr(response, "status_code", 404)
|
|
53
|
+
req = request or getattr(response, "request", None)
|
|
54
|
+
url = getattr(req, "url", "unknown")
|
|
55
|
+
message = f"HTTP {status} for {url}"
|
|
56
|
+
super().__init__(message, response, request, *args, **kwargs)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class BadRequestError(ManagedHTTPError):
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
message: Optional[str] = None,
|
|
63
|
+
response: Optional[Response] = None,
|
|
64
|
+
request: Optional[Request] = None,
|
|
65
|
+
*args,
|
|
66
|
+
**kwargs,
|
|
67
|
+
):
|
|
68
|
+
if not message:
|
|
69
|
+
status = getattr(response, "status_code", 400)
|
|
70
|
+
req = request or getattr(response, "request", None)
|
|
71
|
+
url = getattr(req, "url", "unknown")
|
|
72
|
+
message = f"HTTP {status} for {url}"
|
|
73
|
+
super().__init__(message, response, request, *args, **kwargs)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class EmptyResponseError(ManagedHTTPError):
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
message: Optional[str] = None,
|
|
80
|
+
response: Optional[Response] = None,
|
|
81
|
+
request: Optional[Request] = None,
|
|
82
|
+
*args,
|
|
83
|
+
**kwargs,
|
|
84
|
+
):
|
|
85
|
+
if not message:
|
|
86
|
+
req = request or getattr(response, "request", None)
|
|
87
|
+
url = getattr(req, "url", "unknown")
|
|
88
|
+
message = f"Empty response for {url}"
|
|
89
|
+
super().__init__(message, response, request, *args, **kwargs)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class RedirectionDetectedError(ManagedHTTPError):
|
|
93
|
+
def __init__(
|
|
94
|
+
self,
|
|
95
|
+
message: Optional[str] = None,
|
|
96
|
+
response: Optional[Response] = None,
|
|
97
|
+
request: Optional[Request] = None,
|
|
98
|
+
*args,
|
|
99
|
+
**kwargs,
|
|
100
|
+
):
|
|
101
|
+
if not message:
|
|
102
|
+
status = getattr(response, "status_code", 300)
|
|
103
|
+
req = request or getattr(response, "request", None)
|
|
104
|
+
url = getattr(req, "url", "unknown")
|
|
105
|
+
message = f"HTTP {status} for {url}"
|
|
106
|
+
super().__init__(message, response, request, *args, **kwargs)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class NoWorkingProxiesError(Exception):
|
|
110
|
+
def __init__(self, message="No working proxies available"):
|
|
111
|
+
self.message = message
|
|
112
|
+
super().__init__(self.message)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class EnsureNewIPTimeoutError(Exception):
|
|
116
|
+
def __init__(self, message="Timed out waiting for new IP"):
|
|
117
|
+
self.message = message
|
|
118
|
+
super().__init__(self.message)
|
datamarket/interfaces/alchemy.py
CHANGED
|
@@ -3,14 +3,16 @@
|
|
|
3
3
|
|
|
4
4
|
import logging
|
|
5
5
|
from collections.abc import MutableMapping
|
|
6
|
-
from
|
|
6
|
+
from enum import Enum, auto
|
|
7
|
+
from typing import Any, Iterator, List, Optional, Type, TypeVar, Union
|
|
7
8
|
from urllib.parse import quote_plus
|
|
8
9
|
|
|
9
|
-
from sqlalchemy import DDL, FrozenResult, Result, Select, SQLColumnExpression, create_engine,
|
|
10
|
+
from sqlalchemy import DDL, FrozenResult, Result, Select, SQLColumnExpression, create_engine, text
|
|
10
11
|
from sqlalchemy.dialects.postgresql import insert
|
|
11
12
|
from sqlalchemy.exc import IntegrityError
|
|
12
13
|
from sqlalchemy.ext.declarative import DeclarativeMeta
|
|
13
|
-
from sqlalchemy.orm import sessionmaker
|
|
14
|
+
from sqlalchemy.orm import Session, sessionmaker
|
|
15
|
+
from sqlalchemy.sql.expression import ClauseElement
|
|
14
16
|
|
|
15
17
|
########################################################################################################################
|
|
16
18
|
# CLASSES
|
|
@@ -20,6 +22,11 @@ logger = logging.getLogger(__name__)
|
|
|
20
22
|
ModelType = TypeVar("ModelType", bound=DeclarativeMeta)
|
|
21
23
|
|
|
22
24
|
|
|
25
|
+
class CommitStrategy(Enum):
|
|
26
|
+
COMMIT_ON_SUCCESS = auto()
|
|
27
|
+
FORCE_COMMIT = auto()
|
|
28
|
+
|
|
29
|
+
|
|
23
30
|
class MockContext:
|
|
24
31
|
def __init__(self, column: SQLColumnExpression) -> None:
|
|
25
32
|
self.current_parameters = {}
|
|
@@ -29,16 +36,58 @@ class MockContext:
|
|
|
29
36
|
|
|
30
37
|
class AlchemyInterface:
|
|
31
38
|
def __init__(self, config: MutableMapping) -> None:
|
|
39
|
+
self.session: Optional[Session] = None
|
|
32
40
|
if "db" in config:
|
|
33
41
|
self.config = config["db"]
|
|
34
|
-
|
|
35
42
|
self.engine = create_engine(self.get_conn_str())
|
|
36
|
-
self.
|
|
37
|
-
self.cursor = self.session.connection().connection.cursor()
|
|
38
|
-
|
|
43
|
+
self.Session = sessionmaker(bind=self.engine)
|
|
39
44
|
else:
|
|
40
45
|
logger.warning("no db section in config")
|
|
41
46
|
|
|
47
|
+
def __enter__(self) -> "AlchemyInterface":
|
|
48
|
+
"""Enter the runtime context related to this object (starts session)."""
|
|
49
|
+
self.start()
|
|
50
|
+
return self
|
|
51
|
+
|
|
52
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
53
|
+
"""Exit the runtime context related to this object (stops session)."""
|
|
54
|
+
should_commit = exc_type is None
|
|
55
|
+
self.stop(commit=should_commit)
|
|
56
|
+
|
|
57
|
+
def start(self) -> None:
|
|
58
|
+
"""Starts a new SQLAlchemy session manually."""
|
|
59
|
+
if not hasattr(self, "Session"):
|
|
60
|
+
raise AttributeError("Database configuration not initialized. Cannot create session.")
|
|
61
|
+
if self.session is not None:
|
|
62
|
+
raise RuntimeError("Session already active.")
|
|
63
|
+
self.session = self.Session()
|
|
64
|
+
logger.debug("SQLAlchemy session started manually.")
|
|
65
|
+
|
|
66
|
+
def stop(self, commit: bool = True) -> None:
|
|
67
|
+
"""Stops the manually started SQLAlchemy session."""
|
|
68
|
+
if self.session is None:
|
|
69
|
+
logger.warning("No active session to stop.")
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
if commit:
|
|
74
|
+
logger.debug("Committing SQLAlchemy session before stopping.")
|
|
75
|
+
self.session.commit()
|
|
76
|
+
else:
|
|
77
|
+
logger.debug("Rolling back SQLAlchemy session before stopping.")
|
|
78
|
+
self.session.rollback()
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logger.error(f"Exception during session commit/rollback on stop: {e}", exc_info=True)
|
|
81
|
+
try:
|
|
82
|
+
self.session.rollback()
|
|
83
|
+
except Exception as rb_exc:
|
|
84
|
+
logger.error(f"Exception during secondary rollback attempt on stop: {rb_exc}", exc_info=True)
|
|
85
|
+
raise
|
|
86
|
+
finally:
|
|
87
|
+
logger.debug("Closing SQLAlchemy session.")
|
|
88
|
+
self.session.close()
|
|
89
|
+
self.session = None
|
|
90
|
+
|
|
42
91
|
def get_conn_str(self):
|
|
43
92
|
return (
|
|
44
93
|
f"{self.config['engine']}://"
|
|
@@ -110,6 +159,16 @@ class AlchemyInterface:
|
|
|
110
159
|
self.create_tables(tables)
|
|
111
160
|
|
|
112
161
|
def reset_column(self, query_results: List[Result[Any]], column_name: str) -> None:
|
|
162
|
+
"""
|
|
163
|
+
Reset a column to its default value for a list of query results.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
query_results: List of query results to update
|
|
167
|
+
column_name: Name of the column to reset
|
|
168
|
+
"""
|
|
169
|
+
if self.session is None:
|
|
170
|
+
raise RuntimeError("Session not active. Use 'with AlchemyInterface(...):' or call start()")
|
|
171
|
+
|
|
113
172
|
if not query_results:
|
|
114
173
|
logger.warning("No objects to reset column for.")
|
|
115
174
|
return
|
|
@@ -124,109 +183,192 @@ class AlchemyInterface:
|
|
|
124
183
|
|
|
125
184
|
column = table.columns[column_name]
|
|
126
185
|
|
|
186
|
+
# Determine the default value to use
|
|
127
187
|
if column.server_default is not None:
|
|
128
|
-
|
|
188
|
+
default_value = text("DEFAULT")
|
|
129
189
|
elif column.default is not None:
|
|
130
190
|
default_value = column.default.arg
|
|
131
191
|
if callable(default_value):
|
|
132
192
|
default_value = default_value(MockContext(column))
|
|
133
|
-
query_results.update({column_name: default_value}, synchronize_session=False)
|
|
134
193
|
else:
|
|
135
194
|
raise ValueError(f"Column '{column_name}' doesn't have a default value defined.")
|
|
136
195
|
|
|
137
|
-
|
|
196
|
+
query_results.update({column_name: default_value}, synchronize_session=False)
|
|
138
197
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
198
|
+
@staticmethod
|
|
199
|
+
def _log_integrity_error(ex: IntegrityError, alchemy_obj, action="insert"):
|
|
200
|
+
"""
|
|
201
|
+
Compact, readable IntegrityError logger using SQLSTATE codes.
|
|
202
|
+
Consult https://www.postgresql.org/docs/current/errcodes-appendix.html for details.
|
|
203
|
+
"""
|
|
143
204
|
|
|
144
|
-
|
|
145
|
-
|
|
205
|
+
PG_ERROR_LABELS = {
|
|
206
|
+
"23000": "Integrity constraint violation",
|
|
207
|
+
"23001": "Restrict violation",
|
|
208
|
+
"23502": "NOT NULL violation",
|
|
209
|
+
"23503": "Foreign key violation",
|
|
210
|
+
"23505": "Unique violation",
|
|
211
|
+
"23514": "Check constraint violation",
|
|
212
|
+
"23P01": "Exclusion constraint violation",
|
|
213
|
+
}
|
|
214
|
+
code = getattr(ex.orig, "pgcode", None)
|
|
215
|
+
label = PG_ERROR_LABELS.get(code, "Integrity error (unspecified)")
|
|
146
216
|
|
|
147
|
-
|
|
217
|
+
# Log one clean message with trace + the raw DB message separately
|
|
218
|
+
if code == "23505": # A simple info log for unique violations
|
|
219
|
+
logger.info(f"{label} trying to {action} {alchemy_obj}")
|
|
220
|
+
else:
|
|
221
|
+
logger.error(f"{label} trying to {action} {alchemy_obj}\nPostgreSQL message: {ex.orig}")
|
|
222
|
+
|
|
223
|
+
def insert_alchemy_obj(self, alchemy_obj: ModelType, silent: bool = False) -> bool:
|
|
224
|
+
if self.session is None:
|
|
225
|
+
raise RuntimeError("Session not active. Use 'with AlchemyInterface(...):' or call start()")
|
|
226
|
+
|
|
227
|
+
try:
|
|
228
|
+
# Use a savepoint (nested transaction)
|
|
229
|
+
with self.session.begin_nested():
|
|
230
|
+
if not silent:
|
|
231
|
+
logger.info(f"adding {alchemy_obj}...")
|
|
232
|
+
self.session.add(alchemy_obj)
|
|
233
|
+
except IntegrityError as ex:
|
|
234
|
+
# Rollback is handled automatically by begin_nested() context manager on error
|
|
148
235
|
if not silent:
|
|
149
|
-
|
|
236
|
+
self._log_integrity_error(ex, alchemy_obj, action="insert")
|
|
237
|
+
# Do not re-raise, allow outer transaction/loop to continue
|
|
238
|
+
return False
|
|
239
|
+
|
|
240
|
+
return True
|
|
150
241
|
|
|
151
|
-
|
|
242
|
+
def upsert_alchemy_obj(self, alchemy_obj: ModelType, index_elements: List[str], silent: bool = False) -> bool:
|
|
243
|
+
if self.session is None:
|
|
244
|
+
raise RuntimeError("Session not active. Use 'with AlchemyInterface(...):' or call start()")
|
|
152
245
|
|
|
153
|
-
def upsert_alchemy_obj(self, alchemy_obj: ModelType, index_elements: List[str], silent: bool = False) -> None:
|
|
154
246
|
if not silent:
|
|
155
247
|
logger.info(f"upserting {alchemy_obj}")
|
|
156
248
|
|
|
157
|
-
|
|
158
|
-
|
|
249
|
+
table = alchemy_obj.__table__
|
|
250
|
+
primary_keys = list(col.name for col in table.primary_key.columns.values())
|
|
251
|
+
|
|
252
|
+
# Build the dictionary for the INSERT values
|
|
253
|
+
insert_values = {
|
|
254
|
+
col.name: getattr(alchemy_obj, col.name)
|
|
255
|
+
for col in table.columns
|
|
256
|
+
if getattr(alchemy_obj, col.name) is not None # Include all non-None values for insert
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
# Build the dictionary for the UPDATE set clause
|
|
260
|
+
# Start with values from the object, excluding primary keys
|
|
261
|
+
update_set_values = {
|
|
159
262
|
col.name: val
|
|
160
|
-
for col in
|
|
263
|
+
for col in table.columns
|
|
161
264
|
if col.name not in primary_keys and (val := getattr(alchemy_obj, col.name)) is not None
|
|
162
265
|
}
|
|
163
266
|
|
|
267
|
+
# Add columns with SQL-based onupdate values explicitly to the set clause
|
|
268
|
+
for column in table.columns:
|
|
269
|
+
actual_sql_expression = None
|
|
270
|
+
if column.onupdate is not None:
|
|
271
|
+
if hasattr(column.onupdate, "arg") and isinstance(column.onupdate.arg, ClauseElement):
|
|
272
|
+
# This handles wrappers like ColumnElementColumnDefault,
|
|
273
|
+
# where the actual SQL expression is in the .arg attribute.
|
|
274
|
+
actual_sql_expression = column.onupdate.arg
|
|
275
|
+
elif isinstance(column.onupdate, ClauseElement):
|
|
276
|
+
# This handles cases where onupdate might be a direct SQL expression.
|
|
277
|
+
actual_sql_expression = column.onupdate
|
|
278
|
+
|
|
279
|
+
if actual_sql_expression is not None:
|
|
280
|
+
update_set_values[column.name] = actual_sql_expression
|
|
281
|
+
|
|
164
282
|
statement = (
|
|
165
|
-
insert(
|
|
166
|
-
.values(
|
|
167
|
-
.on_conflict_do_update(index_elements=index_elements, set_=
|
|
283
|
+
insert(table)
|
|
284
|
+
.values(insert_values)
|
|
285
|
+
.on_conflict_do_update(index_elements=index_elements, set_=update_set_values)
|
|
168
286
|
)
|
|
169
287
|
|
|
170
288
|
try:
|
|
171
|
-
|
|
172
|
-
self.session.
|
|
173
|
-
|
|
289
|
+
# Use a savepoint (nested transaction)
|
|
290
|
+
with self.session.begin_nested():
|
|
291
|
+
self.session.execute(statement)
|
|
292
|
+
except IntegrityError as ex:
|
|
293
|
+
# Rollback is handled automatically by begin_nested() context manager on error
|
|
174
294
|
if not silent:
|
|
175
|
-
|
|
295
|
+
self._log_integrity_error(ex, alchemy_obj, action="upsert")
|
|
296
|
+
# Do not re-raise, allow outer transaction/loop to continue
|
|
297
|
+
return False
|
|
176
298
|
|
|
177
|
-
|
|
299
|
+
return True
|
|
178
300
|
|
|
179
301
|
def windowed_query(
|
|
180
302
|
self,
|
|
181
303
|
stmt: Select[Any],
|
|
182
304
|
order_by: List[SQLColumnExpression[Any]],
|
|
183
305
|
windowsize: int,
|
|
306
|
+
commit_strategy: Union[CommitStrategy, str] = CommitStrategy.COMMIT_ON_SUCCESS,
|
|
184
307
|
) -> Iterator[Result[Any]]:
|
|
185
308
|
"""
|
|
186
|
-
Executes a windowed query
|
|
309
|
+
Executes a windowed query, fetching each window in a separate, short-lived session.
|
|
187
310
|
|
|
188
311
|
Args:
|
|
189
312
|
stmt: The SQL select statement to execute.
|
|
190
313
|
order_by: The columns to use for ordering.
|
|
191
314
|
windowsize: The number of rows to fetch in each window.
|
|
315
|
+
commit_strategy: The strategy to use for committing the session after each window.
|
|
316
|
+
Defaults to `CommitStrategy.COMMIT_ON_SUCCESS`.
|
|
192
317
|
|
|
193
318
|
Returns:
|
|
194
319
|
An iterator of Result objects, each containing a window of data.
|
|
320
|
+
The session used to fetch the Result is closed immediately after yielding.
|
|
195
321
|
|
|
196
322
|
More info: https://github.com/sqlalchemy/sqlalchemy/wiki/RangeQuery-and-WindowedRangeQuery
|
|
197
323
|
"""
|
|
198
|
-
#
|
|
199
|
-
|
|
324
|
+
# Parameter mapping
|
|
325
|
+
if isinstance(commit_strategy, str):
|
|
326
|
+
commit_strategy = CommitStrategy[commit_strategy.upper()]
|
|
200
327
|
|
|
201
|
-
#
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
# Create an outer query that selects from the subquery
|
|
206
|
-
cols = [c for c in subq.c]
|
|
207
|
-
outer_query = select(*cols)
|
|
208
|
-
|
|
209
|
-
last_row_number = 0
|
|
328
|
+
# Find id column in stmt
|
|
329
|
+
if not any(column.get("entity").id for column in stmt.column_descriptions):
|
|
330
|
+
raise Exception("Column 'id' not found in any entity of the query.")
|
|
331
|
+
id_column = stmt.column_descriptions[0]["entity"].id
|
|
210
332
|
|
|
333
|
+
last_id = 0
|
|
211
334
|
while True:
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
335
|
+
session_active = False
|
|
336
|
+
commit_needed = False
|
|
337
|
+
try:
|
|
338
|
+
self.start()
|
|
339
|
+
session_active = True
|
|
340
|
+
|
|
341
|
+
# Filter on row_number in the outer query
|
|
342
|
+
current_query = stmt.where(id_column > last_id).order_by(order_by[0], *order_by[1:]).limit(windowsize)
|
|
343
|
+
result = self.session.execute(current_query)
|
|
344
|
+
|
|
345
|
+
# Create a FrozenResult to allow peeking at the data without consuming
|
|
346
|
+
frozen_result: FrozenResult = result.freeze()
|
|
347
|
+
chunk = frozen_result().all()
|
|
348
|
+
|
|
349
|
+
if not chunk:
|
|
350
|
+
break
|
|
351
|
+
|
|
352
|
+
# Update for next iteration
|
|
353
|
+
last_id = chunk[-1].id
|
|
354
|
+
|
|
355
|
+
# Create a new Result object from the FrozenResult
|
|
356
|
+
yield_result = frozen_result()
|
|
357
|
+
|
|
358
|
+
yield yield_result
|
|
359
|
+
commit_needed = True
|
|
360
|
+
|
|
361
|
+
finally:
|
|
362
|
+
if session_active and self.session:
|
|
363
|
+
if commit_strategy == CommitStrategy.FORCE_COMMIT:
|
|
364
|
+
# For forced commit, always attempt to commit.
|
|
365
|
+
# The self.stop() method already handles potential exceptions during commit/rollback.
|
|
366
|
+
self.stop(commit=True)
|
|
367
|
+
elif commit_strategy == CommitStrategy.COMMIT_ON_SUCCESS:
|
|
368
|
+
# Commit only if no exception occurred before yielding the result.
|
|
369
|
+
self.stop(commit=commit_needed)
|
|
370
|
+
else:
|
|
371
|
+
# Fallback or error for unknown strategy, though type hinting should prevent this.
|
|
372
|
+
# For safety, default to rollback.
|
|
373
|
+
logger.warning(f"Unknown commit strategy: {commit_strategy}. Defaulting to rollback.")
|
|
374
|
+
self.stop(commit=False)
|
datamarket/interfaces/aws.py
CHANGED
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
|
|
4
4
|
import io
|
|
5
5
|
import logging
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
6
8
|
import boto3
|
|
7
9
|
|
|
8
10
|
########################################################################################################################
|
|
@@ -12,34 +14,44 @@ logger = logging.getLogger(__name__)
|
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
class AWSInterface:
|
|
15
|
-
def __init__(self, config):
|
|
16
|
-
self.profiles = []
|
|
17
|
+
def __init__(self, config) -> None:
|
|
18
|
+
self.profiles: List[Dict[str, Any]] = []
|
|
17
19
|
self.config = config
|
|
18
20
|
|
|
19
|
-
for section in self.config
|
|
21
|
+
for section in getattr(self.config, "sections", lambda: [])():
|
|
20
22
|
if section.startswith("aws:"):
|
|
21
23
|
profile_name = section.split(":", 1)[1]
|
|
24
|
+
bucket_value = self.config[section].get("buckets", "")
|
|
25
|
+
buckets = [b.strip() for b in bucket_value.split(",") if b.strip()]
|
|
26
|
+
session = boto3.Session(profile_name=profile_name)
|
|
27
|
+
|
|
22
28
|
self.profiles.append(
|
|
23
29
|
{
|
|
24
30
|
"profile": profile_name,
|
|
25
|
-
"
|
|
26
|
-
"session":
|
|
31
|
+
"buckets": buckets,
|
|
32
|
+
"session": session,
|
|
27
33
|
}
|
|
28
34
|
)
|
|
29
35
|
|
|
30
36
|
if not self.profiles:
|
|
31
37
|
logger.warning("No AWS profiles found in config file")
|
|
32
38
|
|
|
33
|
-
self.current_profile = self.profiles[0] if self.profiles else None
|
|
39
|
+
self.current_profile: Optional[Dict[str, Any]] = self.profiles[0] if self.profiles else None
|
|
34
40
|
self._update_resources()
|
|
35
41
|
|
|
36
|
-
def _update_resources(self):
|
|
42
|
+
def _update_resources(self) -> None:
|
|
43
|
+
"""Refresh S3 resources for the current profile and set default bucket (first in list)"""
|
|
37
44
|
if self.current_profile:
|
|
38
45
|
self.s3 = self.current_profile["session"].resource("s3")
|
|
39
46
|
self.s3_client = self.s3.meta.client
|
|
40
|
-
|
|
47
|
+
buckets = self.current_profile.get("buckets", [])
|
|
48
|
+
self.bucket = buckets[0] if buckets else None
|
|
49
|
+
else:
|
|
50
|
+
self.s3 = None
|
|
51
|
+
self.s3_client = None
|
|
52
|
+
self.bucket = None
|
|
41
53
|
|
|
42
|
-
def switch_profile(self, profile_name):
|
|
54
|
+
def switch_profile(self, profile_name: str) -> None:
|
|
43
55
|
for profile in self.profiles:
|
|
44
56
|
if profile["profile"] == profile_name:
|
|
45
57
|
self.current_profile = profile
|
|
@@ -47,14 +59,69 @@ class AWSInterface:
|
|
|
47
59
|
return
|
|
48
60
|
logger.warning(f"Profile {profile_name} not found")
|
|
49
61
|
|
|
50
|
-
def
|
|
62
|
+
def switch_bucket(self, bucket: str) -> None:
|
|
63
|
+
if not self.current_profile:
|
|
64
|
+
logger.warning("No current AWS profile to switch bucket on")
|
|
65
|
+
return
|
|
66
|
+
|
|
67
|
+
buckets = self.current_profile.get("buckets") or []
|
|
68
|
+
if bucket not in buckets:
|
|
69
|
+
logger.warning(f"Bucket {bucket} not found in profile {self.current_profile.get('profile')}")
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
self.bucket = bucket
|
|
73
|
+
|
|
74
|
+
def switch_bucket_for_profile(self, profile_name: str, bucket: str) -> None:
|
|
75
|
+
"""
|
|
76
|
+
Select a profile and then switch its active bucket.
|
|
77
|
+
"""
|
|
78
|
+
for profile in self.profiles:
|
|
79
|
+
if profile["profile"] == profile_name:
|
|
80
|
+
self.current_profile = profile
|
|
81
|
+
self._update_resources() # sets default bucket & s3 clients
|
|
82
|
+
self.switch_bucket(bucket) # only sets self.bucket if valid
|
|
83
|
+
return
|
|
84
|
+
logger.warning(f"Profile {profile_name} not found")
|
|
85
|
+
|
|
86
|
+
def get_bucket_url(self) -> Optional[str]:
|
|
87
|
+
"""Return active bucket URL."""
|
|
88
|
+
if not self.bucket:
|
|
89
|
+
logger.warning("No active bucket selected")
|
|
90
|
+
return None
|
|
91
|
+
region = self.s3_client.meta.region_name
|
|
92
|
+
return f"https://{self.bucket}.s3.{region}.amazonaws.com"
|
|
93
|
+
|
|
94
|
+
def get_file(self, s3_path: str):
|
|
95
|
+
if not self.bucket:
|
|
96
|
+
logger.warning("No active bucket selected")
|
|
97
|
+
return None
|
|
51
98
|
try:
|
|
52
99
|
return self.s3.Object(self.bucket, s3_path).get()
|
|
53
100
|
except self.s3_client.exceptions.NoSuchKey:
|
|
54
101
|
logger.info(f"{s3_path} does not exist")
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
def file_exists(self, s3_path: str) -> bool:
|
|
105
|
+
if not self.bucket:
|
|
106
|
+
logger.warning("No active bucket selected")
|
|
107
|
+
return False
|
|
108
|
+
try:
|
|
109
|
+
self.s3_client.head_object(Bucket=self.bucket, Key=s3_path)
|
|
110
|
+
return True
|
|
111
|
+
except self.s3_client.exceptions.NoSuchKey:
|
|
112
|
+
return False
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.error(f"Error checking existence of {s3_path}: {e}")
|
|
115
|
+
raise
|
|
55
116
|
|
|
56
|
-
def read_file_as_bytes(self, s3_path):
|
|
57
|
-
|
|
117
|
+
def read_file_as_bytes(self, s3_path: str) -> Optional[io.BytesIO]:
|
|
118
|
+
obj = self.get_file(s3_path)
|
|
119
|
+
if not obj:
|
|
120
|
+
return None
|
|
121
|
+
return io.BytesIO(obj["Body"].read())
|
|
58
122
|
|
|
59
|
-
def upload_file(self, local_path, s3_path):
|
|
60
|
-
self.
|
|
123
|
+
def upload_file(self, local_path: str, s3_path: str, **kwargs) -> None:
|
|
124
|
+
if not self.bucket:
|
|
125
|
+
logger.warning("No active bucket selected")
|
|
126
|
+
return
|
|
127
|
+
self.s3.Bucket(self.bucket).upload_file(local_path, s3_path, **kwargs)
|