datamarket 0.9.19__py3-none-any.whl → 0.9.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -3,14 +3,14 @@
3
3
 
4
4
  import logging
5
5
  from collections.abc import MutableMapping
6
- from typing import Any, Iterator, List, Type, TypeVar
6
+ from typing import Any, Iterator, List, Optional, Type, TypeVar
7
7
  from urllib.parse import quote_plus
8
8
 
9
9
  from sqlalchemy import DDL, FrozenResult, Result, Select, SQLColumnExpression, create_engine, func, select, text
10
10
  from sqlalchemy.dialects.postgresql import insert
11
11
  from sqlalchemy.exc import IntegrityError
12
12
  from sqlalchemy.ext.declarative import DeclarativeMeta
13
- from sqlalchemy.orm import sessionmaker
13
+ from sqlalchemy.orm import Session, sessionmaker
14
14
 
15
15
  ########################################################################################################################
16
16
  # CLASSES
@@ -29,16 +29,58 @@ class MockContext:
29
29
 
30
30
  class AlchemyInterface:
31
31
  def __init__(self, config: MutableMapping) -> None:
32
+ self.session: Optional[Session] = None
32
33
  if "db" in config:
33
34
  self.config = config["db"]
34
-
35
35
  self.engine = create_engine(self.get_conn_str())
36
- self.session = sessionmaker(bind=self.engine)()
37
- self.cursor = self.session.connection().connection.cursor()
38
-
36
+ self.Session = sessionmaker(bind=self.engine)
39
37
  else:
40
38
  logger.warning("no db section in config")
41
39
 
40
+ def __enter__(self) -> "AlchemyInterface":
41
+ """Enter the runtime context related to this object (starts session)."""
42
+ self.start()
43
+ return self
44
+
45
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
46
+ """Exit the runtime context related to this object (stops session)."""
47
+ should_commit = exc_type is None
48
+ self.stop(commit=should_commit)
49
+
50
+ def start(self) -> None:
51
+ """Starts a new SQLAlchemy session manually."""
52
+ if not hasattr(self, "Session"):
53
+ raise AttributeError("Database configuration not initialized. Cannot create session.")
54
+ if self.session is not None:
55
+ raise RuntimeError("Session already active.")
56
+ self.session = self.Session()
57
+ logger.debug("SQLAlchemy session started manually.")
58
+
59
+ def stop(self, commit: bool = True) -> None:
60
+ """Stops the manually started SQLAlchemy session."""
61
+ if self.session is None:
62
+ logger.warning("No active session to stop.")
63
+ return
64
+
65
+ try:
66
+ if commit:
67
+ logger.debug("Committing SQLAlchemy session before stopping.")
68
+ self.session.commit()
69
+ else:
70
+ logger.debug("Rolling back SQLAlchemy session before stopping.")
71
+ self.session.rollback()
72
+ except Exception as e:
73
+ logger.error(f"Exception during session commit/rollback on stop: {e}", exc_info=True)
74
+ try:
75
+ self.session.rollback()
76
+ except Exception as rb_exc:
77
+ logger.error(f"Exception during secondary rollback attempt on stop: {rb_exc}", exc_info=True)
78
+ raise
79
+ finally:
80
+ logger.debug("Closing SQLAlchemy session.")
81
+ self.session.close()
82
+ self.session = None
83
+
42
84
  def get_conn_str(self):
43
85
  return (
44
86
  f"{self.config['engine']}://"
@@ -110,6 +152,16 @@ class AlchemyInterface:
110
152
  self.create_tables(tables)
111
153
 
112
154
  def reset_column(self, query_results: List[Result[Any]], column_name: str) -> None:
155
+ """
156
+ Reset a column to its default value for a list of query results.
157
+
158
+ Args:
159
+ query_results: List of query results to update
160
+ column_name: Name of the column to reset
161
+ """
162
+ if self.session is None:
163
+ raise RuntimeError("Session not active. Use 'with AlchemyInterface(...):' or call start()")
164
+
113
165
  if not query_results:
114
166
  logger.warning("No objects to reset column for.")
115
167
  return
@@ -124,33 +176,38 @@ class AlchemyInterface:
124
176
 
125
177
  column = table.columns[column_name]
126
178
 
179
+ # Determine the default value to use
127
180
  if column.server_default is not None:
128
- query_results.update({column_name: text("DEFAULT")}, synchronize_session=False)
181
+ default_value = text("DEFAULT")
129
182
  elif column.default is not None:
130
183
  default_value = column.default.arg
131
184
  if callable(default_value):
132
185
  default_value = default_value(MockContext(column))
133
- query_results.update({column_name: default_value}, synchronize_session=False)
134
186
  else:
135
187
  raise ValueError(f"Column '{column_name}' doesn't have a default value defined.")
136
188
 
137
- self.session.commit()
189
+ query_results.update({column_name: default_value}, synchronize_session=False)
138
190
 
139
191
  def insert_alchemy_obj(self, alchemy_obj: ModelType, silent: bool = False) -> None:
140
- try:
141
- if not silent:
142
- logger.info(f"adding {alchemy_obj}...")
143
-
144
- self.session.add(alchemy_obj)
145
- self.session.commit()
192
+ if self.session is None:
193
+ raise RuntimeError("Session not active. Use 'with AlchemyInterface(...):' or call start()")
146
194
 
195
+ try:
196
+ # Use a savepoint (nested transaction)
197
+ with self.session.begin_nested():
198
+ if not silent:
199
+ logger.info(f"adding {alchemy_obj}...")
200
+ self.session.add(alchemy_obj)
147
201
  except IntegrityError:
202
+ # Rollback is handled automatically by begin_nested() context manager on error
148
203
  if not silent:
149
- logger.info(f"{alchemy_obj} already in db")
150
-
151
- self.session.rollback()
204
+ logger.info(f"{alchemy_obj} already in db (savepoint rolled back)")
205
+ # Do not re-raise, allow outer transaction/loop to continue
152
206
 
153
207
  def upsert_alchemy_obj(self, alchemy_obj: ModelType, index_elements: List[str], silent: bool = False) -> None:
208
+ if self.session is None:
209
+ raise RuntimeError("Session not active. Use 'with AlchemyInterface(...):' or call start()")
210
+
154
211
  if not silent:
155
212
  logger.info(f"upserting {alchemy_obj}")
156
213
 
@@ -168,13 +225,14 @@ class AlchemyInterface:
168
225
  )
169
226
 
170
227
  try:
171
- self.session.execute(statement)
172
- self.session.commit()
228
+ # Use a savepoint (nested transaction)
229
+ with self.session.begin_nested():
230
+ self.session.execute(statement)
173
231
  except IntegrityError:
232
+ # Rollback is handled automatically by begin_nested() context manager on error
174
233
  if not silent:
175
- logger.info(f"could not upsert {alchemy_obj}")
176
-
177
- self.session.rollback()
234
+ logger.info(f"could not upsert {alchemy_obj} (savepoint rolled back)")
235
+ # Do not re-raise, allow outer transaction/loop to continue
178
236
 
179
237
  def windowed_query(
180
238
  self,
@@ -183,7 +241,7 @@ class AlchemyInterface:
183
241
  windowsize: int,
184
242
  ) -> Iterator[Result[Any]]:
185
243
  """
186
- Executes a windowed query on the given statement.
244
+ Executes a windowed query, fetching each window in a separate, short-lived session.
187
245
 
188
246
  Args:
189
247
  stmt: The SQL select statement to execute.
@@ -192,14 +250,15 @@ class AlchemyInterface:
192
250
 
193
251
  Returns:
194
252
  An iterator of Result objects, each containing a window of data.
253
+ The session used to fetch the Result is closed immediately after yielding.
195
254
 
196
255
  More info: https://github.com/sqlalchemy/sqlalchemy/wiki/RangeQuery-and-WindowedRangeQuery
197
256
  """
198
257
  # Add row_number over the specified order
199
258
  row_number = func.row_number().over(order_by=order_by).label("row_number")
200
259
 
201
- # Add the windowing column to the statement and sort by it for deterministic results
202
- inner_stmt = stmt.add_columns(row_number).order_by(row_number)
260
+ # Add the windowing column to the statement
261
+ inner_stmt = stmt.with_session(None).add_columns(row_number).order_by(row_number)
203
262
  subq = inner_stmt.subquery()
204
263
 
205
264
  # Create an outer query that selects from the subquery
@@ -209,24 +268,37 @@ class AlchemyInterface:
209
268
  last_row_number = 0
210
269
 
211
270
  while True:
212
- # Filter on row_number in the outer query
213
- current_query = outer_query.where(subq.c.row_number > last_row_number).limit(windowsize)
214
- result = self.session.execute(current_query)
271
+ session_active = False
272
+ commit_needed = False
273
+ try:
274
+ self.start()
275
+ session_active = True
276
+
277
+ # Filter on row_number in the outer query
278
+ current_query = outer_query.where(subq.c.row_number > last_row_number).limit(windowsize)
279
+ result = self.session.execute(current_query)
280
+
281
+ # Create a FrozenResult to allow peeking at the data without consuming
282
+ frozen_result: FrozenResult = result.freeze()
283
+ chunk = frozen_result().all()
215
284
 
216
- # Create a FrozenResult to allow peeking at the data without consuming
217
- frozen_result: FrozenResult = result.freeze()
218
- chunk = frozen_result().all()
285
+ if not chunk:
286
+ break
219
287
 
220
- if not chunk:
221
- break
288
+ # Update for next iteration
289
+ last_row_number = chunk[-1].row_number
222
290
 
223
- # Update for next iteration
224
- last_row_number = chunk[-1].row_number - 1
291
+ # Create a new Result object from the FrozenResult
292
+ yield_result = frozen_result()
225
293
 
226
- # Create a new Result object from the FrozenResult
227
- yield_result = frozen_result()
294
+ # Remove row_number from result before yielding
295
+ # Ensure we don't yield the row_number column itself
296
+ original_col_count = len(cols) - 1
297
+ yield_result = yield_result.columns(*range(original_col_count))
228
298
 
229
- # Remove row_number from result
230
- yield_result = yield_result.columns(*range(len(cols) - 1))
299
+ yield yield_result
300
+ commit_needed = True
231
301
 
232
- yield yield_result
302
+ finally:
303
+ if session_active and self.session:
304
+ self.stop(commit=commit_needed)
datamarket/utils/main.py CHANGED
@@ -2,8 +2,6 @@
2
2
  # IMPORTS
3
3
 
4
4
  import asyncio
5
- from dataclasses import dataclass
6
- import inspect
7
5
  import logging
8
6
  import random
9
7
  import re
@@ -12,26 +10,25 @@ import shutil
12
10
  import subprocess
13
11
  import time
14
12
  from pathlib import Path
15
- from typing import Literal, Optional, Union
13
+ from typing import Any, Literal, Self, Union
16
14
 
17
15
  import pendulum
18
16
  from croniter import croniter
19
- from configparser import RawConfigParser
20
17
  from dynaconf import Dynaconf, add_converter
21
18
 
22
19
  logger = logging.getLogger(__name__)
23
20
 
24
- ########################################################################################################################
25
- # CLASSES
21
+
22
+ class NoProjectFoundError(Exception):
23
+ def __init__(self):
24
+ super().__init__("Could not detect project. Did you add the config file?")
26
25
 
27
26
 
28
- @dataclass
29
- class ProjectMetadata:
30
- cmd_prefix: str
31
- package_name: str
32
- env_name: str
33
- path: Path
34
- config_path: Path
27
+ class NoPackageFoundError(Exception):
28
+ def __init__(self):
29
+ super().__init__(
30
+ "A project was detected but it has no packages inside the 'src' directory"
31
+ )
35
32
 
36
33
 
37
34
  ########################################################################################################################
@@ -79,62 +76,80 @@ def read_converter(path_str: str):
79
76
  return f.read()
80
77
 
81
78
 
82
- def get_config(
83
- config_file: Optional[Path] = None, tz: str = "Europe/Madrid"
84
- ) -> Dynaconf:
85
- config_file = config_file or get_project_metadata().config_path
86
-
87
- if Path(config_file).suffix == ".ini":
88
- logger.warning("Using legacy INI config reader. Please migrate to TOML")
89
- cfg = RawConfigParser()
90
- cfg.read(config_file)
91
- return cfg
92
-
93
- add_converter("read", read_converter)
94
-
95
- dt_now = get_granular_date("now", tz)
96
- dt_weekly = get_granular_date("weekly", tz)
97
- dt_biweekly = get_granular_date("biweekly", tz)
98
-
99
- config = Dynaconf(
100
- environments=True,
101
- env_switcher="SYSTYPE",
102
- )
103
-
104
- config.load_file(path=config_file)
105
- config.load_file(path=Path.home() / config_file.name)
106
-
107
- config.vars = {
108
- "year": dt_now.strftime("%Y"),
109
- "month": dt_now.strftime("%m"),
110
- "day": dt_now.strftime("%d"),
111
- "now": dt_now.strftime("%Y-%m-%d %H:%M:%S"),
112
- "now_stripped": dt_now.strftime("%Y%m%d%H%M%S"),
113
- "today": dt_now.strftime("%Y-%m-%d"),
114
- "today_stripped": dt_now.strftime("%Y%m%d"),
115
- "weekly_date": dt_weekly.strftime("%Y-%m-%d"),
116
- "weekly_date_stripped": dt_weekly.strftime("%Y%m%d"),
117
- "biweekly_date": dt_biweekly.strftime("%Y-%m-%d"),
118
- "biweekly_date_stripped": dt_biweekly.strftime("%Y%m%d"),
119
- "dynaconf_merge": True,
120
- }
121
-
122
- return config
123
-
124
- def get_project_metadata() -> ProjectMetadata:
125
- caller_frame = inspect.stack()[1]
126
- current_file_parts = Path(caller_frame.filename).resolve().parts
127
- src_index = current_file_parts.index("src")
128
-
129
- cmd_prefix = "dix vnc run --" if shutil.which("dix") else ""
130
- package_name = current_file_parts[src_index + 1]
131
- env_name = f"{package_name}_env"
132
- project_path = Path(*current_file_parts[:src_index])
133
- config_path = project_path / "config.toml"
134
-
135
- return ProjectMetadata(
136
- cmd_prefix, package_name, env_name, project_path, config_path
137
- )
79
+ class Project:
80
+ CONFIG_FILE_NAME = "config.toml"
81
+
82
+ def __init__(self, path: Path):
83
+ self.path = path
84
+
85
+ try:
86
+ self.pkg_name = next((self.path / "src").glob("*")).name
87
+ except StopIteration:
88
+ raise NoPackageFoundError()
89
+
90
+ self.env_name = f"{self.pkg_name}_env"
91
+ self.config_path = self.path / self.CONFIG_FILE_NAME
92
+ self.tests_path = self.path / "tests"
93
+ self.cmd_prefix = "dix vnc run --" if shutil.which("dix") else ""
94
+
95
+ @classmethod
96
+ def from_file(cls, file: Path | str) -> Self:
97
+ file_path = Path(file).resolve()
98
+
99
+ while file_path != file_path.parent:
100
+ config_file = file_path / cls.CONFIG_FILE_NAME
101
+
102
+ if not config_file.is_file():
103
+ file_path = file_path.parent
104
+ continue
105
+
106
+ if file_path.parent == Path("/home"):
107
+ raise NoProjectFoundError()
108
+
109
+ return cls(file_path)
110
+
111
+ raise NoProjectFoundError()
112
+
113
+ def to_dict(self) -> dict[str, Any]:
114
+ return {
115
+ "project_path": self.path,
116
+ "cmd_prefix": self.cmd_prefix,
117
+ "env_name": self.env_name,
118
+ "pkg_name": self.pkg_name,
119
+ "config_path": self.config_path,
120
+ }
121
+
122
+ def get_config(self, tz: str = "Europe/Madrid") -> Dynaconf:
123
+ add_converter("read", read_converter)
124
+
125
+ dt_now = get_granular_date("now", tz)
126
+ dt_weekly = get_granular_date("weekly", tz)
127
+ dt_biweekly = get_granular_date("biweekly", tz)
128
+
129
+ config = Dynaconf(
130
+ environments=True,
131
+ env_switcher="SYSTYPE",
132
+ )
133
+
134
+ config.load_file(path=self.config_path)
135
+ config.load_file(path=Path.home() / self.config_path.name)
136
+
137
+ config.vars = {
138
+ "year": dt_now.strftime("%Y"),
139
+ "month": dt_now.strftime("%m"),
140
+ "day": dt_now.strftime("%d"),
141
+ "now": dt_now.strftime("%Y-%m-%d %H:%M:%S"),
142
+ "now_stripped": dt_now.strftime("%Y%m%d%H%M%S"),
143
+ "today": dt_now.strftime("%Y-%m-%d"),
144
+ "today_stripped": dt_now.strftime("%Y%m%d"),
145
+ "weekly_date": dt_weekly.strftime("%Y-%m-%d"),
146
+ "weekly_date_stripped": dt_weekly.strftime("%Y%m%d"),
147
+ "biweekly_date": dt_biweekly.strftime("%Y-%m-%d"),
148
+ "biweekly_date_stripped": dt_biweekly.strftime("%Y%m%d"),
149
+ "dynaconf_merge": True,
150
+ }
151
+
152
+ return config
138
153
 
139
154
 
140
155
  def set_logger(level):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.19
3
+ Version: 0.9.21
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -1,6 +1,6 @@
1
1
  datamarket/__init__.py,sha256=FHS77P9qNewKMoN-p0FLEUEC60oWIYup1QkbJZP4ays,12
2
2
  datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- datamarket/interfaces/alchemy.py,sha256=z7VarlKZ-JfsXWtuDXCYnNp_pSzEYo5IvrD7wqoRpbI,8891
3
+ datamarket/interfaces/alchemy.py,sha256=Qxgi_-TSzlDlUPy9SV1GoEfg1VzLx4KnPcSRVT-BFdc,12365
4
4
  datamarket/interfaces/aws.py,sha256=7KLUeBxmPN7avEMPsu5HC_KHB1N7W6Anp2X8fo43mlw,2383
5
5
  datamarket/interfaces/drive.py,sha256=shbV5jpQVe_KPE-8Idx6Z9te5Zu1SmVfrvSAyd9ZIgE,2915
6
6
  datamarket/interfaces/ftp.py,sha256=o0KlJxtksbop9OjCiQRzyAa2IeG_ExVXagS6apwrAQo,1881
@@ -13,11 +13,11 @@ datamarket/params/nominatim.py,sha256=pBYRfoBkkLBg2INbFymefmYSzaAVujQSpEro5c1hD_
13
13
  datamarket/utils/__init__.py,sha256=8D5a8oKgqd6WA1RUkiKCn4l_PVemtyuckxQut0vDHXM,20
14
14
  datamarket/utils/airflow.py,sha256=al0vc0YUikNu3Oy51VSn52I7pMU40akFBOl_UlHa2E4,795
15
15
  datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,635
16
- datamarket/utils/main.py,sha256=O6rX-65h4h0j2zs9dofdTPlly5reKDnvgLtTwbLmbWg,6529
16
+ datamarket/utils/main.py,sha256=j8wnAxeLvijdRU9M4V6HunWH7vgWWHP4u4xamzkWcUU,7009
17
17
  datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
18
18
  datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
19
19
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
20
- datamarket-0.9.19.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
21
- datamarket-0.9.19.dist-info/METADATA,sha256=yMqp6i6mEyKhwB6VOxXF5Gg1sldJSbkuB9HH2w3NdPQ,6459
22
- datamarket-0.9.19.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
23
- datamarket-0.9.19.dist-info/RECORD,,
20
+ datamarket-0.9.21.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
21
+ datamarket-0.9.21.dist-info/METADATA,sha256=OQQsXVewgsgXDBi86Ld6OddXq7VHTSZXA-zbZf-yH-U,6459
22
+ datamarket-0.9.21.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
23
+ datamarket-0.9.21.dist-info/RECORD,,