datamarket 0.9.22__tar.gz → 0.9.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (24) hide show
  1. {datamarket-0.9.22 → datamarket-0.9.24}/PKG-INFO +1 -1
  2. {datamarket-0.9.22 → datamarket-0.9.24}/pyproject.toml +1 -1
  3. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/interfaces/alchemy.py +37 -22
  4. {datamarket-0.9.22 → datamarket-0.9.24}/LICENSE +0 -0
  5. {datamarket-0.9.22 → datamarket-0.9.24}/README.md +0 -0
  6. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/__init__.py +0 -0
  7. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/interfaces/__init__.py +0 -0
  8. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/interfaces/aws.py +0 -0
  9. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/interfaces/drive.py +0 -0
  10. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/interfaces/ftp.py +0 -0
  11. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/interfaces/nominatim.py +0 -0
  12. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/interfaces/peerdb.py +0 -0
  13. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/interfaces/proxy.py +0 -0
  14. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/interfaces/tinybird.py +0 -0
  15. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/params/__init__.py +0 -0
  16. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/params/nominatim.py +0 -0
  17. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/utils/__init__.py +0 -0
  18. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/utils/airflow.py +0 -0
  19. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/utils/alchemy.py +0 -0
  20. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/utils/main.py +0 -0
  21. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/utils/selenium.py +0 -0
  22. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/utils/soda.py +0 -0
  23. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/utils/typer.py +0 -0
  24. {datamarket-0.9.22 → datamarket-0.9.24}/src/datamarket/utils/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.22
3
+ Version: 0.9.24
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.9.22"
3
+ version = "0.9.24"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -6,11 +6,12 @@ from collections.abc import MutableMapping
6
6
  from typing import Any, Iterator, List, Optional, Type, TypeVar
7
7
  from urllib.parse import quote_plus
8
8
 
9
- from sqlalchemy import DDL, FrozenResult, Result, Select, SQLColumnExpression, create_engine, func, select, text
9
+ from sqlalchemy import DDL, FrozenResult, Result, Select, SQLColumnExpression, create_engine, text
10
10
  from sqlalchemy.dialects.postgresql import insert
11
11
  from sqlalchemy.exc import IntegrityError
12
12
  from sqlalchemy.ext.declarative import DeclarativeMeta
13
13
  from sqlalchemy.orm import Session, sessionmaker
14
+ from enum import Enum
14
15
 
15
16
  ########################################################################################################################
16
17
  # CLASSES
@@ -20,6 +21,11 @@ logger = logging.getLogger(__name__)
20
21
  ModelType = TypeVar("ModelType", bound=DeclarativeMeta)
21
22
 
22
23
 
24
+ class CommitStrategy(Enum):
25
+ COMMIT_ON_SUCCESS = "commit_on_success"
26
+ FORCE_COMMIT = "force_commit"
27
+
28
+
23
29
  class MockContext:
24
30
  def __init__(self, column: SQLColumnExpression) -> None:
25
31
  self.current_parameters = {}
@@ -188,7 +194,7 @@ class AlchemyInterface:
188
194
 
189
195
  query_results.update({column_name: default_value}, synchronize_session=False)
190
196
 
191
- def insert_alchemy_obj(self, alchemy_obj: ModelType, silent: bool = False) -> None:
197
+ def insert_alchemy_obj(self, alchemy_obj: ModelType, silent: bool = False) -> bool:
192
198
  if self.session is None:
193
199
  raise RuntimeError("Session not active. Use 'with AlchemyInterface(...):' or call start()")
194
200
 
@@ -203,8 +209,11 @@ class AlchemyInterface:
203
209
  if not silent:
204
210
  logger.info(f"{alchemy_obj} already in db (savepoint rolled back)")
205
211
  # Do not re-raise, allow outer transaction/loop to continue
212
+ return False
206
213
 
207
- def upsert_alchemy_obj(self, alchemy_obj: ModelType, index_elements: List[str], silent: bool = False) -> None:
214
+ return True
215
+
216
+ def upsert_alchemy_obj(self, alchemy_obj: ModelType, index_elements: List[str], silent: bool = False) -> bool:
208
217
  if self.session is None:
209
218
  raise RuntimeError("Session not active. Use 'with AlchemyInterface(...):' or call start()")
210
219
 
@@ -233,12 +242,16 @@ class AlchemyInterface:
233
242
  if not silent:
234
243
  logger.info(f"could not upsert {alchemy_obj} (savepoint rolled back)")
235
244
  # Do not re-raise, allow outer transaction/loop to continue
245
+ return False
246
+
247
+ return True
236
248
 
237
249
  def windowed_query(
238
250
  self,
239
251
  stmt: Select[Any],
240
252
  order_by: List[SQLColumnExpression[Any]],
241
253
  windowsize: int,
254
+ commit_strategy: CommitStrategy = CommitStrategy.COMMIT_ON_SUCCESS,
242
255
  ) -> Iterator[Result[Any]]:
243
256
  """
244
257
  Executes a windowed query, fetching each window in a separate, short-lived session.
@@ -247,6 +260,8 @@ class AlchemyInterface:
247
260
  stmt: The SQL select statement to execute.
248
261
  order_by: The columns to use for ordering.
249
262
  windowsize: The number of rows to fetch in each window.
263
+ commit_strategy: The strategy to use for committing the session after each window.
264
+ Defaults to CommitStrategy.COMMIT_ON_SUCCESS.
250
265
 
251
266
  Returns:
252
267
  An iterator of Result objects, each containing a window of data.
@@ -254,19 +269,13 @@ class AlchemyInterface:
254
269
 
255
270
  More info: https://github.com/sqlalchemy/sqlalchemy/wiki/RangeQuery-and-WindowedRangeQuery
256
271
  """
257
- # Add row_number over the specified order
258
- row_number = func.row_number().over(order_by=order_by).label("row_number")
259
272
 
260
- # Add the windowing column to the statement
261
- inner_stmt = stmt.with_session(None).add_columns(row_number).order_by(row_number)
262
- subq = inner_stmt.subquery()
263
-
264
- # Create an outer query that selects from the subquery
265
- cols = [c for c in subq.c]
266
- outer_query = select(*cols)
267
-
268
- last_row_number = 0
273
+ # Find id column in stmt
274
+ if not any(column.get("entity").id for column in stmt.column_descriptions):
275
+ raise Exception("Column 'id' not found in any entity of the query.")
276
+ id_column = stmt.column_descriptions[0]["entity"].id
269
277
 
278
+ last_id = 0
270
279
  while True:
271
280
  session_active = False
272
281
  commit_needed = False
@@ -275,7 +284,7 @@ class AlchemyInterface:
275
284
  session_active = True
276
285
 
277
286
  # Filter on row_number in the outer query
278
- current_query = outer_query.where(subq.c.row_number > last_row_number).limit(windowsize)
287
+ current_query = stmt.where(id_column > last_id).order_by(order_by[0], *order_by[1:]).limit(windowsize)
279
288
  result = self.session.execute(current_query)
280
289
 
281
290
  # Create a FrozenResult to allow peeking at the data without consuming
@@ -286,19 +295,25 @@ class AlchemyInterface:
286
295
  break
287
296
 
288
297
  # Update for next iteration
289
- last_row_number = chunk[-1].row_number
298
+ last_id = chunk[-1].id
290
299
 
291
300
  # Create a new Result object from the FrozenResult
292
301
  yield_result = frozen_result()
293
302
 
294
- # Remove row_number from result before yielding
295
- # Ensure we don't yield the row_number column itself
296
- original_col_count = len(cols) - 1
297
- yield_result = yield_result.columns(*range(original_col_count))
298
-
299
303
  yield yield_result
300
304
  commit_needed = True
301
305
 
302
306
  finally:
303
307
  if session_active and self.session:
304
- self.stop(commit=commit_needed)
308
+ if commit_strategy == CommitStrategy.FORCE_COMMIT:
309
+ # For forced commit, always attempt to commit.
310
+ # The self.stop() method already handles potential exceptions during commit/rollback.
311
+ self.stop(commit=True)
312
+ elif commit_strategy == CommitStrategy.COMMIT_ON_SUCCESS:
313
+ # Commit only if no exception occurred before yielding the result.
314
+ self.stop(commit=commit_needed)
315
+ else:
316
+ # Fallback or error for unknown strategy, though type hinting should prevent this.
317
+ # For safety, default to rollback.
318
+ logger.warning(f"Unknown commit strategy: {commit_strategy}. Defaulting to rollback.")
319
+ self.stop(commit=False)
File without changes
File without changes