datamarket 0.9.14__py3-none-any.whl → 0.9.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -2,11 +2,14 @@
2
2
  # IMPORTS
3
3
 
4
4
  import logging
5
+ from collections.abc import MutableMapping
6
+ from typing import Any, Iterator, List, Type, TypeVar
5
7
  from urllib.parse import quote_plus
6
8
 
7
- from sqlalchemy import DDL, create_engine, text
9
+ from sqlalchemy import DDL, FrozenResult, Result, Select, SQLColumnExpression, create_engine, func, select, text
8
10
  from sqlalchemy.dialects.postgresql import insert
9
11
  from sqlalchemy.exc import IntegrityError
12
+ from sqlalchemy.ext.declarative import DeclarativeMeta
10
13
  from sqlalchemy.orm import sessionmaker
11
14
 
12
15
  ########################################################################################################################
@@ -14,16 +17,18 @@ from sqlalchemy.orm import sessionmaker
14
17
 
15
18
  logger = logging.getLogger(__name__)
16
19
 
20
+ ModelType = TypeVar("ModelType", bound=DeclarativeMeta)
21
+
17
22
 
18
23
  class MockContext:
19
- def __init__(self, column):
24
+ def __init__(self, column: SQLColumnExpression) -> None:
20
25
  self.current_parameters = {}
21
26
  self.current_column = column
22
27
  self.connection = None
23
28
 
24
29
 
25
30
  class AlchemyInterface:
26
- def __init__(self, config):
31
+ def __init__(self, config: MutableMapping) -> None:
27
32
  if "db" in config:
28
33
  self.config = config["db"]
29
34
 
@@ -43,7 +48,7 @@ class AlchemyInterface:
43
48
  )
44
49
 
45
50
  @staticmethod
46
- def get_schema_from_table(table):
51
+ def get_schema_from_table(table: Type[ModelType]) -> str:
47
52
  schema = "public"
48
53
 
49
54
  if isinstance(table.__table_args__, tuple):
@@ -59,7 +64,7 @@ class AlchemyInterface:
59
64
 
60
65
  return schema
61
66
 
62
- def create_tables(self, tables):
67
+ def create_tables(self, tables: List[Type[ModelType]]) -> None:
63
68
  for table in tables:
64
69
  schema = self.get_schema_from_table(table)
65
70
 
@@ -82,7 +87,7 @@ class AlchemyInterface:
82
87
  else:
83
88
  logger.info(f"table {table.__tablename__} already exists")
84
89
 
85
- def drop_tables(self, tables):
90
+ def drop_tables(self, tables: List[Type[ModelType]]) -> None:
86
91
  for table in tables:
87
92
  schema = self.get_schema_from_table(table)
88
93
 
@@ -98,13 +103,40 @@ class AlchemyInterface:
98
103
  conn.execute(DDL(f"DROP TABLE {schema}.{table.__tablename__} CASCADE"))
99
104
  conn.commit()
100
105
 
101
- def reset_db(self, tables, drop):
106
+ def reset_db(self, tables: List[Type[ModelType]], drop: bool = False) -> None:
102
107
  if drop:
103
108
  self.drop_tables(tables)
104
109
 
105
110
  self.create_tables(tables)
106
111
 
107
- def insert_alchemy_obj(self, alchemy_obj, silent=False):
112
+ def reset_column(self, query_results: List[Result[Any]], column_name: str) -> None:
113
+ if not query_results:
114
+ logger.warning("No objects to reset column for.")
115
+ return
116
+
117
+ first_obj = query_results[0]
118
+ model_class = first_obj.__class__
119
+ table = model_class.__table__
120
+
121
+ if column_name not in table.columns:
122
+ logger.warning(f"Column {column_name} does not exist in table {table.name}.")
123
+ return
124
+
125
+ column = table.columns[column_name]
126
+
127
+ if column.server_default is not None:
128
+ query_results.update({column_name: text("DEFAULT")}, synchronize_session=False)
129
+ elif column.default is not None:
130
+ default_value = column.default.arg
131
+ if callable(default_value):
132
+ default_value = default_value(MockContext(column))
133
+ query_results.update({column_name: default_value}, synchronize_session=False)
134
+ else:
135
+ raise ValueError(f"Column '{column_name}' doesn't have a default value defined.")
136
+
137
+ self.session.commit()
138
+
139
+ def insert_alchemy_obj(self, alchemy_obj: ModelType, silent: bool = False) -> None:
108
140
  try:
109
141
  if not silent:
110
142
  logger.info(f"adding {alchemy_obj}...")
@@ -118,7 +150,7 @@ class AlchemyInterface:
118
150
 
119
151
  self.session.rollback()
120
152
 
121
- def upsert_alchemy_obj(self, alchemy_obj, index_elements, silent=False):
153
+ def upsert_alchemy_obj(self, alchemy_obj: ModelType, index_elements: List[str], silent: bool = False) -> None:
122
154
  if not silent:
123
155
  logger.info(f"upserting {alchemy_obj}")
124
156
 
@@ -144,29 +176,57 @@ class AlchemyInterface:
144
176
 
145
177
  self.session.rollback()
146
178
 
147
- def reset_column(self, query_results, column_name):
148
- if not query_results:
149
- logger.warning("No objects to reset column for.")
150
- return
179
+ def windowed_query(
180
+ self,
181
+ stmt: Select[Any],
182
+ order_by: List[SQLColumnExpression[Any]],
183
+ windowsize: int,
184
+ ) -> Iterator[Result[Any]]:
185
+ """
186
+ Executes a windowed query on the given statement.
151
187
 
152
- first_obj = query_results[0]
153
- model_class = first_obj.__class__
154
- table = model_class.__table__
188
+ Args:
189
+ stmt: The SQL select statement to execute.
190
+ order_by: The columns to use for ordering.
191
+ windowsize: The number of rows to fetch in each window.
155
192
 
156
- if column_name not in table.columns:
157
- logger.warning(f"Column {column_name} does not exist in table {table.name}.")
158
- return
193
+ Returns:
194
+ An iterator of Result objects, each containing a window of data.
159
195
 
160
- column = table.columns[column_name]
196
+ More info: https://github.com/sqlalchemy/sqlalchemy/wiki/RangeQuery-and-WindowedRangeQuery
197
+ """
198
+ # Add row_number over the specified order
199
+ row_number = func.row_number().over(order_by=order_by).label("row_number")
161
200
 
162
- if column.server_default is not None:
163
- query_results.update({column_name: text("DEFAULT")}, synchronize_session=False)
164
- elif column.default is not None:
165
- default_value = column.default.arg
166
- if callable(default_value):
167
- default_value = default_value(MockContext(column))
168
- query_results.update({column_name: default_value}, synchronize_session=False)
169
- else:
170
- raise ValueError(f"Column '{column_name}' doesn't have a default value defined.")
201
+ # Add the windowing column to the statement and sort by it for deterministic results
202
+ inner_stmt = stmt.add_columns(row_number).order_by(row_number)
203
+ subq = inner_stmt.subquery()
171
204
 
172
- self.session.commit()
205
+ # Create an outer query that selects from the subquery
206
+ cols = [c for c in subq.c]
207
+ outer_query = select(*cols)
208
+
209
+ last_row_number = 0
210
+
211
+ while True:
212
+ # Filter on row_number in the outer query
213
+ current_query = outer_query.where(subq.c.row_number > last_row_number).limit(windowsize)
214
+ result = self.session.execute(current_query)
215
+
216
+ # Create a FrozenResult to allow peeking at the data without consuming
217
+ frozen_result: FrozenResult = result.freeze()
218
+ chunk = frozen_result().all()
219
+
220
+ if not chunk:
221
+ break
222
+
223
+ # Update for next iteration
224
+ last_row_number = chunk[-1].row_number - 1
225
+
226
+ # Create a new Result object from the FrozenResult
227
+ yield_result = frozen_result()
228
+
229
+ # Remove row_number from result
230
+ yield_result = yield_result.columns(*range(len(cols) - 1))
231
+
232
+ yield yield_result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.14
3
+ Version: 0.9.16
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -1,6 +1,6 @@
1
1
  datamarket/__init__.py,sha256=FHS77P9qNewKMoN-p0FLEUEC60oWIYup1QkbJZP4ays,12
2
2
  datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- datamarket/interfaces/alchemy.py,sha256=Gkb-55kgHz-07wpfxQfCHP-jCtj7k-WcRAaQWqDaNRs,6377
3
+ datamarket/interfaces/alchemy.py,sha256=z7VarlKZ-JfsXWtuDXCYnNp_pSzEYo5IvrD7wqoRpbI,8891
4
4
  datamarket/interfaces/aws.py,sha256=7KLUeBxmPN7avEMPsu5HC_KHB1N7W6Anp2X8fo43mlw,2383
5
5
  datamarket/interfaces/drive.py,sha256=shbV5jpQVe_KPE-8Idx6Z9te5Zu1SmVfrvSAyd9ZIgE,2915
6
6
  datamarket/interfaces/ftp.py,sha256=o0KlJxtksbop9OjCiQRzyAa2IeG_ExVXagS6apwrAQo,1881
@@ -17,7 +17,7 @@ datamarket/utils/main.py,sha256=O6rX-65h4h0j2zs9dofdTPlly5reKDnvgLtTwbLmbWg,6529
17
17
  datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
18
18
  datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
19
19
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
20
- datamarket-0.9.14.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
21
- datamarket-0.9.14.dist-info/METADATA,sha256=NnDtsHllSavKwk68xUBM9slmP8w6ZzvZgGjyg2uOrGQ,6363
22
- datamarket-0.9.14.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
23
- datamarket-0.9.14.dist-info/RECORD,,
20
+ datamarket-0.9.16.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
21
+ datamarket-0.9.16.dist-info/METADATA,sha256=DsHOm4WsenKNFphVyqbLI3A3WluNWGGgVFZRlE8U8NI,6363
22
+ datamarket-0.9.16.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
23
+ datamarket-0.9.16.dist-info/RECORD,,