datamarket 0.9.14__py3-none-any.whl → 0.9.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -3,27 +3,33 @@
3
3
 
4
4
  import logging
5
5
  from urllib.parse import quote_plus
6
+ from typing import Any, Iterator, List, Type, TypeVar
7
+ from collections.abc import MutableMapping
6
8
 
7
- from sqlalchemy import DDL, create_engine, text
9
+ from sqlalchemy import DDL, Result, Select, SQLColumnExpression, create_engine, text
8
10
  from sqlalchemy.dialects.postgresql import insert
9
11
  from sqlalchemy.exc import IntegrityError
10
12
  from sqlalchemy.orm import sessionmaker
13
+ from sqlalchemy import FrozenResult
14
+ from sqlalchemy.ext.declarative import DeclarativeMeta
11
15
 
12
16
  ########################################################################################################################
13
17
  # CLASSES
14
18
 
15
19
  logger = logging.getLogger(__name__)
16
20
 
21
+ ModelType = TypeVar("ModelType", bound=DeclarativeMeta)
22
+
17
23
 
18
24
  class MockContext:
19
- def __init__(self, column):
25
+ def __init__(self, column: SQLColumnExpression) -> None:
20
26
  self.current_parameters = {}
21
27
  self.current_column = column
22
28
  self.connection = None
23
29
 
24
30
 
25
31
  class AlchemyInterface:
26
- def __init__(self, config):
32
+ def __init__(self, config: MutableMapping) -> None:
27
33
  if "db" in config:
28
34
  self.config = config["db"]
29
35
 
@@ -43,7 +49,7 @@ class AlchemyInterface:
43
49
  )
44
50
 
45
51
  @staticmethod
46
- def get_schema_from_table(table):
52
+ def get_schema_from_table(table: Type[ModelType]) -> str:
47
53
  schema = "public"
48
54
 
49
55
  if isinstance(table.__table_args__, tuple):
@@ -59,7 +65,7 @@ class AlchemyInterface:
59
65
 
60
66
  return schema
61
67
 
62
- def create_tables(self, tables):
68
+ def create_tables(self, tables: List[Type[ModelType]]) -> None:
63
69
  for table in tables:
64
70
  schema = self.get_schema_from_table(table)
65
71
 
@@ -82,7 +88,7 @@ class AlchemyInterface:
82
88
  else:
83
89
  logger.info(f"table {table.__tablename__} already exists")
84
90
 
85
- def drop_tables(self, tables):
91
+ def drop_tables(self, tables: List[Type[ModelType]]) -> None:
86
92
  for table in tables:
87
93
  schema = self.get_schema_from_table(table)
88
94
 
@@ -98,13 +104,40 @@ class AlchemyInterface:
98
104
  conn.execute(DDL(f"DROP TABLE {schema}.{table.__tablename__} CASCADE"))
99
105
  conn.commit()
100
106
 
101
- def reset_db(self, tables, drop):
107
+ def reset_db(self, tables: List[Type[ModelType]], drop: bool = False) -> None:
102
108
  if drop:
103
109
  self.drop_tables(tables)
104
110
 
105
111
  self.create_tables(tables)
106
112
 
107
- def insert_alchemy_obj(self, alchemy_obj, silent=False):
113
+ def reset_column(self, query_results: List[Result[Any]], column_name: str) -> None:
114
+ if not query_results:
115
+ logger.warning("No objects to reset column for.")
116
+ return
117
+
118
+ first_obj = query_results[0]
119
+ model_class = first_obj.__class__
120
+ table = model_class.__table__
121
+
122
+ if column_name not in table.columns:
123
+ logger.warning(f"Column {column_name} does not exist in table {table.name}.")
124
+ return
125
+
126
+ column = table.columns[column_name]
127
+
128
+ if column.server_default is not None:
129
+ query_results.update({column_name: text("DEFAULT")}, synchronize_session=False)
130
+ elif column.default is not None:
131
+ default_value = column.default.arg
132
+ if callable(default_value):
133
+ default_value = default_value(MockContext(column))
134
+ query_results.update({column_name: default_value}, synchronize_session=False)
135
+ else:
136
+ raise ValueError(f"Column '{column_name}' doesn't have a default value defined.")
137
+
138
+ self.session.commit()
139
+
140
+ def insert_alchemy_obj(self, alchemy_obj: ModelType, silent: bool = False) -> None:
108
141
  try:
109
142
  if not silent:
110
143
  logger.info(f"adding {alchemy_obj}...")
@@ -118,7 +151,7 @@ class AlchemyInterface:
118
151
 
119
152
  self.session.rollback()
120
153
 
121
- def upsert_alchemy_obj(self, alchemy_obj, index_elements, silent=False):
154
+ def upsert_alchemy_obj(self, alchemy_obj: ModelType, index_elements: List[str], silent: bool = False) -> None:
122
155
  if not silent:
123
156
  logger.info(f"upserting {alchemy_obj}")
124
157
 
@@ -144,29 +177,52 @@ class AlchemyInterface:
144
177
 
145
178
  self.session.rollback()
146
179
 
147
- def reset_column(self, query_results, column_name):
148
- if not query_results:
149
- logger.warning("No objects to reset column for.")
150
- return
180
+ def windowed_query(
181
+ self,
182
+ stmt: Select[Any],
183
+ column: SQLColumnExpression[Any],
184
+ windowsize: int,
185
+ ) -> Iterator[Result[Any]]:
186
+ """
187
+ Executes a windowed query on the given statement.
151
188
 
152
- first_obj = query_results[0]
153
- model_class = first_obj.__class__
154
- table = model_class.__table__
189
+ Args:
190
+ stmt: The SQL select statement to execute.
191
+ column: The column to use for windowing and sorting.
192
+ windowsize: The number of rows to fetch in each window.
155
193
 
156
- if column_name not in table.columns:
157
- logger.warning(f"Column {column_name} does not exist in table {table.name}.")
158
- return
194
+ Returns:
195
+ An iterator of Result objects, each containing a window of data.
159
196
 
160
- column = table.columns[column_name]
197
+ Source: https://github.com/sqlalchemy/sqlalchemy/wiki/RangeQuery-and-WindowedRangeQuery
198
+ """
161
199
 
162
- if column.server_default is not None:
163
- query_results.update({column_name: text("DEFAULT")}, synchronize_session=False)
164
- elif column.default is not None:
165
- default_value = column.default.arg
166
- if callable(default_value):
167
- default_value = default_value(MockContext(column))
168
- query_results.update({column_name: default_value}, synchronize_session=False)
169
- else:
170
- raise ValueError(f"Column '{column_name}' doesn't have a default value defined.")
200
+ # Add the windowing column to the statement and sort by it for deterministic results
201
+ stmt = stmt.add_columns(column).order_by(column)
202
+ last_id = None
171
203
 
172
- self.session.commit()
204
+ while True:
205
+ subq = stmt
206
+
207
+ if last_id is not None:
208
+ subq = subq.filter(column > last_id)
209
+
210
+ result: Result = self.session.execute(subq.limit(windowsize))
211
+
212
+ # Create a FrozenResult to allow peeking at the data without consuming
213
+ frozen_result: FrozenResult = result.freeze()
214
+ chunk = frozen_result().all()
215
+
216
+ if not chunk:
217
+ break
218
+
219
+ result_width = len(chunk)
220
+ last_id = chunk[-1][-1]
221
+
222
+ # Create a new Result object from the FrozenResult
223
+ yield_result: Result = frozen_result()
224
+
225
+ # Remove the windowing column from the result
226
+ yield_result = yield_result.columns(*list(range(0, result_width - 1)))
227
+
228
+ yield yield_result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.14
3
+ Version: 0.9.15
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -1,6 +1,6 @@
1
1
  datamarket/__init__.py,sha256=FHS77P9qNewKMoN-p0FLEUEC60oWIYup1QkbJZP4ays,12
2
2
  datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- datamarket/interfaces/alchemy.py,sha256=Gkb-55kgHz-07wpfxQfCHP-jCtj7k-WcRAaQWqDaNRs,6377
3
+ datamarket/interfaces/alchemy.py,sha256=Y1wKSEBlWXCE3-6JypdaUL2i_7FAGa9PK4_zQHeM2SE,8563
4
4
  datamarket/interfaces/aws.py,sha256=7KLUeBxmPN7avEMPsu5HC_KHB1N7W6Anp2X8fo43mlw,2383
5
5
  datamarket/interfaces/drive.py,sha256=shbV5jpQVe_KPE-8Idx6Z9te5Zu1SmVfrvSAyd9ZIgE,2915
6
6
  datamarket/interfaces/ftp.py,sha256=o0KlJxtksbop9OjCiQRzyAa2IeG_ExVXagS6apwrAQo,1881
@@ -17,7 +17,7 @@ datamarket/utils/main.py,sha256=O6rX-65h4h0j2zs9dofdTPlly5reKDnvgLtTwbLmbWg,6529
17
17
  datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
18
18
  datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
19
19
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
20
- datamarket-0.9.14.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
21
- datamarket-0.9.14.dist-info/METADATA,sha256=NnDtsHllSavKwk68xUBM9slmP8w6ZzvZgGjyg2uOrGQ,6363
22
- datamarket-0.9.14.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
23
- datamarket-0.9.14.dist-info/RECORD,,
20
+ datamarket-0.9.15.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
21
+ datamarket-0.9.15.dist-info/METADATA,sha256=LzfDrq6DA-PUfIHU19ZOy0n_x_txIMqXu6PKl-pmmxI,6363
22
+ datamarket-0.9.15.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
23
+ datamarket-0.9.15.dist-info/RECORD,,