datamarket 0.9.13__py3-none-any.whl → 0.9.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/interfaces/alchemy.py +86 -30
- {datamarket-0.9.13.dist-info → datamarket-0.9.15.dist-info}/METADATA +2 -2
- {datamarket-0.9.13.dist-info → datamarket-0.9.15.dist-info}/RECORD +5 -5
- {datamarket-0.9.13.dist-info → datamarket-0.9.15.dist-info}/LICENSE +0 -0
- {datamarket-0.9.13.dist-info → datamarket-0.9.15.dist-info}/WHEEL +0 -0
datamarket/interfaces/alchemy.py
CHANGED
|
@@ -3,27 +3,33 @@
|
|
|
3
3
|
|
|
4
4
|
import logging
|
|
5
5
|
from urllib.parse import quote_plus
|
|
6
|
+
from typing import Any, Iterator, List, Type, TypeVar
|
|
7
|
+
from collections.abc import MutableMapping
|
|
6
8
|
|
|
7
|
-
from sqlalchemy import DDL, create_engine, text
|
|
9
|
+
from sqlalchemy import DDL, Result, Select, SQLColumnExpression, create_engine, text
|
|
8
10
|
from sqlalchemy.dialects.postgresql import insert
|
|
9
11
|
from sqlalchemy.exc import IntegrityError
|
|
10
12
|
from sqlalchemy.orm import sessionmaker
|
|
13
|
+
from sqlalchemy import FrozenResult
|
|
14
|
+
from sqlalchemy.ext.declarative import DeclarativeMeta
|
|
11
15
|
|
|
12
16
|
########################################################################################################################
|
|
13
17
|
# CLASSES
|
|
14
18
|
|
|
15
19
|
logger = logging.getLogger(__name__)
|
|
16
20
|
|
|
21
|
+
ModelType = TypeVar("ModelType", bound=DeclarativeMeta)
|
|
22
|
+
|
|
17
23
|
|
|
18
24
|
class MockContext:
|
|
19
|
-
def __init__(self, column):
|
|
25
|
+
def __init__(self, column: SQLColumnExpression) -> None:
|
|
20
26
|
self.current_parameters = {}
|
|
21
27
|
self.current_column = column
|
|
22
28
|
self.connection = None
|
|
23
29
|
|
|
24
30
|
|
|
25
31
|
class AlchemyInterface:
|
|
26
|
-
def __init__(self, config):
|
|
32
|
+
def __init__(self, config: MutableMapping) -> None:
|
|
27
33
|
if "db" in config:
|
|
28
34
|
self.config = config["db"]
|
|
29
35
|
|
|
@@ -43,7 +49,7 @@ class AlchemyInterface:
|
|
|
43
49
|
)
|
|
44
50
|
|
|
45
51
|
@staticmethod
|
|
46
|
-
def get_schema_from_table(table):
|
|
52
|
+
def get_schema_from_table(table: Type[ModelType]) -> str:
|
|
47
53
|
schema = "public"
|
|
48
54
|
|
|
49
55
|
if isinstance(table.__table_args__, tuple):
|
|
@@ -59,7 +65,7 @@ class AlchemyInterface:
|
|
|
59
65
|
|
|
60
66
|
return schema
|
|
61
67
|
|
|
62
|
-
def create_tables(self, tables):
|
|
68
|
+
def create_tables(self, tables: List[Type[ModelType]]) -> None:
|
|
63
69
|
for table in tables:
|
|
64
70
|
schema = self.get_schema_from_table(table)
|
|
65
71
|
|
|
@@ -82,7 +88,7 @@ class AlchemyInterface:
|
|
|
82
88
|
else:
|
|
83
89
|
logger.info(f"table {table.__tablename__} already exists")
|
|
84
90
|
|
|
85
|
-
def drop_tables(self, tables):
|
|
91
|
+
def drop_tables(self, tables: List[Type[ModelType]]) -> None:
|
|
86
92
|
for table in tables:
|
|
87
93
|
schema = self.get_schema_from_table(table)
|
|
88
94
|
|
|
@@ -98,13 +104,40 @@ class AlchemyInterface:
|
|
|
98
104
|
conn.execute(DDL(f"DROP TABLE {schema}.{table.__tablename__} CASCADE"))
|
|
99
105
|
conn.commit()
|
|
100
106
|
|
|
101
|
-
def reset_db(self, tables, drop):
|
|
107
|
+
def reset_db(self, tables: List[Type[ModelType]], drop: bool = False) -> None:
|
|
102
108
|
if drop:
|
|
103
109
|
self.drop_tables(tables)
|
|
104
110
|
|
|
105
111
|
self.create_tables(tables)
|
|
106
112
|
|
|
107
|
-
def
|
|
113
|
+
def reset_column(self, query_results: List[Result[Any]], column_name: str) -> None:
|
|
114
|
+
if not query_results:
|
|
115
|
+
logger.warning("No objects to reset column for.")
|
|
116
|
+
return
|
|
117
|
+
|
|
118
|
+
first_obj = query_results[0]
|
|
119
|
+
model_class = first_obj.__class__
|
|
120
|
+
table = model_class.__table__
|
|
121
|
+
|
|
122
|
+
if column_name not in table.columns:
|
|
123
|
+
logger.warning(f"Column {column_name} does not exist in table {table.name}.")
|
|
124
|
+
return
|
|
125
|
+
|
|
126
|
+
column = table.columns[column_name]
|
|
127
|
+
|
|
128
|
+
if column.server_default is not None:
|
|
129
|
+
query_results.update({column_name: text("DEFAULT")}, synchronize_session=False)
|
|
130
|
+
elif column.default is not None:
|
|
131
|
+
default_value = column.default.arg
|
|
132
|
+
if callable(default_value):
|
|
133
|
+
default_value = default_value(MockContext(column))
|
|
134
|
+
query_results.update({column_name: default_value}, synchronize_session=False)
|
|
135
|
+
else:
|
|
136
|
+
raise ValueError(f"Column '{column_name}' doesn't have a default value defined.")
|
|
137
|
+
|
|
138
|
+
self.session.commit()
|
|
139
|
+
|
|
140
|
+
def insert_alchemy_obj(self, alchemy_obj: ModelType, silent: bool = False) -> None:
|
|
108
141
|
try:
|
|
109
142
|
if not silent:
|
|
110
143
|
logger.info(f"adding {alchemy_obj}...")
|
|
@@ -118,7 +151,7 @@ class AlchemyInterface:
|
|
|
118
151
|
|
|
119
152
|
self.session.rollback()
|
|
120
153
|
|
|
121
|
-
def upsert_alchemy_obj(self, alchemy_obj, index_elements, silent=False):
|
|
154
|
+
def upsert_alchemy_obj(self, alchemy_obj: ModelType, index_elements: List[str], silent: bool = False) -> None:
|
|
122
155
|
if not silent:
|
|
123
156
|
logger.info(f"upserting {alchemy_obj}")
|
|
124
157
|
|
|
@@ -144,29 +177,52 @@ class AlchemyInterface:
|
|
|
144
177
|
|
|
145
178
|
self.session.rollback()
|
|
146
179
|
|
|
147
|
-
def
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
180
|
+
def windowed_query(
|
|
181
|
+
self,
|
|
182
|
+
stmt: Select[Any],
|
|
183
|
+
column: SQLColumnExpression[Any],
|
|
184
|
+
windowsize: int,
|
|
185
|
+
) -> Iterator[Result[Any]]:
|
|
186
|
+
"""
|
|
187
|
+
Executes a windowed query on the given statement.
|
|
151
188
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
189
|
+
Args:
|
|
190
|
+
stmt: The SQL select statement to execute.
|
|
191
|
+
column: The column to use for windowing and sorting.
|
|
192
|
+
windowsize: The number of rows to fetch in each window.
|
|
155
193
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
return
|
|
194
|
+
Returns:
|
|
195
|
+
An iterator of Result objects, each containing a window of data.
|
|
159
196
|
|
|
160
|
-
|
|
197
|
+
Source: https://github.com/sqlalchemy/sqlalchemy/wiki/RangeQuery-and-WindowedRangeQuery
|
|
198
|
+
"""
|
|
161
199
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
default_value = column.default.arg
|
|
166
|
-
if callable(default_value):
|
|
167
|
-
default_value = default_value(MockContext(column))
|
|
168
|
-
query_results.update({column_name: default_value}, synchronize_session=False)
|
|
169
|
-
else:
|
|
170
|
-
raise ValueError(f"Column '{column_name}' doesn't have a default value defined.")
|
|
200
|
+
# Add the windowing column to the statement and sort by it for deterministic results
|
|
201
|
+
stmt = stmt.add_columns(column).order_by(column)
|
|
202
|
+
last_id = None
|
|
171
203
|
|
|
172
|
-
|
|
204
|
+
while True:
|
|
205
|
+
subq = stmt
|
|
206
|
+
|
|
207
|
+
if last_id is not None:
|
|
208
|
+
subq = subq.filter(column > last_id)
|
|
209
|
+
|
|
210
|
+
result: Result = self.session.execute(subq.limit(windowsize))
|
|
211
|
+
|
|
212
|
+
# Create a FrozenResult to allow peeking at the data without consuming
|
|
213
|
+
frozen_result: FrozenResult = result.freeze()
|
|
214
|
+
chunk = frozen_result().all()
|
|
215
|
+
|
|
216
|
+
if not chunk:
|
|
217
|
+
break
|
|
218
|
+
|
|
219
|
+
result_width = len(chunk)
|
|
220
|
+
last_id = chunk[-1][-1]
|
|
221
|
+
|
|
222
|
+
# Create a new Result object from the FrozenResult
|
|
223
|
+
yield_result: Result = frozen_result()
|
|
224
|
+
|
|
225
|
+
# Remove the windowing column from the result
|
|
226
|
+
yield_result = yield_result.columns(*list(range(0, result_width - 1)))
|
|
227
|
+
|
|
228
|
+
yield yield_result
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: datamarket
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.15
|
|
4
4
|
Summary: Utilities that integrate advanced scraping knowledge into just one library.
|
|
5
5
|
License: GPL-3.0-or-later
|
|
6
6
|
Author: DataMarket
|
|
@@ -85,7 +85,7 @@ Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
|
|
|
85
85
|
Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
|
|
86
86
|
Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
|
|
87
87
|
Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
|
|
88
|
-
Requires-Dist: nodriver (
|
|
88
|
+
Requires-Dist: nodriver (>=0.44,<0.45) ; extra == "nodriver"
|
|
89
89
|
Requires-Dist: openpyxl (>=3.0.0,<4.0.0) ; extra == "openpyxl"
|
|
90
90
|
Requires-Dist: pandas (>=2.0.0,<3.0.0) ; extra == "pandas"
|
|
91
91
|
Requires-Dist: pandera (>=0.22.0,<0.23.0) ; extra == "pandera"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
datamarket/__init__.py,sha256=FHS77P9qNewKMoN-p0FLEUEC60oWIYup1QkbJZP4ays,12
|
|
2
2
|
datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
datamarket/interfaces/alchemy.py,sha256=
|
|
3
|
+
datamarket/interfaces/alchemy.py,sha256=Y1wKSEBlWXCE3-6JypdaUL2i_7FAGa9PK4_zQHeM2SE,8563
|
|
4
4
|
datamarket/interfaces/aws.py,sha256=7KLUeBxmPN7avEMPsu5HC_KHB1N7W6Anp2X8fo43mlw,2383
|
|
5
5
|
datamarket/interfaces/drive.py,sha256=shbV5jpQVe_KPE-8Idx6Z9te5Zu1SmVfrvSAyd9ZIgE,2915
|
|
6
6
|
datamarket/interfaces/ftp.py,sha256=o0KlJxtksbop9OjCiQRzyAa2IeG_ExVXagS6apwrAQo,1881
|
|
@@ -17,7 +17,7 @@ datamarket/utils/main.py,sha256=O6rX-65h4h0j2zs9dofdTPlly5reKDnvgLtTwbLmbWg,6529
|
|
|
17
17
|
datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
|
|
18
18
|
datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
|
|
19
19
|
datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
|
|
20
|
-
datamarket-0.9.
|
|
21
|
-
datamarket-0.9.
|
|
22
|
-
datamarket-0.9.
|
|
23
|
-
datamarket-0.9.
|
|
20
|
+
datamarket-0.9.15.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
21
|
+
datamarket-0.9.15.dist-info/METADATA,sha256=LzfDrq6DA-PUfIHU19ZOy0n_x_txIMqXu6PKl-pmmxI,6363
|
|
22
|
+
datamarket-0.9.15.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
|
|
23
|
+
datamarket-0.9.15.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|