udata-hydra 2.2.2.dev7611__tar.gz → 2.2.2.dev7633__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/PKG-INFO +1 -1
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/pyproject.toml +1 -1
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/analysis/csv.py +29 -59
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/analysis/helpers.py +0 -22
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/__init__.py +0 -2
- udata_hydra-2.2.2.dev7633/udata_hydra/utils/parquet.py +14 -0
- udata_hydra-2.2.2.dev7611/udata_hydra/utils/parquet.py +0 -29
- udata_hydra-2.2.2.dev7611/udata_hydra/utils/reader.py +0 -69
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/README.md +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/__init__.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/analysis/__init__.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/analysis/geojson.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/analysis/resource.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/app.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/cli.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/config_default.toml +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/context.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/crawl/__init__.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/crawl/calculate_next_check.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/crawl/check_resources.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/crawl/helpers.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/crawl/preprocess_check_data.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/crawl/select_batch.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/db/__init__.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/db/check.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/db/resource.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/db/resource_exception.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/logger.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/__init__.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/csv/20221205_initial_up_rev1.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/csv/20230130_drop_migrations.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/csv/20230206_datetime_aware.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/csv/20240827_add_indexes_column_to_tables_index_table.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/csv/20250610_migrate_resources_exception.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/csv/20250626_delete_datetime_iso_references.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20221205_initial_up_rev1.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20221206_rev1_up_rev2.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20221206_rev2_up_rev3.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20221208_rev3_up_rev4.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20221208_rev4_up_rev5.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20230119_rev5_up_rev6.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20230121_rev6_up_rev7.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20230121_rev7_up_rev8.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20230130_drop_migrations.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20230206_datetime_aware.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20230515_rev8_up_rev9.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20230606_rev9_up_rev10.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20231102_drop_csv_analysis.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20240827_add_resources_exceptions_table.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20240926_add_indexes.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20241004_add_comment_column_to_resources_exceptions.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20241021_add_parquet_columns.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20241023_alter_foreign_key.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20241025_add_next_check_column.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20250108_add_indexes.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20250130_add_pmtiles_fields.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20250519_add_format_column_catalog.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20250610_migrate_resources_exception.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20250611_add_status_since_catalog.sql +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/routes/__init__.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/routes/checks.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/routes/resources.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/routes/resources_exceptions.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/routes/status.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/schemas/__init__.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/schemas/check.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/schemas/resource.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/schemas/resource_exception.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/auth.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/csv.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/db.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/errors.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/file.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/geojson.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/http.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/minio.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/queue.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/timer.py +0 -0
- {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/worker.py +0 -0
|
@@ -5,12 +5,14 @@ import logging
|
|
|
5
5
|
import os
|
|
6
6
|
import sys
|
|
7
7
|
from datetime import datetime, timezone
|
|
8
|
-
from
|
|
8
|
+
from math import isnan
|
|
9
|
+
from typing import Iterator
|
|
9
10
|
|
|
11
|
+
import pandas as pd
|
|
10
12
|
from asyncpg import Record
|
|
13
|
+
from csv_detective import routine as csv_detective_routine
|
|
14
|
+
from csv_detective import validate_then_detect
|
|
11
15
|
from csv_detective.detection.engine import engine_to_file
|
|
12
|
-
from csv_detective.explore_csv import routine as csv_detective_routine
|
|
13
|
-
from csv_detective.explore_csv import validate_then_detect
|
|
14
16
|
from progressist import ProgressBar
|
|
15
17
|
from slugify import slugify
|
|
16
18
|
from sqlalchemy import (
|
|
@@ -28,8 +30,6 @@ from sqlalchemy import (
|
|
|
28
30
|
)
|
|
29
31
|
from sqlalchemy.dialects.postgresql import asyncpg
|
|
30
32
|
from sqlalchemy.schema import CreateIndex, CreateTable, Index
|
|
31
|
-
from str2bool import str2bool
|
|
32
|
-
from str2float import str2float
|
|
33
33
|
|
|
34
34
|
from udata_hydra import config, context
|
|
35
35
|
from udata_hydra.analysis import helpers
|
|
@@ -40,7 +40,6 @@ from udata_hydra.db.resource_exception import ResourceException
|
|
|
40
40
|
from udata_hydra.utils import (
|
|
41
41
|
IOException,
|
|
42
42
|
ParseException,
|
|
43
|
-
Reader,
|
|
44
43
|
Timer,
|
|
45
44
|
detect_tabular_from_headers,
|
|
46
45
|
handle_parse_exception,
|
|
@@ -71,17 +70,6 @@ PYTHON_TYPE_TO_PG = {
|
|
|
71
70
|
"datetime_aware": DateTime(timezone=True),
|
|
72
71
|
}
|
|
73
72
|
|
|
74
|
-
PYTHON_TYPE_TO_PY = {
|
|
75
|
-
"string": str,
|
|
76
|
-
"float": float,
|
|
77
|
-
"int": int,
|
|
78
|
-
"bool": bool,
|
|
79
|
-
"json": helpers.to_json,
|
|
80
|
-
"date": helpers.to_date,
|
|
81
|
-
"datetime": helpers.to_datetime,
|
|
82
|
-
"datetime_aware": helpers.to_datetime,
|
|
83
|
-
}
|
|
84
|
-
|
|
85
73
|
RESERVED_COLS = ("__id", "cmin", "cmax", "collation", "ctid", "tableoid", "xmin", "xmax")
|
|
86
74
|
minio_client = MinIOClient(bucket=config.MINIO_PARQUET_BUCKET, folder=config.MINIO_PARQUET_FOLDER)
|
|
87
75
|
|
|
@@ -130,17 +118,21 @@ async def analyse_csv(
|
|
|
130
118
|
try:
|
|
131
119
|
previous_analysis: dict | None = await get_previous_analysis(resource_id=resource_id)
|
|
132
120
|
if previous_analysis:
|
|
133
|
-
csv_inspection
|
|
121
|
+
csv_inspection, df = validate_then_detect(
|
|
134
122
|
file_path=tmp_file.name,
|
|
135
123
|
previous_analysis=previous_analysis,
|
|
136
124
|
output_profile=True,
|
|
125
|
+
output_df=True,
|
|
126
|
+
cast_json=False,
|
|
137
127
|
num_rows=-1,
|
|
138
128
|
save_results=False,
|
|
139
129
|
)
|
|
140
130
|
else:
|
|
141
|
-
csv_inspection
|
|
131
|
+
csv_inspection, df = csv_detective_routine(
|
|
142
132
|
file_path=tmp_file.name,
|
|
143
133
|
output_profile=True,
|
|
134
|
+
output_df=True,
|
|
135
|
+
cast_json=False,
|
|
144
136
|
num_rows=-1,
|
|
145
137
|
save_results=False,
|
|
146
138
|
)
|
|
@@ -151,7 +143,7 @@ async def analyse_csv(
|
|
|
151
143
|
timer.mark("csv-inspection")
|
|
152
144
|
|
|
153
145
|
await csv_to_db(
|
|
154
|
-
|
|
146
|
+
df=df,
|
|
155
147
|
inspection=csv_inspection,
|
|
156
148
|
table_name=table_name,
|
|
157
149
|
table_indexes=table_indexes,
|
|
@@ -162,7 +154,7 @@ async def analyse_csv(
|
|
|
162
154
|
|
|
163
155
|
try:
|
|
164
156
|
parquet_args: tuple[str, int] | None = await csv_to_parquet(
|
|
165
|
-
|
|
157
|
+
df=df,
|
|
166
158
|
inspection=csv_inspection,
|
|
167
159
|
resource_id=resource_id,
|
|
168
160
|
)
|
|
@@ -219,26 +211,6 @@ async def get_previous_analysis(resource_id: str) -> dict | None:
|
|
|
219
211
|
return analysis
|
|
220
212
|
|
|
221
213
|
|
|
222
|
-
def smart_cast(_type: str, value, failsafe: bool = False) -> Any:
|
|
223
|
-
try:
|
|
224
|
-
if value is None or value == "":
|
|
225
|
-
return None
|
|
226
|
-
if _type == "bool":
|
|
227
|
-
return str2bool(value)
|
|
228
|
-
return PYTHON_TYPE_TO_PY[_type](value)
|
|
229
|
-
except ValueError as e:
|
|
230
|
-
if _type == "int":
|
|
231
|
-
_value = str2float(value, default=None)
|
|
232
|
-
if _value:
|
|
233
|
-
return int(_value)
|
|
234
|
-
elif _type == "float":
|
|
235
|
-
return str2float(value, default=None)
|
|
236
|
-
if not failsafe:
|
|
237
|
-
raise e
|
|
238
|
-
log.warning(f'Could not convert "{value}" to {_type}, defaulting to null')
|
|
239
|
-
return None
|
|
240
|
-
|
|
241
|
-
|
|
242
214
|
def compute_create_table_query(
|
|
243
215
|
table_name: str, columns: dict, indexes: dict[str, str] | None = None
|
|
244
216
|
) -> str:
|
|
@@ -255,7 +227,8 @@ def compute_create_table_query(
|
|
|
255
227
|
for col_name, index_type in indexes.items():
|
|
256
228
|
if index_type not in config.SQL_INDEXES_TYPES_SUPPORTED:
|
|
257
229
|
log.error(
|
|
258
|
-
f'Index type "{index_type}" is unknown or not supported yet!
|
|
230
|
+
f'Index type "{index_type}" is unknown or not supported yet! '
|
|
231
|
+
f"Index for column {col_name} was not created."
|
|
259
232
|
)
|
|
260
233
|
continue
|
|
261
234
|
|
|
@@ -267,7 +240,8 @@ def compute_create_table_query(
|
|
|
267
240
|
table.append_constraint(Index(index_name, col_name))
|
|
268
241
|
except KeyError:
|
|
269
242
|
raise KeyError(
|
|
270
|
-
f'Error creating index "{index_name}" on column "{col_name}".
|
|
243
|
+
f'Error creating index "{index_name}" on column "{col_name}". '
|
|
244
|
+
f'Does the column "{col_name}" exist in the table?'
|
|
271
245
|
)
|
|
272
246
|
# TODO: other index types. Not easy with sqlalchemy, maybe use raw sql?
|
|
273
247
|
|
|
@@ -289,17 +263,15 @@ def compute_create_table_query(
|
|
|
289
263
|
return query
|
|
290
264
|
|
|
291
265
|
|
|
292
|
-
def generate_records(
|
|
293
|
-
#
|
|
294
|
-
#
|
|
295
|
-
|
|
296
|
-
for
|
|
297
|
-
if line:
|
|
298
|
-
yield [smart_cast(t, v, failsafe=True) for t, v in zip(columns.values(), line)]
|
|
266
|
+
def generate_records(df: pd.DataFrame) -> Iterator[list]:
|
|
267
|
+
# pandas cannot have None in columns typed as int so we have to cast
|
|
268
|
+
# NaN-int values to None for db insertion, and we also change NaN to None
|
|
269
|
+
for row in df.values:
|
|
270
|
+
yield tuple(cell if not pd.isna(cell) else None for cell in row)
|
|
299
271
|
|
|
300
272
|
|
|
301
273
|
async def csv_to_parquet(
|
|
302
|
-
|
|
274
|
+
df: pd.DataFrame,
|
|
303
275
|
inspection: dict,
|
|
304
276
|
resource_id: str | None = None,
|
|
305
277
|
) -> tuple[str, int] | None:
|
|
@@ -334,11 +306,9 @@ async def csv_to_parquet(
|
|
|
334
306
|
# Update resource status to CONVERTING_TO_PARQUET
|
|
335
307
|
await Resource.update(resource_id, {"status": "CONVERTING_TO_PARQUET"})
|
|
336
308
|
|
|
337
|
-
columns = {c: v["python_type"] for c, v in inspection["columns"].items()}
|
|
338
309
|
# save the file as parquet and store it on Minio instance
|
|
339
310
|
parquet_file, _ = save_as_parquet(
|
|
340
|
-
|
|
341
|
-
columns=columns,
|
|
311
|
+
df=df,
|
|
342
312
|
output_filename=resource_id,
|
|
343
313
|
)
|
|
344
314
|
parquet_size: int = os.path.getsize(parquet_file)
|
|
@@ -347,7 +317,7 @@ async def csv_to_parquet(
|
|
|
347
317
|
|
|
348
318
|
|
|
349
319
|
async def csv_to_db(
|
|
350
|
-
|
|
320
|
+
df: pd.DataFrame,
|
|
351
321
|
inspection: dict,
|
|
352
322
|
table_name: str,
|
|
353
323
|
table_indexes: dict[str, str] | None = None,
|
|
@@ -401,8 +371,8 @@ async def csv_to_db(
|
|
|
401
371
|
try:
|
|
402
372
|
await db.copy_records_to_table(
|
|
403
373
|
table_name,
|
|
404
|
-
records=generate_records(
|
|
405
|
-
columns=columns.keys(),
|
|
374
|
+
records=generate_records(df),
|
|
375
|
+
columns=list(columns.keys()),
|
|
406
376
|
)
|
|
407
377
|
except Exception as e: # I know what I'm doing, pinky swear
|
|
408
378
|
raise ParseException(
|
|
@@ -411,8 +381,8 @@ async def csv_to_db(
|
|
|
411
381
|
# this inserts rows from iterator one by one, slow but useful for debugging
|
|
412
382
|
else:
|
|
413
383
|
bar = ProgressBar(total=inspection["total_lines"])
|
|
414
|
-
for r in bar.iter(generate_records(
|
|
415
|
-
data = {k: v for k, v in zip(columns
|
|
384
|
+
for r in bar.iter(generate_records(df)):
|
|
385
|
+
data = {k: v for k, v in zip(df.columns, r)}
|
|
416
386
|
# NB: possible sql injection here, but should not be used in prod
|
|
417
387
|
q = compute_insert_query(table_name=table_name, data=data, returning="__id")
|
|
418
388
|
await db.execute(q, *data.values())
|
|
@@ -1,34 +1,12 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from datetime import date, datetime
|
|
3
2
|
from typing import IO
|
|
4
3
|
|
|
5
4
|
from asyncpg import Record
|
|
6
|
-
from dateparser import parse as date_parser
|
|
7
|
-
from dateutil.parser import ParserError
|
|
8
|
-
from dateutil.parser import parse as dateutil_parser
|
|
9
5
|
|
|
10
6
|
from udata_hydra import config
|
|
11
7
|
from udata_hydra.utils import UdataPayload, download_resource, queue, send
|
|
12
8
|
|
|
13
9
|
|
|
14
|
-
def to_json(value: str) -> str:
|
|
15
|
-
"""Convenience method, should be casted from string directly by postgres"""
|
|
16
|
-
return value
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def to_datetime(value: str) -> datetime | None:
|
|
20
|
-
"""For performance reasons, we try first with dateutil and fallback on dateparser"""
|
|
21
|
-
try:
|
|
22
|
-
return dateutil_parser(value)
|
|
23
|
-
except ParserError:
|
|
24
|
-
return date_parser(value)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def to_date(value: str) -> date | None:
|
|
28
|
-
parsed = to_datetime(value)
|
|
29
|
-
return parsed.date() if parsed else None
|
|
30
|
-
|
|
31
|
-
|
|
32
10
|
def get_python_type(column: dict) -> str:
|
|
33
11
|
"""Outsourcing the distinction of aware datetimes"""
|
|
34
12
|
return (
|
|
@@ -5,7 +5,6 @@ from .file import compute_checksum_from_file, download_resource
|
|
|
5
5
|
from .geojson import detect_geojson_from_headers_or_catalog
|
|
6
6
|
from .http import UdataPayload, get_request_params, send
|
|
7
7
|
from .queue import enqueue
|
|
8
|
-
from .reader import Reader
|
|
9
8
|
from .timer import Timer
|
|
10
9
|
|
|
11
10
|
__all__ = [
|
|
@@ -21,6 +20,5 @@ __all__ = [
|
|
|
21
20
|
"get_request_params",
|
|
22
21
|
"send",
|
|
23
22
|
"enqueue",
|
|
24
|
-
"Reader",
|
|
25
23
|
"Timer",
|
|
26
24
|
]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def save_as_parquet(
|
|
7
|
+
df: pd.DataFrame,
|
|
8
|
+
output_filename: str | None = None,
|
|
9
|
+
) -> tuple[str, BytesIO | None]:
|
|
10
|
+
bytes = df.to_parquet(
|
|
11
|
+
f"{output_filename}.parquet" if output_filename else None,
|
|
12
|
+
compression="zstd", # best compression to date
|
|
13
|
+
)
|
|
14
|
+
return f"{output_filename}.parquet", bytes
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
from typing import Iterator
|
|
2
|
-
|
|
3
|
-
import pyarrow as pa
|
|
4
|
-
import pyarrow.parquet as pq
|
|
5
|
-
|
|
6
|
-
PYTHON_TYPE_TO_PA = {
|
|
7
|
-
"string": pa.string(),
|
|
8
|
-
"float": pa.float64(),
|
|
9
|
-
"int": pa.int64(),
|
|
10
|
-
"bool": pa.bool_(),
|
|
11
|
-
"json": pa.string(),
|
|
12
|
-
"date": pa.date32(),
|
|
13
|
-
"datetime": pa.date64(),
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def save_as_parquet(
|
|
18
|
-
records: Iterator[list],
|
|
19
|
-
columns: dict,
|
|
20
|
-
output_filename: str | None = None,
|
|
21
|
-
) -> tuple[str, pa.Table]:
|
|
22
|
-
# the "output_name = None" case is only used in tests
|
|
23
|
-
table = pa.Table.from_pylist(
|
|
24
|
-
[{c: v for c, v in zip(columns, values)} for values in records],
|
|
25
|
-
schema=pa.schema([pa.field(c, PYTHON_TYPE_TO_PA[columns[c]]) for c in columns]),
|
|
26
|
-
)
|
|
27
|
-
if output_filename:
|
|
28
|
-
pq.write_table(table, f"{output_filename}.parquet")
|
|
29
|
-
return f"{output_filename}.parquet", table
|
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
import csv as stdcsv
|
|
2
|
-
from io import BytesIO
|
|
3
|
-
from typing import Generator
|
|
4
|
-
|
|
5
|
-
import openpyxl
|
|
6
|
-
import xlrd
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def generate_dialect(inspection: dict) -> stdcsv.Dialect:
|
|
10
|
-
class CustomDialect(stdcsv.unix_dialect):
|
|
11
|
-
# TODO: it would be nice to have more info from csvdetective to feed the dialect
|
|
12
|
-
# in the meantime we might want to sniff the file a bit
|
|
13
|
-
delimiter = inspection["separator"]
|
|
14
|
-
|
|
15
|
-
return CustomDialect()
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class Reader:
|
|
19
|
-
def __init__(self, file_path, inspection):
|
|
20
|
-
self.file_path = file_path
|
|
21
|
-
self.inspection = inspection
|
|
22
|
-
self.nb_skip = self.inspection["header_row_idx"]
|
|
23
|
-
self.mapping = {
|
|
24
|
-
"openpyxl": "iter_rows",
|
|
25
|
-
"xlrd": "get_rows",
|
|
26
|
-
}
|
|
27
|
-
self.nb_columns = len(self.inspection["header"])
|
|
28
|
-
self.reader = None
|
|
29
|
-
|
|
30
|
-
def __enter__(self):
|
|
31
|
-
if self.inspection.get("engine") == "openpyxl":
|
|
32
|
-
with open(self.file_path, "rb") as f:
|
|
33
|
-
content = BytesIO(f.read())
|
|
34
|
-
self.file = openpyxl.load_workbook(content)
|
|
35
|
-
self.sheet = self.file[self.inspection["sheet_name"]]
|
|
36
|
-
self.reader = self._excel_reader()
|
|
37
|
-
|
|
38
|
-
elif self.inspection.get("engine") == "xlrd":
|
|
39
|
-
self.file = xlrd.open_workbook(self.file_path)
|
|
40
|
-
self.sheet = self.file[self.inspection["sheet_name"]]
|
|
41
|
-
self.reader = self._excel_reader()
|
|
42
|
-
|
|
43
|
-
else:
|
|
44
|
-
self.file = open(self.file_path, encoding=self.inspection["encoding"])
|
|
45
|
-
self.reader = stdcsv.reader(
|
|
46
|
-
self._skip_rows(), dialect=generate_dialect(self.inspection)
|
|
47
|
-
)
|
|
48
|
-
return self
|
|
49
|
-
|
|
50
|
-
def __exit__(self, exc_type, exc_value, traceback):
|
|
51
|
-
if self.file is not None and hasattr(self.file, "close"):
|
|
52
|
-
self.file.close()
|
|
53
|
-
|
|
54
|
-
def _skip_rows(self):
|
|
55
|
-
# skipping header
|
|
56
|
-
for _ in range(self.nb_skip + 1):
|
|
57
|
-
next(self.file)
|
|
58
|
-
return self.file
|
|
59
|
-
|
|
60
|
-
def _excel_reader(self) -> Generator:
|
|
61
|
-
_method = getattr(self.sheet, self.mapping[self.inspection["engine"]])
|
|
62
|
-
for idx, row in enumerate(_method()):
|
|
63
|
-
# skipping header
|
|
64
|
-
if idx <= self.nb_skip:
|
|
65
|
-
continue
|
|
66
|
-
yield [c.value for c in row]
|
|
67
|
-
|
|
68
|
-
def __iter__(self):
|
|
69
|
-
return self.reader
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/crawl/calculate_next_check.py
RENAMED
|
File without changes
|
{udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/crawl/check_resources.py
RENAMED
|
File without changes
|
|
File without changes
|
{udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/crawl/preprocess_check_data.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/db/resource_exception.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/routes/resources_exceptions.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/schemas/resource_exception.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|