udata-hydra 2.2.2.dev7611__tar.gz → 2.2.2.dev7633__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/PKG-INFO +1 -1
  2. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/pyproject.toml +1 -1
  3. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/analysis/csv.py +29 -59
  4. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/analysis/helpers.py +0 -22
  5. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/__init__.py +0 -2
  6. udata_hydra-2.2.2.dev7633/udata_hydra/utils/parquet.py +14 -0
  7. udata_hydra-2.2.2.dev7611/udata_hydra/utils/parquet.py +0 -29
  8. udata_hydra-2.2.2.dev7611/udata_hydra/utils/reader.py +0 -69
  9. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/README.md +0 -0
  10. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/__init__.py +0 -0
  11. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/analysis/__init__.py +0 -0
  12. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/analysis/geojson.py +0 -0
  13. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/analysis/resource.py +0 -0
  14. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/app.py +0 -0
  15. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/cli.py +0 -0
  16. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/config_default.toml +0 -0
  17. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/context.py +0 -0
  18. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/crawl/__init__.py +0 -0
  19. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/crawl/calculate_next_check.py +0 -0
  20. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/crawl/check_resources.py +0 -0
  21. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/crawl/helpers.py +0 -0
  22. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/crawl/preprocess_check_data.py +0 -0
  23. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/crawl/select_batch.py +0 -0
  24. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/db/__init__.py +0 -0
  25. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/db/check.py +0 -0
  26. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/db/resource.py +0 -0
  27. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/db/resource_exception.py +0 -0
  28. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/logger.py +0 -0
  29. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/__init__.py +0 -0
  30. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/csv/20221205_initial_up_rev1.sql +0 -0
  31. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/csv/20230130_drop_migrations.sql +0 -0
  32. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/csv/20230206_datetime_aware.sql +0 -0
  33. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/csv/20240827_add_indexes_column_to_tables_index_table.sql +0 -0
  34. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/csv/20250610_migrate_resources_exception.sql +0 -0
  35. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/csv/20250626_delete_datetime_iso_references.sql +0 -0
  36. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20221205_initial_up_rev1.sql +0 -0
  37. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20221206_rev1_up_rev2.sql +0 -0
  38. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20221206_rev2_up_rev3.sql +0 -0
  39. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20221208_rev3_up_rev4.sql +0 -0
  40. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20221208_rev4_up_rev5.sql +0 -0
  41. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20230119_rev5_up_rev6.sql +0 -0
  42. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20230121_rev6_up_rev7.sql +0 -0
  43. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20230121_rev7_up_rev8.sql +0 -0
  44. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20230130_drop_migrations.sql +0 -0
  45. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20230206_datetime_aware.sql +0 -0
  46. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20230515_rev8_up_rev9.sql +0 -0
  47. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20230606_rev9_up_rev10.sql +0 -0
  48. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20231102_drop_csv_analysis.sql +0 -0
  49. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20240827_add_resources_exceptions_table.sql +0 -0
  50. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20240926_add_indexes.sql +0 -0
  51. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20241004_add_comment_column_to_resources_exceptions.sql +0 -0
  52. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20241021_add_parquet_columns.sql +0 -0
  53. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20241023_alter_foreign_key.sql +0 -0
  54. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20241025_add_next_check_column.sql +0 -0
  55. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20250108_add_indexes.sql +0 -0
  56. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20250130_add_pmtiles_fields.sql +0 -0
  57. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20250519_add_format_column_catalog.sql +0 -0
  58. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20250610_migrate_resources_exception.sql +0 -0
  59. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/migrations/main/20250611_add_status_since_catalog.sql +0 -0
  60. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/routes/__init__.py +0 -0
  61. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/routes/checks.py +0 -0
  62. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/routes/resources.py +0 -0
  63. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/routes/resources_exceptions.py +0 -0
  64. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/routes/status.py +0 -0
  65. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/schemas/__init__.py +0 -0
  66. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/schemas/check.py +0 -0
  67. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/schemas/resource.py +0 -0
  68. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/schemas/resource_exception.py +0 -0
  69. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/auth.py +0 -0
  70. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/csv.py +0 -0
  71. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/db.py +0 -0
  72. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/errors.py +0 -0
  73. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/file.py +0 -0
  74. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/geojson.py +0 -0
  75. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/http.py +0 -0
  76. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/minio.py +0 -0
  77. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/queue.py +0 -0
  78. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/utils/timer.py +0 -0
  79. {udata_hydra-2.2.2.dev7611 → udata_hydra-2.2.2.dev7633}/udata_hydra/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: udata-hydra
3
- Version: 2.2.2.dev7611
3
+ Version: 2.2.2.dev7633
4
4
  Summary: Async crawler and parsing service for data.gouv.fr
5
5
  License: MIT
6
6
  Author: Opendata Team
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "udata-hydra"
3
- version = "2.2.2.dev7611"
3
+ version = "2.2.2.dev7633"
4
4
  description = "Async crawler and parsing service for data.gouv.fr"
5
5
  authors = [{ name = "Opendata Team", email = "opendatateam@data.gouv.fr" }]
6
6
  dependencies = [
@@ -5,12 +5,14 @@ import logging
5
5
  import os
6
6
  import sys
7
7
  from datetime import datetime, timezone
8
- from typing import Any, Iterator
8
+ from math import isnan
9
+ from typing import Iterator
9
10
 
11
+ import pandas as pd
10
12
  from asyncpg import Record
13
+ from csv_detective import routine as csv_detective_routine
14
+ from csv_detective import validate_then_detect
11
15
  from csv_detective.detection.engine import engine_to_file
12
- from csv_detective.explore_csv import routine as csv_detective_routine
13
- from csv_detective.explore_csv import validate_then_detect
14
16
  from progressist import ProgressBar
15
17
  from slugify import slugify
16
18
  from sqlalchemy import (
@@ -28,8 +30,6 @@ from sqlalchemy import (
28
30
  )
29
31
  from sqlalchemy.dialects.postgresql import asyncpg
30
32
  from sqlalchemy.schema import CreateIndex, CreateTable, Index
31
- from str2bool import str2bool
32
- from str2float import str2float
33
33
 
34
34
  from udata_hydra import config, context
35
35
  from udata_hydra.analysis import helpers
@@ -40,7 +40,6 @@ from udata_hydra.db.resource_exception import ResourceException
40
40
  from udata_hydra.utils import (
41
41
  IOException,
42
42
  ParseException,
43
- Reader,
44
43
  Timer,
45
44
  detect_tabular_from_headers,
46
45
  handle_parse_exception,
@@ -71,17 +70,6 @@ PYTHON_TYPE_TO_PG = {
71
70
  "datetime_aware": DateTime(timezone=True),
72
71
  }
73
72
 
74
- PYTHON_TYPE_TO_PY = {
75
- "string": str,
76
- "float": float,
77
- "int": int,
78
- "bool": bool,
79
- "json": helpers.to_json,
80
- "date": helpers.to_date,
81
- "datetime": helpers.to_datetime,
82
- "datetime_aware": helpers.to_datetime,
83
- }
84
-
85
73
  RESERVED_COLS = ("__id", "cmin", "cmax", "collation", "ctid", "tableoid", "xmin", "xmax")
86
74
  minio_client = MinIOClient(bucket=config.MINIO_PARQUET_BUCKET, folder=config.MINIO_PARQUET_FOLDER)
87
75
 
@@ -130,17 +118,21 @@ async def analyse_csv(
130
118
  try:
131
119
  previous_analysis: dict | None = await get_previous_analysis(resource_id=resource_id)
132
120
  if previous_analysis:
133
- csv_inspection: dict = validate_then_detect(
121
+ csv_inspection, df = validate_then_detect(
134
122
  file_path=tmp_file.name,
135
123
  previous_analysis=previous_analysis,
136
124
  output_profile=True,
125
+ output_df=True,
126
+ cast_json=False,
137
127
  num_rows=-1,
138
128
  save_results=False,
139
129
  )
140
130
  else:
141
- csv_inspection: dict | None = csv_detective_routine(
131
+ csv_inspection, df = csv_detective_routine(
142
132
  file_path=tmp_file.name,
143
133
  output_profile=True,
134
+ output_df=True,
135
+ cast_json=False,
144
136
  num_rows=-1,
145
137
  save_results=False,
146
138
  )
@@ -151,7 +143,7 @@ async def analyse_csv(
151
143
  timer.mark("csv-inspection")
152
144
 
153
145
  await csv_to_db(
154
- file_path=tmp_file.name,
146
+ df=df,
155
147
  inspection=csv_inspection,
156
148
  table_name=table_name,
157
149
  table_indexes=table_indexes,
@@ -162,7 +154,7 @@ async def analyse_csv(
162
154
 
163
155
  try:
164
156
  parquet_args: tuple[str, int] | None = await csv_to_parquet(
165
- file_path=tmp_file.name,
157
+ df=df,
166
158
  inspection=csv_inspection,
167
159
  resource_id=resource_id,
168
160
  )
@@ -219,26 +211,6 @@ async def get_previous_analysis(resource_id: str) -> dict | None:
219
211
  return analysis
220
212
 
221
213
 
222
- def smart_cast(_type: str, value, failsafe: bool = False) -> Any:
223
- try:
224
- if value is None or value == "":
225
- return None
226
- if _type == "bool":
227
- return str2bool(value)
228
- return PYTHON_TYPE_TO_PY[_type](value)
229
- except ValueError as e:
230
- if _type == "int":
231
- _value = str2float(value, default=None)
232
- if _value:
233
- return int(_value)
234
- elif _type == "float":
235
- return str2float(value, default=None)
236
- if not failsafe:
237
- raise e
238
- log.warning(f'Could not convert "{value}" to {_type}, defaulting to null')
239
- return None
240
-
241
-
242
214
  def compute_create_table_query(
243
215
  table_name: str, columns: dict, indexes: dict[str, str] | None = None
244
216
  ) -> str:
@@ -255,7 +227,8 @@ def compute_create_table_query(
255
227
  for col_name, index_type in indexes.items():
256
228
  if index_type not in config.SQL_INDEXES_TYPES_SUPPORTED:
257
229
  log.error(
258
- f'Index type "{index_type}" is unknown or not supported yet! Index for column {col_name} was not created.'
230
+ f'Index type "{index_type}" is unknown or not supported yet! '
231
+ f"Index for column {col_name} was not created."
259
232
  )
260
233
  continue
261
234
 
@@ -267,7 +240,8 @@ def compute_create_table_query(
267
240
  table.append_constraint(Index(index_name, col_name))
268
241
  except KeyError:
269
242
  raise KeyError(
270
- f'Error creating index "{index_name}" on column "{col_name}". Does the column "{col_name}" exist in the table?'
243
+ f'Error creating index "{index_name}" on column "{col_name}". '
244
+ f'Does the column "{col_name}" exist in the table?'
271
245
  )
272
246
  # TODO: other index types. Not easy with sqlalchemy, maybe use raw sql?
273
247
 
@@ -289,17 +263,15 @@ def compute_create_table_query(
289
263
  return query
290
264
 
291
265
 
292
- def generate_records(file_path: str, inspection: dict, columns: dict) -> Iterator[list]:
293
- # because we need the iterator twice, not possible to
294
- # handle parquet and db through the same iteration
295
- with Reader(file_path, inspection) as reader:
296
- for line in reader:
297
- if line:
298
- yield [smart_cast(t, v, failsafe=True) for t, v in zip(columns.values(), line)]
266
+ def generate_records(df: pd.DataFrame) -> Iterator[list]:
267
+ # pandas cannot have None in columns typed as int so we have to cast
268
+ # NaN-int values to None for db insertion, and we also change NaN to None
269
+ for row in df.values:
270
+ yield tuple(cell if not pd.isna(cell) else None for cell in row)
299
271
 
300
272
 
301
273
  async def csv_to_parquet(
302
- file_path: str,
274
+ df: pd.DataFrame,
303
275
  inspection: dict,
304
276
  resource_id: str | None = None,
305
277
  ) -> tuple[str, int] | None:
@@ -334,11 +306,9 @@ async def csv_to_parquet(
334
306
  # Update resource status to CONVERTING_TO_PARQUET
335
307
  await Resource.update(resource_id, {"status": "CONVERTING_TO_PARQUET"})
336
308
 
337
- columns = {c: v["python_type"] for c, v in inspection["columns"].items()}
338
309
  # save the file as parquet and store it on Minio instance
339
310
  parquet_file, _ = save_as_parquet(
340
- records=generate_records(file_path, inspection, columns),
341
- columns=columns,
311
+ df=df,
342
312
  output_filename=resource_id,
343
313
  )
344
314
  parquet_size: int = os.path.getsize(parquet_file)
@@ -347,7 +317,7 @@ async def csv_to_parquet(
347
317
 
348
318
 
349
319
  async def csv_to_db(
350
- file_path: str,
320
+ df: pd.DataFrame,
351
321
  inspection: dict,
352
322
  table_name: str,
353
323
  table_indexes: dict[str, str] | None = None,
@@ -401,8 +371,8 @@ async def csv_to_db(
401
371
  try:
402
372
  await db.copy_records_to_table(
403
373
  table_name,
404
- records=generate_records(file_path, inspection, columns),
405
- columns=columns.keys(),
374
+ records=generate_records(df),
375
+ columns=list(columns.keys()),
406
376
  )
407
377
  except Exception as e: # I know what I'm doing, pinky swear
408
378
  raise ParseException(
@@ -411,8 +381,8 @@ async def csv_to_db(
411
381
  # this inserts rows from iterator one by one, slow but useful for debugging
412
382
  else:
413
383
  bar = ProgressBar(total=inspection["total_lines"])
414
- for r in bar.iter(generate_records(file_path, inspection, columns)):
415
- data = {k: v for k, v in zip(columns.keys(), r)}
384
+ for r in bar.iter(generate_records(df)):
385
+ data = {k: v for k, v in zip(df.columns, r)}
416
386
  # NB: possible sql injection here, but should not be used in prod
417
387
  q = compute_insert_query(table_name=table_name, data=data, returning="__id")
418
388
  await db.execute(q, *data.values())
@@ -1,34 +1,12 @@
1
1
  import json
2
- from datetime import date, datetime
3
2
  from typing import IO
4
3
 
5
4
  from asyncpg import Record
6
- from dateparser import parse as date_parser
7
- from dateutil.parser import ParserError
8
- from dateutil.parser import parse as dateutil_parser
9
5
 
10
6
  from udata_hydra import config
11
7
  from udata_hydra.utils import UdataPayload, download_resource, queue, send
12
8
 
13
9
 
14
- def to_json(value: str) -> str:
15
- """Convenience method, should be casted from string directly by postgres"""
16
- return value
17
-
18
-
19
- def to_datetime(value: str) -> datetime | None:
20
- """For performance reasons, we try first with dateutil and fallback on dateparser"""
21
- try:
22
- return dateutil_parser(value)
23
- except ParserError:
24
- return date_parser(value)
25
-
26
-
27
- def to_date(value: str) -> date | None:
28
- parsed = to_datetime(value)
29
- return parsed.date() if parsed else None
30
-
31
-
32
10
  def get_python_type(column: dict) -> str:
33
11
  """Outsourcing the distinction of aware datetimes"""
34
12
  return (
@@ -5,7 +5,6 @@ from .file import compute_checksum_from_file, download_resource
5
5
  from .geojson import detect_geojson_from_headers_or_catalog
6
6
  from .http import UdataPayload, get_request_params, send
7
7
  from .queue import enqueue
8
- from .reader import Reader
9
8
  from .timer import Timer
10
9
 
11
10
  __all__ = [
@@ -21,6 +20,5 @@ __all__ = [
21
20
  "get_request_params",
22
21
  "send",
23
22
  "enqueue",
24
- "Reader",
25
23
  "Timer",
26
24
  ]
@@ -0,0 +1,14 @@
1
+ from io import BytesIO
2
+
3
+ import pandas as pd
4
+
5
+
6
+ def save_as_parquet(
7
+ df: pd.DataFrame,
8
+ output_filename: str | None = None,
9
+ ) -> tuple[str, BytesIO | None]:
10
+ bytes = df.to_parquet(
11
+ f"{output_filename}.parquet" if output_filename else None,
12
+ compression="zstd", # best compression to date
13
+ )
14
+ return f"{output_filename}.parquet", bytes
@@ -1,29 +0,0 @@
1
- from typing import Iterator
2
-
3
- import pyarrow as pa
4
- import pyarrow.parquet as pq
5
-
6
- PYTHON_TYPE_TO_PA = {
7
- "string": pa.string(),
8
- "float": pa.float64(),
9
- "int": pa.int64(),
10
- "bool": pa.bool_(),
11
- "json": pa.string(),
12
- "date": pa.date32(),
13
- "datetime": pa.date64(),
14
- }
15
-
16
-
17
- def save_as_parquet(
18
- records: Iterator[list],
19
- columns: dict,
20
- output_filename: str | None = None,
21
- ) -> tuple[str, pa.Table]:
22
- # the "output_name = None" case is only used in tests
23
- table = pa.Table.from_pylist(
24
- [{c: v for c, v in zip(columns, values)} for values in records],
25
- schema=pa.schema([pa.field(c, PYTHON_TYPE_TO_PA[columns[c]]) for c in columns]),
26
- )
27
- if output_filename:
28
- pq.write_table(table, f"{output_filename}.parquet")
29
- return f"{output_filename}.parquet", table
@@ -1,69 +0,0 @@
1
- import csv as stdcsv
2
- from io import BytesIO
3
- from typing import Generator
4
-
5
- import openpyxl
6
- import xlrd
7
-
8
-
9
- def generate_dialect(inspection: dict) -> stdcsv.Dialect:
10
- class CustomDialect(stdcsv.unix_dialect):
11
- # TODO: it would be nice to have more info from csvdetective to feed the dialect
12
- # in the meantime we might want to sniff the file a bit
13
- delimiter = inspection["separator"]
14
-
15
- return CustomDialect()
16
-
17
-
18
- class Reader:
19
- def __init__(self, file_path, inspection):
20
- self.file_path = file_path
21
- self.inspection = inspection
22
- self.nb_skip = self.inspection["header_row_idx"]
23
- self.mapping = {
24
- "openpyxl": "iter_rows",
25
- "xlrd": "get_rows",
26
- }
27
- self.nb_columns = len(self.inspection["header"])
28
- self.reader = None
29
-
30
- def __enter__(self):
31
- if self.inspection.get("engine") == "openpyxl":
32
- with open(self.file_path, "rb") as f:
33
- content = BytesIO(f.read())
34
- self.file = openpyxl.load_workbook(content)
35
- self.sheet = self.file[self.inspection["sheet_name"]]
36
- self.reader = self._excel_reader()
37
-
38
- elif self.inspection.get("engine") == "xlrd":
39
- self.file = xlrd.open_workbook(self.file_path)
40
- self.sheet = self.file[self.inspection["sheet_name"]]
41
- self.reader = self._excel_reader()
42
-
43
- else:
44
- self.file = open(self.file_path, encoding=self.inspection["encoding"])
45
- self.reader = stdcsv.reader(
46
- self._skip_rows(), dialect=generate_dialect(self.inspection)
47
- )
48
- return self
49
-
50
- def __exit__(self, exc_type, exc_value, traceback):
51
- if self.file is not None and hasattr(self.file, "close"):
52
- self.file.close()
53
-
54
- def _skip_rows(self):
55
- # skipping header
56
- for _ in range(self.nb_skip + 1):
57
- next(self.file)
58
- return self.file
59
-
60
- def _excel_reader(self) -> Generator:
61
- _method = getattr(self.sheet, self.mapping[self.inspection["engine"]])
62
- for idx, row in enumerate(_method()):
63
- # skipping header
64
- if idx <= self.nb_skip:
65
- continue
66
- yield [c.value for c in row]
67
-
68
- def __iter__(self):
69
- return self.reader