etlplus 0.9.0__py3-none-any.whl → 0.10.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
etlplus/cli/commands.py CHANGED
@@ -443,9 +443,9 @@ def extract_cmd(
443
443
  Source (JSON payload, file/folder path, URL/URI, or - for STDIN)
444
444
  from which to extract data. Default is ``-``.
445
445
  source_format : SourceFormatOption, optional
446
- Source data format. Overrides the inferred format (``csv``, ``json``,
447
- ``parquet``, ``xml``) based on filename extension or STDIN content.
448
- Default is ``None``.
446
+ Data source format. Overrides the inferred format (``csv``, ``json``,
447
+ etc.) based on filename extension or STDIN content. Default is
448
+ ``None``.
449
449
  source_type : SourceTypeOption, optional
450
450
  Data source type. Overrides the inferred type (``api``, ``database``,
451
451
  ``file``, ``folder``) based on URI/URL schema. Default is ``None``.
@@ -523,15 +523,15 @@ def load_cmd(
523
523
  ctx : typer.Context
524
524
  The Typer context.
525
525
  source_format : SourceFormatOption, optional
526
- Source data format. Overrides the inferred format (``csv``, ``json``,
527
- ``parquet``, ``xml``) based on STDIN content. Default is ``None``.
526
+ Data source format. Overrides the inferred format (``csv``, ``json``,
527
+ etc.) based on filename extension or STDIN content. Default is
528
+ ``None``.
528
529
  target : TargetArg, optional
529
530
  Target (file/folder path, URL/URI, or - for STDOUT) into which to load
530
531
  data. Default is ``-``.
531
532
  target_format : TargetFormatOption, optional
532
- Format of the target data. Overrides the inferred format (``csv``,
533
- ``json``, ``parquet``, ``xml``) based on filename extension. Default is
534
- ``None``.
533
+ Target data format. Overrides the inferred format (``csv``, ``json``,
534
+ etc.) based on filename extension. Default is ``None``.
535
535
  target_type : TargetTypeOption, optional
536
536
  Data target type. Overrides the inferred type (``api``, ``database``,
537
537
  ``file``, ``folder``) based on URI/URL schema. Default is ``None``.
@@ -760,9 +760,9 @@ def transform_cmd(
760
760
  Source (JSON payload, file/folder path, URL/URI, or - for STDIN) from
761
761
  which to extract data. Default is ``-``.
762
762
  source_format : SourceFormatOption, optional
763
- Source data format. Overrides the inferred format (``csv``, ``json``,
764
- ``parquet``, ``xml``) based on filename extension or STDIN content.
765
- Default is ``None``.
763
+ Data source format. Overrides the inferred format (``csv``, ``json``,
764
+ etc.) based on filename extension or STDIN content. Default is
765
+ ``None``.
766
766
  source_type : SourceTypeOption, optional
767
767
  Data source type. Overrides the inferred type (``api``, ``database``,
768
768
  ``file``, ``folder``) based on URI/URL schema. Default is ``None``.
@@ -770,9 +770,8 @@ def transform_cmd(
770
770
  Target (file/folder path, URL/URI, or - for STDOUT) into which to load
771
771
  data. Default is ``-``.
772
772
  target_format : TargetFormatOption, optional
773
- Format of the target data. Overrides the inferred format (``csv``,
774
- ``json``, ``parquet``, ``xml``) based on filename extension. Default is
775
- ``None``.
773
+ Target data format. Overrides the inferred format (``csv``, ``json``,
774
+ etc.) based on filename extension. Default is ``None``.
776
775
  target_type : TargetTypeOption, optional
777
776
  Data target type. Overrides the inferred type (``api``, ``database``,
778
777
  ``file``, ``folder``) based on URI/URL schema. Default is ``None``.
@@ -876,11 +875,12 @@ def validate_cmd(
876
875
  source : SourceArg
877
876
  Data source to validate (path, JSON payload, or - for STDIN).
878
877
  source_format : SourceFormatOption, optional
879
- Format of the source. Overrides filename-based inference when provided.
880
- Default is ``None``.
881
- source_type : SourceTypeOption, optional
882
- Override the inferred source type (file, database, api). Default is
878
+ Data source format. Overrides the inferred format (``csv``, ``json``,
879
+ etc.) based on filename extension or STDIN content. Default is
883
880
  ``None``.
881
+ source_type : SourceTypeOption, optional
882
+ Data source type. Overrides the inferred type (``api``, ``database``,
883
+ ``file``, ``folder``) based on URI/URL schema. Default is ``None``.
884
884
  output : OutputOption, optional
885
885
  Output file for validated output (- for STDOUT). Default is ``None``.
886
886
 
etlplus/enums.py CHANGED
@@ -8,6 +8,7 @@ from __future__ import annotations
8
8
 
9
9
  import enum
10
10
  import operator as _op
11
+ from pathlib import PurePath
11
12
  from statistics import fmean
12
13
  from typing import Self
13
14
 
@@ -19,16 +20,21 @@ from .types import StrStrMap
19
20
 
20
21
 
21
22
  __all__ = [
23
+ # Enums
22
24
  'AggregateName',
23
25
  'CoercibleStrEnum',
26
+ 'CompressionFormat',
24
27
  'DataConnectorType',
25
28
  'FileFormat',
26
29
  'HttpMethod',
27
30
  'OperatorName',
28
31
  'PipelineStep',
32
+ # Functions
33
+ 'coerce_compression_format',
29
34
  'coerce_data_connector_type',
30
35
  'coerce_file_format',
31
36
  'coerce_http_method',
37
+ 'infer_file_format_and_compression',
32
38
  ]
33
39
 
34
40
 
@@ -172,6 +178,39 @@ class AggregateName(CoercibleStrEnum):
172
178
  return lambda xs, n: (fmean(xs) if xs else 0.0)
173
179
 
174
180
 
181
+ class CompressionFormat(CoercibleStrEnum):
182
+ """Supported compression formats for data files."""
183
+
184
+ # -- Constants -- #
185
+
186
+ GZ = 'gz'
187
+ ZIP = 'zip'
188
+
189
+ # -- Class Methods -- #
190
+
191
+ @classmethod
192
+ def aliases(cls) -> StrStrMap:
193
+ """
194
+ Return a mapping of common aliases for each enum member.
195
+
196
+ Returns
197
+ -------
198
+ StrStrMap
199
+ A mapping of alias names to their corresponding enum member names.
200
+ """
201
+ return {
202
+ # File extensions
203
+ '.gz': 'gz',
204
+ '.gzip': 'gz',
205
+ '.zip': 'zip',
206
+ # MIME types
207
+ 'application/gzip': 'gz',
208
+ 'application/x-gzip': 'gz',
209
+ 'application/zip': 'zip',
210
+ 'application/x-zip-compressed': 'zip',
211
+ }
212
+
213
+
175
214
  class DataConnectorType(CoercibleStrEnum):
176
215
  """Supported data connector types."""
177
216
 
@@ -208,8 +247,19 @@ class FileFormat(CoercibleStrEnum):
208
247
 
209
248
  # -- Constants -- #
210
249
 
250
+ AVRO = 'avro'
211
251
  CSV = 'csv'
252
+ FEATHER = 'feather'
253
+ GZ = 'gz'
212
254
  JSON = 'json'
255
+ NDJSON = 'ndjson'
256
+ ORC = 'orc'
257
+ PARQUET = 'parquet'
258
+ TSV = 'tsv'
259
+ TXT = 'txt'
260
+ XLS = 'xls'
261
+ XLSX = 'xlsx'
262
+ ZIP = 'zip'
213
263
  XML = 'xml'
214
264
  YAML = 'yaml'
215
265
 
@@ -227,11 +277,61 @@ class FileFormat(CoercibleStrEnum):
227
277
  """
228
278
  return {
229
279
  # Common shorthand
280
+ 'parq': 'parquet',
230
281
  'yml': 'yaml',
282
+ # File extensions
283
+ '.avro': 'avro',
284
+ '.csv': 'csv',
285
+ '.feather': 'feather',
286
+ '.gz': 'gz',
287
+ '.json': 'json',
288
+ '.jsonl': 'ndjson',
289
+ '.ndjson': 'ndjson',
290
+ '.orc': 'orc',
291
+ '.parquet': 'parquet',
292
+ '.pq': 'parquet',
293
+ '.tsv': 'tsv',
294
+ '.txt': 'txt',
295
+ '.xls': 'xls',
296
+ '.xlsx': 'xlsx',
297
+ '.zip': 'zip',
298
+ '.xml': 'xml',
299
+ '.yaml': 'yaml',
300
+ '.yml': 'yaml',
231
301
  # MIME types
232
- 'text/csv': 'csv',
302
+ 'application/avro': 'avro',
303
+ 'application/csv': 'csv',
304
+ 'application/feather': 'feather',
305
+ 'application/gzip': 'gz',
233
306
  'application/json': 'json',
307
+ 'application/jsonlines': 'ndjson',
308
+ 'application/ndjson': 'ndjson',
309
+ 'application/orc': 'orc',
310
+ 'application/parquet': 'parquet',
311
+ 'application/vnd.apache.avro': 'avro',
312
+ 'application/vnd.apache.parquet': 'parquet',
313
+ 'application/vnd.apache.arrow.file': 'feather',
314
+ 'application/vnd.apache.orc': 'orc',
315
+ 'application/vnd.ms-excel': 'xls',
316
+ (
317
+ 'application/vnd.openxmlformats-'
318
+ 'officedocument.spreadsheetml.sheet'
319
+ ): 'xlsx',
320
+ 'application/x-avro': 'avro',
321
+ 'application/x-csv': 'csv',
322
+ 'application/x-feather': 'feather',
323
+ 'application/x-orc': 'orc',
324
+ 'application/x-ndjson': 'ndjson',
325
+ 'application/x-parquet': 'parquet',
326
+ 'application/x-yaml': 'yaml',
234
327
  'application/xml': 'xml',
328
+ 'application/zip': 'zip',
329
+ 'text/csv': 'csv',
330
+ 'text/plain': 'txt',
331
+ 'text/tab-separated-values': 'tsv',
332
+ 'text/tsv': 'tsv',
333
+ 'text/xml': 'xml',
334
+ 'text/yaml': 'yaml',
235
335
  }
236
336
 
237
337
 
@@ -365,6 +465,13 @@ class PipelineStep(CoercibleStrEnum):
365
465
  # SECTION: INTERNAL CONSTANTS ============================================== #
366
466
 
367
467
 
468
+ # Compression formats that are also file formats.
469
+ _COMPRESSION_FILE_FORMATS: set[FileFormat] = {
470
+ FileFormat.GZ,
471
+ FileFormat.ZIP,
472
+ }
473
+
474
+
368
475
  # Precomputed order index for PipelineStep; avoids recomputing on each access.
369
476
  _PIPELINE_ORDER_INDEX: dict[PipelineStep, int] = {
370
477
  PipelineStep.FILTER: 0,
@@ -402,6 +509,18 @@ def coerce_file_format(
402
509
  return FileFormat.coerce(file_format)
403
510
 
404
511
 
512
+ def coerce_compression_format(
513
+ compression_format: CompressionFormat | str,
514
+ ) -> CompressionFormat:
515
+ """
516
+ Normalize textual compression format values to :class:`CompressionFormat`.
517
+
518
+ This thin wrapper is kept for backward compatibility; prefer
519
+ :meth:`CompressionFormat.coerce` going forward.
520
+ """
521
+ return CompressionFormat.coerce(compression_format)
522
+
523
+
405
524
  def coerce_http_method(
406
525
  http_method: HttpMethod | str,
407
526
  ) -> HttpMethod:
@@ -412,3 +531,78 @@ def coerce_http_method(
412
531
  :meth:`HttpMethod.coerce` going forward.
413
532
  """
414
533
  return HttpMethod.coerce(http_method)
534
+
535
+
536
+ def infer_file_format_and_compression(
537
+ value: object,
538
+ filename: object | None = None,
539
+ ) -> tuple[FileFormat | None, CompressionFormat | None]:
540
+ """
541
+ Infer data format and compression from a filename, extension, or MIME type.
542
+
543
+ Parameters
544
+ ----------
545
+ value : object
546
+ A filename, extension, MIME type, or existing enum member.
547
+ filename : object | None, optional
548
+ A filename to consult for extension-based inference (e.g. when
549
+ ``value`` is ``application/octet-stream``).
550
+
551
+ Returns
552
+ -------
553
+ tuple[FileFormat | None, CompressionFormat | None]
554
+ The inferred data format and compression, if any.
555
+ """
556
+ if isinstance(value, FileFormat):
557
+ if value in _COMPRESSION_FILE_FORMATS:
558
+ return None, CompressionFormat.coerce(value.value)
559
+ return value, None
560
+ if isinstance(value, CompressionFormat):
561
+ return None, value
562
+
563
+ text = str(value).strip()
564
+ if not text:
565
+ return None, None
566
+
567
+ normalized = text.casefold()
568
+ mime = normalized.split(';', 1)[0].strip()
569
+
570
+ is_octet_stream = mime == 'application/octet-stream'
571
+ compression = CompressionFormat.try_coerce(mime)
572
+ fmt = None if is_octet_stream else FileFormat.try_coerce(mime)
573
+
574
+ is_mime = mime.startswith(
575
+ (
576
+ 'application/',
577
+ 'text/',
578
+ 'audio/',
579
+ 'image/',
580
+ 'video/',
581
+ 'multipart/',
582
+ ),
583
+ )
584
+ suffix_source: object | None = filename if filename is not None else text
585
+ if is_mime and filename is None:
586
+ suffix_source = None
587
+
588
+ suffixes = (
589
+ PurePath(str(suffix_source)).suffixes
590
+ if suffix_source is not None
591
+ else []
592
+ )
593
+ if suffixes:
594
+ normalized_suffixes = [suffix.casefold() for suffix in suffixes]
595
+ compression = (
596
+ CompressionFormat.try_coerce(normalized_suffixes[-1])
597
+ or compression
598
+ )
599
+ if compression is not None:
600
+ normalized_suffixes = normalized_suffixes[:-1]
601
+ if normalized_suffixes:
602
+ fmt = FileFormat.try_coerce(normalized_suffixes[-1]) or fmt
603
+
604
+ if fmt in _COMPRESSION_FILE_FORMATS:
605
+ compression = compression or CompressionFormat.coerce(fmt.value)
606
+ fmt = None
607
+
608
+ return fmt, compression
etlplus/file.py CHANGED
@@ -16,6 +16,7 @@ from typing import Any
16
16
  from typing import cast
17
17
 
18
18
  from .enums import FileFormat
19
+ from .enums import infer_file_format_and_compression
19
20
  from .types import JSONData
20
21
  from .types import JSONDict
21
22
  from .types import JSONList
@@ -33,15 +34,6 @@ __all__ = ['File']
33
34
 
34
35
  _DEFAULT_XML_ROOT = 'root'
35
36
 
36
- # Map common filename extensions to FileFormat (used for inference)
37
- _EXT_TO_FORMAT: dict[str, FileFormat] = {
38
- 'csv': FileFormat.CSV,
39
- 'json': FileFormat.JSON,
40
- 'xml': FileFormat.XML,
41
- 'yaml': FileFormat.YAML,
42
- 'yml': FileFormat.YAML,
43
- }
44
-
45
37
  # Optional YAML support (lazy-loaded to avoid hard dependency)
46
38
  # Cached access function to avoid global statements.
47
39
  _YAML_CACHE: dict[str, Any] = {}
@@ -246,14 +238,17 @@ class File:
246
238
  ValueError
247
239
  If the extension is unknown or unsupported.
248
240
  """
249
- ext = self.path.suffix.lstrip('.').casefold()
250
- try:
251
- return _EXT_TO_FORMAT[ext]
252
- except KeyError as e:
241
+ fmt, compression = infer_file_format_and_compression(self.path)
242
+ if fmt is not None:
243
+ return fmt
244
+ if compression is not None:
253
245
  raise ValueError(
254
- 'Cannot infer file format from '
255
- f'extension {self.path.suffix!r}',
256
- ) from e
246
+ 'Cannot infer file format from compressed file '
247
+ f'{self.path!r} with compression {compression.value!r}',
248
+ )
249
+ raise ValueError(
250
+ f'Cannot infer file format from extension {self.path.suffix!r}',
251
+ )
257
252
 
258
253
  # -- Instance Methods (Generic API) -- #
259
254
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: etlplus
3
- Version: 0.9.0
3
+ Version: 0.10.4
4
4
  Summary: A Swiss Army knife for simple ETL operations
5
5
  Home-page: https://github.com/Dagitali/ETLPlus
6
6
  Author: ETLPlus Team
@@ -1,9 +1,9 @@
1
1
  etlplus/__init__.py,sha256=M2gScnyir6WOMAh_EuoQIiAzdcTls0_5hbd_Q6of8I0,1021
2
2
  etlplus/__main__.py,sha256=btoROneNiigyfBU7BSzPKZ1R9gzBMpxcpsbPwmuHwTM,479
3
3
  etlplus/__version__.py,sha256=1E0GMK_yUWCMQFKxXjTvyMwofi0qT2k4CDNiHWiymWE,327
4
- etlplus/enums.py,sha256=V_j18Ud2BCXpFsBk2pZGrvCVrvAMJ7uja1z9fppFGso,10175
4
+ etlplus/enums.py,sha256=8hzprOLyeCCzlHaXpG4VfgmxPSEdlZeOnHLFzBneKNs,15969
5
5
  etlplus/extract.py,sha256=f44JdHhNTACxgn44USx05paKTwq7LQY-V4wANCW9hVM,6173
6
- etlplus/file.py,sha256=RxIAsGDN4f_vNA2B5-ct88JNd_ISAyYbooIRE5DstS8,17972
6
+ etlplus/file.py,sha256=B-zebTrIFDKaaKzA9Fq5-L0JwDNYa2T--_6veR3N03s,17939
7
7
  etlplus/load.py,sha256=R_y0_vtsEo1bwxWVQu2bfhB5ZIJoIoWu2ycCdvY4RnE,8737
8
8
  etlplus/mixins.py,sha256=ifGpHwWv7U00yqGf-kN93vJax2IiK4jaGtTsPsO3Oak,1350
9
9
  etlplus/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -31,7 +31,7 @@ etlplus/api/rate_limiting/__init__.py,sha256=ZySB1dZettEDnWvI1EHf_TZ9L08M_kKsNR-
31
31
  etlplus/api/rate_limiting/config.py,sha256=2b4wIynblN-1EyMqI4aXa71SljzSjXYh5N1Nngr3jOg,9406
32
32
  etlplus/api/rate_limiting/rate_limiter.py,sha256=Uxozqd_Ej5Lsj-M-mLT2WexChgWh7x35_YP10yqYPQA,7159
33
33
  etlplus/cli/__init__.py,sha256=J97-Rv931IL1_b4AXnB7Fbbd7HKnHBpx18NQfC_kE6c,299
34
- etlplus/cli/commands.py,sha256=_nias9eSMZoTBiicXDNEkWLYfzd4-CcO2j_xPPxghls,24632
34
+ etlplus/cli/commands.py,sha256=BK2qmFsser6AXOgEvpiadrYMIiwviAzqkSxMlBhRXRw,24670
35
35
  etlplus/cli/constants.py,sha256=KIZj7J2tNf5mJbkqAdZmu5FXYW2FQmxwgeOKWc3-3Hg,1944
36
36
  etlplus/cli/handlers.py,sha256=K0GazvrPgocJ-63HZqF0xhyJk8TB1Gcj-eIbWltXKRU,17759
37
37
  etlplus/cli/io.py,sha256=7sldiZz4-Geomge5IO_XYykXPa6UiORfUWzLCdQePG8,7846
@@ -57,9 +57,9 @@ etlplus/templates/ddl.sql.j2,sha256=s8fMWvcb4eaJVXkifuib1aQPljtZ8buuyB_uA-ZdU3Q,
57
57
  etlplus/templates/view.sql.j2,sha256=Iy8DHfhq5yyvrUKDxqp_aHIEXY4Tm6j4wT7YDEFWAhk,2180
58
58
  etlplus/validation/__init__.py,sha256=Pe5Xg1_EA4uiNZGYu5WTF3j7odjmyxnAJ8rcioaplSQ,1254
59
59
  etlplus/validation/utils.py,sha256=Mtqg449VIke0ziy_wd2r6yrwJzQkA1iulZC87FzXMjo,10201
60
- etlplus-0.9.0.dist-info/licenses/LICENSE,sha256=MuNO63i6kWmgnV2pbP2SLqP54mk1BGmu7CmbtxMmT-U,1069
61
- etlplus-0.9.0.dist-info/METADATA,sha256=ynMgjG7Wv_xkP0fBaAOj3-rpUgJHwZ7UOzCxoU8CBeE,21035
62
- etlplus-0.9.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
63
- etlplus-0.9.0.dist-info/entry_points.txt,sha256=6w-2-jzuPa55spzK34h-UKh2JTEShh38adFRONNP9QE,45
64
- etlplus-0.9.0.dist-info/top_level.txt,sha256=aWWF-udn_sLGuHTM6W6MLh99ArS9ROkUWO8Mi8y1_2U,8
65
- etlplus-0.9.0.dist-info/RECORD,,
60
+ etlplus-0.10.4.dist-info/licenses/LICENSE,sha256=MuNO63i6kWmgnV2pbP2SLqP54mk1BGmu7CmbtxMmT-U,1069
61
+ etlplus-0.10.4.dist-info/METADATA,sha256=M_lQUZ5o-JaD1KuZk_t0LeHbaOj_SdqqaJQSbDCW-zY,21036
62
+ etlplus-0.10.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
63
+ etlplus-0.10.4.dist-info/entry_points.txt,sha256=6w-2-jzuPa55spzK34h-UKh2JTEShh38adFRONNP9QE,45
64
+ etlplus-0.10.4.dist-info/top_level.txt,sha256=aWWF-udn_sLGuHTM6W6MLh99ArS9ROkUWO8Mi8y1_2U,8
65
+ etlplus-0.10.4.dist-info/RECORD,,