etlplus 0.10.4__py3-none-any.whl → 0.12.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. etlplus/README.md +37 -0
  2. etlplus/api/README.md +20 -3
  3. etlplus/cli/README.md +40 -0
  4. etlplus/cli/commands.py +1 -1
  5. etlplus/cli/constants.py +1 -1
  6. etlplus/cli/handlers.py +1 -1
  7. etlplus/cli/io.py +2 -2
  8. etlplus/config/README.md +52 -0
  9. etlplus/config/pipeline.py +2 -2
  10. etlplus/database/README.md +48 -0
  11. etlplus/database/ddl.py +1 -1
  12. etlplus/database/engine.py +1 -1
  13. etlplus/database/schema.py +1 -1
  14. etlplus/enums.py +2 -270
  15. etlplus/extract.py +5 -7
  16. etlplus/file/README.md +105 -0
  17. etlplus/file/__init__.py +25 -0
  18. etlplus/file/avro.py +198 -0
  19. etlplus/file/core.py +287 -0
  20. etlplus/file/csv.py +91 -0
  21. etlplus/file/enums.py +238 -0
  22. etlplus/file/feather.py +144 -0
  23. etlplus/file/gz.py +123 -0
  24. etlplus/file/json.py +98 -0
  25. etlplus/file/ndjson.py +109 -0
  26. etlplus/file/orc.py +142 -0
  27. etlplus/file/parquet.py +146 -0
  28. etlplus/file/tsv.py +91 -0
  29. etlplus/file/txt.py +99 -0
  30. etlplus/file/xls.py +132 -0
  31. etlplus/file/xlsx.py +142 -0
  32. etlplus/file/xml.py +174 -0
  33. etlplus/file/yaml.py +136 -0
  34. etlplus/file/zip.py +175 -0
  35. etlplus/load.py +9 -12
  36. etlplus/run.py +6 -9
  37. etlplus/templates/README.md +46 -0
  38. etlplus/validation/README.md +50 -0
  39. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/METADATA +58 -14
  40. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/RECORD +44 -20
  41. etlplus/file.py +0 -652
  42. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/WHEEL +0 -0
  43. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/entry_points.txt +0 -0
  44. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/licenses/LICENSE +0 -0
  45. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/top_level.txt +0 -0
etlplus/README.md ADDED
@@ -0,0 +1,37 @@
1
+ # etlplus package
2
+
3
+ The `etlplus` package provides a unified Python API and CLI for ETL operations: extraction,
4
+ validation, transformation, and loading of data from files, APIs, and databases.
5
+
6
+ - Top-level entry points for extract, validate, transform, and load
7
+ - Utilities for pipeline orchestration and helpers
8
+ - Exposes all subpackages for advanced usage
9
+
10
+ Back to project overview: see the top-level [README](../README.md).
11
+
12
+ ## Subpackages
13
+
14
+ - [etlplus.api](api/README.md): Lightweight HTTP client and paginated REST helpers
15
+ - [etlplus.file](file/README.md): Unified file format support and helpers
16
+ - [etlplus.config](config/README.md): Configuration helpers for connectors, pipelines, jobs, and
17
+ profiles
18
+ - [etlplus.cli](cli/README.md): Command-line interface for ETLPlus workflows
19
+ - [etlplus.database](database/README.md): Database engine, schema, and ORM helpers
20
+ - [etlplus.templates](templates/README.md): SQL and DDL template helpers
21
+ - [etlplus.validation](validation/README.md): Data validation utilities and helpers
22
+
23
+ ## Quickstart
24
+
25
+ ```python
26
+ from etlplus import extract, validate, transform, load
27
+
28
+ data = extract("file", "input.csv")
29
+ filtered = transform(data, {"filter": {"field": "age", "op": "gt", "value": 25}})
30
+ assert validate(filtered, {"age": {"type": "number", "min": 0}})["valid"]
31
+ load(filtered, "file", "output.json", file_format="json")
32
+ ```
33
+
34
+ ## See Also
35
+
36
+ - [Top-level project README](../README.md)
37
+ - [API reference](../docs/README.md)
etlplus/api/README.md CHANGED
@@ -1,7 +1,7 @@
1
- # etlplus.api module.
1
+ # etlplus.api subpackage
2
2
 
3
- Focused documentation for the `etlplus.api` subpackage: a lightweight HTTP client and helpers for
4
- paginated REST endpoints.
3
+ Documentation for the `etlplus.api` subpackage: a lightweight HTTP client and helpers for paginated
4
+ REST endpoints.
5
5
 
6
6
  - Provides a small `EndpointClient` for calling JSON APIs
7
7
  - Supports page-, offset-, and cursor-based pagination via `PaginationConfig`
@@ -12,6 +12,20 @@ paginated REST endpoints.
12
12
 
13
13
  Back to project overview: see the top-level [README](../../README.md).
14
14
 
15
+ - [etlplus.api subpackage](#etlplusapi-subpackage)
16
+ - [Installation](#installation)
17
+ - [Quickstart](#quickstart)
18
+ - [Overriding Rate Limits Per Call](#overriding-rate-limits-per-call)
19
+ - [Choosing `records_path` and `cursor_path`](#choosing-records_path-and-cursor_path)
20
+ - [Cursor-Based Pagination Example](#cursor-based-pagination-example)
21
+ - [Offset-based pagination example](#offset-based-pagination-example)
22
+ - [Authentication](#authentication)
23
+ - [Errors and Rate Limiting](#errors-and-rate-limiting)
24
+ - [Types and Transport](#types-and-transport)
25
+ - [Supporting Modules](#supporting-modules)
26
+ - [Minimal Contract](#minimal-contract)
27
+ - [See also](#see-also)
28
+
15
29
  ## Installation
16
30
 
17
31
  `etlplus.api` ships as part of the `etlplus` package. Install the package as usual:
@@ -233,3 +247,6 @@ providers can fall back to their own defaults. If you already possess a static t
233
247
  ## See also
234
248
 
235
249
  - Top-level CLI and library usage in the main [README](../../README.md)
250
+
251
+
252
+ [def]: #installation
etlplus/cli/README.md ADDED
@@ -0,0 +1,40 @@
1
+ # etlplus.cli subpackage
2
+
3
+ Documentation for the `etlplus.cli` subpackage: command-line interface for ETLPlus workflows.
4
+
5
+ - Provides a CLI for running ETL pipelines, jobs, and utilities
6
+ - Supports commands for running, validating, and inspecting pipelines
7
+ - Includes options for configuration, state, and output control
8
+ - Exposes handlers for custom command integration
9
+
10
+ Back to project overview: see the top-level [README](../../README.md).
11
+
12
+ - [etlplus.cli subpackage](#etlpluscli-subpackage)
13
+ - [Available Commands](#available-commands)
14
+ - [Command Options](#command-options)
15
+ - [Example: Running a Pipeline](#example-running-a-pipeline)
16
+ - [See Also](#see-also)
17
+
18
+ ## Available Commands
19
+
20
+ - **run**: Execute a pipeline or job
21
+ - **validate**: Validate pipeline or config files
22
+ - **inspect**: Show pipeline/job details
23
+
24
+ ## Command Options
25
+
26
+ - `--config`: Path to config file
27
+ - `--state`: Path to state file
28
+ - `--output`: Output file or format
29
+
30
+ ## Example: Running a Pipeline
31
+
32
+ ```bash
33
+ etlplus run --config configs/pipeline.yml --output results.json
34
+ ```
35
+
36
+ ## See Also
37
+
38
+ - Top-level CLI and library usage in the main [README](../../README.md)
39
+ - Command handlers in [handlers.py](handlers.py)
40
+ - Command options in [options.py](options.py)
etlplus/cli/commands.py CHANGED
@@ -36,7 +36,7 @@ from typing import cast
36
36
  import typer
37
37
 
38
38
  from .. import __version__
39
- from ..enums import FileFormat
39
+ from ..file import FileFormat
40
40
  from . import handlers
41
41
  from .constants import CLI_DESCRIPTION
42
42
  from .constants import CLI_EPILOG
etlplus/cli/constants.py CHANGED
@@ -9,7 +9,7 @@ from __future__ import annotations
9
9
  from typing import Final
10
10
 
11
11
  from ..enums import DataConnectorType
12
- from ..enums import FileFormat
12
+ from ..file import FileFormat
13
13
 
14
14
  # SECTION: EXPORTS ========================================================== #
15
15
 
etlplus/cli/handlers.py CHANGED
@@ -570,7 +570,7 @@ def transform_handler(
570
570
  data = transform(payload, cast(TransformOperations, operations_payload))
571
571
 
572
572
  if target and target != '-':
573
- File.write_file(target, data, file_format=target_format)
573
+ File(target, file_format=target_format).write(data)
574
574
  print(f'Data transformed and saved to {target}')
575
575
  return 0
576
576
 
etlplus/cli/io.py CHANGED
@@ -15,8 +15,8 @@ from pathlib import Path
15
15
  from typing import Any
16
16
  from typing import cast
17
17
 
18
- from ..enums import FileFormat
19
18
  from ..file import File
19
+ from ..file import FileFormat
20
20
  from ..types import JSONData
21
21
  from ..utils import print_json
22
22
 
@@ -331,6 +331,6 @@ def write_json_output(
331
331
  """
332
332
  if not output_path or output_path == '-':
333
333
  return False
334
- File(Path(output_path), FileFormat.JSON).write_json(data)
334
+ File(Path(output_path), FileFormat.JSON).write(data)
335
335
  print(f'{success_message} {output_path}')
336
336
  return True
@@ -0,0 +1,52 @@
1
+ # etlplus.config subpackage
2
+
3
+ Documentation for the `etlplus.config` subpackage: configuration helpers for connectors, pipelines,
4
+ jobs, and profiles.
5
+
6
+ - Provides classes and utilities for managing ETL pipeline configuration
7
+ - Supports YAML/JSON config loading and validation
8
+ - Includes helpers for connectors, jobs, pipelines, and profiles
9
+ - Exposes type definitions for config schemas
10
+
11
+ Back to project overview: see the top-level [README](../../README.md).
12
+
13
+ - [etlplus.config subpackage](#etlplusconfig-subpackage)
14
+ - [Supported Configuration Types](#supported-configuration-types)
15
+ - [Loading and Validating Configs](#loading-and-validating-configs)
16
+ - [Example: Loading a Pipeline Config](#example-loading-a-pipeline-config)
17
+ - [See Also](#see-also)
18
+
19
+ ## Supported Configuration Types
20
+
21
+ - **Connector**: Connection details for databases, files, or APIs
22
+ - **Job**: ETL job definitions and scheduling
23
+ - **Pipeline**: End-to-end pipeline configuration
24
+ - **Profile**: User or environment-specific settings
25
+
26
+ ## Loading and Validating Configs
27
+
28
+ Use the provided classes to load and validate configuration files:
29
+
30
+ ```python
31
+ from etlplus.config import PipelineConfig
32
+
33
+ cfg = PipelineConfig.from_yaml("pipeline.yml")
34
+ ```
35
+
36
+ - Supports YAML and JSON formats
37
+ - Validates against expected schema
38
+
39
+ ## Example: Loading a Pipeline Config
40
+
41
+ ```python
42
+ from etlplus.config import PipelineConfig
43
+
44
+ pipeline = PipelineConfig.from_yaml("configs/pipeline.yml")
45
+ print(pipeline)
46
+ ```
47
+
48
+ ## See Also
49
+
50
+ - Top-level CLI and library usage in the main [README](../../README.md)
51
+ - Config type definitions in [types.py](types.py)
52
+ - Config utilities in [utils.py](utils.py)
@@ -24,8 +24,8 @@ from typing import Any
24
24
  from typing import Self
25
25
 
26
26
  from ..api import ApiConfig
27
- from ..enums import FileFormat
28
27
  from ..file import File
28
+ from ..file import FileFormat
29
29
  from ..types import StrAnyMap
30
30
  from ..utils import coerce_dict
31
31
  from ..utils import maybe_mapping
@@ -246,7 +246,7 @@ class PipelineConfig:
246
246
  TypeError
247
247
  If the YAML root is not a mapping/object.
248
248
  """
249
- raw = File(Path(path), FileFormat.YAML).read_yaml()
249
+ raw = File(Path(path), FileFormat.YAML).read()
250
250
  if not isinstance(raw, dict):
251
251
  raise TypeError('Pipeline YAML must have a mapping/object root')
252
252
 
@@ -0,0 +1,48 @@
1
+ # etlplus.database subpackage
2
+
3
+ Documentation for the `etlplus.database` subpackage: database engine, schema, and ORM helpers.
4
+
5
+ - Provides database engine and connection management
6
+ - Supports schema definition and DDL generation
7
+ - Includes lightweight ORM utilities for tabular data
8
+ - Exposes type definitions for database objects
9
+
10
+ Back to project overview: see the top-level [README](../../README.md).
11
+
12
+ - [etlplus.database subpackage](#etlplusdatabase-subpackage)
13
+ - [Database Engine and Connections](#database-engine-and-connections)
14
+ - [Schema and DDL Helpers](#schema-and-ddl-helpers)
15
+ - [ORM Utilities](#orm-utilities)
16
+ - [Example: Creating a Table](#example-creating-a-table)
17
+ - [See Also](#see-also)
18
+
19
+ ## Database Engine and Connections
20
+
21
+ - Manage connections to supported databases
22
+ - Configure engines for different backends
23
+
24
+ ## Schema and DDL Helpers
25
+
26
+ - Define table schemas and columns
27
+ - Generate DDL statements for supported databases
28
+
29
+ ## ORM Utilities
30
+
31
+ - Map rows to Python objects
32
+ - Simple CRUD helpers for tabular data
33
+
34
+ ## Example: Creating a Table
35
+
36
+ ```python
37
+ from etlplus.database import Schema, Engine
38
+
39
+ engine = Engine.connect("sqlite:///example.db")
40
+ schema = Schema.from_dict({"name": "users", "columns": [ ... ]})
41
+ engine.create_table(schema)
42
+ ```
43
+
44
+ ## See Also
45
+
46
+ - Top-level CLI and library usage in the main [README](../../README.md)
47
+ - Schema helpers in [schema.py](schema.py)
48
+ - ORM utilities in [orm.py](orm.py)
etlplus/database/ddl.py CHANGED
@@ -203,7 +203,7 @@ def load_table_spec(
203
203
  raise ValueError('Spec must be .json, .yml, or .yaml')
204
204
 
205
205
  try:
206
- spec = File.read_file(spec_path)
206
+ spec = File(spec_path).read()
207
207
  except ImportError as e:
208
208
  if suffix in {'.yml', '.yaml'}:
209
209
  raise RuntimeError(
@@ -113,7 +113,7 @@ def load_database_url_from_config(
113
113
  ValueError
114
114
  If no connection string/URL/DSN is found for the specified entry.
115
115
  """
116
- cfg = File.read_file(Path(path))
116
+ cfg = File(Path(path)).read()
117
117
  if not isinstance(cfg, Mapping):
118
118
  raise TypeError('Database config must be a mapping')
119
119
 
@@ -260,7 +260,7 @@ def load_table_specs(
260
260
  list[TableSpec]
261
261
  A list of TableSpec instances parsed from the YAML file.
262
262
  """
263
- data = File.read_file(Path(path))
263
+ data = File(Path(path)).read()
264
264
  if not data:
265
265
  return []
266
266
 
etlplus/enums.py CHANGED
@@ -8,7 +8,6 @@ from __future__ import annotations
8
8
 
9
9
  import enum
10
10
  import operator as _op
11
- from pathlib import PurePath
12
11
  from statistics import fmean
13
12
  from typing import Self
14
13
 
@@ -23,18 +22,10 @@ __all__ = [
23
22
  # Enums
24
23
  'AggregateName',
25
24
  'CoercibleStrEnum',
26
- 'CompressionFormat',
27
25
  'DataConnectorType',
28
- 'FileFormat',
29
26
  'HttpMethod',
30
27
  'OperatorName',
31
28
  'PipelineStep',
32
- # Functions
33
- 'coerce_compression_format',
34
- 'coerce_data_connector_type',
35
- 'coerce_file_format',
36
- 'coerce_http_method',
37
- 'infer_file_format_and_compression',
38
29
  ]
39
30
 
40
31
 
@@ -178,39 +169,6 @@ class AggregateName(CoercibleStrEnum):
178
169
  return lambda xs, n: (fmean(xs) if xs else 0.0)
179
170
 
180
171
 
181
- class CompressionFormat(CoercibleStrEnum):
182
- """Supported compression formats for data files."""
183
-
184
- # -- Constants -- #
185
-
186
- GZ = 'gz'
187
- ZIP = 'zip'
188
-
189
- # -- Class Methods -- #
190
-
191
- @classmethod
192
- def aliases(cls) -> StrStrMap:
193
- """
194
- Return a mapping of common aliases for each enum member.
195
-
196
- Returns
197
- -------
198
- StrStrMap
199
- A mapping of alias names to their corresponding enum member names.
200
- """
201
- return {
202
- # File extensions
203
- '.gz': 'gz',
204
- '.gzip': 'gz',
205
- '.zip': 'zip',
206
- # MIME types
207
- 'application/gzip': 'gz',
208
- 'application/x-gzip': 'gz',
209
- 'application/zip': 'zip',
210
- 'application/x-zip-compressed': 'zip',
211
- }
212
-
213
-
214
172
  class DataConnectorType(CoercibleStrEnum):
215
173
  """Supported data connector types."""
216
174
 
@@ -242,99 +200,6 @@ class DataConnectorType(CoercibleStrEnum):
242
200
  }
243
201
 
244
202
 
245
- class FileFormat(CoercibleStrEnum):
246
- """Supported file formats for extraction."""
247
-
248
- # -- Constants -- #
249
-
250
- AVRO = 'avro'
251
- CSV = 'csv'
252
- FEATHER = 'feather'
253
- GZ = 'gz'
254
- JSON = 'json'
255
- NDJSON = 'ndjson'
256
- ORC = 'orc'
257
- PARQUET = 'parquet'
258
- TSV = 'tsv'
259
- TXT = 'txt'
260
- XLS = 'xls'
261
- XLSX = 'xlsx'
262
- ZIP = 'zip'
263
- XML = 'xml'
264
- YAML = 'yaml'
265
-
266
- # -- Class Methods -- #
267
-
268
- @classmethod
269
- def aliases(cls) -> StrStrMap:
270
- """
271
- Return a mapping of common aliases for each enum member.
272
-
273
- Returns
274
- -------
275
- StrStrMap
276
- A mapping of alias names to their corresponding enum member names.
277
- """
278
- return {
279
- # Common shorthand
280
- 'parq': 'parquet',
281
- 'yml': 'yaml',
282
- # File extensions
283
- '.avro': 'avro',
284
- '.csv': 'csv',
285
- '.feather': 'feather',
286
- '.gz': 'gz',
287
- '.json': 'json',
288
- '.jsonl': 'ndjson',
289
- '.ndjson': 'ndjson',
290
- '.orc': 'orc',
291
- '.parquet': 'parquet',
292
- '.pq': 'parquet',
293
- '.tsv': 'tsv',
294
- '.txt': 'txt',
295
- '.xls': 'xls',
296
- '.xlsx': 'xlsx',
297
- '.zip': 'zip',
298
- '.xml': 'xml',
299
- '.yaml': 'yaml',
300
- '.yml': 'yaml',
301
- # MIME types
302
- 'application/avro': 'avro',
303
- 'application/csv': 'csv',
304
- 'application/feather': 'feather',
305
- 'application/gzip': 'gz',
306
- 'application/json': 'json',
307
- 'application/jsonlines': 'ndjson',
308
- 'application/ndjson': 'ndjson',
309
- 'application/orc': 'orc',
310
- 'application/parquet': 'parquet',
311
- 'application/vnd.apache.avro': 'avro',
312
- 'application/vnd.apache.parquet': 'parquet',
313
- 'application/vnd.apache.arrow.file': 'feather',
314
- 'application/vnd.apache.orc': 'orc',
315
- 'application/vnd.ms-excel': 'xls',
316
- (
317
- 'application/vnd.openxmlformats-'
318
- 'officedocument.spreadsheetml.sheet'
319
- ): 'xlsx',
320
- 'application/x-avro': 'avro',
321
- 'application/x-csv': 'csv',
322
- 'application/x-feather': 'feather',
323
- 'application/x-orc': 'orc',
324
- 'application/x-ndjson': 'ndjson',
325
- 'application/x-parquet': 'parquet',
326
- 'application/x-yaml': 'yaml',
327
- 'application/xml': 'xml',
328
- 'application/zip': 'zip',
329
- 'text/csv': 'csv',
330
- 'text/plain': 'txt',
331
- 'text/tab-separated-values': 'tsv',
332
- 'text/tsv': 'tsv',
333
- 'text/xml': 'xml',
334
- 'text/yaml': 'yaml',
335
- }
336
-
337
-
338
203
  class HttpMethod(CoercibleStrEnum):
339
204
  """Supported HTTP verbs that accept JSON payloads."""
340
205
 
@@ -360,8 +225,8 @@ class HttpMethod(CoercibleStrEnum):
360
225
  Notes
361
226
  -----
362
227
  - RFCs do not strictly forbid bodies on some other methods (e.g.,
363
- ``DELETE``), but many servers/clients do not expect them. We mark
364
- ``POST``, ``PUT``, and ``PATCH`` as True.
228
+ ``DELETE``), but many servers/clients do not expect them. We mark
229
+ ``POST``, ``PUT``, and ``PATCH`` as True.
365
230
  """
366
231
  return self in {HttpMethod.POST, HttpMethod.PUT, HttpMethod.PATCH}
367
232
 
@@ -465,13 +330,6 @@ class PipelineStep(CoercibleStrEnum):
465
330
  # SECTION: INTERNAL CONSTANTS ============================================== #
466
331
 
467
332
 
468
- # Compression formats that are also file formats.
469
- _COMPRESSION_FILE_FORMATS: set[FileFormat] = {
470
- FileFormat.GZ,
471
- FileFormat.ZIP,
472
- }
473
-
474
-
475
333
  # Precomputed order index for PipelineStep; avoids recomputing on each access.
476
334
  _PIPELINE_ORDER_INDEX: dict[PipelineStep, int] = {
477
335
  PipelineStep.FILTER: 0,
@@ -480,129 +338,3 @@ _PIPELINE_ORDER_INDEX: dict[PipelineStep, int] = {
480
338
  PipelineStep.SORT: 3,
481
339
  PipelineStep.AGGREGATE: 4,
482
340
  }
483
-
484
-
485
- # SECTION: FUNCTIONS ======================================================== #
486
-
487
-
488
- def coerce_data_connector_type(
489
- connector: DataConnectorType | str,
490
- ) -> DataConnectorType:
491
- """
492
- Normalize textual data connector values to :class:`DataConnectorType`.
493
-
494
- This thin wrapper is kept for backward compatibility; prefer
495
- :meth:`DataConnectorType.coerce` going forward.
496
- """
497
- return DataConnectorType.coerce(connector)
498
-
499
-
500
- def coerce_file_format(
501
- file_format: FileFormat | str,
502
- ) -> FileFormat:
503
- """
504
- Normalize textual file format values to :class:`FileFormat`.
505
-
506
- This thin wrapper is kept for backward compatibility; prefer
507
- :meth:`FileFormat.coerce` going forward.
508
- """
509
- return FileFormat.coerce(file_format)
510
-
511
-
512
- def coerce_compression_format(
513
- compression_format: CompressionFormat | str,
514
- ) -> CompressionFormat:
515
- """
516
- Normalize textual compression format values to :class:`CompressionFormat`.
517
-
518
- This thin wrapper is kept for backward compatibility; prefer
519
- :meth:`CompressionFormat.coerce` going forward.
520
- """
521
- return CompressionFormat.coerce(compression_format)
522
-
523
-
524
- def coerce_http_method(
525
- http_method: HttpMethod | str,
526
- ) -> HttpMethod:
527
- """
528
- Normalize textual HTTP method values to :class:`HttpMethod`.
529
-
530
- This thin wrapper is kept for backward compatibility; prefer
531
- :meth:`HttpMethod.coerce` going forward.
532
- """
533
- return HttpMethod.coerce(http_method)
534
-
535
-
536
- def infer_file_format_and_compression(
537
- value: object,
538
- filename: object | None = None,
539
- ) -> tuple[FileFormat | None, CompressionFormat | None]:
540
- """
541
- Infer data format and compression from a filename, extension, or MIME type.
542
-
543
- Parameters
544
- ----------
545
- value : object
546
- A filename, extension, MIME type, or existing enum member.
547
- filename : object | None, optional
548
- A filename to consult for extension-based inference (e.g. when
549
- ``value`` is ``application/octet-stream``).
550
-
551
- Returns
552
- -------
553
- tuple[FileFormat | None, CompressionFormat | None]
554
- The inferred data format and compression, if any.
555
- """
556
- if isinstance(value, FileFormat):
557
- if value in _COMPRESSION_FILE_FORMATS:
558
- return None, CompressionFormat.coerce(value.value)
559
- return value, None
560
- if isinstance(value, CompressionFormat):
561
- return None, value
562
-
563
- text = str(value).strip()
564
- if not text:
565
- return None, None
566
-
567
- normalized = text.casefold()
568
- mime = normalized.split(';', 1)[0].strip()
569
-
570
- is_octet_stream = mime == 'application/octet-stream'
571
- compression = CompressionFormat.try_coerce(mime)
572
- fmt = None if is_octet_stream else FileFormat.try_coerce(mime)
573
-
574
- is_mime = mime.startswith(
575
- (
576
- 'application/',
577
- 'text/',
578
- 'audio/',
579
- 'image/',
580
- 'video/',
581
- 'multipart/',
582
- ),
583
- )
584
- suffix_source: object | None = filename if filename is not None else text
585
- if is_mime and filename is None:
586
- suffix_source = None
587
-
588
- suffixes = (
589
- PurePath(str(suffix_source)).suffixes
590
- if suffix_source is not None
591
- else []
592
- )
593
- if suffixes:
594
- normalized_suffixes = [suffix.casefold() for suffix in suffixes]
595
- compression = (
596
- CompressionFormat.try_coerce(normalized_suffixes[-1])
597
- or compression
598
- )
599
- if compression is not None:
600
- normalized_suffixes = normalized_suffixes[:-1]
601
- if normalized_suffixes:
602
- fmt = FileFormat.try_coerce(normalized_suffixes[-1]) or fmt
603
-
604
- if fmt in _COMPRESSION_FILE_FORMATS:
605
- compression = compression or CompressionFormat.coerce(fmt.value)
606
- fmt = None
607
-
608
- return fmt, compression
etlplus/extract.py CHANGED
@@ -13,11 +13,9 @@ from typing import cast
13
13
  import requests # type: ignore[import]
14
14
 
15
15
  from .enums import DataConnectorType
16
- from .enums import FileFormat
17
16
  from .enums import HttpMethod
18
- from .enums import coerce_data_connector_type
19
- from .enums import coerce_file_format
20
17
  from .file import File
18
+ from .file import FileFormat
21
19
  from .types import JSONData
22
20
  from .types import JSONDict
23
21
  from .types import JSONList
@@ -55,7 +53,7 @@ def extract_from_file(
55
53
  # If no explicit format is provided, let File infer from extension.
56
54
  if file_format is None:
57
55
  return File(path, None).read()
58
- fmt = coerce_file_format(file_format)
56
+ fmt = FileFormat.coerce(file_format)
59
57
 
60
58
  # Let file module perform existence and format validation.
61
59
  return File(path, fmt).read()
@@ -202,7 +200,7 @@ def extract(
202
200
  ValueError
203
201
  If `source_type` is not one of the supported values.
204
202
  """
205
- match coerce_data_connector_type(source_type):
203
+ match DataConnectorType.coerce(source_type):
206
204
  case DataConnectorType.FILE:
207
205
  # Prefer explicit format if provided, else infer from filename.
208
206
  return extract_from_file(source, file_format)
@@ -213,6 +211,6 @@ def extract(
213
211
  # ``file_format`` is ignored for APIs.
214
212
  return extract_from_api(str(source), **kwargs)
215
213
  case _:
216
- # ``coerce_data_connector_type`` covers invalid entries, but keep
217
- # explicit guard for defensive programming.
214
+ # :meth:`coerce` already raises for invalid connector types, but
215
+ # keep explicit guard for defensive programming.
218
216
  raise ValueError(f'Invalid source type: {source_type}')