etlplus 0.9.2__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. etlplus/__init__.py +26 -1
  2. etlplus/api/README.md +3 -51
  3. etlplus/api/__init__.py +0 -10
  4. etlplus/api/config.py +28 -39
  5. etlplus/api/endpoint_client.py +3 -3
  6. etlplus/api/pagination/client.py +1 -1
  7. etlplus/api/rate_limiting/config.py +1 -13
  8. etlplus/api/rate_limiting/rate_limiter.py +11 -8
  9. etlplus/api/request_manager.py +6 -11
  10. etlplus/api/transport.py +2 -14
  11. etlplus/api/types.py +6 -96
  12. etlplus/cli/commands.py +43 -76
  13. etlplus/cli/constants.py +1 -1
  14. etlplus/cli/handlers.py +12 -40
  15. etlplus/cli/io.py +2 -2
  16. etlplus/cli/main.py +1 -1
  17. etlplus/cli/state.py +7 -4
  18. etlplus/{workflow → config}/__init__.py +23 -10
  19. etlplus/{workflow → config}/connector.py +44 -58
  20. etlplus/{workflow → config}/jobs.py +32 -105
  21. etlplus/{workflow → config}/pipeline.py +51 -59
  22. etlplus/{workflow → config}/profile.py +5 -8
  23. etlplus/config/types.py +204 -0
  24. etlplus/config/utils.py +120 -0
  25. etlplus/database/ddl.py +1 -1
  26. etlplus/database/engine.py +3 -19
  27. etlplus/database/orm.py +0 -2
  28. etlplus/database/schema.py +1 -1
  29. etlplus/enums.py +266 -0
  30. etlplus/{ops/extract.py → extract.py} +99 -81
  31. etlplus/file.py +652 -0
  32. etlplus/{ops/load.py → load.py} +101 -78
  33. etlplus/{ops/run.py → run.py} +127 -159
  34. etlplus/{api/utils.py → run_helpers.py} +153 -209
  35. etlplus/{ops/transform.py → transform.py} +68 -75
  36. etlplus/types.py +4 -5
  37. etlplus/utils.py +2 -136
  38. etlplus/{ops/validate.py → validate.py} +12 -22
  39. etlplus/validation/__init__.py +44 -0
  40. etlplus/{ops → validation}/utils.py +17 -53
  41. {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/METADATA +17 -210
  42. etlplus-0.10.1.dist-info/RECORD +65 -0
  43. {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/WHEEL +1 -1
  44. etlplus/README.md +0 -37
  45. etlplus/api/enums.py +0 -51
  46. etlplus/cli/README.md +0 -40
  47. etlplus/database/README.md +0 -48
  48. etlplus/file/README.md +0 -105
  49. etlplus/file/__init__.py +0 -25
  50. etlplus/file/_imports.py +0 -141
  51. etlplus/file/_io.py +0 -160
  52. etlplus/file/accdb.py +0 -78
  53. etlplus/file/arrow.py +0 -78
  54. etlplus/file/avro.py +0 -176
  55. etlplus/file/bson.py +0 -77
  56. etlplus/file/cbor.py +0 -78
  57. etlplus/file/cfg.py +0 -79
  58. etlplus/file/conf.py +0 -80
  59. etlplus/file/core.py +0 -322
  60. etlplus/file/csv.py +0 -79
  61. etlplus/file/dat.py +0 -78
  62. etlplus/file/dta.py +0 -77
  63. etlplus/file/duckdb.py +0 -78
  64. etlplus/file/enums.py +0 -343
  65. etlplus/file/feather.py +0 -111
  66. etlplus/file/fwf.py +0 -77
  67. etlplus/file/gz.py +0 -123
  68. etlplus/file/hbs.py +0 -78
  69. etlplus/file/hdf5.py +0 -78
  70. etlplus/file/ini.py +0 -79
  71. etlplus/file/ion.py +0 -78
  72. etlplus/file/jinja2.py +0 -78
  73. etlplus/file/json.py +0 -98
  74. etlplus/file/log.py +0 -78
  75. etlplus/file/mat.py +0 -78
  76. etlplus/file/mdb.py +0 -78
  77. etlplus/file/msgpack.py +0 -78
  78. etlplus/file/mustache.py +0 -78
  79. etlplus/file/nc.py +0 -78
  80. etlplus/file/ndjson.py +0 -108
  81. etlplus/file/numbers.py +0 -75
  82. etlplus/file/ods.py +0 -79
  83. etlplus/file/orc.py +0 -111
  84. etlplus/file/parquet.py +0 -113
  85. etlplus/file/pb.py +0 -78
  86. etlplus/file/pbf.py +0 -77
  87. etlplus/file/properties.py +0 -78
  88. etlplus/file/proto.py +0 -77
  89. etlplus/file/psv.py +0 -79
  90. etlplus/file/rda.py +0 -78
  91. etlplus/file/rds.py +0 -78
  92. etlplus/file/sas7bdat.py +0 -78
  93. etlplus/file/sav.py +0 -77
  94. etlplus/file/sqlite.py +0 -78
  95. etlplus/file/stub.py +0 -84
  96. etlplus/file/sylk.py +0 -77
  97. etlplus/file/tab.py +0 -81
  98. etlplus/file/toml.py +0 -78
  99. etlplus/file/tsv.py +0 -80
  100. etlplus/file/txt.py +0 -102
  101. etlplus/file/vm.py +0 -78
  102. etlplus/file/wks.py +0 -77
  103. etlplus/file/xls.py +0 -88
  104. etlplus/file/xlsm.py +0 -79
  105. etlplus/file/xlsx.py +0 -99
  106. etlplus/file/xml.py +0 -185
  107. etlplus/file/xpt.py +0 -78
  108. etlplus/file/yaml.py +0 -95
  109. etlplus/file/zip.py +0 -175
  110. etlplus/file/zsav.py +0 -77
  111. etlplus/ops/README.md +0 -50
  112. etlplus/ops/__init__.py +0 -61
  113. etlplus/templates/README.md +0 -46
  114. etlplus/workflow/README.md +0 -52
  115. etlplus/workflow/dag.py +0 -105
  116. etlplus/workflow/types.py +0 -115
  117. etlplus-0.9.2.dist-info/RECORD +0 -134
  118. {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/entry_points.txt +0 -0
  119. {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/licenses/LICENSE +0 -0
  120. {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,204 @@
1
+ """
2
+ :mod:`etlplus.config.types` module.
3
+
4
+ Type aliases and editor-only TypedDicts for :mod:`etlplus.config`.
5
+
6
+ These types improve IDE autocomplete and static analysis while the runtime
7
+ parsers remain permissive.
8
+
9
+ Notes
10
+ -----
11
+ - TypedDicts in this module are intentionally ``total=False`` and are not
12
+ enforced at runtime.
13
+ - ``*.from_obj`` constructors accept ``Mapping[str, Any]`` and perform
14
+ tolerant parsing and light casting. This keeps the runtime permissive while
15
+ improving autocomplete and static analysis for contributors.
16
+
17
+ Examples
18
+ --------
19
+ >>> from etlplus.config import Connector
20
+ >>> src: Connector = {
21
+ >>> "type": "file",
22
+ >>> "path": "/data/input.csv",
23
+ >>> }
24
+ >>> tgt: Connector = {
25
+ >>> "type": "database",
26
+ >>> "connection_string": "postgresql://user:pass@localhost/db",
27
+ >>> }
28
+ >>> from etlplus.api import RetryPolicy
29
+ >>> rp: RetryPolicy = {"max_attempts": 3, "backoff": 0.5}
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ from collections.abc import Mapping
35
+ from typing import Any
36
+ from typing import Literal
37
+ from typing import TypedDict
38
+
39
+ from ..api import PaginationConfigMap
40
+ from ..api import RateLimitConfigMap
41
+ from ..types import StrAnyMap
42
+
43
+ # SECTION: EXPORTS ========================================================= #
44
+
45
+
46
+ __all__ = [
47
+ # Type aliases
48
+ 'ConnectorType',
49
+ # 'PaginationType',
50
+ # TypedDicts
51
+ 'ApiProfileDefaultsMap',
52
+ 'ApiProfileConfigMap',
53
+ 'ApiConfigMap',
54
+ 'EndpointMap',
55
+ 'ConnectorApiConfigMap',
56
+ 'ConnectorDbConfigMap',
57
+ 'ConnectorFileConfigMap',
58
+ ]
59
+
60
+
61
+ # SECTION: TYPE ALIASES ===================================================== #
62
+
63
+
64
+ # Literal type for supported connector kinds
65
+ type ConnectorType = Literal['api', 'database', 'file']
66
+
67
+ # Literal type for supported pagination kinds
68
+ # type PaginationType = Literal['page', 'offset', 'cursor']
69
+
70
+
71
+ # SECTION: TYPED DICTS ====================================================== #
72
+
73
+
74
+ class ApiConfigMap(TypedDict, total=False):
75
+ """
76
+ Top-level API config shape parsed by ApiConfig.from_obj.
77
+
78
+ Either provide a 'base_url' with optional 'headers' and 'endpoints', or
79
+ provide 'profiles' with at least one profile having a 'base_url'.
80
+
81
+ See Also
82
+ --------
83
+ - etlplus.config.api.ApiConfig.from_obj: parses this mapping
84
+ """
85
+
86
+ base_url: str
87
+ headers: StrAnyMap
88
+ endpoints: Mapping[str, EndpointMap | str]
89
+ profiles: Mapping[str, ApiProfileConfigMap]
90
+
91
+
92
+ class ApiProfileConfigMap(TypedDict, total=False):
93
+ """
94
+ Shape accepted for a profile entry under ApiConfigMap.profiles.
95
+
96
+ Notes
97
+ -----
98
+ `base_url` is required at runtime when profiles are provided.
99
+
100
+ See Also
101
+ --------
102
+ - etlplus.config.api.ApiProfileConfig.from_obj: parses this mapping
103
+ """
104
+
105
+ base_url: str
106
+ headers: StrAnyMap
107
+ base_path: str
108
+ auth: StrAnyMap
109
+ defaults: ApiProfileDefaultsMap
110
+
111
+
112
+ class ApiProfileDefaultsMap(TypedDict, total=False):
113
+ """
114
+ Defaults block available under a profile (all keys optional).
115
+
116
+ Notes
117
+ -----
118
+ Runtime expects header values to be str; typing remains permissive.
119
+
120
+ See Also
121
+ --------
122
+ - etlplus.config.api.ApiProfileConfig.from_obj: consumes this block
123
+ - etlplus.config.pagination.PaginationConfig.from_obj: parses pagination
124
+ - etlplus.api.rate_limiting.RateLimitConfig.from_obj: parses rate_limit
125
+ """
126
+
127
+ headers: StrAnyMap
128
+ pagination: PaginationConfigMap | StrAnyMap
129
+ rate_limit: RateLimitConfigMap | StrAnyMap
130
+
131
+
132
+ class ConnectorApiConfigMap(TypedDict, total=False):
133
+ """
134
+ Shape accepted by ConnectorApi.from_obj (all keys optional).
135
+
136
+ See Also
137
+ --------
138
+ - etlplus.config.connector.ConnectorApi.from_obj
139
+ """
140
+
141
+ name: str
142
+ type: ConnectorType
143
+ url: str
144
+ method: str
145
+ headers: StrAnyMap
146
+ query_params: StrAnyMap
147
+ pagination: PaginationConfigMap
148
+ rate_limit: RateLimitConfigMap
149
+ api: str
150
+ endpoint: str
151
+
152
+
153
+ class ConnectorDbConfigMap(TypedDict, total=False):
154
+ """
155
+ Shape accepted by ConnectorDb.from_obj (all keys optional).
156
+
157
+ See Also
158
+ --------
159
+ - etlplus.config.connector.ConnectorDb.from_obj
160
+ """
161
+
162
+ name: str
163
+ type: ConnectorType
164
+ connection_string: str
165
+ query: str
166
+ table: str
167
+ mode: str
168
+
169
+
170
+ class ConnectorFileConfigMap(TypedDict, total=False):
171
+ """
172
+ Shape accepted by ConnectorFile.from_obj (all keys optional).
173
+
174
+ See Also
175
+ --------
176
+ - etlplus.config.connector.ConnectorFile.from_obj
177
+ """
178
+
179
+ name: str
180
+ type: ConnectorType
181
+ format: str
182
+ path: str
183
+ options: StrAnyMap
184
+
185
+
186
+ class EndpointMap(TypedDict, total=False):
187
+ """
188
+ Shape accepted by EndpointConfig.from_obj.
189
+
190
+ One of 'path' or 'url' should be provided.
191
+
192
+ See Also
193
+ --------
194
+ - etlplus.config.api.EndpointConfig.from_obj: parses this mapping
195
+ """
196
+
197
+ path: str
198
+ url: str
199
+ method: str
200
+ path_params: StrAnyMap
201
+ query_params: StrAnyMap
202
+ body: Any
203
+ pagination: PaginationConfigMap
204
+ rate_limit: RateLimitConfigMap
@@ -0,0 +1,120 @@
1
+ """
2
+ :mod:`etlplus.config.utils` module.
3
+
4
+ A module defining utility helpers for ETL pipeline configuration.
5
+
6
+ Notes
7
+ -----
8
+ - Inputs to parsers favor ``Mapping[str, Any]`` to remain permissive and
9
+ avoid unnecessary copies; normalization returns concrete types.
10
+ - Substitution is shallow for strings and recursive for containers.
11
+ - Numeric coercion helpers are intentionally forgiving: invalid values
12
+ become ``None`` rather than raising.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from collections.abc import Iterable
18
+ from collections.abc import Mapping
19
+ from typing import Any
20
+
21
+ from ..types import StrAnyMap
22
+
23
+ # SECTION: EXPORTS ========================================================== #
24
+
25
+
26
+ __all__ = [
27
+ # Functions
28
+ 'deep_substitute',
29
+ ]
30
+
31
+
32
+ # SECTION: FUNCTIONS ======================================================== #
33
+
34
+
35
+ def deep_substitute(
36
+ value: Any,
37
+ vars_map: StrAnyMap | None,
38
+ env_map: Mapping[str, str] | None,
39
+ ) -> Any:
40
+ """
41
+ Recursively substitute ``${VAR}`` tokens in nested structures.
42
+
43
+ Only strings are substituted; other types are returned as-is.
44
+
45
+ Parameters
46
+ ----------
47
+ value : Any
48
+ The value to perform substitutions on.
49
+ vars_map : StrAnyMap | None
50
+ Mapping of variable names to replacement values (lower precedence).
51
+ env_map : Mapping[str, str] | None
52
+ Mapping of environment variables overriding ``vars_map`` values (higher
53
+ precedence).
54
+
55
+ Returns
56
+ -------
57
+ Any
58
+ New structure with substitutions applied where tokens were found.
59
+ """
60
+ substitutions = _prepare_substitutions(vars_map, env_map)
61
+
62
+ def _apply(node: Any) -> Any:
63
+ match node:
64
+ case str():
65
+ return _replace_tokens(node, substitutions)
66
+ case Mapping():
67
+ return {k: _apply(v) for k, v in node.items()}
68
+ case list() | tuple() as seq:
69
+ apply = [_apply(item) for item in seq]
70
+ return apply if isinstance(seq, list) else tuple(apply)
71
+ case set():
72
+ return {_apply(item) for item in node}
73
+ case frozenset():
74
+ return frozenset(_apply(item) for item in node)
75
+ case _:
76
+ return node
77
+
78
+ return _apply(value)
79
+
80
+
81
+ # SECTION: INTERNAL FUNCTIONS ============================================== #
82
+
83
+
84
+ def _prepare_substitutions(
85
+ vars_map: StrAnyMap | None,
86
+ env_map: Mapping[str, Any] | None,
87
+ ) -> tuple[tuple[str, Any], ...]:
88
+ """Merge variable and environment maps into an ordered substitutions list.
89
+
90
+ Parameters
91
+ ----------
92
+ vars_map : StrAnyMap | None
93
+ Mapping of variable names to replacement values (lower precedence).
94
+ env_map : Mapping[str, Any] | None
95
+ Environment-backed values that override entries from ``vars_map``.
96
+
97
+ Returns
98
+ -------
99
+ tuple[tuple[str, Any], ...]
100
+ Immutable sequence of ``(name, value)`` pairs suitable for token
101
+ replacement.
102
+ """
103
+ if not vars_map and not env_map:
104
+ return ()
105
+ merged: dict[str, Any] = {**(vars_map or {}), **(env_map or {})}
106
+ return tuple(merged.items())
107
+
108
+
109
+ def _replace_tokens(
110
+ text: str,
111
+ substitutions: Iterable[tuple[str, Any]],
112
+ ) -> str:
113
+ if not substitutions:
114
+ return text
115
+ out = text
116
+ for name, replacement in substitutions:
117
+ token = f'${{{name}}}'
118
+ if token in out:
119
+ out = out.replace(token, str(replacement))
120
+ return out
etlplus/database/ddl.py CHANGED
@@ -203,7 +203,7 @@ def load_table_spec(
203
203
  raise ValueError('Spec must be .json, .yml, or .yaml')
204
204
 
205
205
  try:
206
- spec = File(spec_path).read()
206
+ spec = File.read_file(spec_path)
207
207
  except ImportError as e:
208
208
  if suffix in {'.yml', '.yaml'}:
209
209
  raise RuntimeError(
@@ -113,7 +113,7 @@ def load_database_url_from_config(
113
113
  ValueError
114
114
  If no connection string/URL/DSN is found for the specified entry.
115
115
  """
116
- cfg = File(Path(path)).read()
116
+ cfg = File.read_file(Path(path))
117
117
  if not isinstance(cfg, Mapping):
118
118
  raise TypeError('Database config must be a mapping')
119
119
 
@@ -136,25 +136,9 @@ def load_database_url_from_config(
136
136
  return url
137
137
 
138
138
 
139
- def make_engine(
140
- url: str | None = None,
141
- **engine_kwargs: Any,
142
- ) -> Engine:
143
- """
144
- Create a SQLAlchemy Engine, defaulting to env config if no URL given.
145
-
146
- Parameters
147
- ----------
148
- url : str | None, optional
149
- Database URL/DSN string. When omitted, ``DATABASE_URL`` is used.
150
- **engine_kwargs : Any
151
- Extra keyword arguments forwarded to ``create_engine``.
139
+ def make_engine(url: str | None = None, **engine_kwargs: Any) -> Engine:
140
+ """Create a SQLAlchemy Engine, defaulting to env config if no URL given."""
152
141
 
153
- Returns
154
- -------
155
- Engine
156
- Configured SQLAlchemy engine instance.
157
- """
158
142
  resolved_url = url or DATABASE_URL
159
143
  return create_engine(resolved_url, pool_pre_ping=True, **engine_kwargs)
160
144
 
etlplus/database/orm.py CHANGED
@@ -201,14 +201,12 @@ def build_models(
201
201
  ) -> ModelRegistry:
202
202
  """
203
203
  Build SQLAlchemy ORM models from table specifications.
204
-
205
204
  Parameters
206
205
  ----------
207
206
  specs : list[TableSpec]
208
207
  List of table specifications.
209
208
  base : type[DeclarativeBase], optional
210
209
  Base class for the ORM models (default: :class:`Base`).
211
-
212
210
  Returns
213
211
  -------
214
212
  ModelRegistry
@@ -260,7 +260,7 @@ def load_table_specs(
260
260
  list[TableSpec]
261
261
  A list of TableSpec instances parsed from the YAML file.
262
262
  """
263
- data = File(Path(path)).read()
263
+ data = File.read_file(Path(path))
264
264
  if not data:
265
265
  return []
266
266
 
etlplus/enums.py CHANGED
@@ -8,6 +8,7 @@ from __future__ import annotations
8
8
 
9
9
  import enum
10
10
  import operator as _op
11
+ from pathlib import PurePath
11
12
  from statistics import fmean
12
13
  from typing import Self
13
14
 
@@ -22,9 +23,18 @@ __all__ = [
22
23
  # Enums
23
24
  'AggregateName',
24
25
  'CoercibleStrEnum',
26
+ 'CompressionFormat',
25
27
  'DataConnectorType',
28
+ 'FileFormat',
29
+ 'HttpMethod',
26
30
  'OperatorName',
27
31
  'PipelineStep',
32
+ # Functions
33
+ 'coerce_compression_format',
34
+ 'coerce_data_connector_type',
35
+ 'coerce_file_format',
36
+ 'coerce_http_method',
37
+ 'infer_file_format_and_compression',
28
38
  ]
29
39
 
30
40
 
@@ -168,6 +178,39 @@ class AggregateName(CoercibleStrEnum):
168
178
  return lambda xs, n: (fmean(xs) if xs else 0.0)
169
179
 
170
180
 
181
+ class CompressionFormat(CoercibleStrEnum):
182
+ """Supported compression formats for data files."""
183
+
184
+ # -- Constants -- #
185
+
186
+ GZ = 'gz'
187
+ ZIP = 'zip'
188
+
189
+ # -- Class Methods -- #
190
+
191
+ @classmethod
192
+ def aliases(cls) -> StrStrMap:
193
+ """
194
+ Return a mapping of common aliases for each enum member.
195
+
196
+ Returns
197
+ -------
198
+ StrStrMap
199
+ A mapping of alias names to their corresponding enum member names.
200
+ """
201
+ return {
202
+ # File extensions
203
+ '.gz': 'gz',
204
+ '.gzip': 'gz',
205
+ '.zip': 'zip',
206
+ # MIME types
207
+ 'application/gzip': 'gz',
208
+ 'application/x-gzip': 'gz',
209
+ 'application/zip': 'zip',
210
+ 'application/x-zip-compressed': 'zip',
211
+ }
212
+
213
+
171
214
  class DataConnectorType(CoercibleStrEnum):
172
215
  """Supported data connector types."""
173
216
 
@@ -199,6 +242,119 @@ class DataConnectorType(CoercibleStrEnum):
199
242
  }
200
243
 
201
244
 
245
+ class FileFormat(CoercibleStrEnum):
246
+ """Supported file formats for extraction."""
247
+
248
+ # -- Constants -- #
249
+
250
+ AVRO = 'avro'
251
+ CSV = 'csv'
252
+ FEATHER = 'feather'
253
+ GZ = 'gz'
254
+ JSON = 'json'
255
+ NDJSON = 'ndjson'
256
+ ORC = 'orc'
257
+ PARQUET = 'parquet'
258
+ TSV = 'tsv'
259
+ TXT = 'txt'
260
+ XLS = 'xls'
261
+ XLSX = 'xlsx'
262
+ ZIP = 'zip'
263
+ XML = 'xml'
264
+ YAML = 'yaml'
265
+
266
+ # -- Class Methods -- #
267
+
268
+ @classmethod
269
+ def aliases(cls) -> StrStrMap:
270
+ """
271
+ Return a mapping of common aliases for each enum member.
272
+
273
+ Returns
274
+ -------
275
+ StrStrMap
276
+ A mapping of alias names to their corresponding enum member names.
277
+ """
278
+ return {
279
+ # Common shorthand
280
+ 'parq': 'parquet',
281
+ 'yml': 'yaml',
282
+ # File extensions
283
+ '.avro': 'avro',
284
+ '.csv': 'csv',
285
+ '.feather': 'feather',
286
+ '.gz': 'gz',
287
+ '.json': 'json',
288
+ '.jsonl': 'ndjson',
289
+ '.ndjson': 'ndjson',
290
+ '.orc': 'orc',
291
+ '.parquet': 'parquet',
292
+ '.pq': 'parquet',
293
+ '.tsv': 'tsv',
294
+ '.txt': 'txt',
295
+ '.xls': 'xls',
296
+ '.xlsx': 'xlsx',
297
+ '.zip': 'zip',
298
+ '.xml': 'xml',
299
+ '.yaml': 'yaml',
300
+ '.yml': 'yaml',
301
+ # MIME types
302
+ 'application/avro': 'avro',
303
+ 'application/feather': 'feather',
304
+ 'application/gzip': 'gz',
305
+ 'application/json': 'json',
306
+ 'application/jsonlines': 'ndjson',
307
+ 'application/ndjson': 'ndjson',
308
+ 'application/orc': 'orc',
309
+ 'application/vnd.apache.arrow.file': 'feather',
310
+ 'application/vnd.apache.orc': 'orc',
311
+ 'application/vnd.ms-excel': 'xls',
312
+ (
313
+ 'application/vnd.openxmlformats-'
314
+ 'officedocument.spreadsheetml.sheet'
315
+ ): 'xlsx',
316
+ 'application/x-avro': 'avro',
317
+ 'application/x-ndjson': 'ndjson',
318
+ 'application/x-parquet': 'parquet',
319
+ 'application/xml': 'xml',
320
+ 'application/zip': 'zip',
321
+ 'text/csv': 'csv',
322
+ 'text/plain': 'txt',
323
+ 'text/tab-separated-values': 'tsv',
324
+ }
325
+
326
+
327
+ class HttpMethod(CoercibleStrEnum):
328
+ """Supported HTTP verbs that accept JSON payloads."""
329
+
330
+ # -- Constants -- #
331
+
332
+ CONNECT = 'connect'
333
+ DELETE = 'delete'
334
+ GET = 'get'
335
+ HEAD = 'head'
336
+ OPTIONS = 'options'
337
+ PATCH = 'patch'
338
+ POST = 'post'
339
+ PUT = 'put'
340
+ TRACE = 'trace'
341
+
342
+ # -- Getters -- #
343
+
344
+ @property
345
+ def allows_body(self) -> bool:
346
+ """
347
+ Whether the method typically allows a request body.
348
+
349
+ Notes
350
+ -----
351
+ - RFCs do not strictly forbid bodies on some other methods (e.g.,
352
+ ``DELETE``), but many servers/clients do not expect them. We mark
353
+ ``POST``, ``PUT``, and ``PATCH`` as True.
354
+ """
355
+ return self in {HttpMethod.POST, HttpMethod.PUT, HttpMethod.PATCH}
356
+
357
+
202
358
  class OperatorName(CoercibleStrEnum):
203
359
  """Supported comparison operators with helpers."""
204
360
 
@@ -298,6 +454,13 @@ class PipelineStep(CoercibleStrEnum):
298
454
  # SECTION: INTERNAL CONSTANTS ============================================== #
299
455
 
300
456
 
457
+ # Compression formats that are also file formats.
458
+ _COMPRESSION_FILE_FORMATS: set[FileFormat] = {
459
+ FileFormat.GZ,
460
+ FileFormat.ZIP,
461
+ }
462
+
463
+
301
464
  # Precomputed order index for PipelineStep; avoids recomputing on each access.
302
465
  _PIPELINE_ORDER_INDEX: dict[PipelineStep, int] = {
303
466
  PipelineStep.FILTER: 0,
@@ -306,3 +469,106 @@ _PIPELINE_ORDER_INDEX: dict[PipelineStep, int] = {
306
469
  PipelineStep.SORT: 3,
307
470
  PipelineStep.AGGREGATE: 4,
308
471
  }
472
+
473
+
474
+ # SECTION: FUNCTIONS ======================================================== #
475
+
476
+
477
+ def coerce_data_connector_type(
478
+ connector: DataConnectorType | str,
479
+ ) -> DataConnectorType:
480
+ """
481
+ Normalize textual data connector values to :class:`DataConnectorType`.
482
+
483
+ This thin wrapper is kept for backward compatibility; prefer
484
+ :meth:`DataConnectorType.coerce` going forward.
485
+ """
486
+ return DataConnectorType.coerce(connector)
487
+
488
+
489
+ def coerce_file_format(
490
+ file_format: FileFormat | str,
491
+ ) -> FileFormat:
492
+ """
493
+ Normalize textual file format values to :class:`FileFormat`.
494
+
495
+ This thin wrapper is kept for backward compatibility; prefer
496
+ :meth:`FileFormat.coerce` going forward.
497
+ """
498
+ return FileFormat.coerce(file_format)
499
+
500
+
501
+ def coerce_compression_format(
502
+ compression_format: CompressionFormat | str,
503
+ ) -> CompressionFormat:
504
+ """
505
+ Normalize textual compression format values to :class:`CompressionFormat`.
506
+
507
+ This thin wrapper is kept for backward compatibility; prefer
508
+ :meth:`CompressionFormat.coerce` going forward.
509
+ """
510
+ return CompressionFormat.coerce(compression_format)
511
+
512
+
513
+ def coerce_http_method(
514
+ http_method: HttpMethod | str,
515
+ ) -> HttpMethod:
516
+ """
517
+ Normalize textual HTTP method values to :class:`HttpMethod`.
518
+
519
+ This thin wrapper is kept for backward compatibility; prefer
520
+ :meth:`HttpMethod.coerce` going forward.
521
+ """
522
+ return HttpMethod.coerce(http_method)
523
+
524
+
525
+ def infer_file_format_and_compression(
526
+ value: object,
527
+ ) -> tuple[FileFormat | None, CompressionFormat | None]:
528
+ """
529
+ Infer data format and compression from a filename, extension, or MIME type.
530
+
531
+ Parameters
532
+ ----------
533
+ value : object
534
+ A filename, extension, MIME type, or existing enum member.
535
+
536
+ Returns
537
+ -------
538
+ tuple[FileFormat | None, CompressionFormat | None]
539
+ The inferred data format and compression, if any.
540
+ """
541
+ if isinstance(value, FileFormat):
542
+ if value in _COMPRESSION_FILE_FORMATS:
543
+ return None, CompressionFormat.coerce(value.value)
544
+ return value, None
545
+ if isinstance(value, CompressionFormat):
546
+ return None, value
547
+
548
+ text = str(value).strip()
549
+ if not text:
550
+ return None, None
551
+
552
+ normalized = text.casefold()
553
+ mime = normalized.split(';', 1)[0].strip()
554
+
555
+ compression = CompressionFormat.try_coerce(mime)
556
+ fmt = FileFormat.try_coerce(mime)
557
+
558
+ suffixes = PurePath(text).suffixes
559
+ if suffixes:
560
+ normalized_suffixes = [suffix.casefold() for suffix in suffixes]
561
+ compression = (
562
+ CompressionFormat.try_coerce(normalized_suffixes[-1])
563
+ or compression
564
+ )
565
+ if compression is not None:
566
+ normalized_suffixes = normalized_suffixes[:-1]
567
+ if normalized_suffixes:
568
+ fmt = FileFormat.try_coerce(normalized_suffixes[-1]) or fmt
569
+
570
+ if fmt in _COMPRESSION_FILE_FORMATS:
571
+ compression = compression or CompressionFormat.coerce(fmt.value)
572
+ fmt = None
573
+
574
+ return fmt, compression