etlplus 0.12.12__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
etlplus/config/jobs.py CHANGED
@@ -34,10 +34,7 @@ __all__ = [
34
34
  ]
35
35
 
36
36
 
37
- # SECTION: TYPE ALIASES ===================================================== #
38
-
39
-
40
- # SECTION: CLASSES ========================================================== #
37
+ # SECTION: DATA CLASSES ===================================================== #
41
38
 
42
39
 
43
40
  @dataclass(kw_only=True, slots=True)
@@ -100,6 +97,8 @@ class JobConfig:
100
97
  Unique job name.
101
98
  description : str | None
102
99
  Optional human-friendly description.
100
+ depends_on : list[str]
101
+ Optional job dependency list. Dependencies must refer to other jobs.
103
102
  extract : ExtractRef | None
104
103
  Extraction reference.
105
104
  validate : ValidationRef | None
@@ -114,6 +113,7 @@ class JobConfig:
114
113
 
115
114
  name: str
116
115
  description: str | None = None
116
+ depends_on: list[str] = field(default_factory=list)
117
117
  extract: ExtractRef | None = None
118
118
  validate: ValidationRef | None = None
119
119
  transform: TransformRef | None = None
@@ -149,9 +149,19 @@ class JobConfig:
149
149
  if description is not None and not isinstance(description, str):
150
150
  description = str(description)
151
151
 
152
+ depends_raw = data.get('depends_on')
153
+ depends_on: list[str] = []
154
+ if isinstance(depends_raw, str):
155
+ depends_on = [depends_raw]
156
+ elif isinstance(depends_raw, list):
157
+ for entry in depends_raw:
158
+ if isinstance(entry, str):
159
+ depends_on.append(entry)
160
+
152
161
  return cls(
153
162
  name=name,
154
163
  description=description,
164
+ depends_on=depends_on,
155
165
  extract=ExtractRef.from_obj(data.get('extract')),
156
166
  validate=ValidationRef.from_obj(data.get('validate')),
157
167
  transform=TransformRef.from_obj(data.get('transform')),
etlplus/dag.py ADDED
@@ -0,0 +1,103 @@
1
+ """
2
+ :mod:`etlplus.dag` module.
3
+
4
+ Lightweight directed acyclic graph (DAG) helpers for ordering jobs based on
5
+ ``depends_on``.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections import deque
11
+ from dataclasses import dataclass
12
+
13
+ from .config.jobs import JobConfig
14
+
15
+ # SECTION: EXPORTS ========================================================== #
16
+
17
+
18
+ __all__ = [
19
+ 'DagError',
20
+ 'topological_sort_jobs',
21
+ ]
22
+
23
+
24
+ # SECTION: ERRORS =========================================================== #
25
+
26
+
27
+ @dataclass(slots=True)
28
+ class DagError(ValueError):
29
+ """
30
+ Raised when the job dependency graph is invalid.
31
+
32
+ Attributes
33
+ ----------
34
+ message : str
35
+ Error message.
36
+ """
37
+
38
+ # -- Attributes -- #
39
+
40
+ message: str
41
+
42
+ # -- Magic Methods (Object Representation) -- #
43
+
44
+ def __str__(self) -> str:
45
+ return self.message
46
+
47
+
48
+ # SECTION: FUNCTIONS ======================================================== #
49
+
50
+
51
+ def topological_sort_jobs(
52
+ jobs: list[JobConfig],
53
+ ) -> list[JobConfig]:
54
+ """
55
+ Return jobs in topological order based on ``depends_on``.
56
+
57
+ Parameters
58
+ ----------
59
+ jobs : list[JobConfig]
60
+ List of job configurations to sort.
61
+
62
+ Returns
63
+ -------
64
+ list[JobConfig]
65
+ Jobs sorted in topological order.
66
+
67
+ Raises
68
+ ------
69
+ DagError
70
+ If a dependency is missing, self-referential, or when a cycle is
71
+ detected.
72
+ """
73
+ index = {job.name: job for job in jobs}
74
+ edges: dict[str, set[str]] = {name: set() for name in index}
75
+ indegree: dict[str, int] = {name: 0 for name in index}
76
+
77
+ for job in jobs:
78
+ for dep in job.depends_on:
79
+ if dep not in index:
80
+ raise DagError(
81
+ f'Unknown dependency "{dep}" in job "{job.name}"',
82
+ )
83
+ if dep == job.name:
84
+ raise DagError(f'Job "{job.name}" depends on itself')
85
+ if job.name not in edges[dep]:
86
+ edges[dep].add(job.name)
87
+ indegree[job.name] += 1
88
+
89
+ queue = deque(sorted(name for name, deg in indegree.items() if deg == 0))
90
+ ordered: list[str] = []
91
+
92
+ while queue:
93
+ name = queue.popleft()
94
+ ordered.append(name)
95
+ for child in sorted(edges[name]):
96
+ indegree[child] -= 1
97
+ if indegree[child] == 0:
98
+ queue.append(child)
99
+
100
+ if len(ordered) != len(jobs):
101
+ raise DagError('Dependency cycle detected')
102
+
103
+ return [index[name] for name in ordered]
etlplus/file/enums.py CHANGED
@@ -123,7 +123,7 @@ class FileFormat(CoercibleStrEnum):
123
123
  RDS = 'rds' # R data file
124
124
  SAS7BDAT = 'sas7bdat' # SAS data file
125
125
  SAV = 'sav' # SPSS data file
126
- SYLK = 'sylk' # Symbolic Link (SYmbolic LinK)
126
+ SYLK = 'sylk' # Symbolic Link
127
127
  XPT = 'xpt' # SAS Transport file
128
128
  ZSAV = 'zsav' # Compressed SPSS data file
129
129
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: etlplus
3
- Version: 0.12.12
3
+ Version: 0.13.0
4
4
  Summary: A Swiss Army knife for simple ETL operations
5
5
  Home-page: https://github.com/Dagitali/ETLPlus
6
6
  Author: ETLPlus Team
@@ -79,8 +79,10 @@ package and command-line interface for data extraction, validation, transformati
79
79
  - [Binary Serialization and Interchange](#binary-serialization-and-interchange)
80
80
  - [Databases and Embedded Storage](#databases-and-embedded-storage)
81
81
  - [Spreadsheets](#spreadsheets)
82
- - [Data Archives](#data-archives)
82
+ - [Statistical / Scientific / Numeric Computing](#statistical--scientific--numeric-computing)
83
83
  - [Logs and Event Streams](#logs-and-event-streams)
84
+ - [Data Archives](#data-archives)
85
+ - [Templates](#templates)
84
86
  - [Usage](#usage)
85
87
  - [Command Line Interface](#command-line-interface)
86
88
  - [Argument Order and Required Options](#argument-order-and-required-options)
@@ -221,93 +223,122 @@ DDL can be rendered from table specs for migrations or schema checks.
221
223
 
222
224
  ### Files (`file`)
223
225
 
224
- File formats are grouped as in `FileFormat`. Support is marked as:
226
+ Recognized file formats are listed in the tables below. Support for reading to or writing from a recognized file format is marked as:
225
227
 
226
228
  - **Y**: implemented (may require optional dependencies)
227
229
  - **N**: stubbed or not yet implemented
228
230
 
229
231
  #### Stubbed / Placeholder
230
232
 
231
- | Format | Supported | Description |
232
- | --- | --- | --- |
233
+ | Format | Read | Write | Description |
234
+ | --- | --- | --- | --- |
233
235
  | `stub` | N | Placeholder format for tests and future connectors. |
234
236
 
235
237
  #### Tabular & Delimited Text
236
238
 
237
- | Format | Supported | Description |
238
- | --- | --- | --- |
239
- | `csv` | Y | Comma-Separated Values |
240
- | `fwf` | N | Fixed-Width Fields |
241
- | `dat` | N | Generic data file, often delimited or fixed-width |
242
- | `psv` | N | Pipe-Separated Values |
243
- | `tab` | N | Often synonymous with TSV |
244
- | `tsv` | Y | Tab-Separated Values |
245
- | `txt` | Y | Plain text, often delimited or fixed-width |
239
+ | Format | Read | Write | Description |
240
+ | --- | --- | --- | --- |
241
+ | `csv` | Y | Y | Comma-Separated Values |
242
+ | `dat` | N | N | Generic data file, often delimited or fixed-width |
243
+ | `fwf` | N | N | Fixed-Width Fields |
244
+ | `psv` | N | N | Pipe-Separated Values |
245
+ | `tab` | N | N | Often synonymous with TSV |
246
+ | `tsv` | Y | Y | Tab-Separated Values |
247
+ | `txt` | Y | Y | Plain text, often delimited or fixed-width |
246
248
 
247
249
  #### Semi-Structured Text
248
250
 
249
- | Format | Supported | Description |
250
- | --- | --- | --- |
251
- | `cfg` | N | Config-style key-value pairs |
252
- | `conf` | N | Config-style key-value pairs |
253
- | `ini` | N | Config-style key-value pairs |
254
- | `json` | Y | JavaScript Object Notation |
255
- | `ndjson` | Y | Newline-Delimited JSON |
256
- | `properties` | N | Java-style key-value pairs |
257
- | `toml` | N | Tom's Obvious Minimal Language |
258
- | `xml` | Y | Extensible Markup Language |
259
- | `yaml` | Y | YAML Ain't Markup Language |
251
+ | Format | Read | Write | Description |
252
+ | --- | --- | --- | --- |
253
+ | `cfg` | N | N | Config-style key-value pairs |
254
+ | `conf` | N | N | Config-style key-value pairs |
255
+ | `ini` | N | N | Config-style key-value pairs |
256
+ | `json` | Y | Y | JavaScript Object Notation |
257
+ | `ndjson` | Y | Y | Newline-Delimited JSON |
258
+ | `properties` | N | N | Java-style key-value pairs |
259
+ | `toml` | N | N | Tom's Obvious Minimal Language |
260
+ | `xml` | Y | Y | Extensible Markup Language |
261
+ | `yaml` | Y | Y | YAML Ain't Markup Language |
260
262
 
261
263
  #### Columnar / Analytics-Friendly
262
264
 
263
- | Format | Supported | Description |
264
- | --- | --- | --- |
265
- | `arrow` | N | Apache Arrow IPC |
266
- | `feather` | Y | Apache Arrow Feather |
267
- | `orc` | Y | Optimized Row Columnar; common in Hadoop |
268
- | `parquet` | Y | Apache Parquet; common in Big Data |
265
+ | Format | Read | Write | Description |
266
+ | --- | --- | --- | --- |
267
+ | `arrow` | N | N | Apache Arrow IPC |
268
+ | `feather` | Y | Y | Apache Arrow Feather |
269
+ | `orc` | Y | Y | Optimized Row Columnar; common in Hadoop |
270
+ | `parquet` | Y | Y | Apache Parquet; common in Big Data |
269
271
 
270
272
  #### Binary Serialization and Interchange
271
273
 
272
- | Format | Supported | Description |
273
- | --- | --- | --- |
274
- | `avro` | Y | Apache Avro |
275
- | `bson` | N | Binary JSON; common with MongoDB exports/dumps |
276
- | `cbor` | N | Concise Binary Object Representation |
277
- | `ion` | N | Amazon Ion |
278
- | `msgpack` | N | MessagePack |
279
- | `pb` | N | Protocol Buffers (Google Protobuf) |
280
- | `pbf` | N | Protocolbuffer Binary Format; often for GIS data |
281
- | `proto` | N | Protocol Buffers schema; often in .pb / .bin |
274
+ | Format | Read | Write | Description |
275
+ | --- | --- | --- | --- |
276
+ | `avro` | Y | Y | Apache Avro |
277
+ | `bson` | N | N | Binary JSON; common with MongoDB exports/dumps |
278
+ | `cbor` | N | N | Concise Binary Object Representation |
279
+ | `ion` | N | N | Amazon Ion |
280
+ | `msgpack` | N | N | MessagePack |
281
+ | `pb` | N | N | Protocol Buffers (Google Protobuf) |
282
+ | `pbf` | N | N | Protocolbuffer Binary Format; often for GIS data |
283
+ | `proto` | N | N | Protocol Buffers schema; often in .pb / .bin |
282
284
 
283
285
  #### Databases and Embedded Storage
284
286
 
285
- | Format | Supported | Description |
286
- | --- | --- | --- |
287
- | `accdb` | N | Microsoft Access database file (newer format) |
288
- | `duckdb` | N | DuckDB database file |
289
- | `mdb` | N | Microsoft Access database file (older format) |
290
- | `sqlite` | N | SQLite database file |
287
+ | Format | Read | Write | Description |
288
+ | --- | --- | --- | --- |
289
+ | `accdb` | N | N | Microsoft Access (newer format) |
290
+ | `duckdb` | N | N | DuckDB |
291
+ | `mdb` | N | N | Microsoft Access (older format) |
292
+ | `sqlite` | N | N | SQLite |
291
293
 
292
294
  #### Spreadsheets
293
295
 
296
+ | Format | Read | Write | Description |
297
+ | --- | --- | --- | --- |
298
+ | `numbers` | N | N | Apple Numbers |
299
+ | `ods` | N | N | OpenDocument |
300
+ | `wks` | N | N | Lotus 1-2-3 |
301
+ | `xls` | Y | Y | Microsoft Excel (BIFF) |
302
+ | `xlsm` | N | N | Microsoft Excel Macro-Enabled (Open XML) |
303
+ | `xlsx` | Y | Y | Microsoft Excel (Open XML) |
304
+
305
+ #### Statistical / Scientific / Numeric Computing
306
+
307
+ | Format | Read | Write | Description |
308
+ | --- | --- | --- | --- |
309
+ | `dta` | N | N | Stata |
310
+ | `hdf5` | N | N | Hierarchical Data Format |
311
+ | `mat` | N | N | MATLAB |
312
+ | `nc` | N | N | NetCDF |
313
+ | `rda` | N | N | RData workspace/object |
314
+ | `rds` | N | N | R data |
315
+ | `sas7bdat` | N | N | SAS data |
316
+ | `sav` | N | N | SPSS data |
317
+ | `sylk` | N | N | Symbolic Link |
318
+ | `xpt` | N | N | SAS Transport |
319
+ | `zsav` | N | N | Compressed SPSS data |
320
+
321
+ #### Logs and Event Streams
322
+
294
323
  | Format | Supported | Description |
295
324
  | --- | --- | --- |
296
- | `xls` | Y | Microsoft Excel (BIFF); read-only |
297
- | `xlsx` | Y | Microsoft Excel (Open XML) |
325
+ | `log` | N | N | Generic log file |
298
326
 
299
327
  #### Data Archives
300
328
 
301
- | Format | Supported | Description |
302
- | --- | --- | --- |
303
- | `gz` | Y | Gzip-compressed file |
304
- | `zip` | Y | ZIP archive |
329
+ | Format | Read | Write | Description |
330
+ | --- | --- | --- | --- |
331
+ | `gz` | Y | Y | Gzip-compressed file |
332
+ | `zip` | Y | Y | ZIP archive |
305
333
 
306
- #### Logs and Event Streams
334
+ #### Templates
307
335
 
308
- | Format | Supported | Description |
309
- | --- | --- | --- |
310
- | `log` | N | Generic log file |
336
+ | Format | Read | Write | Description |
337
+ | --- | --- | --- | --- |
338
+ | `hbs` | N | N | Handlebars |
339
+ | `jinja2` | N | N | Jinja2 |
340
+ | `mustache` | N | N | Mustache |
341
+ | `vm` | N | N | Apache Velocity |
311
342
 
312
343
  ## Usage
313
344
 
@@ -2,6 +2,7 @@ etlplus/README.md,sha256=5jNes37UIy_THNmUr5HSAyS5mTCTa5tqRfEPnvsgV4s,1455
2
2
  etlplus/__init__.py,sha256=M2gScnyir6WOMAh_EuoQIiAzdcTls0_5hbd_Q6of8I0,1021
3
3
  etlplus/__main__.py,sha256=btoROneNiigyfBU7BSzPKZ1R9gzBMpxcpsbPwmuHwTM,479
4
4
  etlplus/__version__.py,sha256=1E0GMK_yUWCMQFKxXjTvyMwofi0qT2k4CDNiHWiymWE,327
5
+ etlplus/dag.py,sha256=4EYmBsJax3y4clHv10jjdp3GrBBD_WblvtxUb_JxGCQ,2464
5
6
  etlplus/enums.py,sha256=WyxpUEUPdYdXlueKDXGaSEo7o9OqCXyzjDOOPqmW8tw,8326
6
7
  etlplus/extract.py,sha256=LOyL8_KCOaIGemTxSnKbN_ttfLWUljqT4OQxANe7G3k,6089
7
8
  etlplus/load.py,sha256=aufl-2CpuI_J1hKBY1uFsoVf9Gfl9bKQjs233dYFf00,8631
@@ -43,7 +44,7 @@ etlplus/cli/types.py,sha256=tclhKVJXDqHzlTQBYKARfqMgDOcuBJ-Zej2pvFy96WM,652
43
44
  etlplus/config/README.md,sha256=ot6oFZxTz4x83mj1_FrQ13dO0z2QkRFDnkCkx7NPsSs,1636
44
45
  etlplus/config/__init__.py,sha256=VZWzOg7d2YR9NT6UwKTv44yf2FRUMjTHynkm1Dl5Qzo,1486
45
46
  etlplus/config/connector.py,sha256=0-TIwevHbKRHVmucvyGpPd-3tB1dKHB-dj0yJ6kq5eY,9809
46
- etlplus/config/jobs.py,sha256=hmzRCqt0OvCEZZR4ONKrd3lvSv0OmayjLc4yOBk3ug8,7399
47
+ etlplus/config/jobs.py,sha256=oa2rNwacy2b_1HP_iFDLarGnn812ZVP2k5cHt4eqBIg,7843
47
48
  etlplus/config/pipeline.py,sha256=m4Jh0ctFcKrIx6zR7LEC0sYY5wq0o8NqOruWPlz6qmA,9494
48
49
  etlplus/config/profile.py,sha256=Ss2zedQGjkaGSpvBLTD4SZaWViMJ7TJPLB8Q2_BTpPg,1898
49
50
  etlplus/config/types.py,sha256=a0epJ3z16HQ5bY3Ctf8s_cQPa3f0HHcwdOcjCP2xoG4,4954
@@ -71,7 +72,7 @@ etlplus/file/csv.py,sha256=6zXt7OKXm_6k8MrDyw8DdEwpQQrmrxG6myrDplF87_E,1744
71
72
  etlplus/file/dat.py,sha256=j-GpY49SmkZtDUzZK6CbrHY9k6N83pyGcMqVGgJZ9cs,1642
72
73
  etlplus/file/dta.py,sha256=cEprcahuYEncDYEBZiEoHyg-1jgBsr9eCHPLdI-naXM,1616
73
74
  etlplus/file/duckdb.py,sha256=hQ8PWcvYILpkgPEtWeqbT_0yhQpJN9bJh1OwQQCcRD4,1631
74
- etlplus/file/enums.py,sha256=O40_cHazyvLFTK1hSErK0a8EMYQy4TJPX4El7cXO3pc,11083
75
+ etlplus/file/enums.py,sha256=d3VGF1IbI2Vye3A9WLeMyo5dqlo2Q1TrUp9GaG7a62g,11067
75
76
  etlplus/file/feather.py,sha256=U19T5V_08ooK1STclE9nDvsnUFzu7nvQvi6J09iC-_0,2669
76
77
  etlplus/file/fwf.py,sha256=WLbvL94cyLCOYfhABJvoIQc1fljtz3yOuA7X4Fc4QGo,1589
77
78
  etlplus/file/gz.py,sha256=NKsvIV7TIWn8USbvuZmRH9hr6OrXh4TzTfDykHD41Kk,2631
@@ -125,9 +126,9 @@ etlplus/templates/view.sql.j2,sha256=Iy8DHfhq5yyvrUKDxqp_aHIEXY4Tm6j4wT7YDEFWAhk
125
126
  etlplus/validation/README.md,sha256=qusyiyJu2DsaK80jlwfXVZ0iDgeuTPOX2EL3a_fcFiw,1401
126
127
  etlplus/validation/__init__.py,sha256=Pe5Xg1_EA4uiNZGYu5WTF3j7odjmyxnAJ8rcioaplSQ,1254
127
128
  etlplus/validation/utils.py,sha256=Mtqg449VIke0ziy_wd2r6yrwJzQkA1iulZC87FzXMjo,10201
128
- etlplus-0.12.12.dist-info/licenses/LICENSE,sha256=MuNO63i6kWmgnV2pbP2SLqP54mk1BGmu7CmbtxMmT-U,1069
129
- etlplus-0.12.12.dist-info/METADATA,sha256=GsFkraquAsxEY-ldW-heU_9WKIMHXKANlc9FsKTRoN8,26886
130
- etlplus-0.12.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
131
- etlplus-0.12.12.dist-info/entry_points.txt,sha256=6w-2-jzuPa55spzK34h-UKh2JTEShh38adFRONNP9QE,45
132
- etlplus-0.12.12.dist-info/top_level.txt,sha256=aWWF-udn_sLGuHTM6W6MLh99ArS9ROkUWO8Mi8y1_2U,8
133
- etlplus-0.12.12.dist-info/RECORD,,
129
+ etlplus-0.13.0.dist-info/licenses/LICENSE,sha256=MuNO63i6kWmgnV2pbP2SLqP54mk1BGmu7CmbtxMmT-U,1069
130
+ etlplus-0.13.0.dist-info/METADATA,sha256=_L0jck50nGtiKn2XwWnpUwHd9ylP3grWBZhATo9ibLM,28104
131
+ etlplus-0.13.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
132
+ etlplus-0.13.0.dist-info/entry_points.txt,sha256=6w-2-jzuPa55spzK34h-UKh2JTEShh38adFRONNP9QE,45
133
+ etlplus-0.13.0.dist-info/top_level.txt,sha256=aWWF-udn_sLGuHTM6W6MLh99ArS9ROkUWO8Mi8y1_2U,8
134
+ etlplus-0.13.0.dist-info/RECORD,,