etlplus 0.11.11__py3-none-any.whl → 0.12.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
etlplus/file/parquet.py CHANGED
@@ -1,21 +1,81 @@
1
1
  """
2
2
  :mod:`etlplus.file.parquet` module.
3
3
 
4
- Stub helpers for PARQUET read/write.
4
+ Helpers for reading/writing Parquet files.
5
5
  """
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
9
  from pathlib import Path
10
+ from typing import Any
11
+ from typing import cast
10
12
 
11
13
  from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
12
16
 
13
17
  # SECTION: EXPORTS ========================================================== #
14
18
 
15
19
 
16
- def read(path: Path) -> JSONData:
20
+ __all__ = [
21
+ 'read',
22
+ 'write',
23
+ ]
24
+
25
+
26
+ # SECTION: INTERNAL CONSTANTS =============================================== #
27
+
28
+
29
+ _PANDAS_CACHE: dict[str, Any] = {}
30
+
31
+
32
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
33
+
34
+
35
+ def _get_pandas() -> Any:
36
+ """
37
+ Return the pandas module, importing it on first use.
38
+
39
+ Raises an informative ImportError if the optional dependency is missing.
40
+ """
41
+ mod = _PANDAS_CACHE.get('mod')
42
+ if mod is not None: # pragma: no cover - tiny branch
43
+ return mod
44
+ try:
45
+ _pd = __import__('pandas') # type: ignore[assignment]
46
+ except ImportError as e: # pragma: no cover
47
+ raise ImportError(
48
+ 'Parquet support requires optional dependency "pandas".\n'
49
+ 'Install with: pip install pandas',
50
+ ) from e
51
+ _PANDAS_CACHE['mod'] = _pd
52
+
53
+ return _pd
54
+
55
+
56
+ def _normalize_records(data: JSONData) -> JSONList:
17
57
  """
18
- Read PARQUET content from ``path``.
58
+ Normalize JSON payloads into a list of dictionaries.
59
+
60
+ Raises TypeError when payloads contain non-dict items.
61
+ """
62
+ if isinstance(data, list):
63
+ if not all(isinstance(item, dict) for item in data):
64
+ raise TypeError(
65
+ 'Parquet payloads must contain only objects (dicts)',
66
+ )
67
+ return cast(JSONList, data)
68
+ return [cast(JSONDict, data)]
69
+
70
+
71
+ # SECTION: FUNCTIONS ======================================================== #
72
+
73
+
74
+ def read(
75
+ path: Path,
76
+ ) -> JSONList:
77
+ """
78
+ Read Parquet content from ``path``.
19
79
 
20
80
  Parameters
21
81
  ----------
@@ -24,20 +84,32 @@ def read(path: Path) -> JSONData:
24
84
 
25
85
  Returns
26
86
  -------
27
- JSONData
28
- Parsed payload.
87
+ JSONList
88
+ The list of dictionaries read from the Parquet file.
29
89
 
30
90
  Raises
31
91
  ------
32
- NotImplementedError
33
- PARQUET :func:`read` is not implemented yet.
92
+ ImportError
93
+ If optional dependencies for Parquet support are missing.
34
94
  """
35
- raise NotImplementedError('PARQUET read is not implemented yet')
95
+ pandas = _get_pandas()
96
+ try:
97
+ frame = pandas.read_parquet(path)
98
+ except ImportError as e: # pragma: no cover
99
+ raise ImportError(
100
+ 'Parquet support requires optional dependency '
101
+ '"pyarrow" or "fastparquet".\n'
102
+ 'Install with: pip install pyarrow',
103
+ ) from e
104
+ return cast(JSONList, frame.to_dict(orient='records'))
36
105
 
37
106
 
38
- def write(path: Path, data: JSONData) -> int:
107
+ def write(
108
+ path: Path,
109
+ data: JSONData,
110
+ ) -> int:
39
111
  """
40
- Write ``data`` to PARQUET at ``path``.
112
+ Write ``data`` to Parquet at ``path`` and return record count.
41
113
 
42
114
  Parameters
43
115
  ----------
@@ -53,7 +125,22 @@ def write(path: Path, data: JSONData) -> int:
53
125
 
54
126
  Raises
55
127
  ------
56
- NotImplementedError
57
- PARQUET :func:`write` is not implemented yet.
128
+ ImportError
129
+ If optional dependencies for Parquet support are missing.
58
130
  """
59
- raise NotImplementedError('PARQUET write is not implemented yet')
131
+ records = _normalize_records(data)
132
+ if not records:
133
+ return 0
134
+
135
+ pandas = _get_pandas()
136
+ path.parent.mkdir(parents=True, exist_ok=True)
137
+ frame = pandas.DataFrame.from_records(records)
138
+ try:
139
+ frame.to_parquet(path, index=False)
140
+ except ImportError as e: # pragma: no cover
141
+ raise ImportError(
142
+ 'Parquet support requires optional dependency '
143
+ '"pyarrow" or "fastparquet".\n'
144
+ 'Install with: pip install pyarrow',
145
+ ) from e
146
+ return len(records)
etlplus/file/tsv.py CHANGED
@@ -1,19 +1,34 @@
1
1
  """
2
2
  :mod:`etlplus.file.tsv` module.
3
3
 
4
- Stub helpers for TSV read/write.
4
+ Helpers for reading/writing TSV files.
5
5
  """
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
+ import csv
9
10
  from pathlib import Path
11
+ from typing import cast
10
12
 
11
13
  from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
12
16
 
13
17
  # SECTION: EXPORTS ========================================================== #
14
18
 
15
19
 
16
- def read(path: Path) -> JSONData:
20
+ __all__ = [
21
+ 'read',
22
+ 'write',
23
+ ]
24
+
25
+
26
+ # SECTION: FUNCTIONS ======================================================== #
27
+
28
+
29
+ def read(
30
+ path: Path,
31
+ ) -> JSONList:
17
32
  """
18
33
  Read TSV content from ``path``.
19
34
 
@@ -24,36 +39,53 @@ def read(path: Path) -> JSONData:
24
39
 
25
40
  Returns
26
41
  -------
27
- JSONData
28
- Parsed payload.
29
-
30
- Raises
31
- ------
32
- NotImplementedError
33
- TSV :func:`read` is not implemented yet.
42
+ JSONList
43
+ The list of dictionaries read from the TSV file.
34
44
  """
35
- raise NotImplementedError('TSV read is not implemented yet')
45
+ with path.open('r', encoding='utf-8', newline='') as handle:
46
+ reader: csv.DictReader[str] = csv.DictReader(handle, delimiter='\t')
47
+ rows: JSONList = []
48
+ for row in reader:
49
+ if not any(row.values()):
50
+ continue
51
+ rows.append(cast(JSONDict, dict(row)))
52
+ return rows
36
53
 
37
54
 
38
- def write(path: Path, data: JSONData) -> int:
55
+ def write(
56
+ path: Path,
57
+ data: JSONData,
58
+ ) -> int:
39
59
  """
40
- Write ``data`` to TSV at ``path``.
60
+ Write ``data`` to TSV at ``path`` and return record count.
41
61
 
42
62
  Parameters
43
63
  ----------
44
64
  path : Path
45
65
  Path to the TSV file on disk.
46
66
  data : JSONData
47
- Data to write.
67
+ Data to write as TSV. Should be a list of dictionaries or a
68
+ single dictionary.
48
69
 
49
70
  Returns
50
71
  -------
51
72
  int
52
- Number of records written.
53
-
54
- Raises
55
- ------
56
- NotImplementedError
57
- TSV :func:`write` is not implemented yet.
73
+ The number of rows written to the TSV file.
58
74
  """
59
- raise NotImplementedError('TSV write is not implemented yet')
75
+ rows: list[JSONDict]
76
+ if isinstance(data, list):
77
+ rows = [row for row in data if isinstance(row, dict)]
78
+ else:
79
+ rows = [data]
80
+
81
+ if not rows:
82
+ return 0
83
+
84
+ fieldnames = sorted({key for row in rows for key in row})
85
+ with path.open('w', encoding='utf-8', newline='') as handle:
86
+ writer = csv.DictWriter(handle, fieldnames=fieldnames, delimiter='\t')
87
+ writer.writeheader()
88
+ for row in rows:
89
+ writer.writerow({field: row.get(field) for field in fieldnames})
90
+
91
+ return len(rows)
etlplus/file/txt.py CHANGED
@@ -1,19 +1,34 @@
1
1
  """
2
2
  :mod:`etlplus.file.txt` module.
3
3
 
4
- Stub helpers for TXT read/write.
4
+ Helpers for reading/writing text files.
5
5
  """
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
9
  from pathlib import Path
10
+ from typing import cast
10
11
 
11
12
  from ..types import JSONData
13
+ from ..types import JSONDict
14
+ from ..types import JSONList
15
+ from ..utils import count_records
12
16
 
13
17
  # SECTION: EXPORTS ========================================================== #
14
18
 
15
19
 
16
- def read(path: Path) -> JSONData:
20
+ __all__ = [
21
+ 'read',
22
+ 'write',
23
+ ]
24
+
25
+
26
+ # SECTION: FUNCTIONS ======================================================== #
27
+
28
+
29
+ def read(
30
+ path: Path,
31
+ ) -> JSONList:
17
32
  """
18
33
  Read TXT content from ``path``.
19
34
 
@@ -24,27 +39,32 @@ def read(path: Path) -> JSONData:
24
39
 
25
40
  Returns
26
41
  -------
27
- JSONData
28
- Parsed payload.
29
-
30
- Raises
31
- ------
32
- NotImplementedError
33
- TXT :func:`read` is not implemented yet.
42
+ JSONList
43
+ The list of dictionaries read from the TXT file.
34
44
  """
35
- raise NotImplementedError('TXT read is not implemented yet')
45
+ rows: JSONList = []
46
+ with path.open('r', encoding='utf-8') as handle:
47
+ for line in handle:
48
+ text = line.rstrip('\n')
49
+ if text == '':
50
+ continue
51
+ rows.append({'text': text})
52
+ return rows
36
53
 
37
54
 
38
- def write(path: Path, data: JSONData) -> int:
55
+ def write(
56
+ path: Path,
57
+ data: JSONData,
58
+ ) -> int:
39
59
  """
40
- Write ``data`` to TXT at ``path``.
60
+ Write ``data`` to TXT at ``path`` and return record count.
41
61
 
42
62
  Parameters
43
63
  ----------
44
64
  path : Path
45
65
  Path to the TXT file on disk.
46
66
  data : JSONData
47
- Data to write.
67
+ Data to write. Expects ``{'text': '...'} `` or a list of those.
48
68
 
49
69
  Returns
50
70
  -------
@@ -53,7 +73,27 @@ def write(path: Path, data: JSONData) -> int:
53
73
 
54
74
  Raises
55
75
  ------
56
- NotImplementedError
57
- TXT :func:`write` is not implemented yet.
76
+ TypeError
77
+ If any item in ``data`` is not a dictionary or if any dictionary
78
+ does not contain a ``'text'`` key.
58
79
  """
59
- raise NotImplementedError('TXT write is not implemented yet')
80
+ rows: JSONList
81
+ if isinstance(data, list):
82
+ if not all(isinstance(item, dict) for item in data):
83
+ raise TypeError('TXT payloads must contain only objects (dicts)')
84
+ rows = cast(JSONList, data)
85
+ else:
86
+ rows = [cast(JSONDict, data)]
87
+
88
+ if not rows:
89
+ return 0
90
+
91
+ path.parent.mkdir(parents=True, exist_ok=True)
92
+ with path.open('w', encoding='utf-8') as handle:
93
+ for row in rows:
94
+ if 'text' not in row:
95
+ raise TypeError('TXT payloads must include a "text" key')
96
+ handle.write(str(row['text']))
97
+ handle.write('\n')
98
+
99
+ return count_records(rows)
etlplus/file/xls.py CHANGED
@@ -1,19 +1,77 @@
1
1
  """
2
2
  :mod:`etlplus.file.xls` module.
3
3
 
4
- Stub helpers for XLS read/write.
4
+ Helpers for reading/writing Excel XLS files.
5
5
  """
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
9
  from pathlib import Path
10
+ from typing import Any
11
+ from typing import cast
10
12
 
11
13
  from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
12
16
 
13
17
  # SECTION: EXPORTS ========================================================== #
14
18
 
15
19
 
16
- def read(path: Path) -> JSONData:
20
+ __all__ = [
21
+ 'read',
22
+ 'write',
23
+ ]
24
+
25
+
26
+ # SECTION: INTERNAL CONSTANTS =============================================== #
27
+
28
+
29
+ _PANDAS_CACHE: dict[str, Any] = {}
30
+
31
+
32
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
33
+
34
+
35
+ def _get_pandas() -> Any:
36
+ """
37
+ Return the pandas module, importing it on first use.
38
+
39
+ Raises an informative ImportError if the optional dependency is missing.
40
+ """
41
+ mod = _PANDAS_CACHE.get('mod')
42
+ if mod is not None: # pragma: no cover - tiny branch
43
+ return mod
44
+ try:
45
+ _pd = __import__('pandas') # type: ignore[assignment]
46
+ except ImportError as e: # pragma: no cover
47
+ raise ImportError(
48
+ 'XLS support requires optional dependency "pandas".\n'
49
+ 'Install with: pip install pandas',
50
+ ) from e
51
+ _PANDAS_CACHE['mod'] = _pd
52
+
53
+ return _pd
54
+
55
+
56
+ def _normalize_records(data: JSONData) -> JSONList:
57
+ """
58
+ Normalize JSON payloads into a list of dictionaries.
59
+
60
+ Raises TypeError when payloads contain non-dict items.
61
+ """
62
+ if isinstance(data, list):
63
+ if not all(isinstance(item, dict) for item in data):
64
+ raise TypeError('XLS payloads must contain only objects (dicts)')
65
+ return cast(JSONList, data)
66
+ return [cast(JSONDict, data)]
67
+
68
+
69
+ # SECTION: FUNCTIONS ======================================================== #
70
+
71
+
72
+ def read(
73
+ path: Path,
74
+ ) -> JSONList:
17
75
  """
18
76
  Read XLS content from ``path``.
19
77
 
@@ -24,20 +82,35 @@ def read(path: Path) -> JSONData:
24
82
 
25
83
  Returns
26
84
  -------
27
- JSONData
28
- Parsed payload.
85
+ JSONList
86
+ The list of dictionaries read from the XLS file.
29
87
 
30
88
  Raises
31
89
  ------
32
- NotImplementedError
33
- XLS :func:`read` is not implemented yet.
90
+ ImportError
91
+ If the optional dependency "xlrd" is not installed.
34
92
  """
35
- raise NotImplementedError('XLS read is not implemented yet')
93
+ pandas = _get_pandas()
94
+ try:
95
+ frame = pandas.read_excel(path, engine='xlrd')
96
+ except ImportError as e: # pragma: no cover
97
+ raise ImportError(
98
+ 'XLS support requires optional dependency "xlrd".\n'
99
+ 'Install with: pip install xlrd',
100
+ ) from e
101
+ return cast(JSONList, frame.to_dict(orient='records'))
36
102
 
37
103
 
38
- def write(path: Path, data: JSONData) -> int:
104
+ def write(
105
+ path: Path,
106
+ data: JSONData,
107
+ ) -> int:
39
108
  """
40
- Write ``data`` to XLS at ``path``.
109
+ Write ``data`` to XLS at ``path`` and return record count.
110
+
111
+ Notes
112
+ -----
113
+ XLS writing is not supported by pandas 2.x. Use XLSX for writes.
41
114
 
42
115
  Parameters
43
116
  ----------
@@ -53,7 +126,7 @@ def write(path: Path, data: JSONData) -> int:
53
126
 
54
127
  Raises
55
128
  ------
56
- NotImplementedError
57
- XLS :func:`write` is not implemented yet.
129
+ ImportError
130
+ If the optional dependency "xlwt" is not installed.
58
131
  """
59
- raise NotImplementedError('XLS write is not implemented yet')
132
+ raise RuntimeError('XLS write is not supported; use XLSX instead')
etlplus/file/xlsx.py CHANGED
@@ -1,19 +1,77 @@
1
1
  """
2
2
  :mod:`etlplus.file.xlsx` module.
3
3
 
4
- Stub helpers for XLSX read/write.
4
+ Helpers for reading/writing Excel XLSX files.
5
5
  """
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
9
  from pathlib import Path
10
+ from typing import Any
11
+ from typing import cast
10
12
 
11
13
  from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
12
16
 
13
17
  # SECTION: EXPORTS ========================================================== #
14
18
 
15
19
 
16
- def read(path: Path) -> JSONData:
20
+ __all__ = [
21
+ 'read',
22
+ 'write',
23
+ ]
24
+
25
+
26
+ # SECTION: INTERNAL CONSTANTS =============================================== #
27
+
28
+
29
+ _PANDAS_CACHE: dict[str, Any] = {}
30
+
31
+
32
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
33
+
34
+
35
+ def _get_pandas() -> Any:
36
+ """
37
+ Return the pandas module, importing it on first use.
38
+
39
+ Raises an informative ImportError if the optional dependency is missing.
40
+ """
41
+ mod = _PANDAS_CACHE.get('mod')
42
+ if mod is not None: # pragma: no cover - tiny branch
43
+ return mod
44
+ try:
45
+ _pd = __import__('pandas') # type: ignore[assignment]
46
+ except ImportError as e: # pragma: no cover
47
+ raise ImportError(
48
+ 'XLSX support requires optional dependency "pandas".\n'
49
+ 'Install with: pip install pandas',
50
+ ) from e
51
+ _PANDAS_CACHE['mod'] = _pd
52
+
53
+ return _pd
54
+
55
+
56
+ def _normalize_records(data: JSONData) -> JSONList:
57
+ """
58
+ Normalize JSON payloads into a list of dictionaries.
59
+
60
+ Raises TypeError when payloads contain non-dict items.
61
+ """
62
+ if isinstance(data, list):
63
+ if not all(isinstance(item, dict) for item in data):
64
+ raise TypeError('XLSX payloads must contain only objects (dicts)')
65
+ return cast(JSONList, data)
66
+ return [cast(JSONDict, data)]
67
+
68
+
69
+ # SECTION: FUNCTIONS ======================================================== #
70
+
71
+
72
+ def read(
73
+ path: Path,
74
+ ) -> JSONList:
17
75
  """
18
76
  Read XLSX content from ``path``.
19
77
 
@@ -24,20 +82,31 @@ def read(path: Path) -> JSONData:
24
82
 
25
83
  Returns
26
84
  -------
27
- JSONData
28
- Parsed payload.
85
+ JSONList
86
+ The list of dictionaries read from the XLSX file.
29
87
 
30
88
  Raises
31
89
  ------
32
- NotImplementedError
33
- XLSX :func:`read` is not implemented yet.
90
+ ImportError
91
+ If optional dependencies for XLSX support are missing.
34
92
  """
35
- raise NotImplementedError('XLSX read is not implemented yet')
93
+ pandas = _get_pandas()
94
+ try:
95
+ frame = pandas.read_excel(path)
96
+ except ImportError as e: # pragma: no cover
97
+ raise ImportError(
98
+ 'XLSX support requires optional dependency "openpyxl".\n'
99
+ 'Install with: pip install openpyxl',
100
+ ) from e
101
+ return cast(JSONList, frame.to_dict(orient='records'))
36
102
 
37
103
 
38
- def write(path: Path, data: JSONData) -> int:
104
+ def write(
105
+ path: Path,
106
+ data: JSONData,
107
+ ) -> int:
39
108
  """
40
- Write ``data`` to XLSX at ``path``.
109
+ Write ``data`` to XLSX at ``path`` and return record count.
41
110
 
42
111
  Parameters
43
112
  ----------
@@ -53,7 +122,21 @@ def write(path: Path, data: JSONData) -> int:
53
122
 
54
123
  Raises
55
124
  ------
56
- NotImplementedError
57
- XLSX :func:`write` is not implemented yet.
125
+ ImportError
126
+ If optional dependencies for XLSX support are missing.
58
127
  """
59
- raise NotImplementedError('XLSX write is not implemented yet')
128
+ records = _normalize_records(data)
129
+ if not records:
130
+ return 0
131
+
132
+ pandas = _get_pandas()
133
+ path.parent.mkdir(parents=True, exist_ok=True)
134
+ frame = pandas.DataFrame.from_records(records)
135
+ try:
136
+ frame.to_excel(path, index=False)
137
+ except ImportError as e: # pragma: no cover
138
+ raise ImportError(
139
+ 'XLSX support requires optional dependency "openpyxl".\n'
140
+ 'Install with: pip install openpyxl',
141
+ ) from e
142
+ return len(records)
etlplus/file/xml.py CHANGED
@@ -1,7 +1,7 @@
1
1
  """
2
2
  :mod:`etlplus.file.xml` module.
3
3
 
4
- XML read/write helpers.
4
+ Helpers for reading/writing XML files.
5
5
  """
6
6
 
7
7
  from __future__ import annotations
@@ -14,6 +14,15 @@ from ..types import JSONData
14
14
  from ..types import JSONDict
15
15
  from ..utils import count_records
16
16
 
17
+ # SECTION: EXPORTS ========================================================== #
18
+
19
+
20
+ __all__ = [
21
+ 'read',
22
+ 'write',
23
+ ]
24
+
25
+
17
26
  # SECTION: CONSTANTS ======================================================== #
18
27
 
19
28
 
@@ -117,7 +126,7 @@ def read(
117
126
  path: Path,
118
127
  ) -> JSONDict:
119
128
  """
120
- Parse XML document at ``path`` into a nested dictionary.
129
+ Read XML content from ``path``.
121
130
 
122
131
  Parameters
123
132
  ----------
@@ -137,7 +146,7 @@ def read(
137
146
 
138
147
  def write(path: Path, data: JSONData, *, root_tag: str) -> int:
139
148
  """
140
- Write ``data`` as XML to ``path`` and return record count.
149
+ Write ``data`` to XML at ``path`` and return record count.
141
150
 
142
151
  Parameters
143
152
  ----------