manualforge 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- manualforge/__init__.py +3 -0
- manualforge/__main__.py +25 -0
- manualforge/config.py +73 -0
- manualforge/hooks.py +425 -0
- manualforge/io/__init__.py +7 -0
- manualforge/io/polars_excel_dataset.py +144 -0
- manualforge/pipeline_registry.py +17 -0
- manualforge/pipelines/__init__.py +0 -0
- manualforge/pipelines/data_processong_pl/__init__.py +10 -0
- manualforge/pipelines/data_processong_pl/nodes.py +1282 -0
- manualforge/pipelines/data_processong_pl/pipeline.py +120 -0
- manualforge/pipelines/data_processong_pl/rulecsv2typ.py +333 -0
- manualforge/pipelines/data_processong_pl/standardize_fields.py +709 -0
- manualforge/settings.py +53 -0
- manualforge-0.1.1.dist-info/METADATA +236 -0
- manualforge-0.1.1.dist-info/RECORD +19 -0
- manualforge-0.1.1.dist-info/WHEEL +5 -0
- manualforge-0.1.1.dist-info/entry_points.txt +2 -0
- manualforge-0.1.1.dist-info/top_level.txt +1 -0
manualforge/__init__.py
ADDED
manualforge/__main__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""manualforge file for ensuring the package is executable
|
|
2
|
+
as `manualforge` and `python -m manualforge`
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from kedro.framework.cli.utils import find_run_command
|
|
10
|
+
from kedro.framework.project import configure_project
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def main(*args, **kwargs) -> Any:
|
|
14
|
+
package_name = Path(__file__).parent.name
|
|
15
|
+
configure_project(package_name)
|
|
16
|
+
|
|
17
|
+
interactive = hasattr(sys, "ps1")
|
|
18
|
+
kwargs["standalone_mode"] = not interactive
|
|
19
|
+
|
|
20
|
+
run = find_run_command(package_name)
|
|
21
|
+
return run(*args, **kwargs)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
if __name__ == "__main__":
|
|
25
|
+
main()
|
manualforge/config.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""ManualForge configuration helpers.
|
|
2
|
+
|
|
3
|
+
Utilities for safely reading project configuration and providing sensible
|
|
4
|
+
defaults, so node functions stay clean when operating in config-driven mode.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
# Sentinel for "not set" to distinguish from explicit None.
|
|
15
|
+
_UNSET = object()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_datasource(config: dict, source_id: str) -> dict:
|
|
19
|
+
"""Return the datasource sub-config for *source_id*."""
|
|
20
|
+
sources = config.get("datasources", {})
|
|
21
|
+
if source_id not in sources:
|
|
22
|
+
raise KeyError(f"Datasource '{source_id}' not found in config.datasources")
|
|
23
|
+
return sources[source_id]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_standardization(config: dict) -> dict:
|
|
27
|
+
"""Return the standardization config section."""
|
|
28
|
+
return config.get("standardization", {})
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_standardization_fields(config: dict) -> list[dict]:
|
|
32
|
+
"""Return the list of field-standardization definitions."""
|
|
33
|
+
return get_standardization(config).get("fields", [])
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_summary(config: dict, summary_id: str) -> dict:
|
|
37
|
+
"""Return the summary sub-config for *summary_id*."""
|
|
38
|
+
summaries = config.get("summaries", {})
|
|
39
|
+
if summary_id not in summaries:
|
|
40
|
+
raise KeyError(f"Summary '{summary_id}' not found in config.summaries")
|
|
41
|
+
return summaries[summary_id]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_sort_order(config: dict, order_name: str) -> list[str]:
|
|
45
|
+
"""Return a named sort-order list."""
|
|
46
|
+
return config.get("sort_orders", {}).get(order_name, [])
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _resolve_sort_ref(config: dict, ref: str | list) -> list[str]:
|
|
50
|
+
"""Resolve a sort_by value which is either a sort-order name or an inline list."""
|
|
51
|
+
if isinstance(ref, list):
|
|
52
|
+
return ref
|
|
53
|
+
if isinstance(ref, str):
|
|
54
|
+
return get_sort_order(config, ref)
|
|
55
|
+
return []
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def get_sort_list(config: dict, sort_by: dict) -> dict[str, list[str]]:
|
|
59
|
+
"""Resolve a sort_by dict {column: order_name_or_list} → {column: [values]}."""
|
|
60
|
+
return {col: _resolve_sort_ref(config, ref) for col, ref in sort_by.items()}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def get_report(config: dict, report_id: str) -> dict:
|
|
64
|
+
"""Return the report sub-config for *report_id*."""
|
|
65
|
+
reports = config.get("reports", {})
|
|
66
|
+
if report_id not in reports:
|
|
67
|
+
raise KeyError(f"Report '{report_id}' not found in config.reports")
|
|
68
|
+
return reports[report_id]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_project(config: dict) -> dict:
|
|
72
|
+
"""Return the project metadata section."""
|
|
73
|
+
return config.get("project", {})
|
manualforge/hooks.py
ADDED
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Custom hooks for manualforge project.
|
|
3
|
+
Execute shell commands before the first node and after the last node.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import subprocess
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from kedro.framework.hooks import hook_impl
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PipelineHooks:
|
|
17
|
+
"""
|
|
18
|
+
Hooks for executing shell commands before first node and after last node.
|
|
19
|
+
|
|
20
|
+
Commands are loaded from configuration file (conf/base/hooks.yml).
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self):
|
|
24
|
+
self.before_commands: list[str] = []
|
|
25
|
+
self.after_commands: list[str] = []
|
|
26
|
+
self.before_node_commands: dict[str, list[str]] = {}
|
|
27
|
+
self.after_node_commands: dict[str, list[str]] = {}
|
|
28
|
+
self._config_loaded = False
|
|
29
|
+
|
|
30
|
+
@hook_impl
|
|
31
|
+
def after_context_created(self, context) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Load configuration after context is created.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
context: Kedro context object with access to configuration
|
|
37
|
+
"""
|
|
38
|
+
try:
|
|
39
|
+
# Get configuration from context
|
|
40
|
+
# The hooks configuration should be in conf/base/hooks.yml
|
|
41
|
+
# and will be loaded as part of parameters
|
|
42
|
+
config = context.params
|
|
43
|
+
|
|
44
|
+
# Look for hooks configuration
|
|
45
|
+
hooks_config = config.get("hooks", {})
|
|
46
|
+
|
|
47
|
+
# Load pipeline-level commands
|
|
48
|
+
before_cmds = hooks_config.get("before_pipeline", [])
|
|
49
|
+
after_cmds = hooks_config.get("after_pipeline", [])
|
|
50
|
+
logger.info(before_cmds)
|
|
51
|
+
# Validate and store pipeline-level commands
|
|
52
|
+
if isinstance(before_cmds, list):
|
|
53
|
+
self.before_commands = [
|
|
54
|
+
cmd for cmd in before_cmds if isinstance(cmd, dict)
|
|
55
|
+
]
|
|
56
|
+
logger.info(
|
|
57
|
+
f"Loaded {len(self.before_commands)} before-pipeline commands"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
if isinstance(after_cmds, list):
|
|
61
|
+
self.after_commands = [
|
|
62
|
+
cmd for cmd in after_cmds if isinstance(cmd, dict)
|
|
63
|
+
]
|
|
64
|
+
logger.info(
|
|
65
|
+
f"Loaded {len(self.after_commands)} after-pipeline commands"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Load node-level commands
|
|
69
|
+
before_nodes_config = hooks_config.get("before_nodes", {})
|
|
70
|
+
after_nodes_config = hooks_config.get("after_nodes", {})
|
|
71
|
+
|
|
72
|
+
if isinstance(before_nodes_config, dict):
|
|
73
|
+
for node_name, cmds in before_nodes_config.items():
|
|
74
|
+
if isinstance(cmds, list):
|
|
75
|
+
valid_cmds = [cmd for cmd in cmds if isinstance(cmd, dict)]
|
|
76
|
+
if valid_cmds:
|
|
77
|
+
self.before_node_commands[node_name] = valid_cmds
|
|
78
|
+
logger.info(
|
|
79
|
+
f"Loaded {len(valid_cmds)} before-node commands for '{node_name}'"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
if isinstance(after_nodes_config, dict):
|
|
83
|
+
for node_name, cmds in after_nodes_config.items():
|
|
84
|
+
if isinstance(cmds, list):
|
|
85
|
+
valid_cmds = [cmd for cmd in cmds if isinstance(cmd, dict)]
|
|
86
|
+
if valid_cmds:
|
|
87
|
+
self.after_node_commands[node_name] = valid_cmds
|
|
88
|
+
logger.info(
|
|
89
|
+
f"Loaded {len(valid_cmds)} after-node commands for '{node_name}'"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
self._config_loaded = True
|
|
93
|
+
|
|
94
|
+
# Log summary
|
|
95
|
+
if self.before_commands:
|
|
96
|
+
logger.debug(f"Before-pipeline commands: {self.before_commands}")
|
|
97
|
+
if self.after_commands:
|
|
98
|
+
logger.debug(f"After-pipeline commands: {self.after_commands}")
|
|
99
|
+
if self.before_node_commands:
|
|
100
|
+
logger.debug(
|
|
101
|
+
f"Before-node commands configured for nodes: {list(self.before_node_commands.keys())}"
|
|
102
|
+
)
|
|
103
|
+
if self.after_node_commands:
|
|
104
|
+
logger.debug(
|
|
105
|
+
f"After-node commands configured for nodes: {list(self.after_node_commands.keys())}"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
except Exception as e:
|
|
109
|
+
logger.error(f"Failed to load hook configuration: {e}")
|
|
110
|
+
self._config_loaded = False
|
|
111
|
+
|
|
112
|
+
@hook_impl
|
|
113
|
+
def before_pipeline_run(self, run_params: dict[str, Any]) -> None:
|
|
114
|
+
"""
|
|
115
|
+
Execute commands before the first node runs.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
run_params: Dictionary of pipeline run parameters
|
|
119
|
+
"""
|
|
120
|
+
if not self._config_loaded:
|
|
121
|
+
logger.warning(
|
|
122
|
+
"Hook configuration not loaded, skipping before-pipeline commands"
|
|
123
|
+
)
|
|
124
|
+
return
|
|
125
|
+
|
|
126
|
+
if not self.before_commands:
|
|
127
|
+
logger.info("No before-pipeline commands configured")
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
logger.info(f"Executing {len(self.before_commands)} before-pipeline commands")
|
|
131
|
+
logger.info(f"Current working directory: {os.getcwd()}")
|
|
132
|
+
|
|
133
|
+
for i, cmd in enumerate(self.before_commands, 1):
|
|
134
|
+
try:
|
|
135
|
+
logger.info(f"Running command {i}/{len(self.before_commands)}: {cmd}")
|
|
136
|
+
logger.info(f"{i}")
|
|
137
|
+
logger.info(f"{cmd}")
|
|
138
|
+
result = subprocess.run(
|
|
139
|
+
cmd["cmd"],
|
|
140
|
+
shell=True,
|
|
141
|
+
check=True,
|
|
142
|
+
cwd=cmd["cwd"],
|
|
143
|
+
capture_output=True,
|
|
144
|
+
text=True,
|
|
145
|
+
timeout=300, # 5 minutes timeout
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
logger.info(f"Command {i}/{len(self.before_commands)} succeeded")
|
|
149
|
+
|
|
150
|
+
# Log output if not empty
|
|
151
|
+
if result.stdout and result.stdout.strip():
|
|
152
|
+
logger.info(f"Command output: {result.stdout.strip()}")
|
|
153
|
+
if result.stderr and result.stderr.strip():
|
|
154
|
+
logger.warning(f"Command stderr: {result.stderr.strip()}")
|
|
155
|
+
|
|
156
|
+
except subprocess.CalledProcessError as e:
|
|
157
|
+
logger.error(f"Command failed with exit code {e.returncode}: {cmd}")
|
|
158
|
+
logger.error(
|
|
159
|
+
f"Error output: {e.stderr[:500] if e.stderr else 'No error output'}"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Re-raise to stop pipeline execution
|
|
163
|
+
raise RuntimeError(f"Before-pipeline hook command failed: {cmd}") from e
|
|
164
|
+
|
|
165
|
+
except subprocess.TimeoutExpired:
|
|
166
|
+
logger.error(f"Command timed out after 300 seconds: {cmd}")
|
|
167
|
+
raise RuntimeError(f"Before-pipeline hook command timed out: {cmd}")
|
|
168
|
+
|
|
169
|
+
except Exception as e:
|
|
170
|
+
logger.error(f"Unexpected error executing command: {cmd}")
|
|
171
|
+
logger.error(f"Error: {e}")
|
|
172
|
+
raise
|
|
173
|
+
|
|
174
|
+
@hook_impl
|
|
175
|
+
def after_pipeline_run(self, run_params: dict[str, Any]) -> None:
|
|
176
|
+
"""
|
|
177
|
+
Execute commands after the last node runs.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
run_params: Dictionary of pipeline run parameters
|
|
181
|
+
"""
|
|
182
|
+
if not self._config_loaded:
|
|
183
|
+
logger.warning(
|
|
184
|
+
"Hook configuration not loaded, skipping after-pipeline commands"
|
|
185
|
+
)
|
|
186
|
+
return
|
|
187
|
+
|
|
188
|
+
if not self.after_commands:
|
|
189
|
+
logger.info("No after-pipeline commands configured")
|
|
190
|
+
return
|
|
191
|
+
|
|
192
|
+
logger.info(f"Executing {len(self.after_commands)} after-pipeline commands")
|
|
193
|
+
logger.info(f"Current working directory: {os.getcwd()}")
|
|
194
|
+
|
|
195
|
+
for i, cmd in enumerate(self.after_commands, 1):
|
|
196
|
+
try:
|
|
197
|
+
logger.info(f"Running command {i}/{len(self.after_commands)}: {cmd}")
|
|
198
|
+
logger.info(f"{i}")
|
|
199
|
+
logger.info(f"{cmd}")
|
|
200
|
+
|
|
201
|
+
# Execute shell command
|
|
202
|
+
result = subprocess.run(
|
|
203
|
+
cmd["cmd"],
|
|
204
|
+
shell=True,
|
|
205
|
+
check=True,
|
|
206
|
+
cwd=cmd["cwd"],
|
|
207
|
+
capture_output=True,
|
|
208
|
+
text=True,
|
|
209
|
+
timeout=300, # 5 minutes timeout
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
logger.info(f"Command {i}/{len(self.after_commands)} succeeded")
|
|
213
|
+
|
|
214
|
+
# Log output if not empty
|
|
215
|
+
if result.stdout and result.stdout.strip():
|
|
216
|
+
logger.info(f"Command output: {result.stdout.strip()}")
|
|
217
|
+
if result.stderr and result.stderr.strip():
|
|
218
|
+
logger.warning(f"Command stderr: {result.stderr.strip()}")
|
|
219
|
+
|
|
220
|
+
except subprocess.CalledProcessError as e:
|
|
221
|
+
logger.error(f"Command failed with exit code {e.returncode}: {cmd}")
|
|
222
|
+
logger.error(
|
|
223
|
+
f"Error output: {e.stderr[:500] if e.stderr else 'No error output'}"
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Re-raise to ensure pipeline failure is known
|
|
227
|
+
raise RuntimeError(f"After-pipeline hook command failed: {cmd}") from e
|
|
228
|
+
|
|
229
|
+
except subprocess.TimeoutExpired:
|
|
230
|
+
logger.error(f"Command timed out after 300 seconds: {cmd}")
|
|
231
|
+
raise RuntimeError(f"After-pipeline hook command timed out: {cmd}")
|
|
232
|
+
|
|
233
|
+
except Exception as e:
|
|
234
|
+
logger.error(f"Unexpected error executing command: {cmd}")
|
|
235
|
+
logger.error(f"Error: {e}")
|
|
236
|
+
raise
|
|
237
|
+
|
|
238
|
+
@hook_impl
|
|
239
|
+
def before_node_run(self, node, catalog, inputs, is_async):
|
|
240
|
+
"""
|
|
241
|
+
Execute commands before a specific node runs.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
node: The node about to be executed
|
|
245
|
+
catalog: Data catalog
|
|
246
|
+
inputs: Inputs to the node
|
|
247
|
+
is_async: Whether node execution is async
|
|
248
|
+
"""
|
|
249
|
+
if not self._config_loaded:
|
|
250
|
+
logger.warning(
|
|
251
|
+
"Hook configuration not loaded, skipping before-node commands"
|
|
252
|
+
)
|
|
253
|
+
return
|
|
254
|
+
|
|
255
|
+
node_name = node.name
|
|
256
|
+
if node_name not in self.before_node_commands:
|
|
257
|
+
return
|
|
258
|
+
|
|
259
|
+
commands = self.before_node_commands[node_name]
|
|
260
|
+
if not commands:
|
|
261
|
+
return
|
|
262
|
+
|
|
263
|
+
logger.info(f"Executing {len(commands)} before-node commands for '{node_name}'")
|
|
264
|
+
logger.info(f"Current working directory: {os.getcwd()}")
|
|
265
|
+
|
|
266
|
+
for i, cmd in enumerate(commands, 1):
|
|
267
|
+
try:
|
|
268
|
+
logger.info(
|
|
269
|
+
f"Running before-node command {i}/{len(commands)} for '{node_name}': {cmd}"
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# Execute shell command
|
|
273
|
+
result = subprocess.run(
|
|
274
|
+
cmd["cmd"],
|
|
275
|
+
shell=True,
|
|
276
|
+
check=True,
|
|
277
|
+
cwd=cmd["cwd"],
|
|
278
|
+
capture_output=True,
|
|
279
|
+
text=True,
|
|
280
|
+
timeout=300, # 5 minutes timeout
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
logger.info(
|
|
284
|
+
f"Before-node command {i}/{len(commands)} for '{node_name}' succeeded"
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Log output if not empty
|
|
288
|
+
if result.stdout and result.stdout.strip():
|
|
289
|
+
logger.info(
|
|
290
|
+
f"Command output for node '{node_name}': {result.stdout.strip()}"
|
|
291
|
+
)
|
|
292
|
+
if result.stderr and result.stderr.strip():
|
|
293
|
+
logger.warning(
|
|
294
|
+
f"Command stderr for node '{node_name}': {result.stderr.strip()}"
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
except subprocess.CalledProcessError as e:
|
|
298
|
+
logger.error(
|
|
299
|
+
f"Before-node command failed with exit code {e.returncode} for '{node_name}': {cmd}"
|
|
300
|
+
)
|
|
301
|
+
logger.error(
|
|
302
|
+
f"Error output: {e.stderr[:500] if e.stderr else 'No error output'}"
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# Re-raise to stop node execution
|
|
306
|
+
raise RuntimeError(
|
|
307
|
+
f"Before-node hook command failed for '{node_name}': {cmd}"
|
|
308
|
+
) from e
|
|
309
|
+
|
|
310
|
+
except subprocess.TimeoutExpired:
|
|
311
|
+
logger.error(
|
|
312
|
+
f"Before-node command timed out after 300 seconds for '{node_name}': {cmd}"
|
|
313
|
+
)
|
|
314
|
+
raise RuntimeError(
|
|
315
|
+
f"Before-node hook command timed out for '{node_name}': {cmd}"
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
except Exception as e:
|
|
319
|
+
logger.error(
|
|
320
|
+
f"Unexpected error executing before-node command for '{node_name}': {cmd}"
|
|
321
|
+
)
|
|
322
|
+
logger.error(f"Error: {e}")
|
|
323
|
+
raise
|
|
324
|
+
|
|
325
|
+
@hook_impl
|
|
326
|
+
def after_node_run(self, node, catalog, inputs, outputs, is_async):
|
|
327
|
+
"""
|
|
328
|
+
Execute commands after a specific node runs.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
node: The node that was executed
|
|
332
|
+
catalog: Data catalog
|
|
333
|
+
inputs: Inputs to the node
|
|
334
|
+
outputs: Outputs from the node
|
|
335
|
+
is_async: Whether node execution was async
|
|
336
|
+
"""
|
|
337
|
+
if not self._config_loaded:
|
|
338
|
+
logger.warning(
|
|
339
|
+
"Hook configuration not loaded, skipping after-node commands"
|
|
340
|
+
)
|
|
341
|
+
return
|
|
342
|
+
|
|
343
|
+
node_name = node.name
|
|
344
|
+
if node_name not in self.after_node_commands:
|
|
345
|
+
return
|
|
346
|
+
|
|
347
|
+
commands = self.after_node_commands[node_name]
|
|
348
|
+
if not commands:
|
|
349
|
+
return
|
|
350
|
+
|
|
351
|
+
logger.info(f"Executing {len(commands)} after-node commands for '{node_name}'")
|
|
352
|
+
logger.info(f"Current working directory: {os.getcwd()}")
|
|
353
|
+
|
|
354
|
+
for i, cmd in enumerate(commands, 1):
|
|
355
|
+
try:
|
|
356
|
+
logger.info(
|
|
357
|
+
f"Running after-node command {i}/{len(commands)} for '{node_name}': {cmd}"
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Execute shell command
|
|
361
|
+
result = subprocess.run(
|
|
362
|
+
cmd["cmd"],
|
|
363
|
+
shell=True,
|
|
364
|
+
check=True,
|
|
365
|
+
cwd=cmd["cwd"],
|
|
366
|
+
capture_output=True,
|
|
367
|
+
text=True,
|
|
368
|
+
timeout=300, # 5 minutes timeout
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
logger.info(
|
|
372
|
+
f"After-node command {i}/{len(commands)} for '{node_name}' succeeded"
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
# Log output if not empty
|
|
376
|
+
if result.stdout and result.stdout.strip():
|
|
377
|
+
logger.info(
|
|
378
|
+
f"Command output for node '{node_name}': {result.stdout.strip()}"
|
|
379
|
+
)
|
|
380
|
+
if result.stderr and result.stderr.strip():
|
|
381
|
+
logger.warning(
|
|
382
|
+
f"Command stderr for node '{node_name}': {result.stderr.strip()}"
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
except subprocess.CalledProcessError as e:
|
|
386
|
+
logger.error(
|
|
387
|
+
f"After-node command failed with exit code {e.returncode} for '{node_name}': {cmd}"
|
|
388
|
+
)
|
|
389
|
+
logger.error(
|
|
390
|
+
f"Error output: {e.stderr[:500] if e.stderr else 'No error output'}"
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
# Re-raise to mark pipeline as failed
|
|
394
|
+
raise RuntimeError(
|
|
395
|
+
f"After-node hook command failed for '{node_name}': {cmd}"
|
|
396
|
+
) from e
|
|
397
|
+
|
|
398
|
+
except subprocess.TimeoutExpired:
|
|
399
|
+
logger.error(
|
|
400
|
+
f"After-node command timed out after 300 seconds for '{node_name}': {cmd}"
|
|
401
|
+
)
|
|
402
|
+
raise RuntimeError(
|
|
403
|
+
f"After-node hook command timed out for '{node_name}': {cmd}"
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
except Exception as e:
|
|
407
|
+
logger.error(
|
|
408
|
+
f"Unexpected error executing after-node command for '{node_name}': {cmd}"
|
|
409
|
+
)
|
|
410
|
+
logger.error(f"Error: {e}")
|
|
411
|
+
raise
|
|
412
|
+
|
|
413
|
+
@hook_impl
|
|
414
|
+
def on_pipeline_error(self, error: Exception, run_params: dict[str, Any]) -> None:
|
|
415
|
+
"""
|
|
416
|
+
Handle pipeline errors.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
error: Exception that caused pipeline to fail
|
|
420
|
+
run_params: Dictionary of pipeline run parameters
|
|
421
|
+
"""
|
|
422
|
+
logger.error(f"Pipeline failed with error: {error}")
|
|
423
|
+
|
|
424
|
+
# You could add error handling commands here if needed
|
|
425
|
+
# For example, cleanup commands even on failure
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""
|
|
2
|
+
自定义Polars Excel数据集,支持使用polars读取Excel文件。
|
|
3
|
+
支持读取单个工作表或多个工作表。
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Union
|
|
8
|
+
|
|
9
|
+
import polars as pl
|
|
10
|
+
from kedro.io import AbstractDataset
|
|
11
|
+
from kedro.io.core import DatasetError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PolarsExcelDataset(
|
|
15
|
+
AbstractDataset[
|
|
16
|
+
Union[pl.DataFrame, dict[str, pl.DataFrame]],
|
|
17
|
+
Union[pl.DataFrame, dict[str, pl.DataFrame]],
|
|
18
|
+
]
|
|
19
|
+
):
|
|
20
|
+
"""``PolarsExcelDataset`` 使用polars库加载和保存Excel文件。
|
|
21
|
+
|
|
22
|
+
根据`sheet_name`参数,可以返回单个DataFrame或工作表名称到DataFrame的字典。
|
|
23
|
+
|
|
24
|
+
示例:
|
|
25
|
+
::
|
|
26
|
+
|
|
27
|
+
>>> # 读取单个工作表
|
|
28
|
+
>>> dataset = PolarsExcelDataset(
|
|
29
|
+
>>> filepath="/path/to/file.xlsx",
|
|
30
|
+
>>> load_args={"has_header": True, "sheet_name": "Sheet1"}
|
|
31
|
+
>>> )
|
|
32
|
+
>>> dataframe = dataset.load() # 返回单个DataFrame
|
|
33
|
+
>>>
|
|
34
|
+
>>> # 读取所有工作表
|
|
35
|
+
>>> dataset = PolarsExcelDataset(
|
|
36
|
+
>>> filepath="/path/to/file.xlsx",
|
|
37
|
+
>>> load_args={"has_header": True, "sheet_name": None}
|
|
38
|
+
>>> )
|
|
39
|
+
>>> sheets_dict = dataset.load() # 返回Dict[str, DataFrame]
|
|
40
|
+
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
filepath: str,
|
|
46
|
+
load_args: dict[str, Any] | None = None,
|
|
47
|
+
save_args: dict[str, Any] | None = None,
|
|
48
|
+
) -> None:
|
|
49
|
+
"""创建PolarsExcelDataset的新实例。
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
filepath: Excel文件的路径。
|
|
53
|
+
load_args: 传递给polars.read_excel()的选项。
|
|
54
|
+
支持的参数包括:sheet_id, sheet_name, engine, has_header,
|
|
55
|
+
xlsx2csv_options, read_options, raise_if_empty,
|
|
56
|
+
infer_schema_length, schema_overrides等。
|
|
57
|
+
参见 https://pola-rs.github.io/polars/py-polars/html/reference/io.html
|
|
58
|
+
save_args: 传递给polars.DataFrame.write_excel()的选项。
|
|
59
|
+
参见 https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/io.html
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
DatasetError: 当提供无效参数时。
|
|
63
|
+
"""
|
|
64
|
+
self._filepath = Path(filepath)
|
|
65
|
+
self._load_args = load_args if load_args is not None else {}
|
|
66
|
+
self._save_args = save_args if save_args is not None else {}
|
|
67
|
+
|
|
68
|
+
# 设置默认引擎为calamine(fastexcel),如果未指定
|
|
69
|
+
if "engine" not in self._load_args:
|
|
70
|
+
self._load_args["engine"] = "calamine"
|
|
71
|
+
|
|
72
|
+
def _describe(self) -> dict[str, Any]:
|
|
73
|
+
return {
|
|
74
|
+
"filepath": str(self._filepath),
|
|
75
|
+
"load_args": self._load_args,
|
|
76
|
+
"save_args": self._save_args,
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
def _load(self) -> pl.DataFrame | dict[str, pl.DataFrame]:
|
|
80
|
+
"""从Excel文件加载数据。
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
如果sheet_id=0或sheet_name是列表/None,返回工作表名称到DataFrame的字典;
|
|
84
|
+
否则返回单个DataFrame。
|
|
85
|
+
|
|
86
|
+
Raises:
|
|
87
|
+
DatasetError: 当加载失败时。
|
|
88
|
+
"""
|
|
89
|
+
try:
|
|
90
|
+
result = pl.read_excel(self._filepath, **self._load_args)
|
|
91
|
+
|
|
92
|
+
# 帮助类型检查器理解返回类型
|
|
93
|
+
# polars.read_excel在sheet_id=0或sheet_name为列表/None时返回Dict[str, DataFrame]
|
|
94
|
+
# 否则返回DataFrame
|
|
95
|
+
return result
|
|
96
|
+
except Exception as exc:
|
|
97
|
+
raise DatasetError(f"从 {self._filepath} 加载Excel文件失败。") from exc
|
|
98
|
+
|
|
99
|
+
def _save(self, data: pl.DataFrame | dict[str, pl.DataFrame]) -> None:
|
|
100
|
+
"""将数据保存到Excel文件。
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
data: 要保存的Polars DataFrame或工作表名称到DataFrame的字典。
|
|
104
|
+
|
|
105
|
+
Raises:
|
|
106
|
+
DatasetError: 当保存失败时,或当数据格式不支持时。
|
|
107
|
+
"""
|
|
108
|
+
try:
|
|
109
|
+
# 确保目录存在
|
|
110
|
+
self._filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
111
|
+
|
|
112
|
+
if isinstance(data, pl.DataFrame):
|
|
113
|
+
# 单个DataFrame
|
|
114
|
+
data.write_excel(self._filepath, **self._save_args)
|
|
115
|
+
elif isinstance(data, dict):
|
|
116
|
+
# 多个工作表
|
|
117
|
+
# 注意:polars的write_excel可以通过worksheet_name参数支持多个工作表
|
|
118
|
+
# 但需要检查是否所有值都是DataFrame
|
|
119
|
+
for sheet_name, df in data.items():
|
|
120
|
+
if not isinstance(df, pl.DataFrame):
|
|
121
|
+
raise DatasetError(
|
|
122
|
+
f"字典值必须是DataFrame,但 '{sheet_name}' 的类型是 {type(df)}"
|
|
123
|
+
)
|
|
124
|
+
# 目前polars的write_excel不支持直接写入多个工作表到单个文件
|
|
125
|
+
# 我们可以使用第一个DataFrame写入,然后添加其他工作表
|
|
126
|
+
# 这是一个简化实现:只保存第一个工作表
|
|
127
|
+
# 更完整的实现需要使用其他库如openpyxl或xlsxwriter
|
|
128
|
+
first_sheet_name = next(iter(data))
|
|
129
|
+
first_df = data[first_sheet_name]
|
|
130
|
+
first_df.write_excel(self._filepath, **self._save_args)
|
|
131
|
+
# 记录警告或抛出不支持的错误
|
|
132
|
+
if len(data) > 1:
|
|
133
|
+
raise DatasetError(
|
|
134
|
+
"当前版本不支持保存多个工作表。请提供一个DataFrame。"
|
|
135
|
+
)
|
|
136
|
+
else:
|
|
137
|
+
raise DatasetError(
|
|
138
|
+
f"不支持的数据类型:{type(data)}。必须是DataFrame或Dict[str, DataFrame]。"
|
|
139
|
+
)
|
|
140
|
+
except Exception as exc:
|
|
141
|
+
raise DatasetError(f"保存到 {self._filepath} 失败。") from exc
|
|
142
|
+
|
|
143
|
+
def _exists(self) -> bool:
|
|
144
|
+
return self._filepath.exists()
|