hexdag 0.5.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hexdag/__init__.py +116 -0
- hexdag/__main__.py +30 -0
- hexdag/adapters/executors/__init__.py +5 -0
- hexdag/adapters/executors/local_executor.py +316 -0
- hexdag/builtin/__init__.py +6 -0
- hexdag/builtin/adapters/__init__.py +51 -0
- hexdag/builtin/adapters/anthropic/__init__.py +5 -0
- hexdag/builtin/adapters/anthropic/anthropic_adapter.py +151 -0
- hexdag/builtin/adapters/database/__init__.py +6 -0
- hexdag/builtin/adapters/database/csv/csv_adapter.py +249 -0
- hexdag/builtin/adapters/database/pgvector/__init__.py +5 -0
- hexdag/builtin/adapters/database/pgvector/pgvector_adapter.py +478 -0
- hexdag/builtin/adapters/database/sqlalchemy/sqlalchemy_adapter.py +252 -0
- hexdag/builtin/adapters/database/sqlite/__init__.py +5 -0
- hexdag/builtin/adapters/database/sqlite/sqlite_adapter.py +410 -0
- hexdag/builtin/adapters/local/README.md +59 -0
- hexdag/builtin/adapters/local/__init__.py +7 -0
- hexdag/builtin/adapters/local/local_observer_manager.py +696 -0
- hexdag/builtin/adapters/memory/__init__.py +47 -0
- hexdag/builtin/adapters/memory/file_memory_adapter.py +297 -0
- hexdag/builtin/adapters/memory/in_memory_memory.py +216 -0
- hexdag/builtin/adapters/memory/schemas.py +57 -0
- hexdag/builtin/adapters/memory/session_memory.py +178 -0
- hexdag/builtin/adapters/memory/sqlite_memory_adapter.py +215 -0
- hexdag/builtin/adapters/memory/state_memory.py +280 -0
- hexdag/builtin/adapters/mock/README.md +89 -0
- hexdag/builtin/adapters/mock/__init__.py +15 -0
- hexdag/builtin/adapters/mock/hexdag.toml +50 -0
- hexdag/builtin/adapters/mock/mock_database.py +225 -0
- hexdag/builtin/adapters/mock/mock_embedding.py +223 -0
- hexdag/builtin/adapters/mock/mock_llm.py +177 -0
- hexdag/builtin/adapters/mock/mock_tool_adapter.py +192 -0
- hexdag/builtin/adapters/mock/mock_tool_router.py +232 -0
- hexdag/builtin/adapters/openai/__init__.py +5 -0
- hexdag/builtin/adapters/openai/openai_adapter.py +634 -0
- hexdag/builtin/adapters/secret/__init__.py +7 -0
- hexdag/builtin/adapters/secret/local_secret_adapter.py +248 -0
- hexdag/builtin/adapters/unified_tool_router.py +280 -0
- hexdag/builtin/macros/__init__.py +17 -0
- hexdag/builtin/macros/conversation_agent.py +390 -0
- hexdag/builtin/macros/llm_macro.py +151 -0
- hexdag/builtin/macros/reasoning_agent.py +423 -0
- hexdag/builtin/macros/tool_macro.py +380 -0
- hexdag/builtin/nodes/__init__.py +38 -0
- hexdag/builtin/nodes/_discovery.py +123 -0
- hexdag/builtin/nodes/agent_node.py +696 -0
- hexdag/builtin/nodes/base_node_factory.py +242 -0
- hexdag/builtin/nodes/composite_node.py +926 -0
- hexdag/builtin/nodes/data_node.py +201 -0
- hexdag/builtin/nodes/expression_node.py +487 -0
- hexdag/builtin/nodes/function_node.py +454 -0
- hexdag/builtin/nodes/llm_node.py +491 -0
- hexdag/builtin/nodes/loop_node.py +920 -0
- hexdag/builtin/nodes/mapped_input.py +518 -0
- hexdag/builtin/nodes/port_call_node.py +269 -0
- hexdag/builtin/nodes/tool_call_node.py +195 -0
- hexdag/builtin/nodes/tool_utils.py +390 -0
- hexdag/builtin/prompts/__init__.py +68 -0
- hexdag/builtin/prompts/base.py +422 -0
- hexdag/builtin/prompts/chat_prompts.py +303 -0
- hexdag/builtin/prompts/error_correction_prompts.py +320 -0
- hexdag/builtin/prompts/tool_prompts.py +160 -0
- hexdag/builtin/tools/builtin_tools.py +84 -0
- hexdag/builtin/tools/database_tools.py +164 -0
- hexdag/cli/__init__.py +17 -0
- hexdag/cli/__main__.py +7 -0
- hexdag/cli/commands/__init__.py +27 -0
- hexdag/cli/commands/build_cmd.py +812 -0
- hexdag/cli/commands/create_cmd.py +208 -0
- hexdag/cli/commands/docs_cmd.py +293 -0
- hexdag/cli/commands/generate_types_cmd.py +252 -0
- hexdag/cli/commands/init_cmd.py +188 -0
- hexdag/cli/commands/pipeline_cmd.py +494 -0
- hexdag/cli/commands/plugin_dev_cmd.py +529 -0
- hexdag/cli/commands/plugins_cmd.py +441 -0
- hexdag/cli/commands/studio_cmd.py +101 -0
- hexdag/cli/commands/validate_cmd.py +221 -0
- hexdag/cli/main.py +84 -0
- hexdag/core/__init__.py +83 -0
- hexdag/core/config/__init__.py +20 -0
- hexdag/core/config/loader.py +479 -0
- hexdag/core/config/models.py +150 -0
- hexdag/core/configurable.py +294 -0
- hexdag/core/context/__init__.py +37 -0
- hexdag/core/context/execution_context.py +378 -0
- hexdag/core/docs/__init__.py +26 -0
- hexdag/core/docs/extractors.py +678 -0
- hexdag/core/docs/generators.py +890 -0
- hexdag/core/docs/models.py +120 -0
- hexdag/core/domain/__init__.py +10 -0
- hexdag/core/domain/dag.py +1225 -0
- hexdag/core/exceptions.py +234 -0
- hexdag/core/expression_parser.py +569 -0
- hexdag/core/logging.py +449 -0
- hexdag/core/models/__init__.py +17 -0
- hexdag/core/models/base.py +138 -0
- hexdag/core/orchestration/__init__.py +46 -0
- hexdag/core/orchestration/body_executor.py +481 -0
- hexdag/core/orchestration/components/__init__.py +97 -0
- hexdag/core/orchestration/components/adapter_lifecycle_manager.py +113 -0
- hexdag/core/orchestration/components/checkpoint_manager.py +134 -0
- hexdag/core/orchestration/components/execution_coordinator.py +360 -0
- hexdag/core/orchestration/components/health_check_manager.py +176 -0
- hexdag/core/orchestration/components/input_mapper.py +143 -0
- hexdag/core/orchestration/components/lifecycle_manager.py +583 -0
- hexdag/core/orchestration/components/node_executor.py +377 -0
- hexdag/core/orchestration/components/secret_manager.py +202 -0
- hexdag/core/orchestration/components/wave_executor.py +158 -0
- hexdag/core/orchestration/constants.py +17 -0
- hexdag/core/orchestration/events/README.md +312 -0
- hexdag/core/orchestration/events/__init__.py +104 -0
- hexdag/core/orchestration/events/batching.py +330 -0
- hexdag/core/orchestration/events/decorators.py +139 -0
- hexdag/core/orchestration/events/events.py +573 -0
- hexdag/core/orchestration/events/observers/__init__.py +30 -0
- hexdag/core/orchestration/events/observers/core_observers.py +690 -0
- hexdag/core/orchestration/events/observers/models.py +111 -0
- hexdag/core/orchestration/events/taxonomy.py +269 -0
- hexdag/core/orchestration/hook_context.py +237 -0
- hexdag/core/orchestration/hooks.py +437 -0
- hexdag/core/orchestration/models.py +418 -0
- hexdag/core/orchestration/orchestrator.py +910 -0
- hexdag/core/orchestration/orchestrator_factory.py +275 -0
- hexdag/core/orchestration/port_wrappers.py +327 -0
- hexdag/core/orchestration/prompt/__init__.py +32 -0
- hexdag/core/orchestration/prompt/template.py +332 -0
- hexdag/core/pipeline_builder/__init__.py +21 -0
- hexdag/core/pipeline_builder/component_instantiator.py +386 -0
- hexdag/core/pipeline_builder/include_tag.py +265 -0
- hexdag/core/pipeline_builder/pipeline_config.py +133 -0
- hexdag/core/pipeline_builder/py_tag.py +223 -0
- hexdag/core/pipeline_builder/tag_discovery.py +268 -0
- hexdag/core/pipeline_builder/yaml_builder.py +1196 -0
- hexdag/core/pipeline_builder/yaml_validator.py +569 -0
- hexdag/core/ports/__init__.py +65 -0
- hexdag/core/ports/api_call.py +133 -0
- hexdag/core/ports/database.py +489 -0
- hexdag/core/ports/embedding.py +215 -0
- hexdag/core/ports/executor.py +237 -0
- hexdag/core/ports/file_storage.py +117 -0
- hexdag/core/ports/healthcheck.py +87 -0
- hexdag/core/ports/llm.py +551 -0
- hexdag/core/ports/memory.py +70 -0
- hexdag/core/ports/observer_manager.py +130 -0
- hexdag/core/ports/secret.py +145 -0
- hexdag/core/ports/tool_router.py +94 -0
- hexdag/core/ports_builder.py +623 -0
- hexdag/core/protocols.py +273 -0
- hexdag/core/resolver.py +304 -0
- hexdag/core/schema/__init__.py +9 -0
- hexdag/core/schema/generator.py +742 -0
- hexdag/core/secrets.py +242 -0
- hexdag/core/types.py +413 -0
- hexdag/core/utils/async_warnings.py +206 -0
- hexdag/core/utils/schema_conversion.py +78 -0
- hexdag/core/utils/sql_validation.py +86 -0
- hexdag/core/validation/secure_json.py +148 -0
- hexdag/core/yaml_macro.py +517 -0
- hexdag/mcp_server.py +3120 -0
- hexdag/studio/__init__.py +10 -0
- hexdag/studio/build_ui.py +92 -0
- hexdag/studio/server/__init__.py +1 -0
- hexdag/studio/server/main.py +100 -0
- hexdag/studio/server/routes/__init__.py +9 -0
- hexdag/studio/server/routes/execute.py +208 -0
- hexdag/studio/server/routes/export.py +558 -0
- hexdag/studio/server/routes/files.py +207 -0
- hexdag/studio/server/routes/plugins.py +419 -0
- hexdag/studio/server/routes/validate.py +220 -0
- hexdag/studio/ui/index.html +13 -0
- hexdag/studio/ui/package-lock.json +2992 -0
- hexdag/studio/ui/package.json +31 -0
- hexdag/studio/ui/postcss.config.js +6 -0
- hexdag/studio/ui/public/hexdag.svg +5 -0
- hexdag/studio/ui/src/App.tsx +251 -0
- hexdag/studio/ui/src/components/Canvas.tsx +408 -0
- hexdag/studio/ui/src/components/ContextMenu.tsx +187 -0
- hexdag/studio/ui/src/components/FileBrowser.tsx +123 -0
- hexdag/studio/ui/src/components/Header.tsx +181 -0
- hexdag/studio/ui/src/components/HexdagNode.tsx +193 -0
- hexdag/studio/ui/src/components/NodeInspector.tsx +512 -0
- hexdag/studio/ui/src/components/NodePalette.tsx +262 -0
- hexdag/studio/ui/src/components/NodePortsSection.tsx +403 -0
- hexdag/studio/ui/src/components/PluginManager.tsx +347 -0
- hexdag/studio/ui/src/components/PortsEditor.tsx +481 -0
- hexdag/studio/ui/src/components/PythonEditor.tsx +195 -0
- hexdag/studio/ui/src/components/ValidationPanel.tsx +105 -0
- hexdag/studio/ui/src/components/YamlEditor.tsx +196 -0
- hexdag/studio/ui/src/components/index.ts +8 -0
- hexdag/studio/ui/src/index.css +92 -0
- hexdag/studio/ui/src/main.tsx +10 -0
- hexdag/studio/ui/src/types/index.ts +123 -0
- hexdag/studio/ui/src/vite-env.d.ts +1 -0
- hexdag/studio/ui/tailwind.config.js +29 -0
- hexdag/studio/ui/tsconfig.json +37 -0
- hexdag/studio/ui/tsconfig.node.json +13 -0
- hexdag/studio/ui/vite.config.ts +35 -0
- hexdag/visualization/__init__.py +69 -0
- hexdag/visualization/dag_visualizer.py +1020 -0
- hexdag-0.5.0.dev1.dist-info/METADATA +369 -0
- hexdag-0.5.0.dev1.dist-info/RECORD +261 -0
- hexdag-0.5.0.dev1.dist-info/WHEEL +4 -0
- hexdag-0.5.0.dev1.dist-info/entry_points.txt +4 -0
- hexdag-0.5.0.dev1.dist-info/licenses/LICENSE +190 -0
- hexdag_plugins/.gitignore +43 -0
- hexdag_plugins/README.md +73 -0
- hexdag_plugins/__init__.py +1 -0
- hexdag_plugins/azure/LICENSE +21 -0
- hexdag_plugins/azure/README.md +414 -0
- hexdag_plugins/azure/__init__.py +21 -0
- hexdag_plugins/azure/azure_blob_adapter.py +450 -0
- hexdag_plugins/azure/azure_cosmos_adapter.py +383 -0
- hexdag_plugins/azure/azure_keyvault_adapter.py +314 -0
- hexdag_plugins/azure/azure_openai_adapter.py +415 -0
- hexdag_plugins/azure/pyproject.toml +107 -0
- hexdag_plugins/azure/tests/__init__.py +1 -0
- hexdag_plugins/azure/tests/test_azure_blob_adapter.py +350 -0
- hexdag_plugins/azure/tests/test_azure_cosmos_adapter.py +323 -0
- hexdag_plugins/azure/tests/test_azure_keyvault_adapter.py +330 -0
- hexdag_plugins/azure/tests/test_azure_openai_adapter.py +329 -0
- hexdag_plugins/hexdag_etl/README.md +168 -0
- hexdag_plugins/hexdag_etl/__init__.py +53 -0
- hexdag_plugins/hexdag_etl/examples/01_simple_pandas_transform.py +270 -0
- hexdag_plugins/hexdag_etl/examples/02_simple_pandas_only.py +149 -0
- hexdag_plugins/hexdag_etl/examples/03_file_io_pipeline.py +109 -0
- hexdag_plugins/hexdag_etl/examples/test_pandas_transform.py +84 -0
- hexdag_plugins/hexdag_etl/hexdag.toml +25 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/__init__.py +48 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/__init__.py +13 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/api_extract.py +230 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/base_node_factory.py +181 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/file_io.py +415 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/outlook.py +492 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/pandas_transform.py +563 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/sql_extract_load.py +112 -0
- hexdag_plugins/hexdag_etl/pyproject.toml +82 -0
- hexdag_plugins/hexdag_etl/test_transform.py +54 -0
- hexdag_plugins/hexdag_etl/tests/test_plugin_integration.py +62 -0
- hexdag_plugins/mysql_adapter/LICENSE +21 -0
- hexdag_plugins/mysql_adapter/README.md +224 -0
- hexdag_plugins/mysql_adapter/__init__.py +6 -0
- hexdag_plugins/mysql_adapter/mysql_adapter.py +408 -0
- hexdag_plugins/mysql_adapter/pyproject.toml +93 -0
- hexdag_plugins/mysql_adapter/tests/test_mysql_adapter.py +259 -0
- hexdag_plugins/storage/README.md +184 -0
- hexdag_plugins/storage/__init__.py +19 -0
- hexdag_plugins/storage/file/__init__.py +5 -0
- hexdag_plugins/storage/file/local.py +325 -0
- hexdag_plugins/storage/ports/__init__.py +5 -0
- hexdag_plugins/storage/ports/vector_store.py +236 -0
- hexdag_plugins/storage/sql/__init__.py +7 -0
- hexdag_plugins/storage/sql/base.py +187 -0
- hexdag_plugins/storage/sql/mysql.py +27 -0
- hexdag_plugins/storage/sql/postgresql.py +27 -0
- hexdag_plugins/storage/tests/__init__.py +1 -0
- hexdag_plugins/storage/tests/test_local_file_storage.py +161 -0
- hexdag_plugins/storage/tests/test_sql_adapters.py +212 -0
- hexdag_plugins/storage/vector/__init__.py +7 -0
- hexdag_plugins/storage/vector/chromadb.py +223 -0
- hexdag_plugins/storage/vector/in_memory.py +285 -0
- hexdag_plugins/storage/vector/pgvector.py +502 -0
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
"""File I/O nodes for reading and writing data files.
|
|
2
|
+
|
|
3
|
+
These nodes provide file-based input and output for ETL pipelines,
|
|
4
|
+
supporting CSV, Parquet, JSON, and Excel formats.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from hexdag.builtin.nodes.base_node_factory import BaseNodeFactory
|
|
12
|
+
from hexdag.core.domain.dag import NodeSpec
|
|
13
|
+
from hexdag.core.registry import node
|
|
14
|
+
from hexdag.core.registry.models import NodeSubtype
|
|
15
|
+
from pydantic import BaseModel
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FileReaderOutput(BaseModel):
|
|
19
|
+
"""Output model for FileReaderNode."""
|
|
20
|
+
|
|
21
|
+
data: Any # DataFrame as dict for serialization
|
|
22
|
+
rows: int
|
|
23
|
+
columns: list[str]
|
|
24
|
+
file_path: str
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class FileWriterOutput(BaseModel):
|
|
28
|
+
"""Output model for FileWriterNode."""
|
|
29
|
+
|
|
30
|
+
file_path: str
|
|
31
|
+
rows: int
|
|
32
|
+
format: str
|
|
33
|
+
success: bool
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@node(name="file_reader_node", subtype=NodeSubtype.FUNCTION, namespace="etl")
|
|
37
|
+
class FileReaderNode(BaseNodeFactory):
|
|
38
|
+
"""Node for reading data files into DataFrames.
|
|
39
|
+
|
|
40
|
+
Supports multiple file formats:
|
|
41
|
+
- CSV (.csv)
|
|
42
|
+
- Parquet (.parquet)
|
|
43
|
+
- JSON (.json, .jsonl)
|
|
44
|
+
- Excel (.xlsx, .xls)
|
|
45
|
+
|
|
46
|
+
Examples
|
|
47
|
+
--------
|
|
48
|
+
YAML pipeline::
|
|
49
|
+
|
|
50
|
+
- kind: etl:file_reader_node
|
|
51
|
+
metadata:
|
|
52
|
+
name: load_customers
|
|
53
|
+
spec:
|
|
54
|
+
file_path: data/customers.csv
|
|
55
|
+
format: csv
|
|
56
|
+
options:
|
|
57
|
+
sep: ","
|
|
58
|
+
encoding: utf-8
|
|
59
|
+
dependencies: []
|
|
60
|
+
|
|
61
|
+
- kind: etl:file_reader_node
|
|
62
|
+
metadata:
|
|
63
|
+
name: load_transactions
|
|
64
|
+
spec:
|
|
65
|
+
file_path: data/transactions.parquet
|
|
66
|
+
format: parquet
|
|
67
|
+
dependencies: []
|
|
68
|
+
|
|
69
|
+
- kind: etl:file_reader_node
|
|
70
|
+
metadata:
|
|
71
|
+
name: load_products
|
|
72
|
+
spec:
|
|
73
|
+
file_path: data/products.json
|
|
74
|
+
format: json
|
|
75
|
+
options:
|
|
76
|
+
orient: records
|
|
77
|
+
dependencies: []
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __call__(
|
|
81
|
+
self,
|
|
82
|
+
name: str,
|
|
83
|
+
file_path: str,
|
|
84
|
+
format: str | None = None,
|
|
85
|
+
options: dict[str, Any] | None = None,
|
|
86
|
+
deps: list[str] | None = None,
|
|
87
|
+
**kwargs: Any,
|
|
88
|
+
) -> NodeSpec:
|
|
89
|
+
"""Create a file reader node specification.
|
|
90
|
+
|
|
91
|
+
Parameters
|
|
92
|
+
----------
|
|
93
|
+
name : str
|
|
94
|
+
Node name
|
|
95
|
+
file_path : str
|
|
96
|
+
Path to the input file (relative to workspace or absolute)
|
|
97
|
+
format : str, optional
|
|
98
|
+
File format: 'csv', 'parquet', 'json', 'excel'
|
|
99
|
+
Auto-detected from extension if not specified
|
|
100
|
+
options : dict, optional
|
|
101
|
+
Additional options passed to pandas read function
|
|
102
|
+
deps : list[str], optional
|
|
103
|
+
Dependency node names
|
|
104
|
+
**kwargs : Any
|
|
105
|
+
Additional node parameters
|
|
106
|
+
|
|
107
|
+
Returns
|
|
108
|
+
-------
|
|
109
|
+
NodeSpec
|
|
110
|
+
Node specification ready for execution
|
|
111
|
+
"""
|
|
112
|
+
# Auto-detect format from file extension if not specified
|
|
113
|
+
if format is None:
|
|
114
|
+
format = self._detect_format(file_path)
|
|
115
|
+
|
|
116
|
+
# Create wrapped function
|
|
117
|
+
wrapped_fn = self._create_reader_function(name, file_path, format, options or {})
|
|
118
|
+
|
|
119
|
+
# Define schemas
|
|
120
|
+
input_schema = {"input_data": dict | None}
|
|
121
|
+
output_model = FileReaderOutput
|
|
122
|
+
|
|
123
|
+
input_model = self.create_pydantic_model(f"{name}Input", input_schema)
|
|
124
|
+
|
|
125
|
+
# Store parameters
|
|
126
|
+
node_params = {
|
|
127
|
+
"file_path": file_path,
|
|
128
|
+
"format": format,
|
|
129
|
+
"options": options,
|
|
130
|
+
**kwargs,
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
return NodeSpec(
|
|
134
|
+
name=name,
|
|
135
|
+
fn=wrapped_fn,
|
|
136
|
+
in_model=input_model,
|
|
137
|
+
out_model=output_model,
|
|
138
|
+
deps=frozenset(deps or []),
|
|
139
|
+
params=node_params,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
def _detect_format(self, file_path: str) -> str:
|
|
143
|
+
"""Detect file format from extension."""
|
|
144
|
+
path = Path(file_path)
|
|
145
|
+
ext = path.suffix.lower()
|
|
146
|
+
|
|
147
|
+
format_map = {
|
|
148
|
+
".csv": "csv",
|
|
149
|
+
".parquet": "parquet",
|
|
150
|
+
".pq": "parquet",
|
|
151
|
+
".json": "json",
|
|
152
|
+
".jsonl": "jsonl",
|
|
153
|
+
".xlsx": "excel",
|
|
154
|
+
".xls": "excel",
|
|
155
|
+
".feather": "feather",
|
|
156
|
+
".pickle": "pickle",
|
|
157
|
+
".pkl": "pickle",
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if ext not in format_map:
|
|
161
|
+
raise ValueError(f"Unknown file format for extension '{ext}'. Supported: {list(format_map.keys())}")
|
|
162
|
+
|
|
163
|
+
return format_map[ext]
|
|
164
|
+
|
|
165
|
+
def _create_reader_function(
|
|
166
|
+
self,
|
|
167
|
+
name: str,
|
|
168
|
+
file_path: str,
|
|
169
|
+
format: str,
|
|
170
|
+
options: dict[str, Any],
|
|
171
|
+
) -> Any:
|
|
172
|
+
"""Create the file reading function."""
|
|
173
|
+
|
|
174
|
+
async def read_file(input_data: Any = None) -> dict[str, Any]:
|
|
175
|
+
"""Read data file into DataFrame."""
|
|
176
|
+
# Resolve file path
|
|
177
|
+
path = Path(file_path)
|
|
178
|
+
|
|
179
|
+
if not path.exists():
|
|
180
|
+
raise FileNotFoundError(f"File not found: {path}")
|
|
181
|
+
|
|
182
|
+
# Read based on format
|
|
183
|
+
if format == "csv":
|
|
184
|
+
df = pd.read_csv(path, **options)
|
|
185
|
+
elif format == "parquet":
|
|
186
|
+
df = pd.read_parquet(path, **options)
|
|
187
|
+
elif format == "json":
|
|
188
|
+
df = pd.read_json(path, **options)
|
|
189
|
+
elif format == "jsonl":
|
|
190
|
+
df = pd.read_json(path, lines=True, **options)
|
|
191
|
+
elif format == "excel":
|
|
192
|
+
df = pd.read_excel(path, **options)
|
|
193
|
+
elif format == "feather":
|
|
194
|
+
df = pd.read_feather(path, **options)
|
|
195
|
+
elif format == "pickle":
|
|
196
|
+
df = pd.read_pickle(path, **options)
|
|
197
|
+
else:
|
|
198
|
+
raise ValueError(f"Unsupported format: {format}")
|
|
199
|
+
|
|
200
|
+
return {
|
|
201
|
+
"data": df, # Keep as DataFrame for downstream nodes
|
|
202
|
+
"rows": len(df),
|
|
203
|
+
"columns": df.columns.tolist(),
|
|
204
|
+
"file_path": str(path.absolute()),
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
read_file.__name__ = f"file_reader_{name}"
|
|
208
|
+
read_file.__doc__ = f"Read file: {file_path}"
|
|
209
|
+
|
|
210
|
+
return read_file
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
@node(name="file_writer_node", subtype=NodeSubtype.FUNCTION, namespace="etl")
|
|
214
|
+
class FileWriterNode(BaseNodeFactory):
|
|
215
|
+
"""Node for writing DataFrames to files.
|
|
216
|
+
|
|
217
|
+
Supports multiple file formats:
|
|
218
|
+
- CSV (.csv)
|
|
219
|
+
- Parquet (.parquet)
|
|
220
|
+
- JSON (.json, .jsonl)
|
|
221
|
+
- Excel (.xlsx)
|
|
222
|
+
|
|
223
|
+
Examples
|
|
224
|
+
--------
|
|
225
|
+
YAML pipeline::
|
|
226
|
+
|
|
227
|
+
- kind: etl:file_writer_node
|
|
228
|
+
metadata:
|
|
229
|
+
name: save_results
|
|
230
|
+
spec:
|
|
231
|
+
file_path: output/results.parquet
|
|
232
|
+
format: parquet
|
|
233
|
+
options:
|
|
234
|
+
compression: snappy
|
|
235
|
+
dependencies:
|
|
236
|
+
- transform_data
|
|
237
|
+
|
|
238
|
+
- kind: etl:file_writer_node
|
|
239
|
+
metadata:
|
|
240
|
+
name: export_csv
|
|
241
|
+
spec:
|
|
242
|
+
file_path: output/report.csv
|
|
243
|
+
format: csv
|
|
244
|
+
options:
|
|
245
|
+
index: false
|
|
246
|
+
dependencies:
|
|
247
|
+
- transform_data
|
|
248
|
+
"""
|
|
249
|
+
|
|
250
|
+
def __call__(
|
|
251
|
+
self,
|
|
252
|
+
name: str,
|
|
253
|
+
file_path: str,
|
|
254
|
+
format: str | None = None,
|
|
255
|
+
input_key: str = "data",
|
|
256
|
+
options: dict[str, Any] | None = None,
|
|
257
|
+
create_dirs: bool = True,
|
|
258
|
+
deps: list[str] | None = None,
|
|
259
|
+
**kwargs: Any,
|
|
260
|
+
) -> NodeSpec:
|
|
261
|
+
"""Create a file writer node specification.
|
|
262
|
+
|
|
263
|
+
Parameters
|
|
264
|
+
----------
|
|
265
|
+
name : str
|
|
266
|
+
Node name
|
|
267
|
+
file_path : str
|
|
268
|
+
Path for the output file
|
|
269
|
+
format : str, optional
|
|
270
|
+
File format: 'csv', 'parquet', 'json', 'excel'
|
|
271
|
+
Auto-detected from extension if not specified
|
|
272
|
+
input_key : str
|
|
273
|
+
Key in input data containing the DataFrame (default: 'data')
|
|
274
|
+
options : dict, optional
|
|
275
|
+
Additional options passed to pandas write function
|
|
276
|
+
create_dirs : bool
|
|
277
|
+
Create parent directories if they don't exist (default: True)
|
|
278
|
+
deps : list[str], optional
|
|
279
|
+
Dependency node names
|
|
280
|
+
**kwargs : Any
|
|
281
|
+
Additional node parameters
|
|
282
|
+
|
|
283
|
+
Returns
|
|
284
|
+
-------
|
|
285
|
+
NodeSpec
|
|
286
|
+
Node specification ready for execution
|
|
287
|
+
"""
|
|
288
|
+
# Auto-detect format from file extension if not specified
|
|
289
|
+
if format is None:
|
|
290
|
+
format = self._detect_format(file_path)
|
|
291
|
+
|
|
292
|
+
# Create wrapped function
|
|
293
|
+
wrapped_fn = self._create_writer_function(name, file_path, format, input_key, options or {}, create_dirs)
|
|
294
|
+
|
|
295
|
+
# Define schemas
|
|
296
|
+
input_schema = {"input_data": dict}
|
|
297
|
+
output_model = FileWriterOutput
|
|
298
|
+
|
|
299
|
+
input_model = self.create_pydantic_model(f"{name}Input", input_schema)
|
|
300
|
+
|
|
301
|
+
# Store parameters
|
|
302
|
+
node_params = {
|
|
303
|
+
"file_path": file_path,
|
|
304
|
+
"format": format,
|
|
305
|
+
"input_key": input_key,
|
|
306
|
+
"options": options,
|
|
307
|
+
"create_dirs": create_dirs,
|
|
308
|
+
**kwargs,
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
return NodeSpec(
|
|
312
|
+
name=name,
|
|
313
|
+
fn=wrapped_fn,
|
|
314
|
+
in_model=input_model,
|
|
315
|
+
out_model=output_model,
|
|
316
|
+
deps=frozenset(deps or []),
|
|
317
|
+
params=node_params,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
def _detect_format(self, file_path: str) -> str:
|
|
321
|
+
"""Detect file format from extension."""
|
|
322
|
+
path = Path(file_path)
|
|
323
|
+
ext = path.suffix.lower()
|
|
324
|
+
|
|
325
|
+
format_map = {
|
|
326
|
+
".csv": "csv",
|
|
327
|
+
".parquet": "parquet",
|
|
328
|
+
".pq": "parquet",
|
|
329
|
+
".json": "json",
|
|
330
|
+
".jsonl": "jsonl",
|
|
331
|
+
".xlsx": "excel",
|
|
332
|
+
".feather": "feather",
|
|
333
|
+
".pickle": "pickle",
|
|
334
|
+
".pkl": "pickle",
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
if ext not in format_map:
|
|
338
|
+
raise ValueError(f"Unknown file format for extension '{ext}'. Supported: {list(format_map.keys())}")
|
|
339
|
+
|
|
340
|
+
return format_map[ext]
|
|
341
|
+
|
|
342
|
+
def _create_writer_function(
|
|
343
|
+
self,
|
|
344
|
+
name: str,
|
|
345
|
+
file_path: str,
|
|
346
|
+
format: str,
|
|
347
|
+
input_key: str,
|
|
348
|
+
options: dict[str, Any],
|
|
349
|
+
create_dirs: bool,
|
|
350
|
+
) -> Any:
|
|
351
|
+
"""Create the file writing function."""
|
|
352
|
+
|
|
353
|
+
async def write_file(input_data: Any) -> dict[str, Any]:
|
|
354
|
+
"""Write DataFrame to file."""
|
|
355
|
+
# Extract DataFrame from input
|
|
356
|
+
if isinstance(input_data, dict):
|
|
357
|
+
df = input_data.get(input_key)
|
|
358
|
+
if df is None:
|
|
359
|
+
# Try to find a DataFrame in the input
|
|
360
|
+
for key, value in input_data.items():
|
|
361
|
+
if isinstance(value, pd.DataFrame):
|
|
362
|
+
df = value
|
|
363
|
+
break
|
|
364
|
+
elif isinstance(value, dict) and "data" in value:
|
|
365
|
+
df = value["data"]
|
|
366
|
+
break
|
|
367
|
+
elif isinstance(input_data, pd.DataFrame):
|
|
368
|
+
df = input_data
|
|
369
|
+
else:
|
|
370
|
+
df = input_data
|
|
371
|
+
|
|
372
|
+
if df is None:
|
|
373
|
+
raise ValueError(f"No DataFrame found in input. Expected key: '{input_key}'")
|
|
374
|
+
|
|
375
|
+
if not isinstance(df, pd.DataFrame):
|
|
376
|
+
try:
|
|
377
|
+
df = pd.DataFrame(df)
|
|
378
|
+
except Exception as e:
|
|
379
|
+
raise ValueError(f"Could not convert input to DataFrame: {e}")
|
|
380
|
+
|
|
381
|
+
# Resolve file path and create directories
|
|
382
|
+
path = Path(file_path)
|
|
383
|
+
|
|
384
|
+
if create_dirs:
|
|
385
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
386
|
+
|
|
387
|
+
# Write based on format
|
|
388
|
+
if format == "csv":
|
|
389
|
+
df.to_csv(path, **options)
|
|
390
|
+
elif format == "parquet":
|
|
391
|
+
df.to_parquet(path, **options)
|
|
392
|
+
elif format == "json":
|
|
393
|
+
df.to_json(path, **options)
|
|
394
|
+
elif format == "jsonl":
|
|
395
|
+
df.to_json(path, orient="records", lines=True, **options)
|
|
396
|
+
elif format == "excel":
|
|
397
|
+
df.to_excel(path, **options)
|
|
398
|
+
elif format == "feather":
|
|
399
|
+
df.to_feather(path, **options)
|
|
400
|
+
elif format == "pickle":
|
|
401
|
+
df.to_pickle(path, **options)
|
|
402
|
+
else:
|
|
403
|
+
raise ValueError(f"Unsupported format: {format}")
|
|
404
|
+
|
|
405
|
+
return {
|
|
406
|
+
"file_path": str(path.absolute()),
|
|
407
|
+
"rows": len(df),
|
|
408
|
+
"format": format,
|
|
409
|
+
"success": True,
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
write_file.__name__ = f"file_writer_{name}"
|
|
413
|
+
write_file.__doc__ = f"Write file: {file_path}"
|
|
414
|
+
|
|
415
|
+
return write_file
|