hexdag 0.5.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. hexdag/__init__.py +116 -0
  2. hexdag/__main__.py +30 -0
  3. hexdag/adapters/executors/__init__.py +5 -0
  4. hexdag/adapters/executors/local_executor.py +316 -0
  5. hexdag/builtin/__init__.py +6 -0
  6. hexdag/builtin/adapters/__init__.py +51 -0
  7. hexdag/builtin/adapters/anthropic/__init__.py +5 -0
  8. hexdag/builtin/adapters/anthropic/anthropic_adapter.py +151 -0
  9. hexdag/builtin/adapters/database/__init__.py +6 -0
  10. hexdag/builtin/adapters/database/csv/csv_adapter.py +249 -0
  11. hexdag/builtin/adapters/database/pgvector/__init__.py +5 -0
  12. hexdag/builtin/adapters/database/pgvector/pgvector_adapter.py +478 -0
  13. hexdag/builtin/adapters/database/sqlalchemy/sqlalchemy_adapter.py +252 -0
  14. hexdag/builtin/adapters/database/sqlite/__init__.py +5 -0
  15. hexdag/builtin/adapters/database/sqlite/sqlite_adapter.py +410 -0
  16. hexdag/builtin/adapters/local/README.md +59 -0
  17. hexdag/builtin/adapters/local/__init__.py +7 -0
  18. hexdag/builtin/adapters/local/local_observer_manager.py +696 -0
  19. hexdag/builtin/adapters/memory/__init__.py +47 -0
  20. hexdag/builtin/adapters/memory/file_memory_adapter.py +297 -0
  21. hexdag/builtin/adapters/memory/in_memory_memory.py +216 -0
  22. hexdag/builtin/adapters/memory/schemas.py +57 -0
  23. hexdag/builtin/adapters/memory/session_memory.py +178 -0
  24. hexdag/builtin/adapters/memory/sqlite_memory_adapter.py +215 -0
  25. hexdag/builtin/adapters/memory/state_memory.py +280 -0
  26. hexdag/builtin/adapters/mock/README.md +89 -0
  27. hexdag/builtin/adapters/mock/__init__.py +15 -0
  28. hexdag/builtin/adapters/mock/hexdag.toml +50 -0
  29. hexdag/builtin/adapters/mock/mock_database.py +225 -0
  30. hexdag/builtin/adapters/mock/mock_embedding.py +223 -0
  31. hexdag/builtin/adapters/mock/mock_llm.py +177 -0
  32. hexdag/builtin/adapters/mock/mock_tool_adapter.py +192 -0
  33. hexdag/builtin/adapters/mock/mock_tool_router.py +232 -0
  34. hexdag/builtin/adapters/openai/__init__.py +5 -0
  35. hexdag/builtin/adapters/openai/openai_adapter.py +634 -0
  36. hexdag/builtin/adapters/secret/__init__.py +7 -0
  37. hexdag/builtin/adapters/secret/local_secret_adapter.py +248 -0
  38. hexdag/builtin/adapters/unified_tool_router.py +280 -0
  39. hexdag/builtin/macros/__init__.py +17 -0
  40. hexdag/builtin/macros/conversation_agent.py +390 -0
  41. hexdag/builtin/macros/llm_macro.py +151 -0
  42. hexdag/builtin/macros/reasoning_agent.py +423 -0
  43. hexdag/builtin/macros/tool_macro.py +380 -0
  44. hexdag/builtin/nodes/__init__.py +38 -0
  45. hexdag/builtin/nodes/_discovery.py +123 -0
  46. hexdag/builtin/nodes/agent_node.py +696 -0
  47. hexdag/builtin/nodes/base_node_factory.py +242 -0
  48. hexdag/builtin/nodes/composite_node.py +926 -0
  49. hexdag/builtin/nodes/data_node.py +201 -0
  50. hexdag/builtin/nodes/expression_node.py +487 -0
  51. hexdag/builtin/nodes/function_node.py +454 -0
  52. hexdag/builtin/nodes/llm_node.py +491 -0
  53. hexdag/builtin/nodes/loop_node.py +920 -0
  54. hexdag/builtin/nodes/mapped_input.py +518 -0
  55. hexdag/builtin/nodes/port_call_node.py +269 -0
  56. hexdag/builtin/nodes/tool_call_node.py +195 -0
  57. hexdag/builtin/nodes/tool_utils.py +390 -0
  58. hexdag/builtin/prompts/__init__.py +68 -0
  59. hexdag/builtin/prompts/base.py +422 -0
  60. hexdag/builtin/prompts/chat_prompts.py +303 -0
  61. hexdag/builtin/prompts/error_correction_prompts.py +320 -0
  62. hexdag/builtin/prompts/tool_prompts.py +160 -0
  63. hexdag/builtin/tools/builtin_tools.py +84 -0
  64. hexdag/builtin/tools/database_tools.py +164 -0
  65. hexdag/cli/__init__.py +17 -0
  66. hexdag/cli/__main__.py +7 -0
  67. hexdag/cli/commands/__init__.py +27 -0
  68. hexdag/cli/commands/build_cmd.py +812 -0
  69. hexdag/cli/commands/create_cmd.py +208 -0
  70. hexdag/cli/commands/docs_cmd.py +293 -0
  71. hexdag/cli/commands/generate_types_cmd.py +252 -0
  72. hexdag/cli/commands/init_cmd.py +188 -0
  73. hexdag/cli/commands/pipeline_cmd.py +494 -0
  74. hexdag/cli/commands/plugin_dev_cmd.py +529 -0
  75. hexdag/cli/commands/plugins_cmd.py +441 -0
  76. hexdag/cli/commands/studio_cmd.py +101 -0
  77. hexdag/cli/commands/validate_cmd.py +221 -0
  78. hexdag/cli/main.py +84 -0
  79. hexdag/core/__init__.py +83 -0
  80. hexdag/core/config/__init__.py +20 -0
  81. hexdag/core/config/loader.py +479 -0
  82. hexdag/core/config/models.py +150 -0
  83. hexdag/core/configurable.py +294 -0
  84. hexdag/core/context/__init__.py +37 -0
  85. hexdag/core/context/execution_context.py +378 -0
  86. hexdag/core/docs/__init__.py +26 -0
  87. hexdag/core/docs/extractors.py +678 -0
  88. hexdag/core/docs/generators.py +890 -0
  89. hexdag/core/docs/models.py +120 -0
  90. hexdag/core/domain/__init__.py +10 -0
  91. hexdag/core/domain/dag.py +1225 -0
  92. hexdag/core/exceptions.py +234 -0
  93. hexdag/core/expression_parser.py +569 -0
  94. hexdag/core/logging.py +449 -0
  95. hexdag/core/models/__init__.py +17 -0
  96. hexdag/core/models/base.py +138 -0
  97. hexdag/core/orchestration/__init__.py +46 -0
  98. hexdag/core/orchestration/body_executor.py +481 -0
  99. hexdag/core/orchestration/components/__init__.py +97 -0
  100. hexdag/core/orchestration/components/adapter_lifecycle_manager.py +113 -0
  101. hexdag/core/orchestration/components/checkpoint_manager.py +134 -0
  102. hexdag/core/orchestration/components/execution_coordinator.py +360 -0
  103. hexdag/core/orchestration/components/health_check_manager.py +176 -0
  104. hexdag/core/orchestration/components/input_mapper.py +143 -0
  105. hexdag/core/orchestration/components/lifecycle_manager.py +583 -0
  106. hexdag/core/orchestration/components/node_executor.py +377 -0
  107. hexdag/core/orchestration/components/secret_manager.py +202 -0
  108. hexdag/core/orchestration/components/wave_executor.py +158 -0
  109. hexdag/core/orchestration/constants.py +17 -0
  110. hexdag/core/orchestration/events/README.md +312 -0
  111. hexdag/core/orchestration/events/__init__.py +104 -0
  112. hexdag/core/orchestration/events/batching.py +330 -0
  113. hexdag/core/orchestration/events/decorators.py +139 -0
  114. hexdag/core/orchestration/events/events.py +573 -0
  115. hexdag/core/orchestration/events/observers/__init__.py +30 -0
  116. hexdag/core/orchestration/events/observers/core_observers.py +690 -0
  117. hexdag/core/orchestration/events/observers/models.py +111 -0
  118. hexdag/core/orchestration/events/taxonomy.py +269 -0
  119. hexdag/core/orchestration/hook_context.py +237 -0
  120. hexdag/core/orchestration/hooks.py +437 -0
  121. hexdag/core/orchestration/models.py +418 -0
  122. hexdag/core/orchestration/orchestrator.py +910 -0
  123. hexdag/core/orchestration/orchestrator_factory.py +275 -0
  124. hexdag/core/orchestration/port_wrappers.py +327 -0
  125. hexdag/core/orchestration/prompt/__init__.py +32 -0
  126. hexdag/core/orchestration/prompt/template.py +332 -0
  127. hexdag/core/pipeline_builder/__init__.py +21 -0
  128. hexdag/core/pipeline_builder/component_instantiator.py +386 -0
  129. hexdag/core/pipeline_builder/include_tag.py +265 -0
  130. hexdag/core/pipeline_builder/pipeline_config.py +133 -0
  131. hexdag/core/pipeline_builder/py_tag.py +223 -0
  132. hexdag/core/pipeline_builder/tag_discovery.py +268 -0
  133. hexdag/core/pipeline_builder/yaml_builder.py +1196 -0
  134. hexdag/core/pipeline_builder/yaml_validator.py +569 -0
  135. hexdag/core/ports/__init__.py +65 -0
  136. hexdag/core/ports/api_call.py +133 -0
  137. hexdag/core/ports/database.py +489 -0
  138. hexdag/core/ports/embedding.py +215 -0
  139. hexdag/core/ports/executor.py +237 -0
  140. hexdag/core/ports/file_storage.py +117 -0
  141. hexdag/core/ports/healthcheck.py +87 -0
  142. hexdag/core/ports/llm.py +551 -0
  143. hexdag/core/ports/memory.py +70 -0
  144. hexdag/core/ports/observer_manager.py +130 -0
  145. hexdag/core/ports/secret.py +145 -0
  146. hexdag/core/ports/tool_router.py +94 -0
  147. hexdag/core/ports_builder.py +623 -0
  148. hexdag/core/protocols.py +273 -0
  149. hexdag/core/resolver.py +304 -0
  150. hexdag/core/schema/__init__.py +9 -0
  151. hexdag/core/schema/generator.py +742 -0
  152. hexdag/core/secrets.py +242 -0
  153. hexdag/core/types.py +413 -0
  154. hexdag/core/utils/async_warnings.py +206 -0
  155. hexdag/core/utils/schema_conversion.py +78 -0
  156. hexdag/core/utils/sql_validation.py +86 -0
  157. hexdag/core/validation/secure_json.py +148 -0
  158. hexdag/core/yaml_macro.py +517 -0
  159. hexdag/mcp_server.py +3120 -0
  160. hexdag/studio/__init__.py +10 -0
  161. hexdag/studio/build_ui.py +92 -0
  162. hexdag/studio/server/__init__.py +1 -0
  163. hexdag/studio/server/main.py +100 -0
  164. hexdag/studio/server/routes/__init__.py +9 -0
  165. hexdag/studio/server/routes/execute.py +208 -0
  166. hexdag/studio/server/routes/export.py +558 -0
  167. hexdag/studio/server/routes/files.py +207 -0
  168. hexdag/studio/server/routes/plugins.py +419 -0
  169. hexdag/studio/server/routes/validate.py +220 -0
  170. hexdag/studio/ui/index.html +13 -0
  171. hexdag/studio/ui/package-lock.json +2992 -0
  172. hexdag/studio/ui/package.json +31 -0
  173. hexdag/studio/ui/postcss.config.js +6 -0
  174. hexdag/studio/ui/public/hexdag.svg +5 -0
  175. hexdag/studio/ui/src/App.tsx +251 -0
  176. hexdag/studio/ui/src/components/Canvas.tsx +408 -0
  177. hexdag/studio/ui/src/components/ContextMenu.tsx +187 -0
  178. hexdag/studio/ui/src/components/FileBrowser.tsx +123 -0
  179. hexdag/studio/ui/src/components/Header.tsx +181 -0
  180. hexdag/studio/ui/src/components/HexdagNode.tsx +193 -0
  181. hexdag/studio/ui/src/components/NodeInspector.tsx +512 -0
  182. hexdag/studio/ui/src/components/NodePalette.tsx +262 -0
  183. hexdag/studio/ui/src/components/NodePortsSection.tsx +403 -0
  184. hexdag/studio/ui/src/components/PluginManager.tsx +347 -0
  185. hexdag/studio/ui/src/components/PortsEditor.tsx +481 -0
  186. hexdag/studio/ui/src/components/PythonEditor.tsx +195 -0
  187. hexdag/studio/ui/src/components/ValidationPanel.tsx +105 -0
  188. hexdag/studio/ui/src/components/YamlEditor.tsx +196 -0
  189. hexdag/studio/ui/src/components/index.ts +8 -0
  190. hexdag/studio/ui/src/index.css +92 -0
  191. hexdag/studio/ui/src/main.tsx +10 -0
  192. hexdag/studio/ui/src/types/index.ts +123 -0
  193. hexdag/studio/ui/src/vite-env.d.ts +1 -0
  194. hexdag/studio/ui/tailwind.config.js +29 -0
  195. hexdag/studio/ui/tsconfig.json +37 -0
  196. hexdag/studio/ui/tsconfig.node.json +13 -0
  197. hexdag/studio/ui/vite.config.ts +35 -0
  198. hexdag/visualization/__init__.py +69 -0
  199. hexdag/visualization/dag_visualizer.py +1020 -0
  200. hexdag-0.5.0.dev1.dist-info/METADATA +369 -0
  201. hexdag-0.5.0.dev1.dist-info/RECORD +261 -0
  202. hexdag-0.5.0.dev1.dist-info/WHEEL +4 -0
  203. hexdag-0.5.0.dev1.dist-info/entry_points.txt +4 -0
  204. hexdag-0.5.0.dev1.dist-info/licenses/LICENSE +190 -0
  205. hexdag_plugins/.gitignore +43 -0
  206. hexdag_plugins/README.md +73 -0
  207. hexdag_plugins/__init__.py +1 -0
  208. hexdag_plugins/azure/LICENSE +21 -0
  209. hexdag_plugins/azure/README.md +414 -0
  210. hexdag_plugins/azure/__init__.py +21 -0
  211. hexdag_plugins/azure/azure_blob_adapter.py +450 -0
  212. hexdag_plugins/azure/azure_cosmos_adapter.py +383 -0
  213. hexdag_plugins/azure/azure_keyvault_adapter.py +314 -0
  214. hexdag_plugins/azure/azure_openai_adapter.py +415 -0
  215. hexdag_plugins/azure/pyproject.toml +107 -0
  216. hexdag_plugins/azure/tests/__init__.py +1 -0
  217. hexdag_plugins/azure/tests/test_azure_blob_adapter.py +350 -0
  218. hexdag_plugins/azure/tests/test_azure_cosmos_adapter.py +323 -0
  219. hexdag_plugins/azure/tests/test_azure_keyvault_adapter.py +330 -0
  220. hexdag_plugins/azure/tests/test_azure_openai_adapter.py +329 -0
  221. hexdag_plugins/hexdag_etl/README.md +168 -0
  222. hexdag_plugins/hexdag_etl/__init__.py +53 -0
  223. hexdag_plugins/hexdag_etl/examples/01_simple_pandas_transform.py +270 -0
  224. hexdag_plugins/hexdag_etl/examples/02_simple_pandas_only.py +149 -0
  225. hexdag_plugins/hexdag_etl/examples/03_file_io_pipeline.py +109 -0
  226. hexdag_plugins/hexdag_etl/examples/test_pandas_transform.py +84 -0
  227. hexdag_plugins/hexdag_etl/hexdag.toml +25 -0
  228. hexdag_plugins/hexdag_etl/hexdag_etl/__init__.py +48 -0
  229. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/__init__.py +13 -0
  230. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/api_extract.py +230 -0
  231. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/base_node_factory.py +181 -0
  232. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/file_io.py +415 -0
  233. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/outlook.py +492 -0
  234. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/pandas_transform.py +563 -0
  235. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/sql_extract_load.py +112 -0
  236. hexdag_plugins/hexdag_etl/pyproject.toml +82 -0
  237. hexdag_plugins/hexdag_etl/test_transform.py +54 -0
  238. hexdag_plugins/hexdag_etl/tests/test_plugin_integration.py +62 -0
  239. hexdag_plugins/mysql_adapter/LICENSE +21 -0
  240. hexdag_plugins/mysql_adapter/README.md +224 -0
  241. hexdag_plugins/mysql_adapter/__init__.py +6 -0
  242. hexdag_plugins/mysql_adapter/mysql_adapter.py +408 -0
  243. hexdag_plugins/mysql_adapter/pyproject.toml +93 -0
  244. hexdag_plugins/mysql_adapter/tests/test_mysql_adapter.py +259 -0
  245. hexdag_plugins/storage/README.md +184 -0
  246. hexdag_plugins/storage/__init__.py +19 -0
  247. hexdag_plugins/storage/file/__init__.py +5 -0
  248. hexdag_plugins/storage/file/local.py +325 -0
  249. hexdag_plugins/storage/ports/__init__.py +5 -0
  250. hexdag_plugins/storage/ports/vector_store.py +236 -0
  251. hexdag_plugins/storage/sql/__init__.py +7 -0
  252. hexdag_plugins/storage/sql/base.py +187 -0
  253. hexdag_plugins/storage/sql/mysql.py +27 -0
  254. hexdag_plugins/storage/sql/postgresql.py +27 -0
  255. hexdag_plugins/storage/tests/__init__.py +1 -0
  256. hexdag_plugins/storage/tests/test_local_file_storage.py +161 -0
  257. hexdag_plugins/storage/tests/test_sql_adapters.py +212 -0
  258. hexdag_plugins/storage/vector/__init__.py +7 -0
  259. hexdag_plugins/storage/vector/chromadb.py +223 -0
  260. hexdag_plugins/storage/vector/in_memory.py +285 -0
  261. hexdag_plugins/storage/vector/pgvector.py +502 -0
@@ -0,0 +1,415 @@
1
+ """File I/O nodes for reading and writing data files.
2
+
3
+ These nodes provide file-based input and output for ETL pipelines,
4
+ supporting CSV, Parquet, JSON, and Excel formats.
5
+ """
6
+
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import pandas as pd
11
+ from hexdag.builtin.nodes.base_node_factory import BaseNodeFactory
12
+ from hexdag.core.domain.dag import NodeSpec
13
+ from hexdag.core.registry import node
14
+ from hexdag.core.registry.models import NodeSubtype
15
+ from pydantic import BaseModel
16
+
17
+
18
+ class FileReaderOutput(BaseModel):
19
+ """Output model for FileReaderNode."""
20
+
21
+ data: Any # DataFrame as dict for serialization
22
+ rows: int
23
+ columns: list[str]
24
+ file_path: str
25
+
26
+
27
+ class FileWriterOutput(BaseModel):
28
+ """Output model for FileWriterNode."""
29
+
30
+ file_path: str
31
+ rows: int
32
+ format: str
33
+ success: bool
34
+
35
+
36
+ @node(name="file_reader_node", subtype=NodeSubtype.FUNCTION, namespace="etl")
37
+ class FileReaderNode(BaseNodeFactory):
38
+ """Node for reading data files into DataFrames.
39
+
40
+ Supports multiple file formats:
41
+ - CSV (.csv)
42
+ - Parquet (.parquet)
43
+ - JSON (.json, .jsonl)
44
+ - Excel (.xlsx, .xls)
45
+
46
+ Examples
47
+ --------
48
+ YAML pipeline::
49
+
50
+ - kind: etl:file_reader_node
51
+ metadata:
52
+ name: load_customers
53
+ spec:
54
+ file_path: data/customers.csv
55
+ format: csv
56
+ options:
57
+ sep: ","
58
+ encoding: utf-8
59
+ dependencies: []
60
+
61
+ - kind: etl:file_reader_node
62
+ metadata:
63
+ name: load_transactions
64
+ spec:
65
+ file_path: data/transactions.parquet
66
+ format: parquet
67
+ dependencies: []
68
+
69
+ - kind: etl:file_reader_node
70
+ metadata:
71
+ name: load_products
72
+ spec:
73
+ file_path: data/products.json
74
+ format: json
75
+ options:
76
+ orient: records
77
+ dependencies: []
78
+ """
79
+
80
+ def __call__(
81
+ self,
82
+ name: str,
83
+ file_path: str,
84
+ format: str | None = None,
85
+ options: dict[str, Any] | None = None,
86
+ deps: list[str] | None = None,
87
+ **kwargs: Any,
88
+ ) -> NodeSpec:
89
+ """Create a file reader node specification.
90
+
91
+ Parameters
92
+ ----------
93
+ name : str
94
+ Node name
95
+ file_path : str
96
+ Path to the input file (relative to workspace or absolute)
97
+ format : str, optional
98
+ File format: 'csv', 'parquet', 'json', 'excel'
99
+ Auto-detected from extension if not specified
100
+ options : dict, optional
101
+ Additional options passed to pandas read function
102
+ deps : list[str], optional
103
+ Dependency node names
104
+ **kwargs : Any
105
+ Additional node parameters
106
+
107
+ Returns
108
+ -------
109
+ NodeSpec
110
+ Node specification ready for execution
111
+ """
112
+ # Auto-detect format from file extension if not specified
113
+ if format is None:
114
+ format = self._detect_format(file_path)
115
+
116
+ # Create wrapped function
117
+ wrapped_fn = self._create_reader_function(name, file_path, format, options or {})
118
+
119
+ # Define schemas
120
+ input_schema = {"input_data": dict | None}
121
+ output_model = FileReaderOutput
122
+
123
+ input_model = self.create_pydantic_model(f"{name}Input", input_schema)
124
+
125
+ # Store parameters
126
+ node_params = {
127
+ "file_path": file_path,
128
+ "format": format,
129
+ "options": options,
130
+ **kwargs,
131
+ }
132
+
133
+ return NodeSpec(
134
+ name=name,
135
+ fn=wrapped_fn,
136
+ in_model=input_model,
137
+ out_model=output_model,
138
+ deps=frozenset(deps or []),
139
+ params=node_params,
140
+ )
141
+
142
+ def _detect_format(self, file_path: str) -> str:
143
+ """Detect file format from extension."""
144
+ path = Path(file_path)
145
+ ext = path.suffix.lower()
146
+
147
+ format_map = {
148
+ ".csv": "csv",
149
+ ".parquet": "parquet",
150
+ ".pq": "parquet",
151
+ ".json": "json",
152
+ ".jsonl": "jsonl",
153
+ ".xlsx": "excel",
154
+ ".xls": "excel",
155
+ ".feather": "feather",
156
+ ".pickle": "pickle",
157
+ ".pkl": "pickle",
158
+ }
159
+
160
+ if ext not in format_map:
161
+ raise ValueError(f"Unknown file format for extension '{ext}'. Supported: {list(format_map.keys())}")
162
+
163
+ return format_map[ext]
164
+
165
+ def _create_reader_function(
166
+ self,
167
+ name: str,
168
+ file_path: str,
169
+ format: str,
170
+ options: dict[str, Any],
171
+ ) -> Any:
172
+ """Create the file reading function."""
173
+
174
+ async def read_file(input_data: Any = None) -> dict[str, Any]:
175
+ """Read data file into DataFrame."""
176
+ # Resolve file path
177
+ path = Path(file_path)
178
+
179
+ if not path.exists():
180
+ raise FileNotFoundError(f"File not found: {path}")
181
+
182
+ # Read based on format
183
+ if format == "csv":
184
+ df = pd.read_csv(path, **options)
185
+ elif format == "parquet":
186
+ df = pd.read_parquet(path, **options)
187
+ elif format == "json":
188
+ df = pd.read_json(path, **options)
189
+ elif format == "jsonl":
190
+ df = pd.read_json(path, lines=True, **options)
191
+ elif format == "excel":
192
+ df = pd.read_excel(path, **options)
193
+ elif format == "feather":
194
+ df = pd.read_feather(path, **options)
195
+ elif format == "pickle":
196
+ df = pd.read_pickle(path, **options)
197
+ else:
198
+ raise ValueError(f"Unsupported format: {format}")
199
+
200
+ return {
201
+ "data": df, # Keep as DataFrame for downstream nodes
202
+ "rows": len(df),
203
+ "columns": df.columns.tolist(),
204
+ "file_path": str(path.absolute()),
205
+ }
206
+
207
+ read_file.__name__ = f"file_reader_{name}"
208
+ read_file.__doc__ = f"Read file: {file_path}"
209
+
210
+ return read_file
211
+
212
+
213
+ @node(name="file_writer_node", subtype=NodeSubtype.FUNCTION, namespace="etl")
214
+ class FileWriterNode(BaseNodeFactory):
215
+ """Node for writing DataFrames to files.
216
+
217
+ Supports multiple file formats:
218
+ - CSV (.csv)
219
+ - Parquet (.parquet)
220
+ - JSON (.json, .jsonl)
221
+ - Excel (.xlsx)
222
+
223
+ Examples
224
+ --------
225
+ YAML pipeline::
226
+
227
+ - kind: etl:file_writer_node
228
+ metadata:
229
+ name: save_results
230
+ spec:
231
+ file_path: output/results.parquet
232
+ format: parquet
233
+ options:
234
+ compression: snappy
235
+ dependencies:
236
+ - transform_data
237
+
238
+ - kind: etl:file_writer_node
239
+ metadata:
240
+ name: export_csv
241
+ spec:
242
+ file_path: output/report.csv
243
+ format: csv
244
+ options:
245
+ index: false
246
+ dependencies:
247
+ - transform_data
248
+ """
249
+
250
+ def __call__(
251
+ self,
252
+ name: str,
253
+ file_path: str,
254
+ format: str | None = None,
255
+ input_key: str = "data",
256
+ options: dict[str, Any] | None = None,
257
+ create_dirs: bool = True,
258
+ deps: list[str] | None = None,
259
+ **kwargs: Any,
260
+ ) -> NodeSpec:
261
+ """Create a file writer node specification.
262
+
263
+ Parameters
264
+ ----------
265
+ name : str
266
+ Node name
267
+ file_path : str
268
+ Path for the output file
269
+ format : str, optional
270
+ File format: 'csv', 'parquet', 'json', 'excel'
271
+ Auto-detected from extension if not specified
272
+ input_key : str
273
+ Key in input data containing the DataFrame (default: 'data')
274
+ options : dict, optional
275
+ Additional options passed to pandas write function
276
+ create_dirs : bool
277
+ Create parent directories if they don't exist (default: True)
278
+ deps : list[str], optional
279
+ Dependency node names
280
+ **kwargs : Any
281
+ Additional node parameters
282
+
283
+ Returns
284
+ -------
285
+ NodeSpec
286
+ Node specification ready for execution
287
+ """
288
+ # Auto-detect format from file extension if not specified
289
+ if format is None:
290
+ format = self._detect_format(file_path)
291
+
292
+ # Create wrapped function
293
+ wrapped_fn = self._create_writer_function(name, file_path, format, input_key, options or {}, create_dirs)
294
+
295
+ # Define schemas
296
+ input_schema = {"input_data": dict}
297
+ output_model = FileWriterOutput
298
+
299
+ input_model = self.create_pydantic_model(f"{name}Input", input_schema)
300
+
301
+ # Store parameters
302
+ node_params = {
303
+ "file_path": file_path,
304
+ "format": format,
305
+ "input_key": input_key,
306
+ "options": options,
307
+ "create_dirs": create_dirs,
308
+ **kwargs,
309
+ }
310
+
311
+ return NodeSpec(
312
+ name=name,
313
+ fn=wrapped_fn,
314
+ in_model=input_model,
315
+ out_model=output_model,
316
+ deps=frozenset(deps or []),
317
+ params=node_params,
318
+ )
319
+
320
+ def _detect_format(self, file_path: str) -> str:
321
+ """Detect file format from extension."""
322
+ path = Path(file_path)
323
+ ext = path.suffix.lower()
324
+
325
+ format_map = {
326
+ ".csv": "csv",
327
+ ".parquet": "parquet",
328
+ ".pq": "parquet",
329
+ ".json": "json",
330
+ ".jsonl": "jsonl",
331
+ ".xlsx": "excel",
332
+ ".feather": "feather",
333
+ ".pickle": "pickle",
334
+ ".pkl": "pickle",
335
+ }
336
+
337
+ if ext not in format_map:
338
+ raise ValueError(f"Unknown file format for extension '{ext}'. Supported: {list(format_map.keys())}")
339
+
340
+ return format_map[ext]
341
+
342
+ def _create_writer_function(
343
+ self,
344
+ name: str,
345
+ file_path: str,
346
+ format: str,
347
+ input_key: str,
348
+ options: dict[str, Any],
349
+ create_dirs: bool,
350
+ ) -> Any:
351
+ """Create the file writing function."""
352
+
353
+ async def write_file(input_data: Any) -> dict[str, Any]:
354
+ """Write DataFrame to file."""
355
+ # Extract DataFrame from input
356
+ if isinstance(input_data, dict):
357
+ df = input_data.get(input_key)
358
+ if df is None:
359
+ # Try to find a DataFrame in the input
360
+ for key, value in input_data.items():
361
+ if isinstance(value, pd.DataFrame):
362
+ df = value
363
+ break
364
+ elif isinstance(value, dict) and "data" in value:
365
+ df = value["data"]
366
+ break
367
+ elif isinstance(input_data, pd.DataFrame):
368
+ df = input_data
369
+ else:
370
+ df = input_data
371
+
372
+ if df is None:
373
+ raise ValueError(f"No DataFrame found in input. Expected key: '{input_key}'")
374
+
375
+ if not isinstance(df, pd.DataFrame):
376
+ try:
377
+ df = pd.DataFrame(df)
378
+ except Exception as e:
379
+ raise ValueError(f"Could not convert input to DataFrame: {e}")
380
+
381
+ # Resolve file path and create directories
382
+ path = Path(file_path)
383
+
384
+ if create_dirs:
385
+ path.parent.mkdir(parents=True, exist_ok=True)
386
+
387
+ # Write based on format
388
+ if format == "csv":
389
+ df.to_csv(path, **options)
390
+ elif format == "parquet":
391
+ df.to_parquet(path, **options)
392
+ elif format == "json":
393
+ df.to_json(path, **options)
394
+ elif format == "jsonl":
395
+ df.to_json(path, orient="records", lines=True, **options)
396
+ elif format == "excel":
397
+ df.to_excel(path, **options)
398
+ elif format == "feather":
399
+ df.to_feather(path, **options)
400
+ elif format == "pickle":
401
+ df.to_pickle(path, **options)
402
+ else:
403
+ raise ValueError(f"Unsupported format: {format}")
404
+
405
+ return {
406
+ "file_path": str(path.absolute()),
407
+ "rows": len(df),
408
+ "format": format,
409
+ "success": True,
410
+ }
411
+
412
+ write_file.__name__ = f"file_writer_{name}"
413
+ write_file.__doc__ = f"Write file: {file_path}"
414
+
415
+ return write_file