hexdag 0.5.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. hexdag/__init__.py +116 -0
  2. hexdag/__main__.py +30 -0
  3. hexdag/adapters/executors/__init__.py +5 -0
  4. hexdag/adapters/executors/local_executor.py +316 -0
  5. hexdag/builtin/__init__.py +6 -0
  6. hexdag/builtin/adapters/__init__.py +51 -0
  7. hexdag/builtin/adapters/anthropic/__init__.py +5 -0
  8. hexdag/builtin/adapters/anthropic/anthropic_adapter.py +151 -0
  9. hexdag/builtin/adapters/database/__init__.py +6 -0
  10. hexdag/builtin/adapters/database/csv/csv_adapter.py +249 -0
  11. hexdag/builtin/adapters/database/pgvector/__init__.py +5 -0
  12. hexdag/builtin/adapters/database/pgvector/pgvector_adapter.py +478 -0
  13. hexdag/builtin/adapters/database/sqlalchemy/sqlalchemy_adapter.py +252 -0
  14. hexdag/builtin/adapters/database/sqlite/__init__.py +5 -0
  15. hexdag/builtin/adapters/database/sqlite/sqlite_adapter.py +410 -0
  16. hexdag/builtin/adapters/local/README.md +59 -0
  17. hexdag/builtin/adapters/local/__init__.py +7 -0
  18. hexdag/builtin/adapters/local/local_observer_manager.py +696 -0
  19. hexdag/builtin/adapters/memory/__init__.py +47 -0
  20. hexdag/builtin/adapters/memory/file_memory_adapter.py +297 -0
  21. hexdag/builtin/adapters/memory/in_memory_memory.py +216 -0
  22. hexdag/builtin/adapters/memory/schemas.py +57 -0
  23. hexdag/builtin/adapters/memory/session_memory.py +178 -0
  24. hexdag/builtin/adapters/memory/sqlite_memory_adapter.py +215 -0
  25. hexdag/builtin/adapters/memory/state_memory.py +280 -0
  26. hexdag/builtin/adapters/mock/README.md +89 -0
  27. hexdag/builtin/adapters/mock/__init__.py +15 -0
  28. hexdag/builtin/adapters/mock/hexdag.toml +50 -0
  29. hexdag/builtin/adapters/mock/mock_database.py +225 -0
  30. hexdag/builtin/adapters/mock/mock_embedding.py +223 -0
  31. hexdag/builtin/adapters/mock/mock_llm.py +177 -0
  32. hexdag/builtin/adapters/mock/mock_tool_adapter.py +192 -0
  33. hexdag/builtin/adapters/mock/mock_tool_router.py +232 -0
  34. hexdag/builtin/adapters/openai/__init__.py +5 -0
  35. hexdag/builtin/adapters/openai/openai_adapter.py +634 -0
  36. hexdag/builtin/adapters/secret/__init__.py +7 -0
  37. hexdag/builtin/adapters/secret/local_secret_adapter.py +248 -0
  38. hexdag/builtin/adapters/unified_tool_router.py +280 -0
  39. hexdag/builtin/macros/__init__.py +17 -0
  40. hexdag/builtin/macros/conversation_agent.py +390 -0
  41. hexdag/builtin/macros/llm_macro.py +151 -0
  42. hexdag/builtin/macros/reasoning_agent.py +423 -0
  43. hexdag/builtin/macros/tool_macro.py +380 -0
  44. hexdag/builtin/nodes/__init__.py +38 -0
  45. hexdag/builtin/nodes/_discovery.py +123 -0
  46. hexdag/builtin/nodes/agent_node.py +696 -0
  47. hexdag/builtin/nodes/base_node_factory.py +242 -0
  48. hexdag/builtin/nodes/composite_node.py +926 -0
  49. hexdag/builtin/nodes/data_node.py +201 -0
  50. hexdag/builtin/nodes/expression_node.py +487 -0
  51. hexdag/builtin/nodes/function_node.py +454 -0
  52. hexdag/builtin/nodes/llm_node.py +491 -0
  53. hexdag/builtin/nodes/loop_node.py +920 -0
  54. hexdag/builtin/nodes/mapped_input.py +518 -0
  55. hexdag/builtin/nodes/port_call_node.py +269 -0
  56. hexdag/builtin/nodes/tool_call_node.py +195 -0
  57. hexdag/builtin/nodes/tool_utils.py +390 -0
  58. hexdag/builtin/prompts/__init__.py +68 -0
  59. hexdag/builtin/prompts/base.py +422 -0
  60. hexdag/builtin/prompts/chat_prompts.py +303 -0
  61. hexdag/builtin/prompts/error_correction_prompts.py +320 -0
  62. hexdag/builtin/prompts/tool_prompts.py +160 -0
  63. hexdag/builtin/tools/builtin_tools.py +84 -0
  64. hexdag/builtin/tools/database_tools.py +164 -0
  65. hexdag/cli/__init__.py +17 -0
  66. hexdag/cli/__main__.py +7 -0
  67. hexdag/cli/commands/__init__.py +27 -0
  68. hexdag/cli/commands/build_cmd.py +812 -0
  69. hexdag/cli/commands/create_cmd.py +208 -0
  70. hexdag/cli/commands/docs_cmd.py +293 -0
  71. hexdag/cli/commands/generate_types_cmd.py +252 -0
  72. hexdag/cli/commands/init_cmd.py +188 -0
  73. hexdag/cli/commands/pipeline_cmd.py +494 -0
  74. hexdag/cli/commands/plugin_dev_cmd.py +529 -0
  75. hexdag/cli/commands/plugins_cmd.py +441 -0
  76. hexdag/cli/commands/studio_cmd.py +101 -0
  77. hexdag/cli/commands/validate_cmd.py +221 -0
  78. hexdag/cli/main.py +84 -0
  79. hexdag/core/__init__.py +83 -0
  80. hexdag/core/config/__init__.py +20 -0
  81. hexdag/core/config/loader.py +479 -0
  82. hexdag/core/config/models.py +150 -0
  83. hexdag/core/configurable.py +294 -0
  84. hexdag/core/context/__init__.py +37 -0
  85. hexdag/core/context/execution_context.py +378 -0
  86. hexdag/core/docs/__init__.py +26 -0
  87. hexdag/core/docs/extractors.py +678 -0
  88. hexdag/core/docs/generators.py +890 -0
  89. hexdag/core/docs/models.py +120 -0
  90. hexdag/core/domain/__init__.py +10 -0
  91. hexdag/core/domain/dag.py +1225 -0
  92. hexdag/core/exceptions.py +234 -0
  93. hexdag/core/expression_parser.py +569 -0
  94. hexdag/core/logging.py +449 -0
  95. hexdag/core/models/__init__.py +17 -0
  96. hexdag/core/models/base.py +138 -0
  97. hexdag/core/orchestration/__init__.py +46 -0
  98. hexdag/core/orchestration/body_executor.py +481 -0
  99. hexdag/core/orchestration/components/__init__.py +97 -0
  100. hexdag/core/orchestration/components/adapter_lifecycle_manager.py +113 -0
  101. hexdag/core/orchestration/components/checkpoint_manager.py +134 -0
  102. hexdag/core/orchestration/components/execution_coordinator.py +360 -0
  103. hexdag/core/orchestration/components/health_check_manager.py +176 -0
  104. hexdag/core/orchestration/components/input_mapper.py +143 -0
  105. hexdag/core/orchestration/components/lifecycle_manager.py +583 -0
  106. hexdag/core/orchestration/components/node_executor.py +377 -0
  107. hexdag/core/orchestration/components/secret_manager.py +202 -0
  108. hexdag/core/orchestration/components/wave_executor.py +158 -0
  109. hexdag/core/orchestration/constants.py +17 -0
  110. hexdag/core/orchestration/events/README.md +312 -0
  111. hexdag/core/orchestration/events/__init__.py +104 -0
  112. hexdag/core/orchestration/events/batching.py +330 -0
  113. hexdag/core/orchestration/events/decorators.py +139 -0
  114. hexdag/core/orchestration/events/events.py +573 -0
  115. hexdag/core/orchestration/events/observers/__init__.py +30 -0
  116. hexdag/core/orchestration/events/observers/core_observers.py +690 -0
  117. hexdag/core/orchestration/events/observers/models.py +111 -0
  118. hexdag/core/orchestration/events/taxonomy.py +269 -0
  119. hexdag/core/orchestration/hook_context.py +237 -0
  120. hexdag/core/orchestration/hooks.py +437 -0
  121. hexdag/core/orchestration/models.py +418 -0
  122. hexdag/core/orchestration/orchestrator.py +910 -0
  123. hexdag/core/orchestration/orchestrator_factory.py +275 -0
  124. hexdag/core/orchestration/port_wrappers.py +327 -0
  125. hexdag/core/orchestration/prompt/__init__.py +32 -0
  126. hexdag/core/orchestration/prompt/template.py +332 -0
  127. hexdag/core/pipeline_builder/__init__.py +21 -0
  128. hexdag/core/pipeline_builder/component_instantiator.py +386 -0
  129. hexdag/core/pipeline_builder/include_tag.py +265 -0
  130. hexdag/core/pipeline_builder/pipeline_config.py +133 -0
  131. hexdag/core/pipeline_builder/py_tag.py +223 -0
  132. hexdag/core/pipeline_builder/tag_discovery.py +268 -0
  133. hexdag/core/pipeline_builder/yaml_builder.py +1196 -0
  134. hexdag/core/pipeline_builder/yaml_validator.py +569 -0
  135. hexdag/core/ports/__init__.py +65 -0
  136. hexdag/core/ports/api_call.py +133 -0
  137. hexdag/core/ports/database.py +489 -0
  138. hexdag/core/ports/embedding.py +215 -0
  139. hexdag/core/ports/executor.py +237 -0
  140. hexdag/core/ports/file_storage.py +117 -0
  141. hexdag/core/ports/healthcheck.py +87 -0
  142. hexdag/core/ports/llm.py +551 -0
  143. hexdag/core/ports/memory.py +70 -0
  144. hexdag/core/ports/observer_manager.py +130 -0
  145. hexdag/core/ports/secret.py +145 -0
  146. hexdag/core/ports/tool_router.py +94 -0
  147. hexdag/core/ports_builder.py +623 -0
  148. hexdag/core/protocols.py +273 -0
  149. hexdag/core/resolver.py +304 -0
  150. hexdag/core/schema/__init__.py +9 -0
  151. hexdag/core/schema/generator.py +742 -0
  152. hexdag/core/secrets.py +242 -0
  153. hexdag/core/types.py +413 -0
  154. hexdag/core/utils/async_warnings.py +206 -0
  155. hexdag/core/utils/schema_conversion.py +78 -0
  156. hexdag/core/utils/sql_validation.py +86 -0
  157. hexdag/core/validation/secure_json.py +148 -0
  158. hexdag/core/yaml_macro.py +517 -0
  159. hexdag/mcp_server.py +3120 -0
  160. hexdag/studio/__init__.py +10 -0
  161. hexdag/studio/build_ui.py +92 -0
  162. hexdag/studio/server/__init__.py +1 -0
  163. hexdag/studio/server/main.py +100 -0
  164. hexdag/studio/server/routes/__init__.py +9 -0
  165. hexdag/studio/server/routes/execute.py +208 -0
  166. hexdag/studio/server/routes/export.py +558 -0
  167. hexdag/studio/server/routes/files.py +207 -0
  168. hexdag/studio/server/routes/plugins.py +419 -0
  169. hexdag/studio/server/routes/validate.py +220 -0
  170. hexdag/studio/ui/index.html +13 -0
  171. hexdag/studio/ui/package-lock.json +2992 -0
  172. hexdag/studio/ui/package.json +31 -0
  173. hexdag/studio/ui/postcss.config.js +6 -0
  174. hexdag/studio/ui/public/hexdag.svg +5 -0
  175. hexdag/studio/ui/src/App.tsx +251 -0
  176. hexdag/studio/ui/src/components/Canvas.tsx +408 -0
  177. hexdag/studio/ui/src/components/ContextMenu.tsx +187 -0
  178. hexdag/studio/ui/src/components/FileBrowser.tsx +123 -0
  179. hexdag/studio/ui/src/components/Header.tsx +181 -0
  180. hexdag/studio/ui/src/components/HexdagNode.tsx +193 -0
  181. hexdag/studio/ui/src/components/NodeInspector.tsx +512 -0
  182. hexdag/studio/ui/src/components/NodePalette.tsx +262 -0
  183. hexdag/studio/ui/src/components/NodePortsSection.tsx +403 -0
  184. hexdag/studio/ui/src/components/PluginManager.tsx +347 -0
  185. hexdag/studio/ui/src/components/PortsEditor.tsx +481 -0
  186. hexdag/studio/ui/src/components/PythonEditor.tsx +195 -0
  187. hexdag/studio/ui/src/components/ValidationPanel.tsx +105 -0
  188. hexdag/studio/ui/src/components/YamlEditor.tsx +196 -0
  189. hexdag/studio/ui/src/components/index.ts +8 -0
  190. hexdag/studio/ui/src/index.css +92 -0
  191. hexdag/studio/ui/src/main.tsx +10 -0
  192. hexdag/studio/ui/src/types/index.ts +123 -0
  193. hexdag/studio/ui/src/vite-env.d.ts +1 -0
  194. hexdag/studio/ui/tailwind.config.js +29 -0
  195. hexdag/studio/ui/tsconfig.json +37 -0
  196. hexdag/studio/ui/tsconfig.node.json +13 -0
  197. hexdag/studio/ui/vite.config.ts +35 -0
  198. hexdag/visualization/__init__.py +69 -0
  199. hexdag/visualization/dag_visualizer.py +1020 -0
  200. hexdag-0.5.0.dev1.dist-info/METADATA +369 -0
  201. hexdag-0.5.0.dev1.dist-info/RECORD +261 -0
  202. hexdag-0.5.0.dev1.dist-info/WHEEL +4 -0
  203. hexdag-0.5.0.dev1.dist-info/entry_points.txt +4 -0
  204. hexdag-0.5.0.dev1.dist-info/licenses/LICENSE +190 -0
  205. hexdag_plugins/.gitignore +43 -0
  206. hexdag_plugins/README.md +73 -0
  207. hexdag_plugins/__init__.py +1 -0
  208. hexdag_plugins/azure/LICENSE +21 -0
  209. hexdag_plugins/azure/README.md +414 -0
  210. hexdag_plugins/azure/__init__.py +21 -0
  211. hexdag_plugins/azure/azure_blob_adapter.py +450 -0
  212. hexdag_plugins/azure/azure_cosmos_adapter.py +383 -0
  213. hexdag_plugins/azure/azure_keyvault_adapter.py +314 -0
  214. hexdag_plugins/azure/azure_openai_adapter.py +415 -0
  215. hexdag_plugins/azure/pyproject.toml +107 -0
  216. hexdag_plugins/azure/tests/__init__.py +1 -0
  217. hexdag_plugins/azure/tests/test_azure_blob_adapter.py +350 -0
  218. hexdag_plugins/azure/tests/test_azure_cosmos_adapter.py +323 -0
  219. hexdag_plugins/azure/tests/test_azure_keyvault_adapter.py +330 -0
  220. hexdag_plugins/azure/tests/test_azure_openai_adapter.py +329 -0
  221. hexdag_plugins/hexdag_etl/README.md +168 -0
  222. hexdag_plugins/hexdag_etl/__init__.py +53 -0
  223. hexdag_plugins/hexdag_etl/examples/01_simple_pandas_transform.py +270 -0
  224. hexdag_plugins/hexdag_etl/examples/02_simple_pandas_only.py +149 -0
  225. hexdag_plugins/hexdag_etl/examples/03_file_io_pipeline.py +109 -0
  226. hexdag_plugins/hexdag_etl/examples/test_pandas_transform.py +84 -0
  227. hexdag_plugins/hexdag_etl/hexdag.toml +25 -0
  228. hexdag_plugins/hexdag_etl/hexdag_etl/__init__.py +48 -0
  229. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/__init__.py +13 -0
  230. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/api_extract.py +230 -0
  231. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/base_node_factory.py +181 -0
  232. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/file_io.py +415 -0
  233. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/outlook.py +492 -0
  234. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/pandas_transform.py +563 -0
  235. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/sql_extract_load.py +112 -0
  236. hexdag_plugins/hexdag_etl/pyproject.toml +82 -0
  237. hexdag_plugins/hexdag_etl/test_transform.py +54 -0
  238. hexdag_plugins/hexdag_etl/tests/test_plugin_integration.py +62 -0
  239. hexdag_plugins/mysql_adapter/LICENSE +21 -0
  240. hexdag_plugins/mysql_adapter/README.md +224 -0
  241. hexdag_plugins/mysql_adapter/__init__.py +6 -0
  242. hexdag_plugins/mysql_adapter/mysql_adapter.py +408 -0
  243. hexdag_plugins/mysql_adapter/pyproject.toml +93 -0
  244. hexdag_plugins/mysql_adapter/tests/test_mysql_adapter.py +259 -0
  245. hexdag_plugins/storage/README.md +184 -0
  246. hexdag_plugins/storage/__init__.py +19 -0
  247. hexdag_plugins/storage/file/__init__.py +5 -0
  248. hexdag_plugins/storage/file/local.py +325 -0
  249. hexdag_plugins/storage/ports/__init__.py +5 -0
  250. hexdag_plugins/storage/ports/vector_store.py +236 -0
  251. hexdag_plugins/storage/sql/__init__.py +7 -0
  252. hexdag_plugins/storage/sql/base.py +187 -0
  253. hexdag_plugins/storage/sql/mysql.py +27 -0
  254. hexdag_plugins/storage/sql/postgresql.py +27 -0
  255. hexdag_plugins/storage/tests/__init__.py +1 -0
  256. hexdag_plugins/storage/tests/test_local_file_storage.py +161 -0
  257. hexdag_plugins/storage/tests/test_sql_adapters.py +212 -0
  258. hexdag_plugins/storage/vector/__init__.py +7 -0
  259. hexdag_plugins/storage/vector/chromadb.py +223 -0
  260. hexdag_plugins/storage/vector/in_memory.py +285 -0
  261. hexdag_plugins/storage/vector/pgvector.py +502 -0
@@ -0,0 +1,563 @@
1
+ """Pandas transform node with multi-operation support for ETL pipelines."""
2
+
3
+ import asyncio
4
+ import importlib
5
+ from collections.abc import Callable
6
+ from dataclasses import asdict
7
+ from typing import Any
8
+
9
+ import pandas as pd
10
+ from hexdag.builtin.nodes.base_node_factory import BaseNodeFactory
11
+ from hexdag.core.domain.dag import NodeSpec
12
+ from hexdag.core.registry import node
13
+ from hexdag.core.registry.models import NodeSubtype
14
+ from pydantic import BaseModel
15
+
16
+
17
+ class PandasOperation(BaseModel):
18
+ """Single pandas operation configuration."""
19
+
20
+ type: str = "transform"
21
+ """Operation type: 'transform', 'map', 'filter', 'assign'"""
22
+
23
+ method: str | None = None
24
+ """Pandas method path (e.g., 'pandas.DataFrame.groupby', 'pandas.merge')"""
25
+
26
+ args: list[Any] | None = None
27
+ """Positional arguments for the operation"""
28
+
29
+ kwargs: dict[str, Any] | None = None
30
+ """Keyword arguments for the operation"""
31
+
32
+ columns: dict[str, str] | None = None
33
+ """Column mappings (for 'map' or 'rename' operations)"""
34
+
35
+ condition: str | None = None
36
+ """Filter condition expression (for 'filter' operations)"""
37
+
38
+
39
+ @node(name="pandas_transform_node", subtype=NodeSubtype.FUNCTION, namespace="etl")
40
+ class PandasTransformNode(BaseNodeFactory):
41
+ """Node factory for multi-operation pandas transforms.
42
+
43
+ Executes a sequence of pandas operations on DataFrames, supporting:
44
+ - Chained transformations
45
+ - Multiple input DataFrames
46
+ - Artifact storage integration
47
+ - Complex data cleaning and enrichment
48
+
49
+ Examples
50
+ --------
51
+ YAML pipeline::
52
+
53
+ - kind: pandas_transform_node
54
+ metadata:
55
+ name: clean_and_aggregate
56
+ spec:
57
+ input_artifacts:
58
+ - slot: raw_customers
59
+ key: customers_v1
60
+ - slot: raw_transactions
61
+ key: transactions_v1
62
+ operations:
63
+ # Operation 1: Join DataFrames
64
+ - type: transform
65
+ method: pandas.merge
66
+ args:
67
+ - {{input_artifacts[0]}}
68
+ - {{input_artifacts[1]}}
69
+ kwargs:
70
+ on: customer_id
71
+ how: left
72
+
73
+ # Operation 2: Drop missing values
74
+ - type: transform
75
+ method: pandas.DataFrame.dropna
76
+ kwargs:
77
+ subset: [customer_id, amount]
78
+
79
+ # Operation 3: Calculate new column
80
+ - type: transform
81
+ method: pandas.DataFrame.assign
82
+ kwargs:
83
+ revenue_tier: |
84
+ lambda df: pd.cut(
85
+ df['amount'],
86
+ bins=[0, 100, 500, float('inf')],
87
+ labels=['Low', 'Medium', 'High']
88
+ )
89
+
90
+ # Operation 4: Rename columns
91
+ - type: map
92
+ columns:
93
+ transaction_id: txn_id
94
+ customer_id: cust_id
95
+ amount: total_amount
96
+
97
+ # Operation 5: Filter rows
98
+ - type: filter
99
+ condition: "{{ df['amount'] > 0 }}"
100
+
101
+ # Operation 6: Group and aggregate
102
+ - type: transform
103
+ method: pandas.DataFrame.groupby
104
+ args:
105
+ - customer_id
106
+ kwargs:
107
+ as_index: false
108
+
109
+ # Operation 7: Calculate aggregations
110
+ - type: transform
111
+ method: pandas.DataFrame.agg
112
+ kwargs:
113
+ amount: ['count', 'sum', 'mean']
114
+ customer_id: 'count'
115
+
116
+ output_artifact:
117
+ slot: enriched_customers
118
+ key: enriched_v1
119
+ format: parquet
120
+ compression: snappy
121
+ """
122
+
123
+ def __call__(
124
+ self,
125
+ name: str,
126
+ operations: list[dict[str, Any]],
127
+ input_artifacts: list[dict[str, Any]] | None = None,
128
+ output_artifact: dict[str, Any] | None = None,
129
+ deps: list[str] | None = None,
130
+ **kwargs: Any,
131
+ ) -> NodeSpec:
132
+ """Create a pandas transform node specification.
133
+
134
+ Parameters
135
+ ----------
136
+ name : str
137
+ Node name
138
+ operations : list[dict]
139
+ List of pandas operation configurations
140
+ input_artifacts : list[dict], optional
141
+ Artifact references for input DataFrames
142
+ output_artifact : dict, optional
143
+ Artifact configuration for output DataFrame
144
+ deps : list[str], optional
145
+ Dependency node names
146
+ **kwargs : Any
147
+ Additional node parameters
148
+
149
+ Returns
150
+ -------
151
+ NodeSpec
152
+ Node specification ready for execution
153
+ """
154
+ # Convert operation dicts to Pydantic models for validation
155
+ operation_models = [PandasOperation(**op) for op in operations]
156
+
157
+ # Create wrapped function
158
+ wrapped_fn = self._create_transform_function(name, operation_models, input_artifacts, output_artifact)
159
+
160
+ # Define input schema
161
+ if input_artifacts:
162
+ input_schema = {"input_data": dict, "**ports": dict}
163
+ else:
164
+ input_schema = {"input_data": dict, "**ports": dict}
165
+
166
+ # Define output schema
167
+ output_schema = {"output": dict}
168
+
169
+ input_model = self.create_pydantic_model(f"{name}Input", input_schema)
170
+ output_model = self.create_pydantic_model(f"{name}Output", output_schema)
171
+
172
+ # Store parameters
173
+ node_params = {
174
+ "operations": operations,
175
+ "input_artifacts": input_artifacts,
176
+ "output_artifact": output_artifact,
177
+ **kwargs,
178
+ }
179
+
180
+ return NodeSpec(
181
+ name=name,
182
+ fn=wrapped_fn,
183
+ in_model=input_model,
184
+ out_model=output_model,
185
+ deps=frozenset(deps or []),
186
+ params=node_params,
187
+ )
188
+
189
+ def _create_transform_function(
190
+ self,
191
+ name: str,
192
+ operations: list[PandasOperation],
193
+ input_artifacts: list[dict[str, Any]] | None,
194
+ output_artifact: dict[str, Any] | None,
195
+ ) -> Callable[..., dict[str, Any]]:
196
+ """Create the wrapped transformation function.
197
+
198
+ Parameters
199
+ ----------
200
+ name : str
201
+ Node name
202
+ operations : list[PandasOperation]
203
+ Operations to execute
204
+ input_artifacts : list[dict], optional
205
+ Input artifact references
206
+ output_artifact : dict, optional
207
+ Output artifact configuration
208
+
209
+ Returns
210
+ -------
211
+ Callable
212
+ Async function that executes the transformation
213
+ """
214
+
215
+ async def wrapped_fn(input_data: Any, **ports: Any) -> dict[str, Any]:
216
+ """Execute pandas transformation operations."""
217
+ # Initialize result DataFrame
218
+ df = None
219
+
220
+ # Load input artifacts if specified
221
+ if input_artifacts:
222
+ artifact_store = ports.get("artifact_store")
223
+ if not artifact_store:
224
+ raise ValueError("artifact_store port required when using input_artifacts")
225
+
226
+ loaded_dfs = []
227
+ for artifact_ref in input_artifacts:
228
+ slot = artifact_ref.get("slot")
229
+ key = artifact_ref.get("key")
230
+ format = artifact_ref.get("format")
231
+
232
+ if not slot or not key:
233
+ raise ValueError(f"Invalid artifact reference: {artifact_ref}")
234
+
235
+ # Load from artifact store
236
+ df_loaded = await artifact_store.read(name=slot, key=key, format=format)
237
+ loaded_dfs.append(df_loaded)
238
+
239
+ # Start with first DataFrame if available
240
+ if loaded_dfs:
241
+ df = loaded_dfs[0]
242
+ else:
243
+ # Use input_data directly
244
+ if isinstance(input_data, dict) and "data" in input_data:
245
+ df = input_data["data"]
246
+ else:
247
+ df = input_data
248
+
249
+ if df is None:
250
+ raise ValueError("No input DataFrame available")
251
+
252
+ if not isinstance(df, pd.DataFrame):
253
+ # Try to convert to DataFrame
254
+ try:
255
+ df = pd.DataFrame(df)
256
+ except Exception as e:
257
+ raise ValueError(f"Could not convert input to DataFrame: {e}")
258
+
259
+ # Execute operations sequentially
260
+ for i, op in enumerate(operations):
261
+ df = await self._execute_operation(df, op, loaded_dfs if input_artifacts else [df])
262
+
263
+ # Store output artifact if specified
264
+ result = {"output": df}
265
+
266
+ if output_artifact:
267
+ artifact_store = ports.get("artifact_store")
268
+ if not artifact_store:
269
+ raise ValueError("artifact_store port required when using output_artifact")
270
+
271
+ slot = output_artifact.get("slot")
272
+ key = output_artifact.get("key", f"{name}_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}")
273
+ format = output_artifact.get("format", "pickle")
274
+ compression = output_artifact.get("compression")
275
+ metadata = output_artifact.get("metadata")
276
+
277
+ if not slot:
278
+ raise ValueError("output_artifact must specify 'slot' name")
279
+
280
+ # Write to artifact store
281
+ artifact_info = await artifact_store.write(
282
+ name=slot,
283
+ key=key,
284
+ data=df,
285
+ format=format,
286
+ compression=compression,
287
+ metadata=metadata,
288
+ )
289
+
290
+ result["artifact_info"] = asdict(artifact_info)
291
+ result["records"] = len(df)
292
+
293
+ return result
294
+
295
+ # Preserve function metadata
296
+ wrapped_fn.__name__ = f"pandas_transform_{name}"
297
+ wrapped_fn.__doc__ = f"Multi-operation pandas transform: {name}"
298
+
299
+ return wrapped_fn
300
+
301
+ async def _execute_operation(
302
+ self, df: pd.DataFrame, op: PandasOperation, input_dfs: list[pd.DataFrame]
303
+ ) -> pd.DataFrame:
304
+ """Execute a single pandas operation.
305
+
306
+ Parameters
307
+ ----------
308
+ df : pd.DataFrame
309
+ Current DataFrame
310
+ op : PandasOperation
311
+ Operation to execute
312
+ input_dfs : list[pd.DataFrame]
313
+ Available input DataFrames
314
+
315
+ Returns
316
+ -------
317
+ pd.DataFrame
318
+ Transformed DataFrame
319
+ """
320
+ op_type = op.type or "transform"
321
+
322
+ if op_type == "transform":
323
+ return await self._execute_transform(df, op, input_dfs)
324
+
325
+ elif op_type == "map":
326
+ return await self._execute_map(df, op)
327
+
328
+ elif op_type == "filter":
329
+ return await self._execute_filter(df, op)
330
+
331
+ elif op_type == "assign":
332
+ return await self._execute_assign(df, op)
333
+
334
+ else:
335
+ raise ValueError(f"Unknown operation type: {op_type}")
336
+
337
+ async def _execute_transform(
338
+ self, df: pd.DataFrame, op: PandasOperation, input_dfs: list[pd.DataFrame]
339
+ ) -> pd.DataFrame:
340
+ """Execute a transform operation (calls a pandas method).
341
+
342
+ Parameters
343
+ ----------
344
+ df : pd.DataFrame
345
+ Current DataFrame
346
+ op : PandasOperation
347
+ Operation configuration
348
+ input_dfs : list[pd.DataFrame]
349
+ Available input DataFrames
350
+
351
+ Returns
352
+ -------
353
+ pd.DataFrame
354
+ Transformed DataFrame
355
+ """
356
+ if not op.method:
357
+ raise ValueError("Transform operation requires 'method' parameter")
358
+
359
+ # Resolve method
360
+ method = self._resolve_method(op.method)
361
+
362
+ # Prepare arguments (resolve template expressions)
363
+ args = []
364
+ if op.args:
365
+ for arg in op.args:
366
+ args.append(self._resolve_arg(arg, df, input_dfs))
367
+
368
+ # Prepare keyword arguments
369
+ kwargs = {}
370
+ if op.kwargs:
371
+ for k, v in op.kwargs.items():
372
+ kwargs[k] = self._resolve_arg(v, df, input_dfs)
373
+
374
+ # Execute method (handle both sync and async)
375
+ if asyncio.iscoroutinefunction(method):
376
+ result = await method(df, *args, **kwargs)
377
+ else:
378
+ result = method(df, *args, **kwargs)
379
+
380
+ return result
381
+
382
+ async def _execute_map(self, df: pd.DataFrame, op: PandasOperation) -> pd.DataFrame:
383
+ """Execute a map operation (column rename/mapping).
384
+
385
+ Parameters
386
+ ----------
387
+ df : pd.DataFrame
388
+ Current DataFrame
389
+ op : PandasOperation
390
+ Operation configuration with columns mapping
391
+
392
+ Returns
393
+ -------
394
+ pd.DataFrame
395
+ DataFrame with renamed columns
396
+ """
397
+ if not op.columns:
398
+ return df
399
+
400
+ return df.rename(columns=op.columns)
401
+
402
+ async def _execute_filter(self, df: pd.DataFrame, op: PandasOperation) -> pd.DataFrame:
403
+ """Execute a filter operation.
404
+
405
+ Parameters
406
+ ----------
407
+ df : pd.DataFrame
408
+ Current DataFrame
409
+ op : PandasOperation
410
+ Operation configuration with filter condition
411
+
412
+ Returns
413
+ -------
414
+ pd.DataFrame
415
+ Filtered DataFrame
416
+ """
417
+ if not op.condition:
418
+ return df
419
+
420
+ # Evaluate condition
421
+ # Note: This is a simplified implementation - production code should validate
422
+ # the condition for security
423
+ condition_result = self._resolve_arg(op.condition, df, [df])
424
+
425
+ return df[condition_result]
426
+
427
+ async def _execute_assign(self, df: pd.DataFrame, op: PandasOperation) -> pd.DataFrame:
428
+ """Execute an assign operation (add new columns).
429
+
430
+ Parameters
431
+ ----------
432
+ df : pd.DataFrame
433
+ Current DataFrame
434
+ op : PandasOperation
435
+ Operation configuration
436
+
437
+ Returns
438
+ -------
439
+ pd.DataFrame
440
+ DataFrame with new columns
441
+ """
442
+ if not op.kwargs:
443
+ return df
444
+
445
+ # Prepare new column assignments
446
+ new_cols = {}
447
+ for col_name, col_expr in op.kwargs.items():
448
+ new_cols[col_name] = self._resolve_arg(col_expr, df, [df])
449
+
450
+ return df.assign(**new_cols)
451
+
452
+ def _resolve_method(self, method_path: str) -> Callable:
453
+ """Resolve a method from a path string or return callable directly.
454
+
455
+ Parameters
456
+ ----------
457
+ method_path : str
458
+ Path like "pandas.DataFrame.groupby" or "pandas.merge"
459
+
460
+ Returns
461
+ -------
462
+ Callable
463
+ The resolved method
464
+ """
465
+ # Already callable
466
+ if callable(method_path):
467
+ return method_path
468
+
469
+ # Parse module path
470
+ if "." not in method_path:
471
+ raise ValueError(f"Method path must contain '.', got: {method_path}")
472
+
473
+ try:
474
+ # Handle pandas class paths like pandas.DataFrame.sort_values
475
+ if method_path.startswith("pandas."):
476
+ parts = method_path.split(".")
477
+ if len(parts) >= 3 and parts[1] == "DataFrame":
478
+ # It's a DataFrame method: pandas.DataFrame.method_name
479
+ method_name = parts[2]
480
+ return getattr(pd.DataFrame, method_name)
481
+ else:
482
+ # It's a module-level function like pandas.merge
483
+ module_path = ".".join(parts[:-1])
484
+ attr_path = parts[-1]
485
+ module = importlib.import_module(module_path)
486
+ method = getattr(module, attr_path)
487
+
488
+ if not callable(method):
489
+ raise ValueError(f"'{method_path}' is not callable")
490
+
491
+ return method
492
+ else:
493
+ # Standard module attribute resolution
494
+ module_path, attr_path = method_path.rsplit(".", 1)
495
+ module = importlib.import_module(module_path)
496
+ method = getattr(module, attr_path)
497
+
498
+ if not callable(method):
499
+ raise ValueError(f"'{method_path}' is not callable")
500
+
501
+ return method
502
+ except Exception as e:
503
+ raise ValueError(f"Could not resolve method '{method_path}': {e}") from e
504
+
505
+ def _resolve_arg(self, arg: Any, df: pd.DataFrame, input_dfs: list[pd.DataFrame]) -> Any:
506
+ """Resolve an argument value (handles templates and expressions).
507
+
508
+ Parameters
509
+ ----------
510
+ arg : Any
511
+ Argument value or template expression
512
+ df : pd.DataFrame
513
+ Current DataFrame for context
514
+ input_dfs : list[pd.DataFrame]
515
+ All input DataFrames
516
+
517
+ Returns
518
+ -------
519
+ Any
520
+ Resolved argument value
521
+ """
522
+ # If it's a string template expression
523
+ if isinstance(arg, str) and "{{" in arg and "}}" in arg:
524
+ # Parse template expression
525
+ import re
526
+
527
+ pattern = r"\{\{\s*(.+?)\s*\}\}"
528
+ match = re.search(pattern, arg)
529
+
530
+ if match:
531
+ expr = match.group(1)
532
+
533
+ # Handle special variables
534
+ if expr == "df":
535
+ return df
536
+ elif expr.startswith("input_artifacts["):
537
+ # Extract index
538
+ idx_match = re.search(r"input_artifacts\[(\d+)\]", expr)
539
+ if idx_match:
540
+ idx = int(idx_match.group(1))
541
+ if 0 <= idx < len(input_dfs):
542
+ return input_dfs[idx]
543
+ else:
544
+ raise IndexError(f"input_artifacts[{idx}] out of range")
545
+
546
+ # Try to evaluate as Python expression
547
+ try:
548
+ # Safe evaluation - limited scope
549
+ scope = {"df": df, "input_artifacts": input_dfs, "pd": pd}
550
+ return eval(expr, {"__builtins__": {}}, scope)
551
+ except Exception:
552
+ # Return as-is if evaluation fails
553
+ return arg
554
+
555
+ # If it's a dict with lambda expression
556
+ if isinstance(arg, dict):
557
+ resolved = {}
558
+ for k, v in arg.items():
559
+ resolved[k] = self._resolve_arg(v, df, input_dfs)
560
+ return resolved
561
+
562
+ # Return as-is
563
+ return arg
@@ -0,0 +1,112 @@
1
+ """SQL extraction and loading nodes for database operations."""
2
+
3
+ from typing import Any
4
+
5
+ from hexdag.core.domain.dag import NodeSpec
6
+ from hexdag.core.registry import node
7
+ from hexdag.core.registry.models import NodeSubtype
8
+
9
+ from .base_node_factory import BaseNodeFactory
10
+
11
+
12
+ @node(name="sql_extract", subtype=NodeSubtype.TOOL, namespace="etl")
13
+ class SQLExtractNode(BaseNodeFactory):
14
+ """Extract data from SQL databases.
15
+
16
+ Placeholder implementation - to be completed with full SQLAlchemy integration.
17
+ """
18
+
19
+ def __call__(
20
+ self, name: str, query: str, database: str | None = None, deps: list[str] | None = None, **kwargs: Any
21
+ ) -> NodeSpec:
22
+ """Create SQL extract node.
23
+
24
+ Parameters
25
+ ----------
26
+ name : str
27
+ Node name
28
+ query : str
29
+ SQL query to execute
30
+ database : str, optional
31
+ Database connection reference
32
+ deps : list, optional
33
+ Dependencies
34
+ **kwargs : Any
35
+ Additional parameters
36
+
37
+ Returns
38
+ -------
39
+ NodeSpec
40
+ Node specification
41
+ """
42
+
43
+ async def wrapped_fn(input_data: dict, **ports: dict) -> dict:
44
+ """Placeholder implementation."""
45
+ return {"output": [], "metadata": {"query": query, "database": database, "status": "placeholder"}}
46
+
47
+ wrapped_fn.__name__ = f"sql_extract_{name}"
48
+
49
+ return self.create_node_with_mapping(
50
+ name=name,
51
+ wrapped_fn=wrapped_fn,
52
+ input_schema={"input_data": dict, "**ports": dict},
53
+ output_schema={"output": dict, "metadata": dict},
54
+ deps=deps or [],
55
+ **kwargs,
56
+ )
57
+
58
+
59
+ @node(name="sql_load", subtype=NodeSubtype.TOOL, namespace="etl")
60
+ class SQLLoadNode(BaseNodeFactory):
61
+ """Load data into SQL databases.
62
+
63
+ Placeholder implementation - to be completed with SQLAlchemy integration.
64
+ """
65
+
66
+ def __call__(
67
+ self,
68
+ name: str,
69
+ table: str,
70
+ mode: str = "append",
71
+ database: str | None = None,
72
+ deps: list[str] | None = None,
73
+ **kwargs: Any,
74
+ ) -> NodeSpec:
75
+ """Create SQL load node.
76
+
77
+ Parameters
78
+ ----------
79
+ name : str
80
+ Node name
81
+ table : str
82
+ Target table name
83
+ mode : str
84
+ Load mode: "append", "replace", "truncate_insert", "merge"
85
+ database : str, optional
86
+ Database connection reference
87
+ deps : list, optional
88
+ Dependencies
89
+ **kwargs : Any
90
+ Additional parameters
91
+
92
+ Returns
93
+ -------
94
+ NodeSpec
95
+ Node specification
96
+ """
97
+
98
+ async def wrapped_fn(input_data: dict, **ports: dict) -> dict:
99
+ """Placeholder implementation."""
100
+ row_count = len(input_data.get("output", [])) if isinstance(input_data, dict) else 0
101
+ return {"status": "loaded", "table": table, "rows": row_count}
102
+
103
+ wrapped_fn.__name__ = f"sql_load_{name}"
104
+
105
+ return self.create_node_with_mapping(
106
+ name=name,
107
+ wrapped_fn=wrapped_fn,
108
+ input_schema={"input_data": dict, "**ports": dict},
109
+ output_schema={"status": dict, "table": dict, "rows": dict},
110
+ deps=deps or [],
111
+ **kwargs,
112
+ )