hexdag 0.5.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. hexdag/__init__.py +116 -0
  2. hexdag/__main__.py +30 -0
  3. hexdag/adapters/executors/__init__.py +5 -0
  4. hexdag/adapters/executors/local_executor.py +316 -0
  5. hexdag/builtin/__init__.py +6 -0
  6. hexdag/builtin/adapters/__init__.py +51 -0
  7. hexdag/builtin/adapters/anthropic/__init__.py +5 -0
  8. hexdag/builtin/adapters/anthropic/anthropic_adapter.py +151 -0
  9. hexdag/builtin/adapters/database/__init__.py +6 -0
  10. hexdag/builtin/adapters/database/csv/csv_adapter.py +249 -0
  11. hexdag/builtin/adapters/database/pgvector/__init__.py +5 -0
  12. hexdag/builtin/adapters/database/pgvector/pgvector_adapter.py +478 -0
  13. hexdag/builtin/adapters/database/sqlalchemy/sqlalchemy_adapter.py +252 -0
  14. hexdag/builtin/adapters/database/sqlite/__init__.py +5 -0
  15. hexdag/builtin/adapters/database/sqlite/sqlite_adapter.py +410 -0
  16. hexdag/builtin/adapters/local/README.md +59 -0
  17. hexdag/builtin/adapters/local/__init__.py +7 -0
  18. hexdag/builtin/adapters/local/local_observer_manager.py +696 -0
  19. hexdag/builtin/adapters/memory/__init__.py +47 -0
  20. hexdag/builtin/adapters/memory/file_memory_adapter.py +297 -0
  21. hexdag/builtin/adapters/memory/in_memory_memory.py +216 -0
  22. hexdag/builtin/adapters/memory/schemas.py +57 -0
  23. hexdag/builtin/adapters/memory/session_memory.py +178 -0
  24. hexdag/builtin/adapters/memory/sqlite_memory_adapter.py +215 -0
  25. hexdag/builtin/adapters/memory/state_memory.py +280 -0
  26. hexdag/builtin/adapters/mock/README.md +89 -0
  27. hexdag/builtin/adapters/mock/__init__.py +15 -0
  28. hexdag/builtin/adapters/mock/hexdag.toml +50 -0
  29. hexdag/builtin/adapters/mock/mock_database.py +225 -0
  30. hexdag/builtin/adapters/mock/mock_embedding.py +223 -0
  31. hexdag/builtin/adapters/mock/mock_llm.py +177 -0
  32. hexdag/builtin/adapters/mock/mock_tool_adapter.py +192 -0
  33. hexdag/builtin/adapters/mock/mock_tool_router.py +232 -0
  34. hexdag/builtin/adapters/openai/__init__.py +5 -0
  35. hexdag/builtin/adapters/openai/openai_adapter.py +634 -0
  36. hexdag/builtin/adapters/secret/__init__.py +7 -0
  37. hexdag/builtin/adapters/secret/local_secret_adapter.py +248 -0
  38. hexdag/builtin/adapters/unified_tool_router.py +280 -0
  39. hexdag/builtin/macros/__init__.py +17 -0
  40. hexdag/builtin/macros/conversation_agent.py +390 -0
  41. hexdag/builtin/macros/llm_macro.py +151 -0
  42. hexdag/builtin/macros/reasoning_agent.py +423 -0
  43. hexdag/builtin/macros/tool_macro.py +380 -0
  44. hexdag/builtin/nodes/__init__.py +38 -0
  45. hexdag/builtin/nodes/_discovery.py +123 -0
  46. hexdag/builtin/nodes/agent_node.py +696 -0
  47. hexdag/builtin/nodes/base_node_factory.py +242 -0
  48. hexdag/builtin/nodes/composite_node.py +926 -0
  49. hexdag/builtin/nodes/data_node.py +201 -0
  50. hexdag/builtin/nodes/expression_node.py +487 -0
  51. hexdag/builtin/nodes/function_node.py +454 -0
  52. hexdag/builtin/nodes/llm_node.py +491 -0
  53. hexdag/builtin/nodes/loop_node.py +920 -0
  54. hexdag/builtin/nodes/mapped_input.py +518 -0
  55. hexdag/builtin/nodes/port_call_node.py +269 -0
  56. hexdag/builtin/nodes/tool_call_node.py +195 -0
  57. hexdag/builtin/nodes/tool_utils.py +390 -0
  58. hexdag/builtin/prompts/__init__.py +68 -0
  59. hexdag/builtin/prompts/base.py +422 -0
  60. hexdag/builtin/prompts/chat_prompts.py +303 -0
  61. hexdag/builtin/prompts/error_correction_prompts.py +320 -0
  62. hexdag/builtin/prompts/tool_prompts.py +160 -0
  63. hexdag/builtin/tools/builtin_tools.py +84 -0
  64. hexdag/builtin/tools/database_tools.py +164 -0
  65. hexdag/cli/__init__.py +17 -0
  66. hexdag/cli/__main__.py +7 -0
  67. hexdag/cli/commands/__init__.py +27 -0
  68. hexdag/cli/commands/build_cmd.py +812 -0
  69. hexdag/cli/commands/create_cmd.py +208 -0
  70. hexdag/cli/commands/docs_cmd.py +293 -0
  71. hexdag/cli/commands/generate_types_cmd.py +252 -0
  72. hexdag/cli/commands/init_cmd.py +188 -0
  73. hexdag/cli/commands/pipeline_cmd.py +494 -0
  74. hexdag/cli/commands/plugin_dev_cmd.py +529 -0
  75. hexdag/cli/commands/plugins_cmd.py +441 -0
  76. hexdag/cli/commands/studio_cmd.py +101 -0
  77. hexdag/cli/commands/validate_cmd.py +221 -0
  78. hexdag/cli/main.py +84 -0
  79. hexdag/core/__init__.py +83 -0
  80. hexdag/core/config/__init__.py +20 -0
  81. hexdag/core/config/loader.py +479 -0
  82. hexdag/core/config/models.py +150 -0
  83. hexdag/core/configurable.py +294 -0
  84. hexdag/core/context/__init__.py +37 -0
  85. hexdag/core/context/execution_context.py +378 -0
  86. hexdag/core/docs/__init__.py +26 -0
  87. hexdag/core/docs/extractors.py +678 -0
  88. hexdag/core/docs/generators.py +890 -0
  89. hexdag/core/docs/models.py +120 -0
  90. hexdag/core/domain/__init__.py +10 -0
  91. hexdag/core/domain/dag.py +1225 -0
  92. hexdag/core/exceptions.py +234 -0
  93. hexdag/core/expression_parser.py +569 -0
  94. hexdag/core/logging.py +449 -0
  95. hexdag/core/models/__init__.py +17 -0
  96. hexdag/core/models/base.py +138 -0
  97. hexdag/core/orchestration/__init__.py +46 -0
  98. hexdag/core/orchestration/body_executor.py +481 -0
  99. hexdag/core/orchestration/components/__init__.py +97 -0
  100. hexdag/core/orchestration/components/adapter_lifecycle_manager.py +113 -0
  101. hexdag/core/orchestration/components/checkpoint_manager.py +134 -0
  102. hexdag/core/orchestration/components/execution_coordinator.py +360 -0
  103. hexdag/core/orchestration/components/health_check_manager.py +176 -0
  104. hexdag/core/orchestration/components/input_mapper.py +143 -0
  105. hexdag/core/orchestration/components/lifecycle_manager.py +583 -0
  106. hexdag/core/orchestration/components/node_executor.py +377 -0
  107. hexdag/core/orchestration/components/secret_manager.py +202 -0
  108. hexdag/core/orchestration/components/wave_executor.py +158 -0
  109. hexdag/core/orchestration/constants.py +17 -0
  110. hexdag/core/orchestration/events/README.md +312 -0
  111. hexdag/core/orchestration/events/__init__.py +104 -0
  112. hexdag/core/orchestration/events/batching.py +330 -0
  113. hexdag/core/orchestration/events/decorators.py +139 -0
  114. hexdag/core/orchestration/events/events.py +573 -0
  115. hexdag/core/orchestration/events/observers/__init__.py +30 -0
  116. hexdag/core/orchestration/events/observers/core_observers.py +690 -0
  117. hexdag/core/orchestration/events/observers/models.py +111 -0
  118. hexdag/core/orchestration/events/taxonomy.py +269 -0
  119. hexdag/core/orchestration/hook_context.py +237 -0
  120. hexdag/core/orchestration/hooks.py +437 -0
  121. hexdag/core/orchestration/models.py +418 -0
  122. hexdag/core/orchestration/orchestrator.py +910 -0
  123. hexdag/core/orchestration/orchestrator_factory.py +275 -0
  124. hexdag/core/orchestration/port_wrappers.py +327 -0
  125. hexdag/core/orchestration/prompt/__init__.py +32 -0
  126. hexdag/core/orchestration/prompt/template.py +332 -0
  127. hexdag/core/pipeline_builder/__init__.py +21 -0
  128. hexdag/core/pipeline_builder/component_instantiator.py +386 -0
  129. hexdag/core/pipeline_builder/include_tag.py +265 -0
  130. hexdag/core/pipeline_builder/pipeline_config.py +133 -0
  131. hexdag/core/pipeline_builder/py_tag.py +223 -0
  132. hexdag/core/pipeline_builder/tag_discovery.py +268 -0
  133. hexdag/core/pipeline_builder/yaml_builder.py +1196 -0
  134. hexdag/core/pipeline_builder/yaml_validator.py +569 -0
  135. hexdag/core/ports/__init__.py +65 -0
  136. hexdag/core/ports/api_call.py +133 -0
  137. hexdag/core/ports/database.py +489 -0
  138. hexdag/core/ports/embedding.py +215 -0
  139. hexdag/core/ports/executor.py +237 -0
  140. hexdag/core/ports/file_storage.py +117 -0
  141. hexdag/core/ports/healthcheck.py +87 -0
  142. hexdag/core/ports/llm.py +551 -0
  143. hexdag/core/ports/memory.py +70 -0
  144. hexdag/core/ports/observer_manager.py +130 -0
  145. hexdag/core/ports/secret.py +145 -0
  146. hexdag/core/ports/tool_router.py +94 -0
  147. hexdag/core/ports_builder.py +623 -0
  148. hexdag/core/protocols.py +273 -0
  149. hexdag/core/resolver.py +304 -0
  150. hexdag/core/schema/__init__.py +9 -0
  151. hexdag/core/schema/generator.py +742 -0
  152. hexdag/core/secrets.py +242 -0
  153. hexdag/core/types.py +413 -0
  154. hexdag/core/utils/async_warnings.py +206 -0
  155. hexdag/core/utils/schema_conversion.py +78 -0
  156. hexdag/core/utils/sql_validation.py +86 -0
  157. hexdag/core/validation/secure_json.py +148 -0
  158. hexdag/core/yaml_macro.py +517 -0
  159. hexdag/mcp_server.py +3120 -0
  160. hexdag/studio/__init__.py +10 -0
  161. hexdag/studio/build_ui.py +92 -0
  162. hexdag/studio/server/__init__.py +1 -0
  163. hexdag/studio/server/main.py +100 -0
  164. hexdag/studio/server/routes/__init__.py +9 -0
  165. hexdag/studio/server/routes/execute.py +208 -0
  166. hexdag/studio/server/routes/export.py +558 -0
  167. hexdag/studio/server/routes/files.py +207 -0
  168. hexdag/studio/server/routes/plugins.py +419 -0
  169. hexdag/studio/server/routes/validate.py +220 -0
  170. hexdag/studio/ui/index.html +13 -0
  171. hexdag/studio/ui/package-lock.json +2992 -0
  172. hexdag/studio/ui/package.json +31 -0
  173. hexdag/studio/ui/postcss.config.js +6 -0
  174. hexdag/studio/ui/public/hexdag.svg +5 -0
  175. hexdag/studio/ui/src/App.tsx +251 -0
  176. hexdag/studio/ui/src/components/Canvas.tsx +408 -0
  177. hexdag/studio/ui/src/components/ContextMenu.tsx +187 -0
  178. hexdag/studio/ui/src/components/FileBrowser.tsx +123 -0
  179. hexdag/studio/ui/src/components/Header.tsx +181 -0
  180. hexdag/studio/ui/src/components/HexdagNode.tsx +193 -0
  181. hexdag/studio/ui/src/components/NodeInspector.tsx +512 -0
  182. hexdag/studio/ui/src/components/NodePalette.tsx +262 -0
  183. hexdag/studio/ui/src/components/NodePortsSection.tsx +403 -0
  184. hexdag/studio/ui/src/components/PluginManager.tsx +347 -0
  185. hexdag/studio/ui/src/components/PortsEditor.tsx +481 -0
  186. hexdag/studio/ui/src/components/PythonEditor.tsx +195 -0
  187. hexdag/studio/ui/src/components/ValidationPanel.tsx +105 -0
  188. hexdag/studio/ui/src/components/YamlEditor.tsx +196 -0
  189. hexdag/studio/ui/src/components/index.ts +8 -0
  190. hexdag/studio/ui/src/index.css +92 -0
  191. hexdag/studio/ui/src/main.tsx +10 -0
  192. hexdag/studio/ui/src/types/index.ts +123 -0
  193. hexdag/studio/ui/src/vite-env.d.ts +1 -0
  194. hexdag/studio/ui/tailwind.config.js +29 -0
  195. hexdag/studio/ui/tsconfig.json +37 -0
  196. hexdag/studio/ui/tsconfig.node.json +13 -0
  197. hexdag/studio/ui/vite.config.ts +35 -0
  198. hexdag/visualization/__init__.py +69 -0
  199. hexdag/visualization/dag_visualizer.py +1020 -0
  200. hexdag-0.5.0.dev1.dist-info/METADATA +369 -0
  201. hexdag-0.5.0.dev1.dist-info/RECORD +261 -0
  202. hexdag-0.5.0.dev1.dist-info/WHEEL +4 -0
  203. hexdag-0.5.0.dev1.dist-info/entry_points.txt +4 -0
  204. hexdag-0.5.0.dev1.dist-info/licenses/LICENSE +190 -0
  205. hexdag_plugins/.gitignore +43 -0
  206. hexdag_plugins/README.md +73 -0
  207. hexdag_plugins/__init__.py +1 -0
  208. hexdag_plugins/azure/LICENSE +21 -0
  209. hexdag_plugins/azure/README.md +414 -0
  210. hexdag_plugins/azure/__init__.py +21 -0
  211. hexdag_plugins/azure/azure_blob_adapter.py +450 -0
  212. hexdag_plugins/azure/azure_cosmos_adapter.py +383 -0
  213. hexdag_plugins/azure/azure_keyvault_adapter.py +314 -0
  214. hexdag_plugins/azure/azure_openai_adapter.py +415 -0
  215. hexdag_plugins/azure/pyproject.toml +107 -0
  216. hexdag_plugins/azure/tests/__init__.py +1 -0
  217. hexdag_plugins/azure/tests/test_azure_blob_adapter.py +350 -0
  218. hexdag_plugins/azure/tests/test_azure_cosmos_adapter.py +323 -0
  219. hexdag_plugins/azure/tests/test_azure_keyvault_adapter.py +330 -0
  220. hexdag_plugins/azure/tests/test_azure_openai_adapter.py +329 -0
  221. hexdag_plugins/hexdag_etl/README.md +168 -0
  222. hexdag_plugins/hexdag_etl/__init__.py +53 -0
  223. hexdag_plugins/hexdag_etl/examples/01_simple_pandas_transform.py +270 -0
  224. hexdag_plugins/hexdag_etl/examples/02_simple_pandas_only.py +149 -0
  225. hexdag_plugins/hexdag_etl/examples/03_file_io_pipeline.py +109 -0
  226. hexdag_plugins/hexdag_etl/examples/test_pandas_transform.py +84 -0
  227. hexdag_plugins/hexdag_etl/hexdag.toml +25 -0
  228. hexdag_plugins/hexdag_etl/hexdag_etl/__init__.py +48 -0
  229. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/__init__.py +13 -0
  230. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/api_extract.py +230 -0
  231. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/base_node_factory.py +181 -0
  232. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/file_io.py +415 -0
  233. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/outlook.py +492 -0
  234. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/pandas_transform.py +563 -0
  235. hexdag_plugins/hexdag_etl/hexdag_etl/nodes/sql_extract_load.py +112 -0
  236. hexdag_plugins/hexdag_etl/pyproject.toml +82 -0
  237. hexdag_plugins/hexdag_etl/test_transform.py +54 -0
  238. hexdag_plugins/hexdag_etl/tests/test_plugin_integration.py +62 -0
  239. hexdag_plugins/mysql_adapter/LICENSE +21 -0
  240. hexdag_plugins/mysql_adapter/README.md +224 -0
  241. hexdag_plugins/mysql_adapter/__init__.py +6 -0
  242. hexdag_plugins/mysql_adapter/mysql_adapter.py +408 -0
  243. hexdag_plugins/mysql_adapter/pyproject.toml +93 -0
  244. hexdag_plugins/mysql_adapter/tests/test_mysql_adapter.py +259 -0
  245. hexdag_plugins/storage/README.md +184 -0
  246. hexdag_plugins/storage/__init__.py +19 -0
  247. hexdag_plugins/storage/file/__init__.py +5 -0
  248. hexdag_plugins/storage/file/local.py +325 -0
  249. hexdag_plugins/storage/ports/__init__.py +5 -0
  250. hexdag_plugins/storage/ports/vector_store.py +236 -0
  251. hexdag_plugins/storage/sql/__init__.py +7 -0
  252. hexdag_plugins/storage/sql/base.py +187 -0
  253. hexdag_plugins/storage/sql/mysql.py +27 -0
  254. hexdag_plugins/storage/sql/postgresql.py +27 -0
  255. hexdag_plugins/storage/tests/__init__.py +1 -0
  256. hexdag_plugins/storage/tests/test_local_file_storage.py +161 -0
  257. hexdag_plugins/storage/tests/test_sql_adapters.py +212 -0
  258. hexdag_plugins/storage/vector/__init__.py +7 -0
  259. hexdag_plugins/storage/vector/chromadb.py +223 -0
  260. hexdag_plugins/storage/vector/in_memory.py +285 -0
  261. hexdag_plugins/storage/vector/pgvector.py +502 -0
@@ -0,0 +1,168 @@
1
+ # hexDAG ETL Plugin
2
+
3
+ ETL (Extract, Transform, Load) infrastructure for hexDAG pipelines.
4
+
5
+ ## Features
6
+
7
+ - **Artifact Storage**: Named storage slots for intermediate data between pipeline nodes
8
+ - **Pandas Transform**: Multi-operation DataFrame transformations with chaining support
9
+ - **API Extract**: REST API extraction with pagination, authentication, and rate limiting
10
+ - **SQL Operations**: Database extraction and loading (placeholder implementations)
11
+
12
+ ## Installation
13
+
14
+ ```bash
15
+ cd hexdag_plugins/hexdag_etl
16
+ pip install -e .
17
+ ```
18
+
19
+ ## Quick Start
20
+
21
+ ### 1. Artifact Storage
22
+
23
+ Store intermediate data between pipeline nodes:
24
+
25
+ ```python
26
+ from hexdag.core.registry import registry
27
+
28
+ # Get artifact store adapter
29
+ artifact_store = registry.get("local", namespace="etl")(
30
+ base_path="/tmp/etl_artifacts"
31
+ )
32
+
33
+ # Write artifact
34
+ await artifact_store.write(
35
+ name="raw_customers",
36
+ key="customers_2024_01_15",
37
+ data=df
38
+ )
39
+
40
+ # Read artifact
41
+ df = await artifact_store.read(
42
+ name="raw_customers",
43
+ key="customers_2024_01_15"
44
+ )
45
+ ```
46
+
47
+ ### 2. Pandas Transform
48
+
49
+ Chain multiple pandas operations:
50
+
51
+ ```yaml
52
+ - kind: etl:pandas_transform
53
+ metadata:
54
+ name: enrich_data
55
+ spec:
56
+ input_artifacts:
57
+ - slot: raw_customers
58
+ key: customers_v1
59
+ - slot: raw_transactions
60
+ key: transactions_v1
61
+ operations:
62
+ # Join DataFrames
63
+ - type: transform
64
+ method: pandas.merge
65
+ args:
66
+ - {{input_artifacts[0]}}
67
+ - {{input_artifacts[1]}}
68
+ kwargs:
69
+ on: customer_id
70
+ how: left
71
+
72
+ # Rename columns
73
+ - type: map
74
+ columns:
75
+ transaction_id: txn_id
76
+ amount: total_amount
77
+
78
+ # Add calculated column
79
+ - type: transform
80
+ method: pandas.DataFrame.assign
81
+ kwargs:
82
+ tier: |
83
+ lambda df: pd.cut(df['amount'], bins=[0,100,500,inf])
84
+
85
+ output_artifact:
86
+ slot: enriched_data
87
+ key: enriched_v1
88
+ ```
89
+
90
+ ### 3. API Extract
91
+
92
+ Extract data from REST APIs:
93
+
94
+ ```yaml
95
+ - kind: etl:api_extract
96
+ metadata:
97
+ name: fetch_customers
98
+ spec:
99
+ endpoint: https://api.example.com/v1/customers
100
+ method: GET
101
+ params:
102
+ limit: 100
103
+ status: active
104
+ pagination:
105
+ type: cursor
106
+ cursor_param: after
107
+ cursor_path: meta.next_cursor
108
+ auth:
109
+ type: bearer
110
+ token: ${API_TOKEN}
111
+ output_artifact:
112
+ slot: raw_customers
113
+ key: customers_api
114
+ ```
115
+
116
+ ## Architecture
117
+
118
+ ```
119
+ hexdag-etl/
120
+ ├── hexdag_etl/
121
+ │ ├── adapters/
122
+ │ │ └── artifact.py # LocalArtifactAdapter
123
+ │ ├── nodes/
124
+ │ │ ├── pandas_transform.py # PandasTransformNode
125
+ │ │ ├── api_extract.py # APIExtractNode
126
+ │ │ └── sql_extract_load.py # SQLExtractNode, SQLLoadNode
127
+ │ └── ports/
128
+ │ └── artifact_storage.py # ArtifactStorePort
129
+ ├── examples/
130
+ ├── tests/
131
+ └── pyproject.toml
132
+ ```
133
+
134
+ ## Components
135
+
136
+ ### Adapters
137
+
138
+ - **LocalArtifactAdapter**: File-based artifact storage with compression and metadata
139
+
140
+ ### Nodes
141
+
142
+ - **PandasTransformNode**: Multi-operation DataFrame transformations
143
+ - **APIExtractNode**: REST API extraction with pagination
144
+ - **SQLExtractNode**: Database extraction (placeholder)
145
+ - **SQLLoadNode**: Database loading (placeholder)
146
+
147
+ ### Ports
148
+
149
+ - **ArtifactStorePort**: Interface for artifact storage adapters
150
+
151
+ ## Examples
152
+
153
+ Run the example pipeline:
154
+
155
+ ```bash
156
+ cd examples
157
+ python 01_simple_pandas_transform.py
158
+ ```
159
+
160
+ ## Testing
161
+
162
+ ```bash
163
+ pytest tests/
164
+ ```
165
+
166
+ ## License
167
+
168
+ MIT
@@ -0,0 +1,53 @@
1
+ """hexdag-etl: ETL infrastructure for hexDAG pipelines.
2
+
3
+ Provides file I/O and multi-operation pandas transform nodes for data transformation pipelines.
4
+
5
+ This plugin extends hexDAG with ETL capabilities:
6
+ - FileReaderNode: Read CSV, Parquet, JSON, Excel files
7
+ - FileWriterNode: Write data to various file formats
8
+ - PandasTransformNode: Chain pandas operations
9
+
10
+ Example Pipeline:
11
+ - kind: etl:file_reader_node
12
+ metadata:
13
+ name: load_data
14
+ spec:
15
+ file_path: data/input.csv
16
+ format: csv
17
+
18
+ - kind: etl:pandas_transform_node
19
+ metadata:
20
+ name: transform
21
+ spec:
22
+ operations:
23
+ - type: filter
24
+ condition: "{{ df['value'] > 0 }}"
25
+ dependencies: [load_data]
26
+
27
+ - kind: etl:file_writer_node
28
+ metadata:
29
+ name: save_results
30
+ spec:
31
+ file_path: output/results.parquet
32
+ format: parquet
33
+ dependencies: [transform]
34
+ """
35
+
36
+ # Re-export from the inner module (using relative import to inner hexdag_etl package)
37
+ from .hexdag_etl import (
38
+ FileReaderNode,
39
+ FileWriterNode,
40
+ OutlookReaderNode,
41
+ OutlookSenderNode,
42
+ PandasTransformNode,
43
+ __version__,
44
+ )
45
+
46
+ __all__ = [
47
+ "__version__",
48
+ "FileReaderNode",
49
+ "FileWriterNode",
50
+ "OutlookReaderNode",
51
+ "OutlookSenderNode",
52
+ "PandasTransformNode",
53
+ ]
@@ -0,0 +1,270 @@
1
+ """
2
+ Example: Simple Pandas Transform Pipeline
3
+
4
+ This example demonstrates:
5
+ 1. Reading CSV files
6
+ 2. Applying multi-operation pandas transforms
7
+ 3. Using artifact storage between nodes
8
+ 4. Simple data cleaning and aggregation
9
+ """
10
+
11
+ import asyncio
12
+ import sys
13
+ from pathlib import Path
14
+
15
+ import pandas as pd
16
+
17
+ # Add hexdag and plugin to path
18
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) # hexdag root
19
+ sys.path.insert(0, str(Path(__file__).parent.parent)) # plugin root
20
+
21
+ # Import plugin to register components
22
+ from hexdag.builtin.adapters.memory.in_memory_memory import InMemoryMemory
23
+ from hexdag.core.orchestration.orchestrator import Orchestrator
24
+ from hexdag.core.pipeline_builder.yaml_builder import YamlPipelineBuilder
25
+
26
+ import hexdag_etl # noqa: F401 - import to register plugin components
27
+
28
+
29
+ def create_sample_data():
30
+ """Create sample customer and transaction CSV files."""
31
+ data_dir = Path("/tmp/hexdag_demo")
32
+ data_dir.mkdir(exist_ok=True)
33
+
34
+ # Sample customers
35
+ customers_df = pd.DataFrame(
36
+ {
37
+ "customer_id": ["C001", "C002", "C003", "C004", "C005"],
38
+ "name": ["Alice", "Bob", "Carol", "David", "Emma"],
39
+ "email": [
40
+ "alice@example.com",
41
+ "bob@example.com",
42
+ "carol@example.com",
43
+ "david@example.com",
44
+ "emma@example.com",
45
+ ],
46
+ "country": ["USA", "UK", "USA", "Canada", "Australia"],
47
+ "signup_date": pd.to_datetime(["2024-01-15", "2024-02-20", "2024-03-10", "2024-01-25", "2024-04-05"]),
48
+ }
49
+ )
50
+
51
+ # Sample transactions
52
+ transactions_df = pd.DataFrame(
53
+ {
54
+ "transaction_id": ["T001", "T002", "T003", "T004", "T005", "T006", "T007", "T008"],
55
+ "customer_id": ["C001", "C001", "C002", "C003", "C004", "C005", "C001", "C003"],
56
+ "product": ["Widget A", "Gadget B", "Widget A", "Book C", "Gadget B", "Widget A", "Book D", "Gadget C"],
57
+ "category": [
58
+ "Electronics",
59
+ "Electronics",
60
+ "Electronics",
61
+ "Books",
62
+ "Electronics",
63
+ "Electronics",
64
+ "Books",
65
+ "Electronics",
66
+ ],
67
+ "amount": [150.00, 299.99, 150.00, 29.99, 299.99, 150.00, 19.99, 499.99],
68
+ "date": pd.to_datetime(
69
+ [
70
+ "2024-02-01",
71
+ "2024-03-15",
72
+ "2024-04-10",
73
+ "2024-02-20",
74
+ "2024-03-01",
75
+ "2024-04-15",
76
+ "2024-04-20",
77
+ "2024-05-01",
78
+ ]
79
+ ),
80
+ }
81
+ )
82
+
83
+ # Save to CSV
84
+ customers_file = data_dir / "customers.csv"
85
+ transactions_file = data_dir / "transactions.csv"
86
+
87
+ customers_df.to_csv(customers_file, index=False)
88
+ transactions_df.to_csv(transactions_file, index=False)
89
+
90
+ return str(customers_file), str(transactions_file)
91
+
92
+
93
+ def main():
94
+ """Run the simple pandas transform example."""
95
+ print("=" * 80)
96
+ print("hexDAG ETL Example: Simple Pandas Transform")
97
+ print("=" * 80)
98
+
99
+ # Create sample data
100
+ customers_file, transactions_file = create_sample_data()
101
+ print("\n✓ Created sample data:")
102
+ print(f" - Customers: {customers_file}")
103
+ print(f" - Transactions: {transactions_file}")
104
+
105
+ # Define pipeline YAML
106
+ pipeline_yaml = """
107
+ apiVersion: hexdag/v1
108
+ kind: Pipeline
109
+ metadata:
110
+ name: simple-etl-demo
111
+ description: Demonstrate pandas multi-operation transforms
112
+ spec:
113
+ artifact_slots:
114
+ - name: raw_customers
115
+ type: dataframe
116
+
117
+ - name: raw_transactions
118
+ type: dataframe
119
+
120
+ - name: enriched_customer_data
121
+ type: dataframe
122
+
123
+ ports:
124
+ artifact_store:
125
+ adapter: etl:local
126
+ config:
127
+ base_path: "/tmp/hexdag/artifacts"
128
+ compress: true
129
+
130
+ nodes:
131
+ # Load customers CSV
132
+ - kind: function_node
133
+ metadata:
134
+ name: load_customers
135
+ spec:
136
+ fn: pandas.read_csv
137
+ input_schema:
138
+ filepath_or_buffer: str
139
+ output_schema:
140
+ output: dataframe
141
+ output_artifact:
142
+ slot: raw_customers
143
+ key: customers_v1
144
+
145
+ # Load transactions CSV
146
+ - kind: function_node
147
+ metadata:
148
+ name: load_transactions
149
+ spec:
150
+ fn: pandas.read_csv
151
+ input_schema:
152
+ filepath_or_buffer: str
153
+ output_schema:
154
+ output: dataframe
155
+ kwargs:
156
+ parse_dates: ["date"]
157
+ output_artifact:
158
+ slot: raw_transactions
159
+ key: transactions_v1
160
+
161
+ # Transform: Join and enrich data
162
+ - kind: user:pandas_transform_node
163
+ metadata:
164
+ name: enrich_customer_data
165
+ spec:
166
+ input_artifacts:
167
+ - slot: raw_customers
168
+ key: customers_v1
169
+ - slot: raw_transactions
170
+ key: transactions_v1
171
+ operations:
172
+ # Join customers with transaction summary
173
+ - type: transform
174
+ method: pandas.merge
175
+ args:
176
+ - "{{input_artifacts[0]}}"
177
+ - |
178
+ "{{input_artifacts[1]}}
179
+ .groupby('customer_id')"
180
+ .agg({
181
+ 'transaction_id': 'count',
182
+ 'amount': ['sum', 'mean']
183
+ })
184
+ .reset_index()
185
+ kwargs:
186
+ on: customer_id
187
+ how: left
188
+ suffixes: ["", "_txn"]
189
+
190
+ # Rename columns
191
+ - type: map
192
+ columns:
193
+ transaction_id: transaction_count
194
+ ('amount', 'sum'): total_spend
195
+ ('amount', 'mean'): avg_spend
196
+
197
+ # Fill missing values
198
+ - type: transform
199
+ method: pandas.DataFrame.fillna
200
+ kwargs:
201
+ value:
202
+ transaction_count: 0
203
+ total_spend: 0.0
204
+ avg_spend: 0.0
205
+
206
+ output_artifact:
207
+ slot: enriched_customer_data
208
+ key: enriched_v1
209
+ dependencies: [load_customers, load_transactions]
210
+
211
+ # Display results
212
+ - kind: function_node
213
+ metadata:
214
+ name: display_results
215
+ spec:
216
+ fn: "builtins.print"
217
+ args:
218
+ - "\n=== ETL Pipeline Complete ==="
219
+ - "Customers loaded: {{output.load_customers.output.shape}}"
220
+ - "Transactions loaded: {{output.load_transactions.output.shape}}"
221
+ - "Enriched data: {{output.enrich_customer_data.records}} records"
222
+ input_schema:
223
+ output: dict
224
+ dependencies: [enrich_customer_data]
225
+ """
226
+
227
+ print("\n✓ Building pipeline from YAML...")
228
+ builder = YamlPipelineBuilder()
229
+ graph, config = builder.build_from_yaml_string(pipeline_yaml)
230
+
231
+ print(f"✓ Pipeline built with {len(graph._graph.nodes())} nodes")
232
+
233
+ # Prepare inputs
234
+ inputs = {
235
+ "load_customers": {"filepath_or_buffer": customers_file},
236
+ "load_transactions": {"filepath_or_buffer": transactions_file},
237
+ }
238
+
239
+ # Execute pipeline
240
+ print("\n✓ Executing pipeline...")
241
+ orchestrator = Orchestrator(
242
+ memory=InMemoryMemory(),
243
+ file_storage=None, # Using artifact storage instead
244
+ )
245
+
246
+ try:
247
+ asyncio.run(orchestrator.run_async(graph, inputs=inputs))
248
+
249
+ print("\n" + "=" * 80)
250
+ print("✓ Pipeline execution completed successfully!")
251
+ print("=" * 80)
252
+
253
+ # Show final artifact
254
+ artifact_store = orchestrator.ports.get("artifact_store")
255
+ if artifact_store:
256
+ enriched_data = asyncio.run(artifact_store.read("enriched_customer_data", "enriched_v1"))
257
+ print("\n📊 Final Enriched Data:")
258
+ print(enriched_data.head())
259
+ print(f"\nShape: {enriched_data.shape}")
260
+ print(f"Columns: {list(enriched_data.columns)}")
261
+
262
+ except Exception as e:
263
+ print(f"\n❌ Pipeline execution failed: {e}")
264
+ import traceback
265
+
266
+ traceback.print_exc()
267
+
268
+
269
+ if __name__ == "__main__":
270
+ main()
@@ -0,0 +1,149 @@
1
+ """
2
+ Simple Pandas Transform Example
3
+
4
+ This example demonstrates the PandasTransformNode without artifact storage
5
+ """
6
+
7
+ import asyncio
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ import pandas as pd
12
+
13
+ # Add hexdag and plugin to path
14
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) # hexdag root
15
+ sys.path.insert(0, str(Path(__file__).parent.parent)) # plugin root
16
+
17
+ # Import plugin to register components
18
+ from hexdag.builtin.adapters.memory.in_memory_memory import InMemoryMemory
19
+ from hexdag.core.orchestration.orchestrator import Orchestrator
20
+ from hexdag.core.pipeline_builder.yaml_builder import YamlPipelineBuilder
21
+
22
+ import hexdag_etl # noqa: F401
23
+
24
+
25
+ def create_sample_data():
26
+ """Create a simple DataFrame."""
27
+ df = pd.DataFrame(
28
+ {
29
+ "customer_id": ["C001", "C002", "C003", "C004", "C005"],
30
+ "name": ["Alice", "Bob", "Carol", "David", "Emma"],
31
+ "amount": [150.0, 299.99, 150.0, 29.99, 299.99],
32
+ "category": ["A", "B", "A", "C", "B"],
33
+ }
34
+ )
35
+ return df
36
+
37
+
38
+ def main():
39
+ """Run the example."""
40
+ print("=" * 80)
41
+ print("hexDAG ETL Example: Simple Pandas Transform")
42
+ print("=" * 80)
43
+
44
+ # Create sample data
45
+ df = create_sample_data()
46
+ print("\n✓ Created sample DataFrame:")
47
+ print(df)
48
+
49
+ # Define pipeline YAML
50
+ pipeline_yaml = """
51
+ apiVersion: hexdag/v1
52
+ kind: Pipeline
53
+ metadata:
54
+ name: pandas-transform-demo
55
+ description: Demonstrate pandas multi-operation transforms
56
+ spec:
57
+ nodes:
58
+ # Create DataFrame
59
+ - kind: function_node
60
+ metadata:
61
+ name: create_data
62
+ spec:
63
+ fn: "pandas.DataFrame"
64
+ input_schema:
65
+ data: dict
66
+ output_schema:
67
+ output: dataframe
68
+ dependencies: []
69
+
70
+ # Transform with pandas operations
71
+ - kind: user:pandas_transform_node
72
+ metadata:
73
+ name: transform_data
74
+ spec:
75
+ operations:
76
+ # Sort by amount
77
+ - type: transform
78
+ method: pandas.DataFrame.sort_values
79
+ kwargs:
80
+ by: amount
81
+ ascending: false
82
+
83
+ # Add a new column
84
+ - type: transform
85
+ method: pandas.DataFrame.assign
86
+ kwargs:
87
+ amount_doubled: "{{ lambda df: df['amount'] * 2 }}"
88
+
89
+ # Get top 3 rows
90
+ - type: transform
91
+ method: pandas.DataFrame.head
92
+ args:
93
+ - 3
94
+ dependencies: [create_data]
95
+
96
+ # Display results
97
+ - kind: function_node
98
+ metadata:
99
+ name: display_results
100
+ spec:
101
+ fn: "builtins.print"
102
+ input_schema:
103
+ output: dict
104
+ dependencies: [transform_data]
105
+ """
106
+
107
+ print("\n✓ Building pipeline from YAML...")
108
+ builder = YamlPipelineBuilder()
109
+ graph, config = builder.build_from_yaml_string(pipeline_yaml)
110
+
111
+ print(f"✓ Pipeline built with {len(graph.nodes)} nodes")
112
+
113
+ # Prepare inputs
114
+ inputs = {
115
+ "create_data": {
116
+ "data": {
117
+ "customer_id": ["C001", "C002", "C003", "C004", "C005"],
118
+ "name": ["Alice", "Bob", "Carol", "David", "Emma"],
119
+ "amount": [150.0, 299.99, 150.0, 29.99, 299.99],
120
+ "category": ["A", "B", "A", "C", "B"],
121
+ }
122
+ }
123
+ }
124
+
125
+ # Execute pipeline
126
+ print("\n✓ Executing pipeline...")
127
+ orchestrator = Orchestrator(ports={"memory": InMemoryMemory()})
128
+
129
+ try:
130
+ result = asyncio.run(orchestrator.run(graph, initial_input=inputs))
131
+
132
+ print("\n" + "=" * 80)
133
+ print("✓ Pipeline execution completed successfully!")
134
+ print("=" * 80)
135
+
136
+ # Show transformed output
137
+ transformed_df = result["transform_data"]["output"]
138
+ print("\n📊 Transformed Data:")
139
+ print(transformed_df)
140
+
141
+ except Exception as e:
142
+ print(f"\n❌ Pipeline execution failed: {e}")
143
+ import traceback
144
+
145
+ traceback.print_exc()
146
+
147
+
148
+ if __name__ == "__main__":
149
+ main()