hexdag 0.5.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hexdag/__init__.py +116 -0
- hexdag/__main__.py +30 -0
- hexdag/adapters/executors/__init__.py +5 -0
- hexdag/adapters/executors/local_executor.py +316 -0
- hexdag/builtin/__init__.py +6 -0
- hexdag/builtin/adapters/__init__.py +51 -0
- hexdag/builtin/adapters/anthropic/__init__.py +5 -0
- hexdag/builtin/adapters/anthropic/anthropic_adapter.py +151 -0
- hexdag/builtin/adapters/database/__init__.py +6 -0
- hexdag/builtin/adapters/database/csv/csv_adapter.py +249 -0
- hexdag/builtin/adapters/database/pgvector/__init__.py +5 -0
- hexdag/builtin/adapters/database/pgvector/pgvector_adapter.py +478 -0
- hexdag/builtin/adapters/database/sqlalchemy/sqlalchemy_adapter.py +252 -0
- hexdag/builtin/adapters/database/sqlite/__init__.py +5 -0
- hexdag/builtin/adapters/database/sqlite/sqlite_adapter.py +410 -0
- hexdag/builtin/adapters/local/README.md +59 -0
- hexdag/builtin/adapters/local/__init__.py +7 -0
- hexdag/builtin/adapters/local/local_observer_manager.py +696 -0
- hexdag/builtin/adapters/memory/__init__.py +47 -0
- hexdag/builtin/adapters/memory/file_memory_adapter.py +297 -0
- hexdag/builtin/adapters/memory/in_memory_memory.py +216 -0
- hexdag/builtin/adapters/memory/schemas.py +57 -0
- hexdag/builtin/adapters/memory/session_memory.py +178 -0
- hexdag/builtin/adapters/memory/sqlite_memory_adapter.py +215 -0
- hexdag/builtin/adapters/memory/state_memory.py +280 -0
- hexdag/builtin/adapters/mock/README.md +89 -0
- hexdag/builtin/adapters/mock/__init__.py +15 -0
- hexdag/builtin/adapters/mock/hexdag.toml +50 -0
- hexdag/builtin/adapters/mock/mock_database.py +225 -0
- hexdag/builtin/adapters/mock/mock_embedding.py +223 -0
- hexdag/builtin/adapters/mock/mock_llm.py +177 -0
- hexdag/builtin/adapters/mock/mock_tool_adapter.py +192 -0
- hexdag/builtin/adapters/mock/mock_tool_router.py +232 -0
- hexdag/builtin/adapters/openai/__init__.py +5 -0
- hexdag/builtin/adapters/openai/openai_adapter.py +634 -0
- hexdag/builtin/adapters/secret/__init__.py +7 -0
- hexdag/builtin/adapters/secret/local_secret_adapter.py +248 -0
- hexdag/builtin/adapters/unified_tool_router.py +280 -0
- hexdag/builtin/macros/__init__.py +17 -0
- hexdag/builtin/macros/conversation_agent.py +390 -0
- hexdag/builtin/macros/llm_macro.py +151 -0
- hexdag/builtin/macros/reasoning_agent.py +423 -0
- hexdag/builtin/macros/tool_macro.py +380 -0
- hexdag/builtin/nodes/__init__.py +38 -0
- hexdag/builtin/nodes/_discovery.py +123 -0
- hexdag/builtin/nodes/agent_node.py +696 -0
- hexdag/builtin/nodes/base_node_factory.py +242 -0
- hexdag/builtin/nodes/composite_node.py +926 -0
- hexdag/builtin/nodes/data_node.py +201 -0
- hexdag/builtin/nodes/expression_node.py +487 -0
- hexdag/builtin/nodes/function_node.py +454 -0
- hexdag/builtin/nodes/llm_node.py +491 -0
- hexdag/builtin/nodes/loop_node.py +920 -0
- hexdag/builtin/nodes/mapped_input.py +518 -0
- hexdag/builtin/nodes/port_call_node.py +269 -0
- hexdag/builtin/nodes/tool_call_node.py +195 -0
- hexdag/builtin/nodes/tool_utils.py +390 -0
- hexdag/builtin/prompts/__init__.py +68 -0
- hexdag/builtin/prompts/base.py +422 -0
- hexdag/builtin/prompts/chat_prompts.py +303 -0
- hexdag/builtin/prompts/error_correction_prompts.py +320 -0
- hexdag/builtin/prompts/tool_prompts.py +160 -0
- hexdag/builtin/tools/builtin_tools.py +84 -0
- hexdag/builtin/tools/database_tools.py +164 -0
- hexdag/cli/__init__.py +17 -0
- hexdag/cli/__main__.py +7 -0
- hexdag/cli/commands/__init__.py +27 -0
- hexdag/cli/commands/build_cmd.py +812 -0
- hexdag/cli/commands/create_cmd.py +208 -0
- hexdag/cli/commands/docs_cmd.py +293 -0
- hexdag/cli/commands/generate_types_cmd.py +252 -0
- hexdag/cli/commands/init_cmd.py +188 -0
- hexdag/cli/commands/pipeline_cmd.py +494 -0
- hexdag/cli/commands/plugin_dev_cmd.py +529 -0
- hexdag/cli/commands/plugins_cmd.py +441 -0
- hexdag/cli/commands/studio_cmd.py +101 -0
- hexdag/cli/commands/validate_cmd.py +221 -0
- hexdag/cli/main.py +84 -0
- hexdag/core/__init__.py +83 -0
- hexdag/core/config/__init__.py +20 -0
- hexdag/core/config/loader.py +479 -0
- hexdag/core/config/models.py +150 -0
- hexdag/core/configurable.py +294 -0
- hexdag/core/context/__init__.py +37 -0
- hexdag/core/context/execution_context.py +378 -0
- hexdag/core/docs/__init__.py +26 -0
- hexdag/core/docs/extractors.py +678 -0
- hexdag/core/docs/generators.py +890 -0
- hexdag/core/docs/models.py +120 -0
- hexdag/core/domain/__init__.py +10 -0
- hexdag/core/domain/dag.py +1225 -0
- hexdag/core/exceptions.py +234 -0
- hexdag/core/expression_parser.py +569 -0
- hexdag/core/logging.py +449 -0
- hexdag/core/models/__init__.py +17 -0
- hexdag/core/models/base.py +138 -0
- hexdag/core/orchestration/__init__.py +46 -0
- hexdag/core/orchestration/body_executor.py +481 -0
- hexdag/core/orchestration/components/__init__.py +97 -0
- hexdag/core/orchestration/components/adapter_lifecycle_manager.py +113 -0
- hexdag/core/orchestration/components/checkpoint_manager.py +134 -0
- hexdag/core/orchestration/components/execution_coordinator.py +360 -0
- hexdag/core/orchestration/components/health_check_manager.py +176 -0
- hexdag/core/orchestration/components/input_mapper.py +143 -0
- hexdag/core/orchestration/components/lifecycle_manager.py +583 -0
- hexdag/core/orchestration/components/node_executor.py +377 -0
- hexdag/core/orchestration/components/secret_manager.py +202 -0
- hexdag/core/orchestration/components/wave_executor.py +158 -0
- hexdag/core/orchestration/constants.py +17 -0
- hexdag/core/orchestration/events/README.md +312 -0
- hexdag/core/orchestration/events/__init__.py +104 -0
- hexdag/core/orchestration/events/batching.py +330 -0
- hexdag/core/orchestration/events/decorators.py +139 -0
- hexdag/core/orchestration/events/events.py +573 -0
- hexdag/core/orchestration/events/observers/__init__.py +30 -0
- hexdag/core/orchestration/events/observers/core_observers.py +690 -0
- hexdag/core/orchestration/events/observers/models.py +111 -0
- hexdag/core/orchestration/events/taxonomy.py +269 -0
- hexdag/core/orchestration/hook_context.py +237 -0
- hexdag/core/orchestration/hooks.py +437 -0
- hexdag/core/orchestration/models.py +418 -0
- hexdag/core/orchestration/orchestrator.py +910 -0
- hexdag/core/orchestration/orchestrator_factory.py +275 -0
- hexdag/core/orchestration/port_wrappers.py +327 -0
- hexdag/core/orchestration/prompt/__init__.py +32 -0
- hexdag/core/orchestration/prompt/template.py +332 -0
- hexdag/core/pipeline_builder/__init__.py +21 -0
- hexdag/core/pipeline_builder/component_instantiator.py +386 -0
- hexdag/core/pipeline_builder/include_tag.py +265 -0
- hexdag/core/pipeline_builder/pipeline_config.py +133 -0
- hexdag/core/pipeline_builder/py_tag.py +223 -0
- hexdag/core/pipeline_builder/tag_discovery.py +268 -0
- hexdag/core/pipeline_builder/yaml_builder.py +1196 -0
- hexdag/core/pipeline_builder/yaml_validator.py +569 -0
- hexdag/core/ports/__init__.py +65 -0
- hexdag/core/ports/api_call.py +133 -0
- hexdag/core/ports/database.py +489 -0
- hexdag/core/ports/embedding.py +215 -0
- hexdag/core/ports/executor.py +237 -0
- hexdag/core/ports/file_storage.py +117 -0
- hexdag/core/ports/healthcheck.py +87 -0
- hexdag/core/ports/llm.py +551 -0
- hexdag/core/ports/memory.py +70 -0
- hexdag/core/ports/observer_manager.py +130 -0
- hexdag/core/ports/secret.py +145 -0
- hexdag/core/ports/tool_router.py +94 -0
- hexdag/core/ports_builder.py +623 -0
- hexdag/core/protocols.py +273 -0
- hexdag/core/resolver.py +304 -0
- hexdag/core/schema/__init__.py +9 -0
- hexdag/core/schema/generator.py +742 -0
- hexdag/core/secrets.py +242 -0
- hexdag/core/types.py +413 -0
- hexdag/core/utils/async_warnings.py +206 -0
- hexdag/core/utils/schema_conversion.py +78 -0
- hexdag/core/utils/sql_validation.py +86 -0
- hexdag/core/validation/secure_json.py +148 -0
- hexdag/core/yaml_macro.py +517 -0
- hexdag/mcp_server.py +3120 -0
- hexdag/studio/__init__.py +10 -0
- hexdag/studio/build_ui.py +92 -0
- hexdag/studio/server/__init__.py +1 -0
- hexdag/studio/server/main.py +100 -0
- hexdag/studio/server/routes/__init__.py +9 -0
- hexdag/studio/server/routes/execute.py +208 -0
- hexdag/studio/server/routes/export.py +558 -0
- hexdag/studio/server/routes/files.py +207 -0
- hexdag/studio/server/routes/plugins.py +419 -0
- hexdag/studio/server/routes/validate.py +220 -0
- hexdag/studio/ui/index.html +13 -0
- hexdag/studio/ui/package-lock.json +2992 -0
- hexdag/studio/ui/package.json +31 -0
- hexdag/studio/ui/postcss.config.js +6 -0
- hexdag/studio/ui/public/hexdag.svg +5 -0
- hexdag/studio/ui/src/App.tsx +251 -0
- hexdag/studio/ui/src/components/Canvas.tsx +408 -0
- hexdag/studio/ui/src/components/ContextMenu.tsx +187 -0
- hexdag/studio/ui/src/components/FileBrowser.tsx +123 -0
- hexdag/studio/ui/src/components/Header.tsx +181 -0
- hexdag/studio/ui/src/components/HexdagNode.tsx +193 -0
- hexdag/studio/ui/src/components/NodeInspector.tsx +512 -0
- hexdag/studio/ui/src/components/NodePalette.tsx +262 -0
- hexdag/studio/ui/src/components/NodePortsSection.tsx +403 -0
- hexdag/studio/ui/src/components/PluginManager.tsx +347 -0
- hexdag/studio/ui/src/components/PortsEditor.tsx +481 -0
- hexdag/studio/ui/src/components/PythonEditor.tsx +195 -0
- hexdag/studio/ui/src/components/ValidationPanel.tsx +105 -0
- hexdag/studio/ui/src/components/YamlEditor.tsx +196 -0
- hexdag/studio/ui/src/components/index.ts +8 -0
- hexdag/studio/ui/src/index.css +92 -0
- hexdag/studio/ui/src/main.tsx +10 -0
- hexdag/studio/ui/src/types/index.ts +123 -0
- hexdag/studio/ui/src/vite-env.d.ts +1 -0
- hexdag/studio/ui/tailwind.config.js +29 -0
- hexdag/studio/ui/tsconfig.json +37 -0
- hexdag/studio/ui/tsconfig.node.json +13 -0
- hexdag/studio/ui/vite.config.ts +35 -0
- hexdag/visualization/__init__.py +69 -0
- hexdag/visualization/dag_visualizer.py +1020 -0
- hexdag-0.5.0.dev1.dist-info/METADATA +369 -0
- hexdag-0.5.0.dev1.dist-info/RECORD +261 -0
- hexdag-0.5.0.dev1.dist-info/WHEEL +4 -0
- hexdag-0.5.0.dev1.dist-info/entry_points.txt +4 -0
- hexdag-0.5.0.dev1.dist-info/licenses/LICENSE +190 -0
- hexdag_plugins/.gitignore +43 -0
- hexdag_plugins/README.md +73 -0
- hexdag_plugins/__init__.py +1 -0
- hexdag_plugins/azure/LICENSE +21 -0
- hexdag_plugins/azure/README.md +414 -0
- hexdag_plugins/azure/__init__.py +21 -0
- hexdag_plugins/azure/azure_blob_adapter.py +450 -0
- hexdag_plugins/azure/azure_cosmos_adapter.py +383 -0
- hexdag_plugins/azure/azure_keyvault_adapter.py +314 -0
- hexdag_plugins/azure/azure_openai_adapter.py +415 -0
- hexdag_plugins/azure/pyproject.toml +107 -0
- hexdag_plugins/azure/tests/__init__.py +1 -0
- hexdag_plugins/azure/tests/test_azure_blob_adapter.py +350 -0
- hexdag_plugins/azure/tests/test_azure_cosmos_adapter.py +323 -0
- hexdag_plugins/azure/tests/test_azure_keyvault_adapter.py +330 -0
- hexdag_plugins/azure/tests/test_azure_openai_adapter.py +329 -0
- hexdag_plugins/hexdag_etl/README.md +168 -0
- hexdag_plugins/hexdag_etl/__init__.py +53 -0
- hexdag_plugins/hexdag_etl/examples/01_simple_pandas_transform.py +270 -0
- hexdag_plugins/hexdag_etl/examples/02_simple_pandas_only.py +149 -0
- hexdag_plugins/hexdag_etl/examples/03_file_io_pipeline.py +109 -0
- hexdag_plugins/hexdag_etl/examples/test_pandas_transform.py +84 -0
- hexdag_plugins/hexdag_etl/hexdag.toml +25 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/__init__.py +48 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/__init__.py +13 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/api_extract.py +230 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/base_node_factory.py +181 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/file_io.py +415 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/outlook.py +492 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/pandas_transform.py +563 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/sql_extract_load.py +112 -0
- hexdag_plugins/hexdag_etl/pyproject.toml +82 -0
- hexdag_plugins/hexdag_etl/test_transform.py +54 -0
- hexdag_plugins/hexdag_etl/tests/test_plugin_integration.py +62 -0
- hexdag_plugins/mysql_adapter/LICENSE +21 -0
- hexdag_plugins/mysql_adapter/README.md +224 -0
- hexdag_plugins/mysql_adapter/__init__.py +6 -0
- hexdag_plugins/mysql_adapter/mysql_adapter.py +408 -0
- hexdag_plugins/mysql_adapter/pyproject.toml +93 -0
- hexdag_plugins/mysql_adapter/tests/test_mysql_adapter.py +259 -0
- hexdag_plugins/storage/README.md +184 -0
- hexdag_plugins/storage/__init__.py +19 -0
- hexdag_plugins/storage/file/__init__.py +5 -0
- hexdag_plugins/storage/file/local.py +325 -0
- hexdag_plugins/storage/ports/__init__.py +5 -0
- hexdag_plugins/storage/ports/vector_store.py +236 -0
- hexdag_plugins/storage/sql/__init__.py +7 -0
- hexdag_plugins/storage/sql/base.py +187 -0
- hexdag_plugins/storage/sql/mysql.py +27 -0
- hexdag_plugins/storage/sql/postgresql.py +27 -0
- hexdag_plugins/storage/tests/__init__.py +1 -0
- hexdag_plugins/storage/tests/test_local_file_storage.py +161 -0
- hexdag_plugins/storage/tests/test_sql_adapters.py +212 -0
- hexdag_plugins/storage/vector/__init__.py +7 -0
- hexdag_plugins/storage/vector/chromadb.py +223 -0
- hexdag_plugins/storage/vector/in_memory.py +285 -0
- hexdag_plugins/storage/vector/pgvector.py +502 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# hexDAG ETL Plugin
|
|
2
|
+
|
|
3
|
+
ETL (Extract, Transform, Load) infrastructure for hexDAG pipelines.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Artifact Storage**: Named storage slots for intermediate data between pipeline nodes
|
|
8
|
+
- **Pandas Transform**: Multi-operation DataFrame transformations with chaining support
|
|
9
|
+
- **API Extract**: REST API extraction with pagination, authentication, and rate limiting
|
|
10
|
+
- **SQL Operations**: Database extraction and loading (placeholder implementations)
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
cd hexdag_plugins/hexdag_etl
|
|
16
|
+
pip install -e .
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Quick Start
|
|
20
|
+
|
|
21
|
+
### 1. Artifact Storage
|
|
22
|
+
|
|
23
|
+
Store intermediate data between pipeline nodes:
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from hexdag.core.registry import registry
|
|
27
|
+
|
|
28
|
+
# Get artifact store adapter
|
|
29
|
+
artifact_store = registry.get("local", namespace="etl")(
|
|
30
|
+
base_path="/tmp/etl_artifacts"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Write artifact
|
|
34
|
+
await artifact_store.write(
|
|
35
|
+
name="raw_customers",
|
|
36
|
+
key="customers_2024_01_15",
|
|
37
|
+
data=df
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Read artifact
|
|
41
|
+
df = await artifact_store.read(
|
|
42
|
+
name="raw_customers",
|
|
43
|
+
key="customers_2024_01_15"
|
|
44
|
+
)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### 2. Pandas Transform
|
|
48
|
+
|
|
49
|
+
Chain multiple pandas operations:
|
|
50
|
+
|
|
51
|
+
```yaml
|
|
52
|
+
- kind: etl:pandas_transform
|
|
53
|
+
metadata:
|
|
54
|
+
name: enrich_data
|
|
55
|
+
spec:
|
|
56
|
+
input_artifacts:
|
|
57
|
+
- slot: raw_customers
|
|
58
|
+
key: customers_v1
|
|
59
|
+
- slot: raw_transactions
|
|
60
|
+
key: transactions_v1
|
|
61
|
+
operations:
|
|
62
|
+
# Join DataFrames
|
|
63
|
+
- type: transform
|
|
64
|
+
method: pandas.merge
|
|
65
|
+
args:
|
|
66
|
+
- {{input_artifacts[0]}}
|
|
67
|
+
- {{input_artifacts[1]}}
|
|
68
|
+
kwargs:
|
|
69
|
+
on: customer_id
|
|
70
|
+
how: left
|
|
71
|
+
|
|
72
|
+
# Rename columns
|
|
73
|
+
- type: map
|
|
74
|
+
columns:
|
|
75
|
+
transaction_id: txn_id
|
|
76
|
+
amount: total_amount
|
|
77
|
+
|
|
78
|
+
# Add calculated column
|
|
79
|
+
- type: transform
|
|
80
|
+
method: pandas.DataFrame.assign
|
|
81
|
+
kwargs:
|
|
82
|
+
tier: |
|
|
83
|
+
lambda df: pd.cut(df['amount'], bins=[0,100,500,inf])
|
|
84
|
+
|
|
85
|
+
output_artifact:
|
|
86
|
+
slot: enriched_data
|
|
87
|
+
key: enriched_v1
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### 3. API Extract
|
|
91
|
+
|
|
92
|
+
Extract data from REST APIs:
|
|
93
|
+
|
|
94
|
+
```yaml
|
|
95
|
+
- kind: etl:api_extract
|
|
96
|
+
metadata:
|
|
97
|
+
name: fetch_customers
|
|
98
|
+
spec:
|
|
99
|
+
endpoint: https://api.example.com/v1/customers
|
|
100
|
+
method: GET
|
|
101
|
+
params:
|
|
102
|
+
limit: 100
|
|
103
|
+
status: active
|
|
104
|
+
pagination:
|
|
105
|
+
type: cursor
|
|
106
|
+
cursor_param: after
|
|
107
|
+
cursor_path: meta.next_cursor
|
|
108
|
+
auth:
|
|
109
|
+
type: bearer
|
|
110
|
+
token: ${API_TOKEN}
|
|
111
|
+
output_artifact:
|
|
112
|
+
slot: raw_customers
|
|
113
|
+
key: customers_api
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Architecture
|
|
117
|
+
|
|
118
|
+
```
|
|
119
|
+
hexdag-etl/
|
|
120
|
+
├── hexdag_etl/
|
|
121
|
+
│ ├── adapters/
|
|
122
|
+
│ │ └── artifact.py # LocalArtifactAdapter
|
|
123
|
+
│ ├── nodes/
|
|
124
|
+
│ │ ├── pandas_transform.py # PandasTransformNode
|
|
125
|
+
│ │ ├── api_extract.py # APIExtractNode
|
|
126
|
+
│ │ └── sql_extract_load.py # SQLExtractNode, SQLLoadNode
|
|
127
|
+
│ └── ports/
|
|
128
|
+
│ └── artifact_storage.py # ArtifactStorePort
|
|
129
|
+
├── examples/
|
|
130
|
+
├── tests/
|
|
131
|
+
└── pyproject.toml
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Components
|
|
135
|
+
|
|
136
|
+
### Adapters
|
|
137
|
+
|
|
138
|
+
- **LocalArtifactAdapter**: File-based artifact storage with compression and metadata
|
|
139
|
+
|
|
140
|
+
### Nodes
|
|
141
|
+
|
|
142
|
+
- **PandasTransformNode**: Multi-operation DataFrame transformations
|
|
143
|
+
- **APIExtractNode**: REST API extraction with pagination
|
|
144
|
+
- **SQLExtractNode**: Database extraction (placeholder)
|
|
145
|
+
- **SQLLoadNode**: Database loading (placeholder)
|
|
146
|
+
|
|
147
|
+
### Ports
|
|
148
|
+
|
|
149
|
+
- **ArtifactStorePort**: Interface for artifact storage adapters
|
|
150
|
+
|
|
151
|
+
## Examples
|
|
152
|
+
|
|
153
|
+
Run the example pipeline:
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
cd examples
|
|
157
|
+
python 01_simple_pandas_transform.py
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Testing
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
pytest tests/
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## License
|
|
167
|
+
|
|
168
|
+
MIT
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""hexdag-etl: ETL infrastructure for hexDAG pipelines.
|
|
2
|
+
|
|
3
|
+
Provides file I/O and multi-operation pandas transform nodes for data transformation pipelines.
|
|
4
|
+
|
|
5
|
+
This plugin extends hexDAG with ETL capabilities:
|
|
6
|
+
- FileReaderNode: Read CSV, Parquet, JSON, Excel files
|
|
7
|
+
- FileWriterNode: Write data to various file formats
|
|
8
|
+
- PandasTransformNode: Chain pandas operations
|
|
9
|
+
|
|
10
|
+
Example Pipeline:
|
|
11
|
+
- kind: etl:file_reader_node
|
|
12
|
+
metadata:
|
|
13
|
+
name: load_data
|
|
14
|
+
spec:
|
|
15
|
+
file_path: data/input.csv
|
|
16
|
+
format: csv
|
|
17
|
+
|
|
18
|
+
- kind: etl:pandas_transform_node
|
|
19
|
+
metadata:
|
|
20
|
+
name: transform
|
|
21
|
+
spec:
|
|
22
|
+
operations:
|
|
23
|
+
- type: filter
|
|
24
|
+
condition: "{{ df['value'] > 0 }}"
|
|
25
|
+
dependencies: [load_data]
|
|
26
|
+
|
|
27
|
+
- kind: etl:file_writer_node
|
|
28
|
+
metadata:
|
|
29
|
+
name: save_results
|
|
30
|
+
spec:
|
|
31
|
+
file_path: output/results.parquet
|
|
32
|
+
format: parquet
|
|
33
|
+
dependencies: [transform]
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
# Re-export from the inner module (using relative import to inner hexdag_etl package)
|
|
37
|
+
from .hexdag_etl import (
|
|
38
|
+
FileReaderNode,
|
|
39
|
+
FileWriterNode,
|
|
40
|
+
OutlookReaderNode,
|
|
41
|
+
OutlookSenderNode,
|
|
42
|
+
PandasTransformNode,
|
|
43
|
+
__version__,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
__all__ = [
|
|
47
|
+
"__version__",
|
|
48
|
+
"FileReaderNode",
|
|
49
|
+
"FileWriterNode",
|
|
50
|
+
"OutlookReaderNode",
|
|
51
|
+
"OutlookSenderNode",
|
|
52
|
+
"PandasTransformNode",
|
|
53
|
+
]
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Example: Simple Pandas Transform Pipeline
|
|
3
|
+
|
|
4
|
+
This example demonstrates:
|
|
5
|
+
1. Reading CSV files
|
|
6
|
+
2. Applying multi-operation pandas transforms
|
|
7
|
+
3. Using artifact storage between nodes
|
|
8
|
+
4. Simple data cleaning and aggregation
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
import sys
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
# Add hexdag and plugin to path
|
|
18
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) # hexdag root
|
|
19
|
+
sys.path.insert(0, str(Path(__file__).parent.parent)) # plugin root
|
|
20
|
+
|
|
21
|
+
# Import plugin to register components
|
|
22
|
+
from hexdag.builtin.adapters.memory.in_memory_memory import InMemoryMemory
|
|
23
|
+
from hexdag.core.orchestration.orchestrator import Orchestrator
|
|
24
|
+
from hexdag.core.pipeline_builder.yaml_builder import YamlPipelineBuilder
|
|
25
|
+
|
|
26
|
+
import hexdag_etl # noqa: F401 - import to register plugin components
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def create_sample_data():
|
|
30
|
+
"""Create sample customer and transaction CSV files."""
|
|
31
|
+
data_dir = Path("/tmp/hexdag_demo")
|
|
32
|
+
data_dir.mkdir(exist_ok=True)
|
|
33
|
+
|
|
34
|
+
# Sample customers
|
|
35
|
+
customers_df = pd.DataFrame(
|
|
36
|
+
{
|
|
37
|
+
"customer_id": ["C001", "C002", "C003", "C004", "C005"],
|
|
38
|
+
"name": ["Alice", "Bob", "Carol", "David", "Emma"],
|
|
39
|
+
"email": [
|
|
40
|
+
"alice@example.com",
|
|
41
|
+
"bob@example.com",
|
|
42
|
+
"carol@example.com",
|
|
43
|
+
"david@example.com",
|
|
44
|
+
"emma@example.com",
|
|
45
|
+
],
|
|
46
|
+
"country": ["USA", "UK", "USA", "Canada", "Australia"],
|
|
47
|
+
"signup_date": pd.to_datetime(["2024-01-15", "2024-02-20", "2024-03-10", "2024-01-25", "2024-04-05"]),
|
|
48
|
+
}
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Sample transactions
|
|
52
|
+
transactions_df = pd.DataFrame(
|
|
53
|
+
{
|
|
54
|
+
"transaction_id": ["T001", "T002", "T003", "T004", "T005", "T006", "T007", "T008"],
|
|
55
|
+
"customer_id": ["C001", "C001", "C002", "C003", "C004", "C005", "C001", "C003"],
|
|
56
|
+
"product": ["Widget A", "Gadget B", "Widget A", "Book C", "Gadget B", "Widget A", "Book D", "Gadget C"],
|
|
57
|
+
"category": [
|
|
58
|
+
"Electronics",
|
|
59
|
+
"Electronics",
|
|
60
|
+
"Electronics",
|
|
61
|
+
"Books",
|
|
62
|
+
"Electronics",
|
|
63
|
+
"Electronics",
|
|
64
|
+
"Books",
|
|
65
|
+
"Electronics",
|
|
66
|
+
],
|
|
67
|
+
"amount": [150.00, 299.99, 150.00, 29.99, 299.99, 150.00, 19.99, 499.99],
|
|
68
|
+
"date": pd.to_datetime(
|
|
69
|
+
[
|
|
70
|
+
"2024-02-01",
|
|
71
|
+
"2024-03-15",
|
|
72
|
+
"2024-04-10",
|
|
73
|
+
"2024-02-20",
|
|
74
|
+
"2024-03-01",
|
|
75
|
+
"2024-04-15",
|
|
76
|
+
"2024-04-20",
|
|
77
|
+
"2024-05-01",
|
|
78
|
+
]
|
|
79
|
+
),
|
|
80
|
+
}
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Save to CSV
|
|
84
|
+
customers_file = data_dir / "customers.csv"
|
|
85
|
+
transactions_file = data_dir / "transactions.csv"
|
|
86
|
+
|
|
87
|
+
customers_df.to_csv(customers_file, index=False)
|
|
88
|
+
transactions_df.to_csv(transactions_file, index=False)
|
|
89
|
+
|
|
90
|
+
return str(customers_file), str(transactions_file)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def main():
|
|
94
|
+
"""Run the simple pandas transform example."""
|
|
95
|
+
print("=" * 80)
|
|
96
|
+
print("hexDAG ETL Example: Simple Pandas Transform")
|
|
97
|
+
print("=" * 80)
|
|
98
|
+
|
|
99
|
+
# Create sample data
|
|
100
|
+
customers_file, transactions_file = create_sample_data()
|
|
101
|
+
print("\n✓ Created sample data:")
|
|
102
|
+
print(f" - Customers: {customers_file}")
|
|
103
|
+
print(f" - Transactions: {transactions_file}")
|
|
104
|
+
|
|
105
|
+
# Define pipeline YAML
|
|
106
|
+
pipeline_yaml = """
|
|
107
|
+
apiVersion: hexdag/v1
|
|
108
|
+
kind: Pipeline
|
|
109
|
+
metadata:
|
|
110
|
+
name: simple-etl-demo
|
|
111
|
+
description: Demonstrate pandas multi-operation transforms
|
|
112
|
+
spec:
|
|
113
|
+
artifact_slots:
|
|
114
|
+
- name: raw_customers
|
|
115
|
+
type: dataframe
|
|
116
|
+
|
|
117
|
+
- name: raw_transactions
|
|
118
|
+
type: dataframe
|
|
119
|
+
|
|
120
|
+
- name: enriched_customer_data
|
|
121
|
+
type: dataframe
|
|
122
|
+
|
|
123
|
+
ports:
|
|
124
|
+
artifact_store:
|
|
125
|
+
adapter: etl:local
|
|
126
|
+
config:
|
|
127
|
+
base_path: "/tmp/hexdag/artifacts"
|
|
128
|
+
compress: true
|
|
129
|
+
|
|
130
|
+
nodes:
|
|
131
|
+
# Load customers CSV
|
|
132
|
+
- kind: function_node
|
|
133
|
+
metadata:
|
|
134
|
+
name: load_customers
|
|
135
|
+
spec:
|
|
136
|
+
fn: pandas.read_csv
|
|
137
|
+
input_schema:
|
|
138
|
+
filepath_or_buffer: str
|
|
139
|
+
output_schema:
|
|
140
|
+
output: dataframe
|
|
141
|
+
output_artifact:
|
|
142
|
+
slot: raw_customers
|
|
143
|
+
key: customers_v1
|
|
144
|
+
|
|
145
|
+
# Load transactions CSV
|
|
146
|
+
- kind: function_node
|
|
147
|
+
metadata:
|
|
148
|
+
name: load_transactions
|
|
149
|
+
spec:
|
|
150
|
+
fn: pandas.read_csv
|
|
151
|
+
input_schema:
|
|
152
|
+
filepath_or_buffer: str
|
|
153
|
+
output_schema:
|
|
154
|
+
output: dataframe
|
|
155
|
+
kwargs:
|
|
156
|
+
parse_dates: ["date"]
|
|
157
|
+
output_artifact:
|
|
158
|
+
slot: raw_transactions
|
|
159
|
+
key: transactions_v1
|
|
160
|
+
|
|
161
|
+
# Transform: Join and enrich data
|
|
162
|
+
- kind: user:pandas_transform_node
|
|
163
|
+
metadata:
|
|
164
|
+
name: enrich_customer_data
|
|
165
|
+
spec:
|
|
166
|
+
input_artifacts:
|
|
167
|
+
- slot: raw_customers
|
|
168
|
+
key: customers_v1
|
|
169
|
+
- slot: raw_transactions
|
|
170
|
+
key: transactions_v1
|
|
171
|
+
operations:
|
|
172
|
+
# Join customers with transaction summary
|
|
173
|
+
- type: transform
|
|
174
|
+
method: pandas.merge
|
|
175
|
+
args:
|
|
176
|
+
- "{{input_artifacts[0]}}"
|
|
177
|
+
- |
|
|
178
|
+
"{{input_artifacts[1]}}
|
|
179
|
+
.groupby('customer_id')"
|
|
180
|
+
.agg({
|
|
181
|
+
'transaction_id': 'count',
|
|
182
|
+
'amount': ['sum', 'mean']
|
|
183
|
+
})
|
|
184
|
+
.reset_index()
|
|
185
|
+
kwargs:
|
|
186
|
+
on: customer_id
|
|
187
|
+
how: left
|
|
188
|
+
suffixes: ["", "_txn"]
|
|
189
|
+
|
|
190
|
+
# Rename columns
|
|
191
|
+
- type: map
|
|
192
|
+
columns:
|
|
193
|
+
transaction_id: transaction_count
|
|
194
|
+
('amount', 'sum'): total_spend
|
|
195
|
+
('amount', 'mean'): avg_spend
|
|
196
|
+
|
|
197
|
+
# Fill missing values
|
|
198
|
+
- type: transform
|
|
199
|
+
method: pandas.DataFrame.fillna
|
|
200
|
+
kwargs:
|
|
201
|
+
value:
|
|
202
|
+
transaction_count: 0
|
|
203
|
+
total_spend: 0.0
|
|
204
|
+
avg_spend: 0.0
|
|
205
|
+
|
|
206
|
+
output_artifact:
|
|
207
|
+
slot: enriched_customer_data
|
|
208
|
+
key: enriched_v1
|
|
209
|
+
dependencies: [load_customers, load_transactions]
|
|
210
|
+
|
|
211
|
+
# Display results
|
|
212
|
+
- kind: function_node
|
|
213
|
+
metadata:
|
|
214
|
+
name: display_results
|
|
215
|
+
spec:
|
|
216
|
+
fn: "builtins.print"
|
|
217
|
+
args:
|
|
218
|
+
- "\n=== ETL Pipeline Complete ==="
|
|
219
|
+
- "Customers loaded: {{output.load_customers.output.shape}}"
|
|
220
|
+
- "Transactions loaded: {{output.load_transactions.output.shape}}"
|
|
221
|
+
- "Enriched data: {{output.enrich_customer_data.records}} records"
|
|
222
|
+
input_schema:
|
|
223
|
+
output: dict
|
|
224
|
+
dependencies: [enrich_customer_data]
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
print("\n✓ Building pipeline from YAML...")
|
|
228
|
+
builder = YamlPipelineBuilder()
|
|
229
|
+
graph, config = builder.build_from_yaml_string(pipeline_yaml)
|
|
230
|
+
|
|
231
|
+
print(f"✓ Pipeline built with {len(graph._graph.nodes())} nodes")
|
|
232
|
+
|
|
233
|
+
# Prepare inputs
|
|
234
|
+
inputs = {
|
|
235
|
+
"load_customers": {"filepath_or_buffer": customers_file},
|
|
236
|
+
"load_transactions": {"filepath_or_buffer": transactions_file},
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
# Execute pipeline
|
|
240
|
+
print("\n✓ Executing pipeline...")
|
|
241
|
+
orchestrator = Orchestrator(
|
|
242
|
+
memory=InMemoryMemory(),
|
|
243
|
+
file_storage=None, # Using artifact storage instead
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
try:
|
|
247
|
+
asyncio.run(orchestrator.run_async(graph, inputs=inputs))
|
|
248
|
+
|
|
249
|
+
print("\n" + "=" * 80)
|
|
250
|
+
print("✓ Pipeline execution completed successfully!")
|
|
251
|
+
print("=" * 80)
|
|
252
|
+
|
|
253
|
+
# Show final artifact
|
|
254
|
+
artifact_store = orchestrator.ports.get("artifact_store")
|
|
255
|
+
if artifact_store:
|
|
256
|
+
enriched_data = asyncio.run(artifact_store.read("enriched_customer_data", "enriched_v1"))
|
|
257
|
+
print("\n📊 Final Enriched Data:")
|
|
258
|
+
print(enriched_data.head())
|
|
259
|
+
print(f"\nShape: {enriched_data.shape}")
|
|
260
|
+
print(f"Columns: {list(enriched_data.columns)}")
|
|
261
|
+
|
|
262
|
+
except Exception as e:
|
|
263
|
+
print(f"\n❌ Pipeline execution failed: {e}")
|
|
264
|
+
import traceback
|
|
265
|
+
|
|
266
|
+
traceback.print_exc()
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
if __name__ == "__main__":
|
|
270
|
+
main()
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Simple Pandas Transform Example
|
|
3
|
+
|
|
4
|
+
This example demonstrates the PandasTransformNode without artifact storage
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
# Add hexdag and plugin to path
|
|
14
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) # hexdag root
|
|
15
|
+
sys.path.insert(0, str(Path(__file__).parent.parent)) # plugin root
|
|
16
|
+
|
|
17
|
+
# Import plugin to register components
|
|
18
|
+
from hexdag.builtin.adapters.memory.in_memory_memory import InMemoryMemory
|
|
19
|
+
from hexdag.core.orchestration.orchestrator import Orchestrator
|
|
20
|
+
from hexdag.core.pipeline_builder.yaml_builder import YamlPipelineBuilder
|
|
21
|
+
|
|
22
|
+
import hexdag_etl # noqa: F401
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def create_sample_data():
|
|
26
|
+
"""Create a simple DataFrame."""
|
|
27
|
+
df = pd.DataFrame(
|
|
28
|
+
{
|
|
29
|
+
"customer_id": ["C001", "C002", "C003", "C004", "C005"],
|
|
30
|
+
"name": ["Alice", "Bob", "Carol", "David", "Emma"],
|
|
31
|
+
"amount": [150.0, 299.99, 150.0, 29.99, 299.99],
|
|
32
|
+
"category": ["A", "B", "A", "C", "B"],
|
|
33
|
+
}
|
|
34
|
+
)
|
|
35
|
+
return df
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def main():
|
|
39
|
+
"""Run the example."""
|
|
40
|
+
print("=" * 80)
|
|
41
|
+
print("hexDAG ETL Example: Simple Pandas Transform")
|
|
42
|
+
print("=" * 80)
|
|
43
|
+
|
|
44
|
+
# Create sample data
|
|
45
|
+
df = create_sample_data()
|
|
46
|
+
print("\n✓ Created sample DataFrame:")
|
|
47
|
+
print(df)
|
|
48
|
+
|
|
49
|
+
# Define pipeline YAML
|
|
50
|
+
pipeline_yaml = """
|
|
51
|
+
apiVersion: hexdag/v1
|
|
52
|
+
kind: Pipeline
|
|
53
|
+
metadata:
|
|
54
|
+
name: pandas-transform-demo
|
|
55
|
+
description: Demonstrate pandas multi-operation transforms
|
|
56
|
+
spec:
|
|
57
|
+
nodes:
|
|
58
|
+
# Create DataFrame
|
|
59
|
+
- kind: function_node
|
|
60
|
+
metadata:
|
|
61
|
+
name: create_data
|
|
62
|
+
spec:
|
|
63
|
+
fn: "pandas.DataFrame"
|
|
64
|
+
input_schema:
|
|
65
|
+
data: dict
|
|
66
|
+
output_schema:
|
|
67
|
+
output: dataframe
|
|
68
|
+
dependencies: []
|
|
69
|
+
|
|
70
|
+
# Transform with pandas operations
|
|
71
|
+
- kind: user:pandas_transform_node
|
|
72
|
+
metadata:
|
|
73
|
+
name: transform_data
|
|
74
|
+
spec:
|
|
75
|
+
operations:
|
|
76
|
+
# Sort by amount
|
|
77
|
+
- type: transform
|
|
78
|
+
method: pandas.DataFrame.sort_values
|
|
79
|
+
kwargs:
|
|
80
|
+
by: amount
|
|
81
|
+
ascending: false
|
|
82
|
+
|
|
83
|
+
# Add a new column
|
|
84
|
+
- type: transform
|
|
85
|
+
method: pandas.DataFrame.assign
|
|
86
|
+
kwargs:
|
|
87
|
+
amount_doubled: "{{ lambda df: df['amount'] * 2 }}"
|
|
88
|
+
|
|
89
|
+
# Get top 3 rows
|
|
90
|
+
- type: transform
|
|
91
|
+
method: pandas.DataFrame.head
|
|
92
|
+
args:
|
|
93
|
+
- 3
|
|
94
|
+
dependencies: [create_data]
|
|
95
|
+
|
|
96
|
+
# Display results
|
|
97
|
+
- kind: function_node
|
|
98
|
+
metadata:
|
|
99
|
+
name: display_results
|
|
100
|
+
spec:
|
|
101
|
+
fn: "builtins.print"
|
|
102
|
+
input_schema:
|
|
103
|
+
output: dict
|
|
104
|
+
dependencies: [transform_data]
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
print("\n✓ Building pipeline from YAML...")
|
|
108
|
+
builder = YamlPipelineBuilder()
|
|
109
|
+
graph, config = builder.build_from_yaml_string(pipeline_yaml)
|
|
110
|
+
|
|
111
|
+
print(f"✓ Pipeline built with {len(graph.nodes)} nodes")
|
|
112
|
+
|
|
113
|
+
# Prepare inputs
|
|
114
|
+
inputs = {
|
|
115
|
+
"create_data": {
|
|
116
|
+
"data": {
|
|
117
|
+
"customer_id": ["C001", "C002", "C003", "C004", "C005"],
|
|
118
|
+
"name": ["Alice", "Bob", "Carol", "David", "Emma"],
|
|
119
|
+
"amount": [150.0, 299.99, 150.0, 29.99, 299.99],
|
|
120
|
+
"category": ["A", "B", "A", "C", "B"],
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
# Execute pipeline
|
|
126
|
+
print("\n✓ Executing pipeline...")
|
|
127
|
+
orchestrator = Orchestrator(ports={"memory": InMemoryMemory()})
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
result = asyncio.run(orchestrator.run(graph, initial_input=inputs))
|
|
131
|
+
|
|
132
|
+
print("\n" + "=" * 80)
|
|
133
|
+
print("✓ Pipeline execution completed successfully!")
|
|
134
|
+
print("=" * 80)
|
|
135
|
+
|
|
136
|
+
# Show transformed output
|
|
137
|
+
transformed_df = result["transform_data"]["output"]
|
|
138
|
+
print("\n📊 Transformed Data:")
|
|
139
|
+
print(transformed_df)
|
|
140
|
+
|
|
141
|
+
except Exception as e:
|
|
142
|
+
print(f"\n❌ Pipeline execution failed: {e}")
|
|
143
|
+
import traceback
|
|
144
|
+
|
|
145
|
+
traceback.print_exc()
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
if __name__ == "__main__":
|
|
149
|
+
main()
|