machinaos 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (288) hide show
  1. package/.env.template +71 -0
  2. package/LICENSE +21 -0
  3. package/README.md +87 -0
  4. package/bin/cli.js +159 -0
  5. package/client/.dockerignore +45 -0
  6. package/client/Dockerfile +68 -0
  7. package/client/eslint.config.js +29 -0
  8. package/client/index.html +13 -0
  9. package/client/nginx.conf +66 -0
  10. package/client/package.json +48 -0
  11. package/client/src/App.tsx +27 -0
  12. package/client/src/Dashboard.tsx +1173 -0
  13. package/client/src/ParameterPanel.tsx +301 -0
  14. package/client/src/components/AIAgentNode.tsx +321 -0
  15. package/client/src/components/APIKeyValidator.tsx +118 -0
  16. package/client/src/components/ClaudeChatModelNode.tsx +18 -0
  17. package/client/src/components/ConditionalEdge.tsx +189 -0
  18. package/client/src/components/CredentialsModal.tsx +306 -0
  19. package/client/src/components/EdgeConditionEditor.tsx +443 -0
  20. package/client/src/components/GeminiChatModelNode.tsx +18 -0
  21. package/client/src/components/GenericNode.tsx +357 -0
  22. package/client/src/components/LocationParameterPanel.tsx +154 -0
  23. package/client/src/components/ModelNode.tsx +286 -0
  24. package/client/src/components/OpenAIChatModelNode.tsx +18 -0
  25. package/client/src/components/OutputPanel.tsx +471 -0
  26. package/client/src/components/ParameterRenderer.tsx +1874 -0
  27. package/client/src/components/SkillEditorModal.tsx +417 -0
  28. package/client/src/components/SquareNode.tsx +797 -0
  29. package/client/src/components/StartNode.tsx +250 -0
  30. package/client/src/components/ToolkitNode.tsx +365 -0
  31. package/client/src/components/TriggerNode.tsx +463 -0
  32. package/client/src/components/auth/LoginPage.tsx +247 -0
  33. package/client/src/components/auth/ProtectedRoute.tsx +59 -0
  34. package/client/src/components/base/BaseChatModelNode.tsx +271 -0
  35. package/client/src/components/icons/AIProviderIcons.tsx +50 -0
  36. package/client/src/components/maps/GoogleMapsPicker.tsx +137 -0
  37. package/client/src/components/maps/MapsPreviewPanel.tsx +110 -0
  38. package/client/src/components/maps/index.ts +26 -0
  39. package/client/src/components/parameterPanel/InputSection.tsx +1094 -0
  40. package/client/src/components/parameterPanel/LocationPanelLayout.tsx +65 -0
  41. package/client/src/components/parameterPanel/MapsSection.tsx +92 -0
  42. package/client/src/components/parameterPanel/MiddleSection.tsx +571 -0
  43. package/client/src/components/parameterPanel/OutputSection.tsx +81 -0
  44. package/client/src/components/parameterPanel/ParameterPanelLayout.tsx +82 -0
  45. package/client/src/components/parameterPanel/ToolSchemaEditor.tsx +436 -0
  46. package/client/src/components/parameterPanel/index.ts +42 -0
  47. package/client/src/components/shared/DataPanel.tsx +142 -0
  48. package/client/src/components/shared/JSONTreeRenderer.tsx +106 -0
  49. package/client/src/components/ui/AIResultModal.tsx +204 -0
  50. package/client/src/components/ui/AndroidSettingsPanel.tsx +401 -0
  51. package/client/src/components/ui/CodeEditor.tsx +81 -0
  52. package/client/src/components/ui/CollapsibleSection.tsx +88 -0
  53. package/client/src/components/ui/ComponentItem.tsx +154 -0
  54. package/client/src/components/ui/ComponentPalette.tsx +321 -0
  55. package/client/src/components/ui/ConsolePanel.tsx +1074 -0
  56. package/client/src/components/ui/ErrorBoundary.tsx +196 -0
  57. package/client/src/components/ui/InputNodesPanel.tsx +204 -0
  58. package/client/src/components/ui/MapSelector.tsx +314 -0
  59. package/client/src/components/ui/Modal.tsx +149 -0
  60. package/client/src/components/ui/NodeContextMenu.tsx +192 -0
  61. package/client/src/components/ui/NodeOutputPanel.tsx +1150 -0
  62. package/client/src/components/ui/OutputDisplayPanel.tsx +381 -0
  63. package/client/src/components/ui/SettingsPanel.tsx +243 -0
  64. package/client/src/components/ui/TopToolbar.tsx +736 -0
  65. package/client/src/components/ui/WhatsAppSettingsPanel.tsx +345 -0
  66. package/client/src/components/ui/WorkflowSidebar.tsx +294 -0
  67. package/client/src/config/antdTheme.ts +186 -0
  68. package/client/src/config/api.ts +54 -0
  69. package/client/src/contexts/AuthContext.tsx +221 -0
  70. package/client/src/contexts/ThemeContext.tsx +42 -0
  71. package/client/src/contexts/WebSocketContext.tsx +1971 -0
  72. package/client/src/factories/baseChatModelFactory.ts +256 -0
  73. package/client/src/hooks/useAndroidOperations.ts +164 -0
  74. package/client/src/hooks/useApiKeyValidation.ts +107 -0
  75. package/client/src/hooks/useApiKeys.ts +238 -0
  76. package/client/src/hooks/useAppTheme.ts +17 -0
  77. package/client/src/hooks/useComponentPalette.ts +51 -0
  78. package/client/src/hooks/useCopyPaste.ts +155 -0
  79. package/client/src/hooks/useDragAndDrop.ts +124 -0
  80. package/client/src/hooks/useDragVariable.ts +88 -0
  81. package/client/src/hooks/useExecution.ts +313 -0
  82. package/client/src/hooks/useParameterPanel.ts +176 -0
  83. package/client/src/hooks/useReactFlowNodes.ts +189 -0
  84. package/client/src/hooks/useToolSchema.ts +209 -0
  85. package/client/src/hooks/useWhatsApp.ts +196 -0
  86. package/client/src/hooks/useWorkflowManagement.ts +46 -0
  87. package/client/src/index.css +315 -0
  88. package/client/src/main.tsx +19 -0
  89. package/client/src/nodeDefinitions/aiAgentNodes.ts +336 -0
  90. package/client/src/nodeDefinitions/aiModelNodes.ts +340 -0
  91. package/client/src/nodeDefinitions/androidDeviceNodes.ts +140 -0
  92. package/client/src/nodeDefinitions/androidServiceNodes.ts +383 -0
  93. package/client/src/nodeDefinitions/chatNodes.ts +135 -0
  94. package/client/src/nodeDefinitions/codeNodes.ts +54 -0
  95. package/client/src/nodeDefinitions/documentNodes.ts +379 -0
  96. package/client/src/nodeDefinitions/index.ts +15 -0
  97. package/client/src/nodeDefinitions/locationNodes.ts +463 -0
  98. package/client/src/nodeDefinitions/schedulerNodes.ts +220 -0
  99. package/client/src/nodeDefinitions/skillNodes.ts +211 -0
  100. package/client/src/nodeDefinitions/toolNodes.ts +198 -0
  101. package/client/src/nodeDefinitions/utilityNodes.ts +284 -0
  102. package/client/src/nodeDefinitions/whatsappNodes.ts +865 -0
  103. package/client/src/nodeDefinitions/workflowNodes.ts +41 -0
  104. package/client/src/nodeDefinitions.ts +104 -0
  105. package/client/src/schemas/workflowSchema.ts +264 -0
  106. package/client/src/services/dynamicParameterService.ts +96 -0
  107. package/client/src/services/execution/aiAgentExecutionService.ts +35 -0
  108. package/client/src/services/executionService.ts +232 -0
  109. package/client/src/services/workflowApi.ts +91 -0
  110. package/client/src/store/useAppStore.ts +582 -0
  111. package/client/src/styles/theme.ts +508 -0
  112. package/client/src/styles/zIndex.ts +17 -0
  113. package/client/src/types/ComponentTypes.ts +39 -0
  114. package/client/src/types/EdgeCondition.ts +231 -0
  115. package/client/src/types/INodeProperties.ts +288 -0
  116. package/client/src/types/NodeTypes.ts +28 -0
  117. package/client/src/utils/formatters.ts +33 -0
  118. package/client/src/utils/googleMapsLoader.ts +140 -0
  119. package/client/src/utils/locationUtils.ts +85 -0
  120. package/client/src/utils/nodeUtils.ts +31 -0
  121. package/client/src/utils/workflow.ts +30 -0
  122. package/client/src/utils/workflowExport.ts +120 -0
  123. package/client/src/vite-env.d.ts +12 -0
  124. package/client/tailwind.config.js +60 -0
  125. package/client/tsconfig.json +25 -0
  126. package/client/tsconfig.node.json +11 -0
  127. package/client/vite.config.js +35 -0
  128. package/docker-compose.prod.yml +107 -0
  129. package/docker-compose.yml +104 -0
  130. package/docs-MachinaOs/README.md +85 -0
  131. package/docs-MachinaOs/deployment/docker.mdx +228 -0
  132. package/docs-MachinaOs/deployment/production.mdx +345 -0
  133. package/docs-MachinaOs/docs.json +75 -0
  134. package/docs-MachinaOs/faq.mdx +309 -0
  135. package/docs-MachinaOs/favicon.svg +5 -0
  136. package/docs-MachinaOs/installation.mdx +160 -0
  137. package/docs-MachinaOs/introduction.mdx +114 -0
  138. package/docs-MachinaOs/logo/dark.svg +6 -0
  139. package/docs-MachinaOs/logo/light.svg +6 -0
  140. package/docs-MachinaOs/nodes/ai-agent.mdx +216 -0
  141. package/docs-MachinaOs/nodes/ai-models.mdx +240 -0
  142. package/docs-MachinaOs/nodes/android.mdx +411 -0
  143. package/docs-MachinaOs/nodes/overview.mdx +181 -0
  144. package/docs-MachinaOs/nodes/schedulers.mdx +316 -0
  145. package/docs-MachinaOs/nodes/webhooks.mdx +330 -0
  146. package/docs-MachinaOs/nodes/whatsapp.mdx +305 -0
  147. package/docs-MachinaOs/quickstart.mdx +119 -0
  148. package/docs-MachinaOs/tutorials/ai-agent-workflow.mdx +177 -0
  149. package/docs-MachinaOs/tutorials/android-automation.mdx +242 -0
  150. package/docs-MachinaOs/tutorials/first-workflow.mdx +134 -0
  151. package/docs-MachinaOs/tutorials/whatsapp-automation.mdx +185 -0
  152. package/nul +0 -0
  153. package/package.json +70 -0
  154. package/scripts/build.js +158 -0
  155. package/scripts/check-ports.ps1 +33 -0
  156. package/scripts/clean.js +40 -0
  157. package/scripts/docker.js +93 -0
  158. package/scripts/kill-port.ps1 +154 -0
  159. package/scripts/start.js +210 -0
  160. package/scripts/stop.js +325 -0
  161. package/server/.dockerignore +44 -0
  162. package/server/Dockerfile +45 -0
  163. package/server/constants.py +249 -0
  164. package/server/core/__init__.py +1 -0
  165. package/server/core/cache.py +461 -0
  166. package/server/core/config.py +128 -0
  167. package/server/core/container.py +99 -0
  168. package/server/core/database.py +1211 -0
  169. package/server/core/logging.py +314 -0
  170. package/server/main.py +289 -0
  171. package/server/middleware/__init__.py +5 -0
  172. package/server/middleware/auth.py +89 -0
  173. package/server/models/__init__.py +1 -0
  174. package/server/models/auth.py +52 -0
  175. package/server/models/cache.py +24 -0
  176. package/server/models/database.py +211 -0
  177. package/server/models/nodes.py +455 -0
  178. package/server/package.json +9 -0
  179. package/server/pyproject.toml +72 -0
  180. package/server/requirements.txt +83 -0
  181. package/server/routers/__init__.py +1 -0
  182. package/server/routers/android.py +294 -0
  183. package/server/routers/auth.py +203 -0
  184. package/server/routers/database.py +151 -0
  185. package/server/routers/maps.py +142 -0
  186. package/server/routers/nodejs_compat.py +289 -0
  187. package/server/routers/webhook.py +90 -0
  188. package/server/routers/websocket.py +2127 -0
  189. package/server/routers/whatsapp.py +761 -0
  190. package/server/routers/workflow.py +200 -0
  191. package/server/services/__init__.py +1 -0
  192. package/server/services/ai.py +2415 -0
  193. package/server/services/android/__init__.py +27 -0
  194. package/server/services/android/broadcaster.py +114 -0
  195. package/server/services/android/client.py +608 -0
  196. package/server/services/android/manager.py +78 -0
  197. package/server/services/android/protocol.py +165 -0
  198. package/server/services/android_service.py +588 -0
  199. package/server/services/auth.py +131 -0
  200. package/server/services/chat_client.py +160 -0
  201. package/server/services/deployment/__init__.py +12 -0
  202. package/server/services/deployment/manager.py +706 -0
  203. package/server/services/deployment/state.py +47 -0
  204. package/server/services/deployment/triggers.py +275 -0
  205. package/server/services/event_waiter.py +785 -0
  206. package/server/services/execution/__init__.py +77 -0
  207. package/server/services/execution/cache.py +769 -0
  208. package/server/services/execution/conditions.py +373 -0
  209. package/server/services/execution/dlq.py +132 -0
  210. package/server/services/execution/executor.py +1351 -0
  211. package/server/services/execution/models.py +531 -0
  212. package/server/services/execution/recovery.py +235 -0
  213. package/server/services/handlers/__init__.py +126 -0
  214. package/server/services/handlers/ai.py +355 -0
  215. package/server/services/handlers/android.py +260 -0
  216. package/server/services/handlers/code.py +278 -0
  217. package/server/services/handlers/document.py +598 -0
  218. package/server/services/handlers/http.py +193 -0
  219. package/server/services/handlers/polyglot.py +105 -0
  220. package/server/services/handlers/tools.py +845 -0
  221. package/server/services/handlers/triggers.py +107 -0
  222. package/server/services/handlers/utility.py +822 -0
  223. package/server/services/handlers/whatsapp.py +476 -0
  224. package/server/services/maps.py +289 -0
  225. package/server/services/memory_store.py +103 -0
  226. package/server/services/node_executor.py +375 -0
  227. package/server/services/parameter_resolver.py +218 -0
  228. package/server/services/polyglot_client.py +169 -0
  229. package/server/services/scheduler.py +155 -0
  230. package/server/services/skill_loader.py +417 -0
  231. package/server/services/status_broadcaster.py +826 -0
  232. package/server/services/temporal/__init__.py +23 -0
  233. package/server/services/temporal/activities.py +344 -0
  234. package/server/services/temporal/client.py +76 -0
  235. package/server/services/temporal/executor.py +147 -0
  236. package/server/services/temporal/worker.py +251 -0
  237. package/server/services/temporal/workflow.py +355 -0
  238. package/server/services/temporal/ws_client.py +236 -0
  239. package/server/services/text.py +111 -0
  240. package/server/services/user_auth.py +172 -0
  241. package/server/services/websocket_client.py +29 -0
  242. package/server/services/workflow.py +597 -0
  243. package/server/skills/android-skill/SKILL.md +82 -0
  244. package/server/skills/assistant-personality/SKILL.md +45 -0
  245. package/server/skills/code-skill/SKILL.md +140 -0
  246. package/server/skills/http-skill/SKILL.md +161 -0
  247. package/server/skills/maps-skill/SKILL.md +170 -0
  248. package/server/skills/memory-skill/SKILL.md +154 -0
  249. package/server/skills/scheduler-skill/SKILL.md +84 -0
  250. package/server/skills/whatsapp-skill/SKILL.md +283 -0
  251. package/server/uv.lock +2916 -0
  252. package/server/whatsapp-rpc/.dockerignore +30 -0
  253. package/server/whatsapp-rpc/Dockerfile +44 -0
  254. package/server/whatsapp-rpc/Dockerfile.web +17 -0
  255. package/server/whatsapp-rpc/README.md +139 -0
  256. package/server/whatsapp-rpc/cli.js +95 -0
  257. package/server/whatsapp-rpc/configs/config.yaml +7 -0
  258. package/server/whatsapp-rpc/docker-compose.yml +35 -0
  259. package/server/whatsapp-rpc/docs/API.md +410 -0
  260. package/server/whatsapp-rpc/go.mod +67 -0
  261. package/server/whatsapp-rpc/go.sum +203 -0
  262. package/server/whatsapp-rpc/package.json +30 -0
  263. package/server/whatsapp-rpc/schema.json +1294 -0
  264. package/server/whatsapp-rpc/scripts/clean.cjs +66 -0
  265. package/server/whatsapp-rpc/scripts/cli.js +162 -0
  266. package/server/whatsapp-rpc/src/go/cmd/server/main.go +91 -0
  267. package/server/whatsapp-rpc/src/go/config/config.go +49 -0
  268. package/server/whatsapp-rpc/src/go/rpc/rpc.go +446 -0
  269. package/server/whatsapp-rpc/src/go/rpc/server.go +112 -0
  270. package/server/whatsapp-rpc/src/go/whatsapp/history.go +166 -0
  271. package/server/whatsapp-rpc/src/go/whatsapp/messages.go +390 -0
  272. package/server/whatsapp-rpc/src/go/whatsapp/service.go +2130 -0
  273. package/server/whatsapp-rpc/src/go/whatsapp/types.go +261 -0
  274. package/server/whatsapp-rpc/src/python/pyproject.toml +15 -0
  275. package/server/whatsapp-rpc/src/python/whatsapp_rpc/__init__.py +4 -0
  276. package/server/whatsapp-rpc/src/python/whatsapp_rpc/client.py +427 -0
  277. package/server/whatsapp-rpc/web/app.py +609 -0
  278. package/server/whatsapp-rpc/web/requirements.txt +6 -0
  279. package/server/whatsapp-rpc/web/rpc_client.py +427 -0
  280. package/server/whatsapp-rpc/web/static/openapi.yaml +59 -0
  281. package/server/whatsapp-rpc/web/templates/base.html +150 -0
  282. package/server/whatsapp-rpc/web/templates/contacts.html +240 -0
  283. package/server/whatsapp-rpc/web/templates/dashboard.html +320 -0
  284. package/server/whatsapp-rpc/web/templates/groups.html +328 -0
  285. package/server/whatsapp-rpc/web/templates/messages.html +465 -0
  286. package/server/whatsapp-rpc/web/templates/messaging.html +681 -0
  287. package/server/whatsapp-rpc/web/templates/send.html +259 -0
  288. package/server/whatsapp-rpc/web/templates/settings.html +459 -0
@@ -0,0 +1,598 @@
1
+ """Document processing handlers for MachinaOs workflow nodes.
2
+
3
+ Each handler is an independent async function that can be executed by
4
+ the Temporal worker system for distributed processing.
5
+
6
+ Handlers:
7
+ - handle_http_scraper: Scrape links from web pages with pagination
8
+ - handle_file_downloader: Download files in parallel
9
+ - handle_document_parser: Parse documents (PyPDF, Marker, Unstructured)
10
+ - handle_text_chunker: Split text into chunks (LangChain)
11
+ - handle_embedding_generator: Generate embeddings (HF, OpenAI, Ollama)
12
+ - handle_vector_store: Store/query vectors (ChromaDB, Qdrant, Pinecone)
13
+ """
14
+
15
+ import asyncio
16
+ import json
17
+ import time
18
+ from datetime import datetime, timedelta
19
+ from pathlib import Path
20
+ from typing import Dict, Any
21
+ from urllib.parse import urljoin, urlparse, unquote
22
+
23
+ from core.logging import get_logger
24
+
25
+ logger = get_logger(__name__)
26
+
27
+
28
+ # =============================================================================
29
+ # HTTP Scraper
30
+ # =============================================================================
31
+
32
+ async def handle_http_scraper(
33
+ node_id: str,
34
+ node_type: str,
35
+ parameters: Dict[str, Any],
36
+ context: Dict[str, Any]
37
+ ) -> Dict[str, Any]:
38
+ """Scrape links from web pages with date/page pagination support."""
39
+ import httpx
40
+ from bs4 import BeautifulSoup
41
+ start_time = time.time()
42
+
43
+ try:
44
+ url = parameters.get('url', '')
45
+ iteration_mode = parameters.get('iterationMode', 'single')
46
+ link_selector = parameters.get('linkSelector', 'a[href$=".pdf"]')
47
+ headers_str = parameters.get('headers', '{}')
48
+
49
+ if not url:
50
+ raise ValueError("URL is required")
51
+
52
+ headers = json.loads(headers_str) if headers_str else {}
53
+ items, errors = [], []
54
+ urls_to_fetch = []
55
+
56
+ if iteration_mode == 'date':
57
+ start_date = parameters.get('startDate', '')
58
+ end_date = parameters.get('endDate', '')
59
+ placeholder = parameters.get('datePlaceholder', '{date}')
60
+ if not start_date or not end_date:
61
+ raise ValueError("Start/end dates required for date mode")
62
+ start = datetime.strptime(start_date, "%Y-%m-%d")
63
+ end = datetime.strptime(end_date, "%Y-%m-%d")
64
+ current = start
65
+ while current <= end:
66
+ urls_to_fetch.append((
67
+ url.replace(placeholder, current.strftime("%Y-%m-%d")),
68
+ {'date': current.isoformat()}
69
+ ))
70
+ current += timedelta(days=1)
71
+ elif iteration_mode == 'page':
72
+ start_page = int(parameters.get('startPage', 1))
73
+ end_page = int(parameters.get('endPage', 10))
74
+ for page in range(start_page, end_page + 1):
75
+ urls_to_fetch.append((url.replace('{page}', str(page)), {'page': page}))
76
+ else:
77
+ urls_to_fetch.append((url, {}))
78
+
79
+ logger.info("[httpScraper] Starting", node_id=node_id, urls=len(urls_to_fetch))
80
+
81
+ async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
82
+ for fetch_url, meta in urls_to_fetch:
83
+ try:
84
+ response = await client.get(fetch_url, headers=headers)
85
+ response.raise_for_status()
86
+ soup = BeautifulSoup(response.text, 'html.parser')
87
+ for el in soup.select(link_selector):
88
+ href = el.get('href', '')
89
+ if href:
90
+ items.append({
91
+ 'url': urljoin(fetch_url, href),
92
+ 'text': el.get_text(strip=True),
93
+ 'source_url': fetch_url,
94
+ **meta
95
+ })
96
+ except Exception as e:
97
+ errors.append(f"{fetch_url}: {str(e)}")
98
+
99
+ return {
100
+ "success": True,
101
+ "node_id": node_id,
102
+ "node_type": node_type,
103
+ "result": {"items": items, "item_count": len(items), "errors": errors},
104
+ "execution_time": time.time() - start_time,
105
+ "timestamp": datetime.now().isoformat()
106
+ }
107
+ except Exception as e:
108
+ logger.error("[httpScraper] Failed", node_id=node_id, error=str(e))
109
+ return {
110
+ "success": False, "node_id": node_id, "node_type": node_type,
111
+ "error": str(e), "execution_time": time.time() - start_time,
112
+ "timestamp": datetime.now().isoformat()
113
+ }
114
+
115
+
116
+ # =============================================================================
117
+ # File Downloader
118
+ # =============================================================================
119
+
120
+ async def handle_file_downloader(
121
+ node_id: str,
122
+ node_type: str,
123
+ parameters: Dict[str, Any],
124
+ context: Dict[str, Any]
125
+ ) -> Dict[str, Any]:
126
+ """Download files from URLs in parallel using semaphore for concurrency."""
127
+ import httpx
128
+ start_time = time.time()
129
+
130
+ try:
131
+ items = parameters.get('items', [])
132
+ output_dir = Path(parameters.get('outputDir', './data/downloads'))
133
+ max_workers = int(parameters.get('maxWorkers', 8))
134
+ skip_existing = parameters.get('skipExisting', True)
135
+ timeout = float(parameters.get('timeout', 60))
136
+
137
+ if not items:
138
+ return {
139
+ "success": True, "node_id": node_id, "node_type": node_type,
140
+ "result": {"downloaded": 0, "skipped": 0, "failed": 0, "files": []},
141
+ "execution_time": time.time() - start_time,
142
+ "timestamp": datetime.now().isoformat()
143
+ }
144
+
145
+ output_dir.mkdir(parents=True, exist_ok=True)
146
+ downloaded, skipped, failed = [], [], []
147
+ semaphore = asyncio.Semaphore(max_workers)
148
+
149
+ async def download_file(item):
150
+ async with semaphore:
151
+ url = item.get('url', '') if isinstance(item, dict) else str(item)
152
+ if not url:
153
+ return {'status': 'failed', 'error': 'Empty URL'}
154
+ filename = unquote(Path(urlparse(url).path).name or 'download')
155
+ file_path = output_dir / filename
156
+ if skip_existing and file_path.exists():
157
+ return {'status': 'skipped', 'path': str(file_path), 'url': url}
158
+ try:
159
+ async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
160
+ response = await client.get(url)
161
+ response.raise_for_status()
162
+ file_path.write_bytes(response.content)
163
+ return {'status': 'downloaded', 'path': str(file_path), 'url': url,
164
+ 'size': len(response.content), 'filename': filename}
165
+ except Exception as e:
166
+ return {'status': 'failed', 'url': url, 'error': str(e)}
167
+
168
+ logger.info("[fileDownloader] Starting", node_id=node_id, items=len(items))
169
+ results = await asyncio.gather(*[download_file(i) for i in items], return_exceptions=True)
170
+
171
+ for r in results:
172
+ if isinstance(r, Exception):
173
+ failed.append({'error': str(r)})
174
+ elif r.get('status') == 'downloaded':
175
+ downloaded.append(r)
176
+ elif r.get('status') == 'skipped':
177
+ skipped.append(r)
178
+ else:
179
+ failed.append(r)
180
+
181
+ return {
182
+ "success": True, "node_id": node_id, "node_type": node_type,
183
+ "result": {"downloaded": len(downloaded), "skipped": len(skipped),
184
+ "failed": len(failed), "files": downloaded, "output_dir": str(output_dir)},
185
+ "execution_time": time.time() - start_time,
186
+ "timestamp": datetime.now().isoformat()
187
+ }
188
+ except Exception as e:
189
+ logger.error("[fileDownloader] Failed", node_id=node_id, error=str(e))
190
+ return {
191
+ "success": False, "node_id": node_id, "node_type": node_type,
192
+ "error": str(e), "execution_time": time.time() - start_time,
193
+ "timestamp": datetime.now().isoformat()
194
+ }
195
+
196
+
197
+ # =============================================================================
198
+ # Document Parser
199
+ # =============================================================================
200
+
201
+ def _parse_file_sync(path: Path, parser: str) -> str:
202
+ """Synchronous file parsing - runs in thread pool."""
203
+ if parser == 'pypdf':
204
+ from pypdf import PdfReader
205
+ return "\n\n".join(p.extract_text() or '' for p in PdfReader(str(path)).pages)
206
+ elif parser == 'marker':
207
+ from marker.converters.pdf import PdfConverter
208
+ from marker.models import create_model_dict
209
+ converter = PdfConverter(artifact_dict=create_model_dict())
210
+ result = converter(str(path))
211
+ return result.markdown if hasattr(result, 'markdown') else str(result)
212
+ elif parser == 'unstructured':
213
+ from unstructured.partition.auto import partition
214
+ return "\n\n".join(str(el) for el in partition(str(path)))
215
+ elif parser == 'beautifulsoup':
216
+ from bs4 import BeautifulSoup
217
+ soup = BeautifulSoup(path.read_text(errors='ignore'), 'html.parser')
218
+ for s in soup(["script", "style"]):
219
+ s.decompose()
220
+ return soup.get_text(separator='\n')
221
+ raise ValueError(f"Unknown parser: {parser}")
222
+
223
+
224
+ async def handle_document_parser(
225
+ node_id: str,
226
+ node_type: str,
227
+ parameters: Dict[str, Any],
228
+ context: Dict[str, Any]
229
+ ) -> Dict[str, Any]:
230
+ """Parse documents to text using configurable parsers."""
231
+ start_time = time.time()
232
+
233
+ try:
234
+ files = parameters.get('files', [])
235
+ input_dir = parameters.get('inputDir', '')
236
+ parser = parameters.get('parser', 'pypdf')
237
+ file_pattern = parameters.get('filePattern', '*.pdf')
238
+
239
+ paths = []
240
+ for f in files:
241
+ p = f.get('path', '') if isinstance(f, dict) else str(f)
242
+ if p:
243
+ paths.append(Path(p))
244
+ if input_dir and Path(input_dir).exists():
245
+ paths.extend(Path(input_dir).glob(file_pattern))
246
+
247
+ if not paths:
248
+ return {
249
+ "success": True, "node_id": node_id, "node_type": node_type,
250
+ "result": {"documents": [], "parsed_count": 0, "failed": []},
251
+ "execution_time": time.time() - start_time,
252
+ "timestamp": datetime.now().isoformat()
253
+ }
254
+
255
+ logger.info("[documentParser] Starting", node_id=node_id, files=len(paths), parser=parser)
256
+ documents, failed = [], []
257
+
258
+ for path in paths:
259
+ try:
260
+ content = await asyncio.to_thread(_parse_file_sync, path, parser)
261
+ documents.append({'source': str(path), 'filename': path.name,
262
+ 'content': content, 'length': len(content), 'parser': parser})
263
+ except Exception as e:
264
+ failed.append({'file': str(path), 'error': str(e)})
265
+
266
+ return {
267
+ "success": True, "node_id": node_id, "node_type": node_type,
268
+ "result": {"documents": documents, "parsed_count": len(documents), "failed": failed},
269
+ "execution_time": time.time() - start_time,
270
+ "timestamp": datetime.now().isoformat()
271
+ }
272
+ except Exception as e:
273
+ logger.error("[documentParser] Failed", node_id=node_id, error=str(e))
274
+ return {
275
+ "success": False, "node_id": node_id, "node_type": node_type,
276
+ "error": str(e), "execution_time": time.time() - start_time,
277
+ "timestamp": datetime.now().isoformat()
278
+ }
279
+
280
+
281
+ # =============================================================================
282
+ # Text Chunker
283
+ # =============================================================================
284
+
285
+ async def handle_text_chunker(
286
+ node_id: str,
287
+ node_type: str,
288
+ parameters: Dict[str, Any],
289
+ context: Dict[str, Any]
290
+ ) -> Dict[str, Any]:
291
+ """Split text into overlapping chunks using LangChain splitters."""
292
+ start_time = time.time()
293
+
294
+ try:
295
+ documents = parameters.get('documents', [])
296
+ chunk_size = int(parameters.get('chunkSize', 1024))
297
+ chunk_overlap = int(parameters.get('chunkOverlap', 200))
298
+ strategy = parameters.get('strategy', 'recursive')
299
+
300
+ if not documents:
301
+ return {
302
+ "success": True, "node_id": node_id, "node_type": node_type,
303
+ "result": {"chunks": [], "chunk_count": 0},
304
+ "execution_time": time.time() - start_time,
305
+ "timestamp": datetime.now().isoformat()
306
+ }
307
+
308
+ from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownTextSplitter
309
+
310
+ if strategy == 'markdown':
311
+ splitter = MarkdownTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
312
+ else:
313
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
314
+
315
+ logger.info("[textChunker] Starting", node_id=node_id, docs=len(documents))
316
+ chunks = []
317
+
318
+ for doc in documents:
319
+ content = doc.get('content', '') if isinstance(doc, dict) else str(doc)
320
+ source = doc.get('source', 'input') if isinstance(doc, dict) else 'input'
321
+ if not content:
322
+ continue
323
+ for i, chunk_text in enumerate(splitter.split_text(content)):
324
+ chunks.append({'source': source, 'chunk_index': i,
325
+ 'content': chunk_text, 'length': len(chunk_text)})
326
+
327
+ return {
328
+ "success": True, "node_id": node_id, "node_type": node_type,
329
+ "result": {"chunks": chunks, "chunk_count": len(chunks)},
330
+ "execution_time": time.time() - start_time,
331
+ "timestamp": datetime.now().isoformat()
332
+ }
333
+ except Exception as e:
334
+ logger.error("[textChunker] Failed", node_id=node_id, error=str(e))
335
+ return {
336
+ "success": False, "node_id": node_id, "node_type": node_type,
337
+ "error": str(e), "execution_time": time.time() - start_time,
338
+ "timestamp": datetime.now().isoformat()
339
+ }
340
+
341
+
342
+ # =============================================================================
343
+ # Embedding Generator
344
+ # =============================================================================
345
+
346
+ async def handle_embedding_generator(
347
+ node_id: str,
348
+ node_type: str,
349
+ parameters: Dict[str, Any],
350
+ context: Dict[str, Any]
351
+ ) -> Dict[str, Any]:
352
+ """Generate embeddings using HuggingFace, OpenAI, or Ollama."""
353
+ start_time = time.time()
354
+
355
+ try:
356
+ chunks = parameters.get('chunks', [])
357
+ provider = parameters.get('provider', 'huggingface')
358
+ model = parameters.get('model', 'BAAI/bge-small-en-v1.5')
359
+ api_key = parameters.get('apiKey', '')
360
+
361
+ if not chunks:
362
+ return {
363
+ "success": True, "node_id": node_id, "node_type": node_type,
364
+ "result": {"embeddings": [], "embedding_count": 0, "dimensions": 0, "chunks": []},
365
+ "execution_time": time.time() - start_time,
366
+ "timestamp": datetime.now().isoformat()
367
+ }
368
+
369
+ texts = [c.get('content', '') if isinstance(c, dict) else str(c) for c in chunks]
370
+ logger.info("[embeddingGenerator] Starting", node_id=node_id, texts=len(texts), provider=provider)
371
+
372
+ if provider == 'huggingface':
373
+ from langchain_huggingface import HuggingFaceEmbeddings
374
+ embedder = HuggingFaceEmbeddings(model_name=model)
375
+ elif provider == 'openai':
376
+ from langchain_openai import OpenAIEmbeddings
377
+ embedder = OpenAIEmbeddings(model=model, api_key=api_key)
378
+ elif provider == 'ollama':
379
+ from langchain_ollama import OllamaEmbeddings
380
+ embedder = OllamaEmbeddings(model=model)
381
+ else:
382
+ raise ValueError(f"Unknown provider: {provider}")
383
+
384
+ embeddings = await asyncio.to_thread(embedder.embed_documents, texts)
385
+ dimensions = len(embeddings[0]) if embeddings else 0
386
+
387
+ return {
388
+ "success": True, "node_id": node_id, "node_type": node_type,
389
+ "result": {"embeddings": embeddings, "embedding_count": len(embeddings),
390
+ "dimensions": dimensions, "chunks": chunks, "provider": provider, "model": model},
391
+ "execution_time": time.time() - start_time,
392
+ "timestamp": datetime.now().isoformat()
393
+ }
394
+ except Exception as e:
395
+ logger.error("[embeddingGenerator] Failed", node_id=node_id, error=str(e))
396
+ return {
397
+ "success": False, "node_id": node_id, "node_type": node_type,
398
+ "error": str(e), "execution_time": time.time() - start_time,
399
+ "timestamp": datetime.now().isoformat()
400
+ }
401
+
402
+
403
+ # =============================================================================
404
+ # Vector Store
405
+ # =============================================================================
406
+
407
+ async def handle_vector_store(
408
+ node_id: str,
409
+ node_type: str,
410
+ parameters: Dict[str, Any],
411
+ context: Dict[str, Any]
412
+ ) -> Dict[str, Any]:
413
+ """Store or query vectors using ChromaDB, Qdrant, or Pinecone."""
414
+ start_time = time.time()
415
+
416
+ try:
417
+ operation = parameters.get('operation', 'store')
418
+ backend = parameters.get('backend', 'chroma')
419
+ collection_name = parameters.get('collectionName', 'documents')
420
+
421
+ logger.info("[vectorStore] Starting", node_id=node_id, op=operation, backend=backend)
422
+
423
+ if backend == 'chroma':
424
+ result = await _chroma_op(operation, parameters, collection_name)
425
+ elif backend == 'qdrant':
426
+ result = await _qdrant_op(operation, parameters, collection_name)
427
+ elif backend == 'pinecone':
428
+ result = await _pinecone_op(operation, parameters, collection_name)
429
+ else:
430
+ raise ValueError(f"Unknown backend: {backend}")
431
+
432
+ result['backend'] = backend
433
+ result['collection_name'] = collection_name
434
+
435
+ return {
436
+ "success": True, "node_id": node_id, "node_type": node_type,
437
+ "result": result, "execution_time": time.time() - start_time,
438
+ "timestamp": datetime.now().isoformat()
439
+ }
440
+ except Exception as e:
441
+ logger.error("[vectorStore] Failed", node_id=node_id, error=str(e))
442
+ return {
443
+ "success": False, "node_id": node_id, "node_type": node_type,
444
+ "error": str(e), "execution_time": time.time() - start_time,
445
+ "timestamp": datetime.now().isoformat()
446
+ }
447
+
448
+
449
+ async def _chroma_op(operation: str, params: Dict, collection: str) -> Dict:
450
+ """ChromaDB operations."""
451
+ import chromadb
452
+ import uuid
453
+
454
+ persist_dir = params.get('persistDir', './data/vectors')
455
+ client = chromadb.PersistentClient(path=persist_dir)
456
+ coll = client.get_or_create_collection(name=collection)
457
+
458
+ if operation == 'store':
459
+ embeddings = params.get('embeddings', [])
460
+ chunks = params.get('chunks', [])
461
+ if not embeddings:
462
+ return {"stored_count": 0, "collection_count": coll.count()}
463
+ ids = [str(uuid.uuid4()) for _ in embeddings]
464
+ docs = [c.get('content', '') if isinstance(c, dict) else str(c) for c in chunks]
465
+ while len(docs) < len(embeddings):
466
+ docs.append('')
467
+ metas = [{'source': c.get('source', 'unknown'), 'chunk_index': c.get('chunk_index', i)}
468
+ if isinstance(c, dict) else {'source': 'input', 'chunk_index': i}
469
+ for i, c in enumerate(chunks)]
470
+ while len(metas) < len(embeddings):
471
+ metas.append({'source': 'unknown', 'chunk_index': len(metas)})
472
+ await asyncio.to_thread(coll.add, ids=ids, embeddings=embeddings, documents=docs, metadatas=metas)
473
+ return {"stored_count": len(embeddings), "collection_count": coll.count()}
474
+
475
+ elif operation == 'query':
476
+ query_emb = params.get('queryEmbedding', [])
477
+ top_k = int(params.get('topK', 5))
478
+ if not query_emb:
479
+ return {"matches": []}
480
+ results = coll.query(query_embeddings=[query_emb], n_results=top_k)
481
+ matches = []
482
+ if results['ids'] and results['ids'][0]:
483
+ for i in range(len(results['ids'][0])):
484
+ matches.append({
485
+ 'id': results['ids'][0][i],
486
+ 'document': results['documents'][0][i] if results['documents'] else '',
487
+ 'metadata': results['metadatas'][0][i] if results['metadatas'] else {},
488
+ 'distance': results['distances'][0][i] if results.get('distances') else None
489
+ })
490
+ return {"matches": matches}
491
+
492
+ elif operation == 'delete':
493
+ ids = params.get('ids', [])
494
+ if ids:
495
+ await asyncio.to_thread(coll.delete, ids=ids)
496
+ return {"deleted": True, "count": len(ids)}
497
+
498
+ return {}
499
+
500
+
501
+ async def _qdrant_op(operation: str, params: Dict, collection: str) -> Dict:
502
+ """Qdrant operations."""
503
+ from qdrant_client import QdrantClient
504
+ from qdrant_client.models import VectorParams, Distance, PointStruct
505
+ import uuid
506
+
507
+ url = params.get('qdrantUrl', 'http://localhost:6333')
508
+ client = QdrantClient(url=url)
509
+
510
+ if operation == 'store':
511
+ embeddings = params.get('embeddings', [])
512
+ chunks = params.get('chunks', [])
513
+ if not embeddings:
514
+ return {"stored_count": 0}
515
+ vec_size = len(embeddings[0])
516
+ colls = client.get_collections().collections
517
+ if collection not in [c.name for c in colls]:
518
+ client.create_collection(collection, vectors_config=VectorParams(size=vec_size, distance=Distance.COSINE))
519
+ points = []
520
+ for i, emb in enumerate(embeddings):
521
+ payload = {}
522
+ if i < len(chunks):
523
+ c = chunks[i]
524
+ if isinstance(c, dict):
525
+ payload = {'content': c.get('content', ''), 'source': c.get('source', 'unknown'),
526
+ 'chunk_index': c.get('chunk_index', i)}
527
+ else:
528
+ payload = {'content': str(c), 'source': 'input', 'chunk_index': i}
529
+ points.append(PointStruct(id=str(uuid.uuid4()), vector=emb, payload=payload))
530
+ await asyncio.to_thread(client.upsert, collection_name=collection, points=points)
531
+ return {"stored_count": len(embeddings)}
532
+
533
+ elif operation == 'query':
534
+ query_emb = params.get('queryEmbedding', [])
535
+ top_k = int(params.get('topK', 5))
536
+ if not query_emb:
537
+ return {"matches": []}
538
+ results = await asyncio.to_thread(client.search, collection_name=collection,
539
+ query_vector=query_emb, limit=top_k)
540
+ return {"matches": [{'id': str(r.id), 'document': r.payload.get('content', ''),
541
+ 'metadata': r.payload, 'score': r.score} for r in results]}
542
+
543
+ elif operation == 'delete':
544
+ ids = params.get('ids', [])
545
+ if ids:
546
+ await asyncio.to_thread(client.delete, collection_name=collection, points_selector=ids)
547
+ return {"deleted": True, "count": len(ids)}
548
+
549
+ return {}
550
+
551
+
552
+ async def _pinecone_op(operation: str, params: Dict, collection: str) -> Dict:
553
+ """Pinecone operations."""
554
+ from pinecone import Pinecone
555
+ import uuid
556
+
557
+ api_key = params.get('pineconeApiKey', '')
558
+ if not api_key:
559
+ raise ValueError("Pinecone API key required")
560
+
561
+ pc = Pinecone(api_key=api_key)
562
+ index = pc.Index(collection)
563
+
564
+ if operation == 'store':
565
+ embeddings = params.get('embeddings', [])
566
+ chunks = params.get('chunks', [])
567
+ if not embeddings:
568
+ return {"stored_count": 0}
569
+ vectors = []
570
+ for i, emb in enumerate(embeddings):
571
+ meta = {}
572
+ if i < len(chunks):
573
+ c = chunks[i]
574
+ if isinstance(c, dict):
575
+ meta = {'content': c.get('content', ''), 'source': c.get('source', 'unknown'),
576
+ 'chunk_index': c.get('chunk_index', i)}
577
+ else:
578
+ meta = {'content': str(c), 'source': 'input', 'chunk_index': i}
579
+ vectors.append({'id': str(uuid.uuid4()), 'values': emb, 'metadata': meta})
580
+ await asyncio.to_thread(index.upsert, vectors=vectors)
581
+ return {"stored_count": len(embeddings)}
582
+
583
+ elif operation == 'query':
584
+ query_emb = params.get('queryEmbedding', [])
585
+ top_k = int(params.get('topK', 5))
586
+ if not query_emb:
587
+ return {"matches": []}
588
+ results = await asyncio.to_thread(index.query, vector=query_emb, top_k=top_k, include_metadata=True)
589
+ return {"matches": [{'id': m.id, 'document': m.metadata.get('content', '') if m.metadata else '',
590
+ 'metadata': m.metadata or {}, 'score': m.score} for m in results.matches]}
591
+
592
+ elif operation == 'delete':
593
+ ids = params.get('ids', [])
594
+ if ids:
595
+ await asyncio.to_thread(index.delete, ids=ids)
596
+ return {"deleted": True, "count": len(ids)}
597
+
598
+ return {}