kailash 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. kailash/nodes/__init__.py +2 -1
  2. kailash/nodes/ai/__init__.py +26 -0
  3. kailash/nodes/ai/ai_providers.py +1272 -0
  4. kailash/nodes/ai/embedding_generator.py +853 -0
  5. kailash/nodes/ai/llm_agent.py +1166 -0
  6. kailash/nodes/api/auth.py +3 -3
  7. kailash/nodes/api/graphql.py +2 -2
  8. kailash/nodes/api/http.py +391 -44
  9. kailash/nodes/api/rate_limiting.py +2 -2
  10. kailash/nodes/api/rest.py +464 -56
  11. kailash/nodes/base.py +71 -12
  12. kailash/nodes/code/python.py +2 -1
  13. kailash/nodes/data/__init__.py +7 -0
  14. kailash/nodes/data/readers.py +28 -26
  15. kailash/nodes/data/retrieval.py +178 -0
  16. kailash/nodes/data/sharepoint_graph.py +7 -7
  17. kailash/nodes/data/sources.py +65 -0
  18. kailash/nodes/data/sql.py +4 -2
  19. kailash/nodes/data/writers.py +6 -3
  20. kailash/nodes/logic/operations.py +2 -1
  21. kailash/nodes/mcp/__init__.py +11 -0
  22. kailash/nodes/mcp/client.py +558 -0
  23. kailash/nodes/mcp/resource.py +682 -0
  24. kailash/nodes/mcp/server.py +571 -0
  25. kailash/nodes/transform/__init__.py +16 -1
  26. kailash/nodes/transform/chunkers.py +78 -0
  27. kailash/nodes/transform/formatters.py +96 -0
  28. kailash/runtime/docker.py +6 -6
  29. kailash/sdk_exceptions.py +24 -10
  30. kailash/tracking/metrics_collector.py +2 -1
  31. kailash/utils/templates.py +6 -6
  32. {kailash-0.1.1.dist-info → kailash-0.1.2.dist-info}/METADATA +344 -46
  33. {kailash-0.1.1.dist-info → kailash-0.1.2.dist-info}/RECORD +37 -26
  34. {kailash-0.1.1.dist-info → kailash-0.1.2.dist-info}/WHEEL +0 -0
  35. {kailash-0.1.1.dist-info → kailash-0.1.2.dist-info}/entry_points.txt +0 -0
  36. {kailash-0.1.1.dist-info → kailash-0.1.2.dist-info}/licenses/LICENSE +0 -0
  37. {kailash-0.1.1.dist-info → kailash-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,571 @@
1
+ """MCP Server node for hosting Model Context Protocol resources and tools."""
2
+
3
+ import json
4
+ from typing import Any, Dict, List
5
+
6
+ from kailash.nodes.base import Node, NodeParameter, register_node
7
+
8
+
9
+ @register_node()
10
+ class MCPServer(Node):
11
+ """
12
+ Server node for hosting Model Context Protocol (MCP) resources and tools.
13
+
14
+ Design Purpose and Philosophy:
15
+ The MCPServer node allows workflows to expose their data and functionality as
16
+ standardized MCP resources and tools. This enables other AI applications and
17
+ agents to discover and interact with workflow capabilities through the MCP protocol.
18
+
19
+ Upstream Dependencies:
20
+ - Resource data to expose (files, databases, APIs)
21
+ - Tool implementations to register with the server
22
+ - Prompt templates to make available to clients
23
+ - Server configuration and authentication settings
24
+
25
+ Downstream Consumers:
26
+ - MCP clients that connect to discover resources
27
+ - AI applications that need workflow context
28
+ - Other Kailash workflows acting as MCP clients
29
+ - External tools and services supporting MCP
30
+
31
+ Usage Patterns:
32
+ 1. Start MCP server with specified resources and tools
33
+ 2. Register dynamic resources that update in real-time
34
+ 3. Expose workflow capabilities as callable tools
35
+ 4. Provide prompt templates for standardized interactions
36
+ 5. Handle client connections and protocol compliance
37
+
38
+ Implementation Details:
39
+ - Uses the FastMCP framework for rapid server development
40
+ - Supports stdio, SSE, and HTTP transports automatically
41
+ - Implements proper resource discovery and metadata
42
+ - Provides authentication and access control mechanisms
43
+ - Handles concurrent client connections efficiently
44
+
45
+ Error Handling:
46
+ - ServerStartupError: When server fails to initialize
47
+ - ResourceRegistrationError: When resources cannot be registered
48
+ - ToolExecutionError: When tool calls fail during execution
49
+ - ClientConnectionError: When client connections are rejected
50
+ - ProtocolViolationError: When clients violate MCP protocol
51
+
52
+ Side Effects:
53
+ - Starts a network server process listening on specified ports
54
+ - Registers resources and tools in the MCP protocol registry
55
+ - May modify external systems when tools are executed
56
+ - Logs server events and client interactions
57
+
58
+ Examples:
59
+ ```python
60
+ # Start a basic MCP server with resources
61
+ server = MCPServer()
62
+ result = server.run(
63
+ server_config={
64
+ "name": "workflow-server",
65
+ "transport": "stdio"
66
+ },
67
+ resources=[
68
+ {
69
+ "uri": "workflow://current/status",
70
+ "name": "Workflow Status",
71
+ "content": "Running workflow with 5 active nodes"
72
+ }
73
+ ],
74
+ tools=[
75
+ {
76
+ "name": "execute_node",
77
+ "description": "Execute a specific workflow node",
78
+ "parameters": {
79
+ "node_id": {"type": "string", "required": True}
80
+ }
81
+ }
82
+ ]
83
+ )
84
+
85
+ # Register dynamic resources
86
+ server_with_dynamic = MCPServer()
87
+ result = server_with_dynamic.run(
88
+ server_config={
89
+ "name": "data-server",
90
+ "transport": "http",
91
+ "port": 8080
92
+ },
93
+ resource_providers={
94
+ "database://tables/*": "list_database_tables",
95
+ "file://workspace/*": "list_workspace_files"
96
+ }
97
+ )
98
+ ```
99
+ """
100
+
101
+ def get_parameters(self) -> Dict[str, NodeParameter]:
102
+ return {
103
+ "server_config": NodeParameter(
104
+ name="server_config",
105
+ type=dict,
106
+ required=False,
107
+ default={},
108
+ description="MCP server configuration (name, transport, port, etc.)",
109
+ ),
110
+ "resources": NodeParameter(
111
+ name="resources",
112
+ type=list,
113
+ required=False,
114
+ default=[],
115
+ description="Static resources to expose (list of resource objects)",
116
+ ),
117
+ "tools": NodeParameter(
118
+ name="tools",
119
+ type=list,
120
+ required=False,
121
+ default=[],
122
+ description="Tools to register with the server (list of tool definitions)",
123
+ ),
124
+ "prompts": NodeParameter(
125
+ name="prompts",
126
+ type=list,
127
+ required=False,
128
+ default=[],
129
+ description="Prompt templates to make available (list of prompt objects)",
130
+ ),
131
+ "resource_providers": NodeParameter(
132
+ name="resource_providers",
133
+ type=dict,
134
+ required=False,
135
+ default={},
136
+ description="Dynamic resource providers (URI pattern -> provider function)",
137
+ ),
138
+ "authentication": NodeParameter(
139
+ name="authentication",
140
+ type=dict,
141
+ required=False,
142
+ default={},
143
+ description="Authentication configuration (type, credentials, etc.)",
144
+ ),
145
+ "auto_start": NodeParameter(
146
+ name="auto_start",
147
+ type=bool,
148
+ required=False,
149
+ default=True,
150
+ description="Whether to automatically start the server",
151
+ ),
152
+ "max_connections": NodeParameter(
153
+ name="max_connections",
154
+ type=int,
155
+ required=False,
156
+ default=10,
157
+ description="Maximum number of concurrent client connections",
158
+ ),
159
+ }
160
+
161
+ def run(self, **kwargs) -> Dict[str, Any]:
162
+ server_config = kwargs["server_config"]
163
+ resources = kwargs.get("resources", [])
164
+ tools = kwargs.get("tools", [])
165
+ prompts = kwargs.get("prompts", [])
166
+ resource_providers = kwargs.get("resource_providers", {})
167
+ authentication = kwargs.get("authentication", {})
168
+ auto_start = kwargs.get("auto_start", True)
169
+ max_connections = kwargs.get("max_connections", 10)
170
+
171
+ try:
172
+ # Import MCP SDK (graceful fallback if not installed)
173
+ try:
174
+ from mcp.server import Server
175
+ from mcp.server.fastmcp import FastMCP
176
+ from mcp.types import Prompt, Resource, Tool
177
+
178
+ mcp_available = True
179
+ except ImportError:
180
+ mcp_available = False
181
+
182
+ if not mcp_available:
183
+ # Provide mock functionality when MCP SDK is not available
184
+ return self._mock_mcp_server(
185
+ server_config,
186
+ resources,
187
+ tools,
188
+ prompts,
189
+ resource_providers,
190
+ authentication,
191
+ auto_start,
192
+ max_connections,
193
+ )
194
+
195
+ # Extract server configuration
196
+ server_name = server_config.get("name", "kailash-server")
197
+ transport_type = server_config.get("transport", "stdio")
198
+ port = server_config.get("port", 8080)
199
+ host = server_config.get("host", "localhost")
200
+
201
+ # For now, provide mock implementation as we need proper MCP server setup
202
+ return self._mock_fastmcp_server(
203
+ server_name,
204
+ transport_type,
205
+ host,
206
+ port,
207
+ resources,
208
+ tools,
209
+ prompts,
210
+ resource_providers,
211
+ authentication,
212
+ auto_start,
213
+ max_connections,
214
+ )
215
+
216
+ except Exception as e:
217
+ return {
218
+ "success": False,
219
+ "error": str(e),
220
+ "error_type": type(e).__name__,
221
+ "server_config": server_config,
222
+ }
223
+
224
+ def _mock_mcp_server(
225
+ self,
226
+ server_config: dict,
227
+ resources: List[dict],
228
+ tools: List[dict],
229
+ prompts: List[dict],
230
+ resource_providers: dict,
231
+ authentication: dict,
232
+ auto_start: bool,
233
+ max_connections: int,
234
+ ) -> Dict[str, Any]:
235
+ """Mock MCP server when SDK is not available."""
236
+ server_name = server_config.get("name", "mock-server")
237
+ transport = server_config.get("transport", "stdio")
238
+
239
+ # Validate resources
240
+ validated_resources = []
241
+ for resource in resources:
242
+ if not isinstance(resource, dict):
243
+ continue
244
+
245
+ uri = resource.get("uri")
246
+ name = resource.get("name", uri)
247
+ description = resource.get("description", f"Resource: {name}")
248
+
249
+ if uri:
250
+ validated_resources.append(
251
+ {
252
+ "uri": uri,
253
+ "name": name,
254
+ "description": description,
255
+ "mimeType": resource.get("mimeType", "text/plain"),
256
+ "content": resource.get("content"),
257
+ }
258
+ )
259
+
260
+ # Validate tools
261
+ validated_tools = []
262
+ for tool in tools:
263
+ if not isinstance(tool, dict):
264
+ continue
265
+
266
+ name = tool.get("name")
267
+ description = tool.get("description", f"Tool: {name}")
268
+
269
+ if name:
270
+ validated_tools.append(
271
+ {
272
+ "name": name,
273
+ "description": description,
274
+ "inputSchema": tool.get("parameters", {}),
275
+ "handler": tool.get("handler", f"mock_handler_{name}"),
276
+ }
277
+ )
278
+
279
+ # Validate prompts
280
+ validated_prompts = []
281
+ for prompt in prompts:
282
+ if not isinstance(prompt, dict):
283
+ continue
284
+
285
+ name = prompt.get("name")
286
+ description = prompt.get("description", f"Prompt: {name}")
287
+
288
+ if name:
289
+ validated_prompts.append(
290
+ {
291
+ "name": name,
292
+ "description": description,
293
+ "arguments": prompt.get("arguments", []),
294
+ "template": prompt.get("template", f"Mock template for {name}"),
295
+ }
296
+ )
297
+
298
+ # Mock server status
299
+ server_status = {
300
+ "name": server_name,
301
+ "transport": transport,
302
+ "status": "running" if auto_start else "configured",
303
+ "pid": 12345, # Mock process ID
304
+ "started_at": "2025-06-01T12:00:00Z",
305
+ "uptime": "0:00:05",
306
+ "connections": {"active": 0, "total": 0, "max": max_connections},
307
+ "capabilities": {
308
+ "resources": True,
309
+ "tools": True,
310
+ "prompts": True,
311
+ "logging": True,
312
+ },
313
+ }
314
+
315
+ if transport == "http":
316
+ host = server_config.get("host", "localhost")
317
+ port = server_config.get("port", 8080)
318
+ server_status.update(
319
+ {
320
+ "host": host,
321
+ "port": port,
322
+ "url": f"http://{host}:{port}",
323
+ "endpoints": {
324
+ "sse": f"http://{host}:{port}/sse",
325
+ "resources": f"http://{host}:{port}/resources",
326
+ "tools": f"http://{host}:{port}/tools",
327
+ "prompts": f"http://{host}:{port}/prompts",
328
+ },
329
+ }
330
+ )
331
+
332
+ return {
333
+ "success": True,
334
+ "server": server_status,
335
+ "resources": {
336
+ "registered": validated_resources,
337
+ "count": len(validated_resources),
338
+ "providers": (
339
+ list(resource_providers.keys()) if resource_providers else []
340
+ ),
341
+ },
342
+ "tools": {"registered": validated_tools, "count": len(validated_tools)},
343
+ "prompts": {
344
+ "registered": validated_prompts,
345
+ "count": len(validated_prompts),
346
+ },
347
+ "authentication": {
348
+ "enabled": bool(authentication),
349
+ "type": authentication.get("type", "none"),
350
+ },
351
+ "mock": True,
352
+ "message": f"Mock MCP server '{server_name}' configured successfully",
353
+ }
354
+
355
+ def _mock_fastmcp_server(
356
+ self,
357
+ server_name: str,
358
+ transport_type: str,
359
+ host: str,
360
+ port: int,
361
+ resources: List[dict],
362
+ tools: List[dict],
363
+ prompts: List[dict],
364
+ resource_providers: dict,
365
+ authentication: dict,
366
+ auto_start: bool,
367
+ max_connections: int,
368
+ ) -> Dict[str, Any]:
369
+ """Mock FastMCP server implementation."""
370
+
371
+ # Create mock FastMCP server configuration
372
+ server_code = f"""
373
+ # Mock FastMCP server code for {server_name}
374
+ from mcp.server.fastmcp import FastMCP
375
+
376
+ # Create server instance
377
+ mcp = FastMCP("{server_name}")
378
+
379
+ # Register resources
380
+ {self._generate_resource_code(resources)}
381
+
382
+ # Register tools
383
+ {self._generate_tool_code(tools)}
384
+
385
+ # Register prompts
386
+ {self._generate_prompt_code(prompts)}
387
+
388
+ # Dynamic resource providers
389
+ {self._generate_provider_code(resource_providers)}
390
+
391
+ if __name__ == "__main__":
392
+ mcp.run()
393
+ """
394
+
395
+ # Mock server startup
396
+ startup_info = {
397
+ "server_name": server_name,
398
+ "transport": transport_type,
399
+ "generated_code": server_code,
400
+ "status": "ready" if auto_start else "configured",
401
+ "resources_count": len(resources),
402
+ "tools_count": len(tools),
403
+ "prompts_count": len(prompts),
404
+ "providers_count": len(resource_providers),
405
+ }
406
+
407
+ if transport_type == "http":
408
+ startup_info.update(
409
+ {
410
+ "host": host,
411
+ "port": port,
412
+ "url": f"http://{host}:{port}",
413
+ "sse_endpoint": f"http://{host}:{port}/sse",
414
+ }
415
+ )
416
+
417
+ return {
418
+ "success": True,
419
+ "server": startup_info,
420
+ "code": server_code,
421
+ "mock": True,
422
+ "next_steps": [
423
+ "Save the generated code to a Python file",
424
+ "Install MCP dependencies: pip install 'mcp[cli]'",
425
+ "Run the server: python server_file.py",
426
+ "Connect clients using the specified transport",
427
+ ],
428
+ }
429
+
430
+ def _generate_resource_code(self, resources: List[dict]) -> str:
431
+ """Generate Python code for resource registration."""
432
+ if not resources:
433
+ return "# No static resources defined"
434
+
435
+ code_lines = []
436
+ for resource in resources:
437
+ uri = resource.get("uri", "")
438
+ content = resource.get("content", "")
439
+ name = resource.get("name", uri)
440
+
441
+ # Escape strings for Python code
442
+ content_escaped = json.dumps(content) if content else '""'
443
+
444
+ code_lines.append(f'@mcp.resource("{uri}")')
445
+ code_lines.append(f"def get_{self._sanitize_name(uri)}():")
446
+ code_lines.append(f' """Resource: {name}"""')
447
+ code_lines.append(f" return {content_escaped}")
448
+ code_lines.append("")
449
+
450
+ return "\n".join(code_lines)
451
+
452
+ def _generate_tool_code(self, tools: List[dict]) -> str:
453
+ """Generate Python code for tool registration."""
454
+ if not tools:
455
+ return "# No tools defined"
456
+
457
+ code_lines = []
458
+ for tool in tools:
459
+ name = tool.get("name", "")
460
+ description = tool.get("description", "")
461
+ parameters = tool.get("parameters", {})
462
+
463
+ # Generate function parameters from schema
464
+ param_list = []
465
+
466
+ # Handle OpenAPI schema format
467
+ if isinstance(parameters, dict) and "properties" in parameters:
468
+ properties = parameters.get("properties", {})
469
+ required = parameters.get("required", [])
470
+
471
+ for param_name, param_info in properties.items():
472
+ param_type = (
473
+ param_info.get("type", "str")
474
+ if isinstance(param_info, dict)
475
+ else "str"
476
+ )
477
+ if param_name in required:
478
+ param_list.append(f"{param_name}: {param_type}")
479
+ else:
480
+ param_list.append(f"{param_name}: {param_type} = None")
481
+ # Handle simple parameter format
482
+ elif isinstance(parameters, dict):
483
+ for param_name, param_info in parameters.items():
484
+ if isinstance(param_info, dict):
485
+ param_type = param_info.get("type", "str")
486
+ if param_info.get("required", False):
487
+ param_list.append(f"{param_name}: {param_type}")
488
+ else:
489
+ param_list.append(f"{param_name}: {param_type} = None")
490
+ else:
491
+ param_list.append(f"{param_name}: str = None")
492
+
493
+ param_str = ", ".join(param_list) if param_list else ""
494
+
495
+ code_lines.append("@mcp.tool()")
496
+ code_lines.append(f"def {name}({param_str}):")
497
+ code_lines.append(f' """{description}"""')
498
+ code_lines.append(" # Mock tool implementation")
499
+ code_lines.append(
500
+ f' return {{"tool": "{name}", "status": "executed", "parameters": locals()}}'
501
+ )
502
+ code_lines.append("")
503
+
504
+ return "\n".join(code_lines)
505
+
506
+ def _generate_prompt_code(self, prompts: List[dict]) -> str:
507
+ """Generate Python code for prompt registration."""
508
+ if not prompts:
509
+ return "# No prompts defined"
510
+
511
+ code_lines = []
512
+ for prompt in prompts:
513
+ name = prompt.get("name", "")
514
+ template = prompt.get("template", "")
515
+ arguments = prompt.get("arguments", [])
516
+
517
+ # Generate function parameters from arguments
518
+ param_list = []
519
+ for arg in arguments:
520
+ if isinstance(arg, dict):
521
+ arg_name = arg.get("name", "")
522
+ if arg.get("required", False):
523
+ param_list.append(f"{arg_name}: str")
524
+ else:
525
+ param_list.append(f"{arg_name}: str = ''")
526
+
527
+ param_str = ", ".join(param_list) if param_list else ""
528
+
529
+ code_lines.append(f'@mcp.prompt("{name}")')
530
+ code_lines.append(f"def {name}_prompt({param_str}):")
531
+ code_lines.append(f' """Prompt: {name}"""')
532
+ if template:
533
+ template_escaped = json.dumps(template)
534
+ code_lines.append(f" template = {template_escaped}")
535
+ code_lines.append(" return template.format(**locals())")
536
+ else:
537
+ code_lines.append(
538
+ f' return f"Mock prompt: {name} with args: {{locals()}}"'
539
+ )
540
+ code_lines.append("")
541
+
542
+ return "\n".join(code_lines)
543
+
544
+ def _generate_provider_code(self, providers: dict) -> str:
545
+ """Generate Python code for dynamic resource providers."""
546
+ if not providers:
547
+ return "# No dynamic resource providers defined"
548
+
549
+ code_lines = []
550
+ for pattern, provider_func in providers.items():
551
+ sanitized_pattern = self._sanitize_name(pattern)
552
+
553
+ code_lines.append(f'@mcp.resource("{pattern}")')
554
+ code_lines.append(f"def dynamic_{sanitized_pattern}(**kwargs):")
555
+ code_lines.append(f' """Dynamic resource provider for {pattern}"""')
556
+ code_lines.append(" # Mock dynamic resource implementation")
557
+ code_lines.append(' return f"Dynamic content for {kwargs}"')
558
+ code_lines.append("")
559
+
560
+ return "\n".join(code_lines)
561
+
562
+ def _sanitize_name(self, name: str) -> str:
563
+ """Sanitize a name for use as Python identifier."""
564
+ import re
565
+
566
+ # Replace non-alphanumeric characters with underscores
567
+ sanitized = re.sub(r"[^a-zA-Z0-9_]", "_", name)
568
+ # Ensure it starts with a letter or underscore
569
+ if sanitized and sanitized[0].isdigit():
570
+ sanitized = f"r_{sanitized}"
571
+ return sanitized or "unnamed"
@@ -1,5 +1,20 @@
1
1
  """Transform processing nodes for the Kailash SDK."""
2
2
 
3
+ from kailash.nodes.transform.chunkers import HierarchicalChunkerNode
4
+ from kailash.nodes.transform.formatters import (
5
+ ChunkTextExtractorNode,
6
+ ContextFormatterNode,
7
+ QueryTextWrapperNode,
8
+ )
3
9
  from kailash.nodes.transform.processors import DataTransformer, Filter, Map, Sort
4
10
 
5
- __all__ = ["Filter", "Map", "Sort", "DataTransformer"]
11
+ __all__ = [
12
+ "Filter",
13
+ "Map",
14
+ "Sort",
15
+ "DataTransformer",
16
+ "HierarchicalChunkerNode",
17
+ "ChunkTextExtractorNode",
18
+ "QueryTextWrapperNode",
19
+ "ContextFormatterNode",
20
+ ]
@@ -0,0 +1,78 @@
1
+ """Document chunking nodes for splitting text into manageable pieces."""
2
+
3
+ from typing import Any, Dict
4
+
5
+ from kailash.nodes.base import Node, NodeParameter, register_node
6
+
7
+
8
+ @register_node()
9
+ class HierarchicalChunkerNode(Node):
10
+ """Splits documents into hierarchical chunks for better retrieval."""
11
+
12
+ def get_parameters(self) -> Dict[str, NodeParameter]:
13
+ return {
14
+ "documents": NodeParameter(
15
+ name="documents",
16
+ type=list,
17
+ required=False,
18
+ description="List of documents to chunk",
19
+ ),
20
+ "chunk_size": NodeParameter(
21
+ name="chunk_size",
22
+ type=int,
23
+ required=False,
24
+ default=200,
25
+ description="Target size for text chunks",
26
+ ),
27
+ "overlap": NodeParameter(
28
+ name="overlap",
29
+ type=int,
30
+ required=False,
31
+ default=50,
32
+ description="Overlap between chunks",
33
+ ),
34
+ }
35
+
36
+ def run(self, **kwargs) -> Dict[str, Any]:
37
+ documents = kwargs.get("documents", [])
38
+ chunk_size = kwargs.get("chunk_size", 200)
39
+ # overlap = kwargs.get("overlap", 50) # Currently not used in chunking logic
40
+
41
+ print(f"Debug Chunker: received {len(documents)} documents")
42
+
43
+ all_chunks = []
44
+
45
+ for doc in documents:
46
+ content = doc["content"]
47
+ doc_id = doc["id"]
48
+ title = doc["title"]
49
+
50
+ # Simple sentence-aware chunking
51
+ sentences = content.split(". ")
52
+ chunks = []
53
+ current_chunk = ""
54
+
55
+ for sentence in sentences:
56
+ if len(current_chunk) + len(sentence) < chunk_size:
57
+ current_chunk += sentence + ". "
58
+ else:
59
+ if current_chunk:
60
+ chunks.append(current_chunk.strip())
61
+ current_chunk = sentence + ". "
62
+
63
+ if current_chunk:
64
+ chunks.append(current_chunk.strip())
65
+
66
+ # Create hierarchical chunk structure
67
+ for i, chunk in enumerate(chunks):
68
+ chunk_data = {
69
+ "chunk_id": f"{doc_id}_chunk_{i}",
70
+ "document_id": doc_id,
71
+ "document_title": title,
72
+ "chunk_index": i,
73
+ "content": chunk,
74
+ "hierarchy_level": "paragraph",
75
+ }
76
+ all_chunks.append(chunk_data)
77
+
78
+ return {"chunks": all_chunks}