kailash 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. kailash/__init__.py +33 -1
  2. kailash/access_control/__init__.py +129 -0
  3. kailash/access_control/managers.py +461 -0
  4. kailash/access_control/rule_evaluators.py +467 -0
  5. kailash/access_control_abac.py +825 -0
  6. kailash/config/__init__.py +27 -0
  7. kailash/config/database_config.py +359 -0
  8. kailash/database/__init__.py +28 -0
  9. kailash/database/execution_pipeline.py +499 -0
  10. kailash/middleware/__init__.py +306 -0
  11. kailash/middleware/auth/__init__.py +33 -0
  12. kailash/middleware/auth/access_control.py +436 -0
  13. kailash/middleware/auth/auth_manager.py +422 -0
  14. kailash/middleware/auth/jwt_auth.py +477 -0
  15. kailash/middleware/auth/kailash_jwt_auth.py +616 -0
  16. kailash/middleware/communication/__init__.py +37 -0
  17. kailash/middleware/communication/ai_chat.py +989 -0
  18. kailash/middleware/communication/api_gateway.py +802 -0
  19. kailash/middleware/communication/events.py +470 -0
  20. kailash/middleware/communication/realtime.py +710 -0
  21. kailash/middleware/core/__init__.py +21 -0
  22. kailash/middleware/core/agent_ui.py +890 -0
  23. kailash/middleware/core/schema.py +643 -0
  24. kailash/middleware/core/workflows.py +396 -0
  25. kailash/middleware/database/__init__.py +63 -0
  26. kailash/middleware/database/base.py +113 -0
  27. kailash/middleware/database/base_models.py +525 -0
  28. kailash/middleware/database/enums.py +106 -0
  29. kailash/middleware/database/migrations.py +12 -0
  30. kailash/{api/database.py → middleware/database/models.py} +183 -291
  31. kailash/middleware/database/repositories.py +685 -0
  32. kailash/middleware/database/session_manager.py +19 -0
  33. kailash/middleware/mcp/__init__.py +38 -0
  34. kailash/middleware/mcp/client_integration.py +585 -0
  35. kailash/middleware/mcp/enhanced_server.py +576 -0
  36. kailash/nodes/__init__.py +27 -3
  37. kailash/nodes/admin/__init__.py +42 -0
  38. kailash/nodes/admin/audit_log.py +794 -0
  39. kailash/nodes/admin/permission_check.py +864 -0
  40. kailash/nodes/admin/role_management.py +823 -0
  41. kailash/nodes/admin/security_event.py +1523 -0
  42. kailash/nodes/admin/user_management.py +944 -0
  43. kailash/nodes/ai/a2a.py +24 -7
  44. kailash/nodes/ai/ai_providers.py +248 -40
  45. kailash/nodes/ai/embedding_generator.py +11 -11
  46. kailash/nodes/ai/intelligent_agent_orchestrator.py +99 -11
  47. kailash/nodes/ai/llm_agent.py +436 -5
  48. kailash/nodes/ai/self_organizing.py +85 -10
  49. kailash/nodes/ai/vision_utils.py +148 -0
  50. kailash/nodes/alerts/__init__.py +26 -0
  51. kailash/nodes/alerts/base.py +234 -0
  52. kailash/nodes/alerts/discord.py +499 -0
  53. kailash/nodes/api/auth.py +287 -6
  54. kailash/nodes/api/rest.py +151 -0
  55. kailash/nodes/auth/__init__.py +17 -0
  56. kailash/nodes/auth/directory_integration.py +1228 -0
  57. kailash/nodes/auth/enterprise_auth_provider.py +1328 -0
  58. kailash/nodes/auth/mfa.py +2338 -0
  59. kailash/nodes/auth/risk_assessment.py +872 -0
  60. kailash/nodes/auth/session_management.py +1093 -0
  61. kailash/nodes/auth/sso.py +1040 -0
  62. kailash/nodes/base.py +344 -13
  63. kailash/nodes/base_cycle_aware.py +4 -2
  64. kailash/nodes/base_with_acl.py +1 -1
  65. kailash/nodes/code/python.py +283 -10
  66. kailash/nodes/compliance/__init__.py +9 -0
  67. kailash/nodes/compliance/data_retention.py +1888 -0
  68. kailash/nodes/compliance/gdpr.py +2004 -0
  69. kailash/nodes/data/__init__.py +22 -2
  70. kailash/nodes/data/async_connection.py +469 -0
  71. kailash/nodes/data/async_sql.py +757 -0
  72. kailash/nodes/data/async_vector.py +598 -0
  73. kailash/nodes/data/readers.py +767 -0
  74. kailash/nodes/data/retrieval.py +360 -1
  75. kailash/nodes/data/sharepoint_graph.py +397 -21
  76. kailash/nodes/data/sql.py +94 -5
  77. kailash/nodes/data/streaming.py +68 -8
  78. kailash/nodes/data/vector_db.py +54 -4
  79. kailash/nodes/enterprise/__init__.py +13 -0
  80. kailash/nodes/enterprise/batch_processor.py +741 -0
  81. kailash/nodes/enterprise/data_lineage.py +497 -0
  82. kailash/nodes/logic/convergence.py +31 -9
  83. kailash/nodes/logic/operations.py +14 -3
  84. kailash/nodes/mixins/__init__.py +8 -0
  85. kailash/nodes/mixins/event_emitter.py +201 -0
  86. kailash/nodes/mixins/mcp.py +9 -4
  87. kailash/nodes/mixins/security.py +165 -0
  88. kailash/nodes/monitoring/__init__.py +7 -0
  89. kailash/nodes/monitoring/performance_benchmark.py +2497 -0
  90. kailash/nodes/rag/__init__.py +284 -0
  91. kailash/nodes/rag/advanced.py +1615 -0
  92. kailash/nodes/rag/agentic.py +773 -0
  93. kailash/nodes/rag/conversational.py +999 -0
  94. kailash/nodes/rag/evaluation.py +875 -0
  95. kailash/nodes/rag/federated.py +1188 -0
  96. kailash/nodes/rag/graph.py +721 -0
  97. kailash/nodes/rag/multimodal.py +671 -0
  98. kailash/nodes/rag/optimized.py +933 -0
  99. kailash/nodes/rag/privacy.py +1059 -0
  100. kailash/nodes/rag/query_processing.py +1335 -0
  101. kailash/nodes/rag/realtime.py +764 -0
  102. kailash/nodes/rag/registry.py +547 -0
  103. kailash/nodes/rag/router.py +837 -0
  104. kailash/nodes/rag/similarity.py +1854 -0
  105. kailash/nodes/rag/strategies.py +566 -0
  106. kailash/nodes/rag/workflows.py +575 -0
  107. kailash/nodes/security/__init__.py +19 -0
  108. kailash/nodes/security/abac_evaluator.py +1411 -0
  109. kailash/nodes/security/audit_log.py +103 -0
  110. kailash/nodes/security/behavior_analysis.py +1893 -0
  111. kailash/nodes/security/credential_manager.py +401 -0
  112. kailash/nodes/security/rotating_credentials.py +760 -0
  113. kailash/nodes/security/security_event.py +133 -0
  114. kailash/nodes/security/threat_detection.py +1103 -0
  115. kailash/nodes/testing/__init__.py +9 -0
  116. kailash/nodes/testing/credential_testing.py +499 -0
  117. kailash/nodes/transform/__init__.py +10 -2
  118. kailash/nodes/transform/chunkers.py +592 -1
  119. kailash/nodes/transform/processors.py +484 -14
  120. kailash/nodes/validation.py +321 -0
  121. kailash/runtime/access_controlled.py +1 -1
  122. kailash/runtime/async_local.py +41 -7
  123. kailash/runtime/docker.py +1 -1
  124. kailash/runtime/local.py +474 -55
  125. kailash/runtime/parallel.py +1 -1
  126. kailash/runtime/parallel_cyclic.py +1 -1
  127. kailash/runtime/testing.py +210 -2
  128. kailash/security.py +1 -1
  129. kailash/utils/migrations/__init__.py +25 -0
  130. kailash/utils/migrations/generator.py +433 -0
  131. kailash/utils/migrations/models.py +231 -0
  132. kailash/utils/migrations/runner.py +489 -0
  133. kailash/utils/secure_logging.py +342 -0
  134. kailash/workflow/__init__.py +16 -0
  135. kailash/workflow/cyclic_runner.py +3 -4
  136. kailash/workflow/graph.py +70 -2
  137. kailash/workflow/resilience.py +249 -0
  138. kailash/workflow/templates.py +726 -0
  139. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/METADATA +256 -20
  140. kailash-0.4.1.dist-info/RECORD +227 -0
  141. kailash/api/__init__.py +0 -17
  142. kailash/api/__main__.py +0 -6
  143. kailash/api/studio_secure.py +0 -893
  144. kailash/mcp/__main__.py +0 -13
  145. kailash/mcp/server_new.py +0 -336
  146. kailash/mcp/servers/__init__.py +0 -12
  147. kailash-0.3.2.dist-info/RECORD +0 -136
  148. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/WHEEL +0 -0
  149. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/entry_points.txt +0 -0
  150. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/licenses/LICENSE +0 -0
  151. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/top_level.txt +0 -0
@@ -289,6 +289,160 @@ class CSVReaderNode(Node):
289
289
 
290
290
  return result
291
291
 
292
+ def _infer_type(self, value: str) -> Any:
293
+ """Infer the appropriate Python type for a CSV value.
294
+
295
+ Args:
296
+ value: String value from CSV
297
+
298
+ Returns:
299
+ Value converted to appropriate type (int, float, bool, or str)
300
+ """
301
+ if not value or value.strip() == "":
302
+ return None
303
+
304
+ value = value.strip()
305
+
306
+ # Try boolean first (only explicit boolean representations, not numeric 0/1)
307
+ if value.lower() in ("true", "false", "yes", "no"):
308
+ return value.lower() in ("true", "yes")
309
+
310
+ # Try integer
311
+ try:
312
+ if (
313
+ "." not in value
314
+ and value.isdigit()
315
+ or (value.startswith("-") and value[1:].isdigit())
316
+ ):
317
+ return int(value)
318
+ except ValueError:
319
+ pass
320
+
321
+ # Try float
322
+ try:
323
+ if "." in value or "e" in value.lower():
324
+ return float(value)
325
+ except ValueError:
326
+ pass
327
+
328
+ # Return as string
329
+ return value
330
+
331
+ async def async_run(self, **kwargs) -> dict[str, Any]:
332
+ """Read CSV file asynchronously for better I/O performance.
333
+
334
+ This method provides true async file reading with aiofiles,
335
+ offering significant performance improvements for large files
336
+ and concurrent operations.
337
+
338
+ Args:
339
+ Same as run() method
340
+
341
+ Returns:
342
+ Same as run() method
343
+
344
+ Raises:
345
+ Same as run() method
346
+ """
347
+ # Import aiofiles for async file operations
348
+ try:
349
+ import aiofiles
350
+ except ImportError:
351
+ # Fallback to sync version if async dependencies not available
352
+ return self.run(**kwargs)
353
+
354
+ file_path = kwargs.get("file_path")
355
+ encoding = kwargs.get("encoding", "utf-8")
356
+ delimiter = kwargs.get("delimiter", ",")
357
+ has_header = kwargs.get("has_header", True)
358
+ skip_rows = kwargs.get("skip_rows", 0)
359
+ max_rows = kwargs.get("max_rows")
360
+ columns = kwargs.get("columns")
361
+ index_column = kwargs.get("index_column")
362
+
363
+ # Validate inputs using same logic as sync version
364
+ if not file_path:
365
+ raise ValueError("file_path is required")
366
+
367
+ validate_file_path(file_path)
368
+
369
+ try:
370
+ # Async file reading with aiofiles
371
+ async with aiofiles.open(file_path, mode="r", encoding=encoding) as file:
372
+ # Read all lines for CSV parsing
373
+ content = await file.read()
374
+
375
+ # Parse CSV content (CPU-bound, but file I/O is async)
376
+ import io
377
+
378
+ content_io = io.StringIO(content)
379
+
380
+ # Skip rows if requested
381
+ for _ in range(skip_rows):
382
+ next(content_io, None)
383
+
384
+ # Create CSV reader
385
+ csv_reader = csv.reader(content_io, delimiter=delimiter)
386
+
387
+ # Handle header row
388
+ headers = None
389
+ if has_header:
390
+ headers = next(csv_reader, None)
391
+ if headers and columns:
392
+ # Validate that specified columns exist
393
+ missing_cols = set(columns) - set(headers)
394
+ if missing_cols:
395
+ raise ValueError(f"Columns not found: {missing_cols}")
396
+ elif columns:
397
+ headers = columns
398
+
399
+ # Read data rows
400
+ data = []
401
+ data_indexed = {}
402
+
403
+ for row_num, row in enumerate(csv_reader):
404
+ if max_rows and row_num >= max_rows:
405
+ break
406
+
407
+ if not row: # Skip empty rows
408
+ continue
409
+
410
+ # Process row based on whether we have headers
411
+ if headers:
412
+ # Create dictionary with column names
413
+ row_data = {}
414
+ for i, value in enumerate(row):
415
+ if i < len(headers):
416
+ col_name = headers[i]
417
+ # Only include specified columns if provided
418
+ if not columns or col_name in columns:
419
+ row_data[col_name] = self._infer_type(
420
+ value.strip() if value else value
421
+ )
422
+ else:
423
+ # No headers, return as list with type inference
424
+ row_data = [
425
+ self._infer_type(cell.strip() if cell else cell) for cell in row
426
+ ]
427
+
428
+ data.append(row_data)
429
+
430
+ # Handle index column for faster lookups
431
+ if index_column and headers and index_column in headers:
432
+ index_value = row_data.get(index_column)
433
+ if index_value is not None:
434
+ data_indexed[index_value] = row_data
435
+
436
+ except Exception as e:
437
+ raise ValueError(f"Error reading CSV file: {str(e)}")
438
+
439
+ # Return same format as sync version
440
+ result = {"data": data}
441
+ if index_column:
442
+ result["data_indexed"] = data_indexed
443
+
444
+ return result
445
+
292
446
 
293
447
  @register_node()
294
448
  class JSONReaderNode(Node):
@@ -416,6 +570,51 @@ class JSONReaderNode(Node):
416
570
 
417
571
  return {"data": data}
418
572
 
573
+ async def async_run(self, **kwargs) -> dict[str, Any]:
574
+ """Read JSON file asynchronously for better I/O performance.
575
+
576
+ This method provides true async file reading with aiofiles,
577
+ offering significant performance improvements for large files
578
+ and concurrent operations.
579
+
580
+ Args:
581
+ Same as run() method
582
+
583
+ Returns:
584
+ Same as run() method
585
+
586
+ Raises:
587
+ Same as run() method
588
+ """
589
+ # Import aiofiles for async file operations
590
+ try:
591
+ import aiofiles
592
+ except ImportError:
593
+ # Fallback to sync version if async dependencies not available
594
+ return self.run(**kwargs)
595
+
596
+ file_path = kwargs.get("file_path") or self.config.get("file_path")
597
+
598
+ # Validate file path for security (same as sync version)
599
+ validated_path = validate_file_path(file_path, operation="JSON read")
600
+
601
+ try:
602
+ # Async file reading with aiofiles
603
+ async with aiofiles.open(
604
+ validated_path, mode="r", encoding="utf-8"
605
+ ) as file:
606
+ content = await file.read()
607
+
608
+ # Parse JSON content (CPU-bound, but file I/O is async)
609
+ data = json.loads(content)
610
+
611
+ except json.JSONDecodeError as e:
612
+ raise ValueError(f"Invalid JSON in file {validated_path}: {str(e)}")
613
+ except Exception as e:
614
+ raise ValueError(f"Error reading JSON file {validated_path}: {str(e)}")
615
+
616
+ return {"data": data}
617
+
419
618
 
420
619
  @register_node()
421
620
  class TextReaderNode(Node):
@@ -557,3 +756,571 @@ class TextReaderNode(Node):
557
756
  text = f.read()
558
757
 
559
758
  return {"text": text}
759
+
760
+
761
+ @register_node()
762
+ class DocumentProcessorNode(Node):
763
+ """
764
+ Advanced document processor that reads and processes multiple document formats
765
+ with automatic format detection, metadata extraction, and structured output.
766
+
767
+ This node unifies document reading across formats (PDF, DOCX, MD, TXT, HTML, RTF)
768
+ and provides consistent structured output with extracted metadata, making it
769
+ ideal for document analysis workflows, content management, and RAG systems.
770
+
771
+ Design Philosophy:
772
+ The DocumentProcessorNode embodies "universal document accessibility."
773
+ Rather than requiring format-specific readers, it automatically detects
774
+ and processes various document types, extracting both content and metadata
775
+ for comprehensive document understanding.
776
+
777
+ Upstream Dependencies:
778
+ - File system providing documents
779
+ - Path discovery nodes
780
+ - Document management systems
781
+ - User inputs specifying documents
782
+
783
+ Downstream Consumers:
784
+ - Chunking nodes for text segmentation
785
+ - Embedding nodes for vector processing
786
+ - LLM nodes for content analysis
787
+ - Indexing systems for document search
788
+ - Metadata analyzers for classification
789
+
790
+ Supported Formats:
791
+ - PDF: Full text extraction with metadata
792
+ - DOCX: Content and document properties
793
+ - TXT: Plain text with encoding detection
794
+ - MD: Markdown with structure parsing
795
+ - HTML: Text extraction from markup
796
+ - RTF: Rich text format processing
797
+ - Auto-detection based on file extension
798
+
799
+ Configuration:
800
+ - extract_metadata: Include document properties
801
+ - preserve_structure: Maintain document sections
802
+ - encoding: Text encoding for plain text files
803
+ - extract_images: Include image references (future)
804
+ - page_numbers: Include page/section numbers
805
+
806
+ Examples:
807
+ >>> processor = DocumentProcessorNode(
808
+ ... extract_metadata=True,
809
+ ... preserve_structure=True
810
+ ... )
811
+ >>> result = processor.run(
812
+ ... file_path="document.pdf"
813
+ ... )
814
+ >>> content = result["content"]
815
+ >>> metadata = result["metadata"]
816
+ >>> sections = result["sections"]
817
+ """
818
+
819
+ def __init__(self, name: str = "document_processor", **kwargs):
820
+ # Set attributes before calling super().__init__() as Kailash validates during init
821
+ self.extract_metadata = kwargs.get("extract_metadata", True)
822
+ self.preserve_structure = kwargs.get("preserve_structure", True)
823
+ self.encoding = kwargs.get("encoding", "utf-8")
824
+ self.extract_images = kwargs.get("extract_images", False)
825
+ self.page_numbers = kwargs.get("page_numbers", True)
826
+
827
+ super().__init__(name=name)
828
+
829
+ def get_parameters(self) -> dict[str, NodeParameter]:
830
+ """Define input parameters for document processing."""
831
+ return {
832
+ "file_path": NodeParameter(
833
+ name="file_path",
834
+ type=str,
835
+ required=True,
836
+ description="Path to the document file to process",
837
+ ),
838
+ "extract_metadata": NodeParameter(
839
+ name="extract_metadata",
840
+ type=bool,
841
+ required=False,
842
+ default=self.extract_metadata,
843
+ description="Extract document metadata (title, author, creation date, etc.)",
844
+ ),
845
+ "preserve_structure": NodeParameter(
846
+ name="preserve_structure",
847
+ type=bool,
848
+ required=False,
849
+ default=self.preserve_structure,
850
+ description="Preserve document structure (sections, headings, etc.)",
851
+ ),
852
+ "encoding": NodeParameter(
853
+ name="encoding",
854
+ type=str,
855
+ required=False,
856
+ default=self.encoding,
857
+ description="Text encoding for plain text files",
858
+ ),
859
+ "page_numbers": NodeParameter(
860
+ name="page_numbers",
861
+ type=bool,
862
+ required=False,
863
+ default=self.page_numbers,
864
+ description="Include page/section numbers in output",
865
+ ),
866
+ "extract_images": NodeParameter(
867
+ name="extract_images",
868
+ type=bool,
869
+ required=False,
870
+ default=self.extract_images,
871
+ description="Extract image references and descriptions",
872
+ ),
873
+ }
874
+
875
+ def run(self, **kwargs) -> dict[str, Any]:
876
+ """Execute document processing operation."""
877
+ file_path = kwargs.get("file_path", "")
878
+ extract_metadata = kwargs.get("extract_metadata", self.extract_metadata)
879
+ preserve_structure = kwargs.get("preserve_structure", self.preserve_structure)
880
+ encoding = kwargs.get("encoding", self.encoding)
881
+ page_numbers = kwargs.get("page_numbers", self.page_numbers)
882
+ extract_images = kwargs.get("extract_images", self.extract_images)
883
+
884
+ if not file_path:
885
+ return {
886
+ "error": "File path is required",
887
+ "content": "",
888
+ "metadata": {},
889
+ "sections": [],
890
+ }
891
+
892
+ try:
893
+ # Validate file path for security
894
+ validated_path = validate_file_path(file_path, operation="document read")
895
+
896
+ # Detect document format
897
+ document_format = self._detect_format(validated_path)
898
+
899
+ # Process document based on format
900
+ if document_format == "pdf":
901
+ result = self._process_pdf(
902
+ validated_path, extract_metadata, preserve_structure, page_numbers
903
+ )
904
+ elif document_format == "docx":
905
+ result = self._process_docx(
906
+ validated_path, extract_metadata, preserve_structure
907
+ )
908
+ elif document_format == "markdown":
909
+ result = self._process_markdown(
910
+ validated_path, encoding, preserve_structure
911
+ )
912
+ elif document_format == "html":
913
+ result = self._process_html(
914
+ validated_path, encoding, preserve_structure
915
+ )
916
+ elif document_format == "rtf":
917
+ result = self._process_rtf(
918
+ validated_path, extract_metadata, preserve_structure
919
+ )
920
+ else: # Default to text
921
+ result = self._process_text(validated_path, encoding, extract_metadata)
922
+
923
+ # Add common metadata
924
+ result["metadata"]["file_path"] = file_path
925
+ result["metadata"]["document_format"] = document_format
926
+ result["metadata"]["processing_timestamp"] = self._get_timestamp()
927
+
928
+ return result
929
+
930
+ except Exception as e:
931
+ return {
932
+ "error": f"Document processing failed: {str(e)}",
933
+ "content": "",
934
+ "metadata": {"file_path": file_path, "error": str(e)},
935
+ "sections": [],
936
+ "document_format": "unknown",
937
+ }
938
+
939
+ def _detect_format(self, file_path: str) -> str:
940
+ """Detect document format based on file extension."""
941
+ import os
942
+
943
+ extension = os.path.splitext(file_path)[1].lower()
944
+
945
+ format_map = {
946
+ ".pdf": "pdf",
947
+ ".docx": "docx",
948
+ ".doc": "docx", # Treat as docx for now
949
+ ".md": "markdown",
950
+ ".markdown": "markdown",
951
+ ".html": "html",
952
+ ".htm": "html",
953
+ ".rtf": "rtf",
954
+ ".txt": "text",
955
+ ".log": "text",
956
+ ".csv": "text", # Could be enhanced
957
+ ".json": "text", # Could be enhanced
958
+ }
959
+
960
+ return format_map.get(extension, "text")
961
+
962
+ def _process_pdf(
963
+ self,
964
+ file_path: str,
965
+ extract_metadata: bool,
966
+ preserve_structure: bool,
967
+ page_numbers: bool,
968
+ ) -> dict:
969
+ """Process PDF document (simplified implementation)."""
970
+ # In a real implementation, this would use PyPDF2, pdfplumber, or similar
971
+ # For now, return a structured placeholder
972
+
973
+ try:
974
+ # Placeholder implementation - in reality would use PDF libraries
975
+ content = f"[PDF Content from {file_path}]"
976
+
977
+ metadata = {}
978
+ if extract_metadata:
979
+ metadata.update(
980
+ {
981
+ "title": "Document Title",
982
+ "author": "Document Author",
983
+ "creation_date": "2024-01-01",
984
+ "page_count": 1,
985
+ "pdf_version": "1.4",
986
+ }
987
+ )
988
+
989
+ sections = []
990
+ if preserve_structure:
991
+ sections = [
992
+ {
993
+ "type": "page",
994
+ "number": 1,
995
+ "content": content,
996
+ "start_position": 0,
997
+ "end_position": len(content),
998
+ }
999
+ ]
1000
+
1001
+ return {
1002
+ "content": content,
1003
+ "metadata": metadata,
1004
+ "sections": sections,
1005
+ "document_format": "pdf",
1006
+ }
1007
+
1008
+ except Exception as e:
1009
+ # Fall back to text reading if PDF processing fails
1010
+ return self._process_text(file_path, "utf-8", extract_metadata)
1011
+
1012
+ def _process_docx(
1013
+ self, file_path: str, extract_metadata: bool, preserve_structure: bool
1014
+ ) -> dict:
1015
+ """Process DOCX document (simplified implementation)."""
1016
+ # In a real implementation, this would use python-docx
1017
+ # For now, return a structured placeholder
1018
+
1019
+ try:
1020
+ # Placeholder implementation - in reality would use python-docx
1021
+ content = f"[DOCX Content from {file_path}]"
1022
+
1023
+ metadata = {}
1024
+ if extract_metadata:
1025
+ metadata.update(
1026
+ {
1027
+ "title": "Document Title",
1028
+ "author": "Document Author",
1029
+ "creation_date": "2024-01-01",
1030
+ "modification_date": "2024-01-01",
1031
+ "word_count": len(content.split()),
1032
+ }
1033
+ )
1034
+
1035
+ sections = []
1036
+ if preserve_structure:
1037
+ sections = [
1038
+ {
1039
+ "type": "paragraph",
1040
+ "style": "Normal",
1041
+ "content": content,
1042
+ "start_position": 0,
1043
+ "end_position": len(content),
1044
+ }
1045
+ ]
1046
+
1047
+ return {
1048
+ "content": content,
1049
+ "metadata": metadata,
1050
+ "sections": sections,
1051
+ "document_format": "docx",
1052
+ }
1053
+
1054
+ except Exception as e:
1055
+ # Fall back to text reading if DOCX processing fails
1056
+ return self._process_text(file_path, "utf-8", extract_metadata)
1057
+
1058
+ def _process_markdown(
1059
+ self, file_path: str, encoding: str, preserve_structure: bool
1060
+ ) -> dict:
1061
+ """Process Markdown document with structure parsing."""
1062
+ try:
1063
+ with safe_open(file_path, "r", encoding=encoding) as f:
1064
+ content = f.read()
1065
+
1066
+ metadata = {
1067
+ "character_count": len(content),
1068
+ "line_count": len(content.splitlines()),
1069
+ "word_count": len(content.split()),
1070
+ }
1071
+
1072
+ sections = []
1073
+ if preserve_structure:
1074
+ sections = self._parse_markdown_structure(content)
1075
+
1076
+ return {
1077
+ "content": content,
1078
+ "metadata": metadata,
1079
+ "sections": sections,
1080
+ "document_format": "markdown",
1081
+ }
1082
+
1083
+ except Exception as e:
1084
+ return {
1085
+ "content": "",
1086
+ "metadata": {"error": str(e)},
1087
+ "sections": [],
1088
+ "document_format": "markdown",
1089
+ }
1090
+
1091
+ def _process_html(
1092
+ self, file_path: str, encoding: str, preserve_structure: bool
1093
+ ) -> dict:
1094
+ """Process HTML document with text extraction."""
1095
+ try:
1096
+ with safe_open(file_path, "r", encoding=encoding) as f:
1097
+ html_content = f.read()
1098
+
1099
+ # Simple HTML text extraction (in reality would use BeautifulSoup)
1100
+ import re
1101
+
1102
+ # Remove script and style elements
1103
+ html_content = re.sub(
1104
+ r"<script[^>]*>.*?</script>",
1105
+ "",
1106
+ html_content,
1107
+ flags=re.DOTALL | re.IGNORECASE,
1108
+ )
1109
+ html_content = re.sub(
1110
+ r"<style[^>]*>.*?</style>",
1111
+ "",
1112
+ html_content,
1113
+ flags=re.DOTALL | re.IGNORECASE,
1114
+ )
1115
+ # Remove HTML tags
1116
+ content = re.sub(r"<[^>]+>", "", html_content)
1117
+ # Clean up whitespace
1118
+ content = re.sub(r"\s+", " ", content).strip()
1119
+
1120
+ metadata = {
1121
+ "character_count": len(content),
1122
+ "word_count": len(content.split()),
1123
+ "original_html_length": len(html_content),
1124
+ }
1125
+
1126
+ sections = []
1127
+ if preserve_structure:
1128
+ # Simple section detection based on common patterns
1129
+ sections = self._parse_html_structure(html_content, content)
1130
+
1131
+ return {
1132
+ "content": content,
1133
+ "metadata": metadata,
1134
+ "sections": sections,
1135
+ "document_format": "html",
1136
+ }
1137
+
1138
+ except Exception as e:
1139
+ return {
1140
+ "content": "",
1141
+ "metadata": {"error": str(e)},
1142
+ "sections": [],
1143
+ "document_format": "html",
1144
+ }
1145
+
1146
+ def _process_rtf(
1147
+ self, file_path: str, extract_metadata: bool, preserve_structure: bool
1148
+ ) -> dict:
1149
+ """Process RTF document (simplified implementation)."""
1150
+ # In a real implementation, this would use striprtf or similar
1151
+ try:
1152
+ with safe_open(file_path, "r", encoding="utf-8") as f:
1153
+ rtf_content = f.read()
1154
+
1155
+ # Simple RTF text extraction (remove RTF control codes)
1156
+ import re
1157
+
1158
+ content = re.sub(r"\\[a-z]+\d*\s?", "", rtf_content) # Remove RTF commands
1159
+ content = re.sub(r"[{}]", "", content) # Remove braces
1160
+ content = re.sub(r"\s+", " ", content).strip() # Clean whitespace
1161
+
1162
+ metadata = {}
1163
+ if extract_metadata:
1164
+ metadata.update(
1165
+ {
1166
+ "character_count": len(content),
1167
+ "word_count": len(content.split()),
1168
+ "original_rtf_length": len(rtf_content),
1169
+ }
1170
+ )
1171
+
1172
+ sections = []
1173
+ if preserve_structure:
1174
+ sections = [
1175
+ {
1176
+ "type": "document",
1177
+ "content": content,
1178
+ "start_position": 0,
1179
+ "end_position": len(content),
1180
+ }
1181
+ ]
1182
+
1183
+ return {
1184
+ "content": content,
1185
+ "metadata": metadata,
1186
+ "sections": sections,
1187
+ "document_format": "rtf",
1188
+ }
1189
+
1190
+ except Exception as e:
1191
+ return {
1192
+ "content": "",
1193
+ "metadata": {"error": str(e)},
1194
+ "sections": [],
1195
+ "document_format": "rtf",
1196
+ }
1197
+
1198
+ def _process_text(
1199
+ self, file_path: str, encoding: str, extract_metadata: bool
1200
+ ) -> dict:
1201
+ """Process plain text document."""
1202
+ try:
1203
+ with safe_open(file_path, "r", encoding=encoding) as f:
1204
+ content = f.read()
1205
+
1206
+ metadata = {}
1207
+ if extract_metadata:
1208
+ lines = content.splitlines()
1209
+ metadata.update(
1210
+ {
1211
+ "character_count": len(content),
1212
+ "line_count": len(lines),
1213
+ "word_count": len(content.split()),
1214
+ "encoding": encoding,
1215
+ "max_line_length": (
1216
+ max(len(line) for line in lines) if lines else 0
1217
+ ),
1218
+ "blank_lines": sum(1 for line in lines if not line.strip()),
1219
+ }
1220
+ )
1221
+
1222
+ sections = [
1223
+ {
1224
+ "type": "text",
1225
+ "content": content,
1226
+ "start_position": 0,
1227
+ "end_position": len(content),
1228
+ }
1229
+ ]
1230
+
1231
+ return {
1232
+ "content": content,
1233
+ "metadata": metadata,
1234
+ "sections": sections,
1235
+ "document_format": "text",
1236
+ }
1237
+
1238
+ except Exception as e:
1239
+ return {
1240
+ "content": "",
1241
+ "metadata": {"error": str(e)},
1242
+ "sections": [],
1243
+ "document_format": "text",
1244
+ }
1245
+
1246
+ def _parse_markdown_structure(self, content: str) -> list:
1247
+ """Parse Markdown structure into sections."""
1248
+ import re
1249
+
1250
+ sections = []
1251
+
1252
+ # Find headings
1253
+ heading_pattern = r"^(#{1,6})\s+(.+)$"
1254
+ lines = content.splitlines()
1255
+ current_pos = 0
1256
+
1257
+ for i, line in enumerate(lines):
1258
+ match = re.match(heading_pattern, line)
1259
+ if match:
1260
+ level = len(match.group(1))
1261
+ title = match.group(2)
1262
+
1263
+ # Calculate position in original content
1264
+ line_start = content.find(line, current_pos)
1265
+
1266
+ sections.append(
1267
+ {
1268
+ "type": "heading",
1269
+ "level": level,
1270
+ "title": title,
1271
+ "content": line,
1272
+ "line_number": i + 1,
1273
+ "start_position": line_start,
1274
+ "end_position": line_start + len(line),
1275
+ }
1276
+ )
1277
+
1278
+ current_pos = line_start + len(line)
1279
+
1280
+ return sections
1281
+
1282
+ def _parse_html_structure(self, html_content: str, text_content: str) -> list:
1283
+ """Parse HTML structure into sections (simplified)."""
1284
+ import re
1285
+
1286
+ sections = []
1287
+
1288
+ # Find title
1289
+ title_match = re.search(
1290
+ r"<title[^>]*>([^<]+)</title>", html_content, re.IGNORECASE
1291
+ )
1292
+ if title_match:
1293
+ sections.append(
1294
+ {
1295
+ "type": "title",
1296
+ "content": title_match.group(1),
1297
+ "start_position": 0,
1298
+ "end_position": len(title_match.group(1)),
1299
+ }
1300
+ )
1301
+
1302
+ # Find headings
1303
+ heading_pattern = r"<(h[1-6])[^>]*>([^<]+)</h[1-6]>"
1304
+ for match in re.finditer(heading_pattern, html_content, re.IGNORECASE):
1305
+ tag = match.group(1)
1306
+ text = match.group(2)
1307
+ level = int(tag[1])
1308
+
1309
+ sections.append(
1310
+ {
1311
+ "type": "heading",
1312
+ "level": level,
1313
+ "title": text,
1314
+ "content": text,
1315
+ "start_position": match.start(),
1316
+ "end_position": match.end(),
1317
+ }
1318
+ )
1319
+
1320
+ return sections
1321
+
1322
+ def _get_timestamp(self) -> str:
1323
+ """Get current timestamp for metadata."""
1324
+ from datetime import datetime
1325
+
1326
+ return datetime.now().isoformat()