kailash 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +33 -1
- kailash/access_control/__init__.py +129 -0
- kailash/access_control/managers.py +461 -0
- kailash/access_control/rule_evaluators.py +467 -0
- kailash/access_control_abac.py +825 -0
- kailash/config/__init__.py +27 -0
- kailash/config/database_config.py +359 -0
- kailash/database/__init__.py +28 -0
- kailash/database/execution_pipeline.py +499 -0
- kailash/middleware/__init__.py +306 -0
- kailash/middleware/auth/__init__.py +33 -0
- kailash/middleware/auth/access_control.py +436 -0
- kailash/middleware/auth/auth_manager.py +422 -0
- kailash/middleware/auth/jwt_auth.py +477 -0
- kailash/middleware/auth/kailash_jwt_auth.py +616 -0
- kailash/middleware/communication/__init__.py +37 -0
- kailash/middleware/communication/ai_chat.py +989 -0
- kailash/middleware/communication/api_gateway.py +802 -0
- kailash/middleware/communication/events.py +470 -0
- kailash/middleware/communication/realtime.py +710 -0
- kailash/middleware/core/__init__.py +21 -0
- kailash/middleware/core/agent_ui.py +890 -0
- kailash/middleware/core/schema.py +643 -0
- kailash/middleware/core/workflows.py +396 -0
- kailash/middleware/database/__init__.py +63 -0
- kailash/middleware/database/base.py +113 -0
- kailash/middleware/database/base_models.py +525 -0
- kailash/middleware/database/enums.py +106 -0
- kailash/middleware/database/migrations.py +12 -0
- kailash/{api/database.py → middleware/database/models.py} +183 -291
- kailash/middleware/database/repositories.py +685 -0
- kailash/middleware/database/session_manager.py +19 -0
- kailash/middleware/mcp/__init__.py +38 -0
- kailash/middleware/mcp/client_integration.py +585 -0
- kailash/middleware/mcp/enhanced_server.py +576 -0
- kailash/nodes/__init__.py +27 -3
- kailash/nodes/admin/__init__.py +42 -0
- kailash/nodes/admin/audit_log.py +794 -0
- kailash/nodes/admin/permission_check.py +864 -0
- kailash/nodes/admin/role_management.py +823 -0
- kailash/nodes/admin/security_event.py +1523 -0
- kailash/nodes/admin/user_management.py +944 -0
- kailash/nodes/ai/a2a.py +24 -7
- kailash/nodes/ai/ai_providers.py +248 -40
- kailash/nodes/ai/embedding_generator.py +11 -11
- kailash/nodes/ai/intelligent_agent_orchestrator.py +99 -11
- kailash/nodes/ai/llm_agent.py +436 -5
- kailash/nodes/ai/self_organizing.py +85 -10
- kailash/nodes/ai/vision_utils.py +148 -0
- kailash/nodes/alerts/__init__.py +26 -0
- kailash/nodes/alerts/base.py +234 -0
- kailash/nodes/alerts/discord.py +499 -0
- kailash/nodes/api/auth.py +287 -6
- kailash/nodes/api/rest.py +151 -0
- kailash/nodes/auth/__init__.py +17 -0
- kailash/nodes/auth/directory_integration.py +1228 -0
- kailash/nodes/auth/enterprise_auth_provider.py +1328 -0
- kailash/nodes/auth/mfa.py +2338 -0
- kailash/nodes/auth/risk_assessment.py +872 -0
- kailash/nodes/auth/session_management.py +1093 -0
- kailash/nodes/auth/sso.py +1040 -0
- kailash/nodes/base.py +344 -13
- kailash/nodes/base_cycle_aware.py +4 -2
- kailash/nodes/base_with_acl.py +1 -1
- kailash/nodes/code/python.py +283 -10
- kailash/nodes/compliance/__init__.py +9 -0
- kailash/nodes/compliance/data_retention.py +1888 -0
- kailash/nodes/compliance/gdpr.py +2004 -0
- kailash/nodes/data/__init__.py +22 -2
- kailash/nodes/data/async_connection.py +469 -0
- kailash/nodes/data/async_sql.py +757 -0
- kailash/nodes/data/async_vector.py +598 -0
- kailash/nodes/data/readers.py +767 -0
- kailash/nodes/data/retrieval.py +360 -1
- kailash/nodes/data/sharepoint_graph.py +397 -21
- kailash/nodes/data/sql.py +94 -5
- kailash/nodes/data/streaming.py +68 -8
- kailash/nodes/data/vector_db.py +54 -4
- kailash/nodes/enterprise/__init__.py +13 -0
- kailash/nodes/enterprise/batch_processor.py +741 -0
- kailash/nodes/enterprise/data_lineage.py +497 -0
- kailash/nodes/logic/convergence.py +31 -9
- kailash/nodes/logic/operations.py +14 -3
- kailash/nodes/mixins/__init__.py +8 -0
- kailash/nodes/mixins/event_emitter.py +201 -0
- kailash/nodes/mixins/mcp.py +9 -4
- kailash/nodes/mixins/security.py +165 -0
- kailash/nodes/monitoring/__init__.py +7 -0
- kailash/nodes/monitoring/performance_benchmark.py +2497 -0
- kailash/nodes/rag/__init__.py +284 -0
- kailash/nodes/rag/advanced.py +1615 -0
- kailash/nodes/rag/agentic.py +773 -0
- kailash/nodes/rag/conversational.py +999 -0
- kailash/nodes/rag/evaluation.py +875 -0
- kailash/nodes/rag/federated.py +1188 -0
- kailash/nodes/rag/graph.py +721 -0
- kailash/nodes/rag/multimodal.py +671 -0
- kailash/nodes/rag/optimized.py +933 -0
- kailash/nodes/rag/privacy.py +1059 -0
- kailash/nodes/rag/query_processing.py +1335 -0
- kailash/nodes/rag/realtime.py +764 -0
- kailash/nodes/rag/registry.py +547 -0
- kailash/nodes/rag/router.py +837 -0
- kailash/nodes/rag/similarity.py +1854 -0
- kailash/nodes/rag/strategies.py +566 -0
- kailash/nodes/rag/workflows.py +575 -0
- kailash/nodes/security/__init__.py +19 -0
- kailash/nodes/security/abac_evaluator.py +1411 -0
- kailash/nodes/security/audit_log.py +103 -0
- kailash/nodes/security/behavior_analysis.py +1893 -0
- kailash/nodes/security/credential_manager.py +401 -0
- kailash/nodes/security/rotating_credentials.py +760 -0
- kailash/nodes/security/security_event.py +133 -0
- kailash/nodes/security/threat_detection.py +1103 -0
- kailash/nodes/testing/__init__.py +9 -0
- kailash/nodes/testing/credential_testing.py +499 -0
- kailash/nodes/transform/__init__.py +10 -2
- kailash/nodes/transform/chunkers.py +592 -1
- kailash/nodes/transform/processors.py +484 -14
- kailash/nodes/validation.py +321 -0
- kailash/runtime/access_controlled.py +1 -1
- kailash/runtime/async_local.py +41 -7
- kailash/runtime/docker.py +1 -1
- kailash/runtime/local.py +474 -55
- kailash/runtime/parallel.py +1 -1
- kailash/runtime/parallel_cyclic.py +1 -1
- kailash/runtime/testing.py +210 -2
- kailash/security.py +1 -1
- kailash/utils/migrations/__init__.py +25 -0
- kailash/utils/migrations/generator.py +433 -0
- kailash/utils/migrations/models.py +231 -0
- kailash/utils/migrations/runner.py +489 -0
- kailash/utils/secure_logging.py +342 -0
- kailash/workflow/__init__.py +16 -0
- kailash/workflow/cyclic_runner.py +3 -4
- kailash/workflow/graph.py +70 -2
- kailash/workflow/resilience.py +249 -0
- kailash/workflow/templates.py +726 -0
- {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/METADATA +256 -20
- kailash-0.4.1.dist-info/RECORD +227 -0
- kailash/api/__init__.py +0 -17
- kailash/api/__main__.py +0 -6
- kailash/api/studio_secure.py +0 -893
- kailash/mcp/__main__.py +0 -13
- kailash/mcp/server_new.py +0 -336
- kailash/mcp/servers/__init__.py +0 -12
- kailash-0.3.2.dist-info/RECORD +0 -136
- {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/WHEEL +0 -0
- {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/entry_points.txt +0 -0
- {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/top_level.txt +0 -0
kailash/nodes/data/readers.py
CHANGED
@@ -289,6 +289,160 @@ class CSVReaderNode(Node):
|
|
289
289
|
|
290
290
|
return result
|
291
291
|
|
292
|
+
def _infer_type(self, value: str) -> Any:
|
293
|
+
"""Infer the appropriate Python type for a CSV value.
|
294
|
+
|
295
|
+
Args:
|
296
|
+
value: String value from CSV
|
297
|
+
|
298
|
+
Returns:
|
299
|
+
Value converted to appropriate type (int, float, bool, or str)
|
300
|
+
"""
|
301
|
+
if not value or value.strip() == "":
|
302
|
+
return None
|
303
|
+
|
304
|
+
value = value.strip()
|
305
|
+
|
306
|
+
# Try boolean first (only explicit boolean representations, not numeric 0/1)
|
307
|
+
if value.lower() in ("true", "false", "yes", "no"):
|
308
|
+
return value.lower() in ("true", "yes")
|
309
|
+
|
310
|
+
# Try integer
|
311
|
+
try:
|
312
|
+
if (
|
313
|
+
"." not in value
|
314
|
+
and value.isdigit()
|
315
|
+
or (value.startswith("-") and value[1:].isdigit())
|
316
|
+
):
|
317
|
+
return int(value)
|
318
|
+
except ValueError:
|
319
|
+
pass
|
320
|
+
|
321
|
+
# Try float
|
322
|
+
try:
|
323
|
+
if "." in value or "e" in value.lower():
|
324
|
+
return float(value)
|
325
|
+
except ValueError:
|
326
|
+
pass
|
327
|
+
|
328
|
+
# Return as string
|
329
|
+
return value
|
330
|
+
|
331
|
+
async def async_run(self, **kwargs) -> dict[str, Any]:
|
332
|
+
"""Read CSV file asynchronously for better I/O performance.
|
333
|
+
|
334
|
+
This method provides true async file reading with aiofiles,
|
335
|
+
offering significant performance improvements for large files
|
336
|
+
and concurrent operations.
|
337
|
+
|
338
|
+
Args:
|
339
|
+
Same as run() method
|
340
|
+
|
341
|
+
Returns:
|
342
|
+
Same as run() method
|
343
|
+
|
344
|
+
Raises:
|
345
|
+
Same as run() method
|
346
|
+
"""
|
347
|
+
# Import aiofiles for async file operations
|
348
|
+
try:
|
349
|
+
import aiofiles
|
350
|
+
except ImportError:
|
351
|
+
# Fallback to sync version if async dependencies not available
|
352
|
+
return self.run(**kwargs)
|
353
|
+
|
354
|
+
file_path = kwargs.get("file_path")
|
355
|
+
encoding = kwargs.get("encoding", "utf-8")
|
356
|
+
delimiter = kwargs.get("delimiter", ",")
|
357
|
+
has_header = kwargs.get("has_header", True)
|
358
|
+
skip_rows = kwargs.get("skip_rows", 0)
|
359
|
+
max_rows = kwargs.get("max_rows")
|
360
|
+
columns = kwargs.get("columns")
|
361
|
+
index_column = kwargs.get("index_column")
|
362
|
+
|
363
|
+
# Validate inputs using same logic as sync version
|
364
|
+
if not file_path:
|
365
|
+
raise ValueError("file_path is required")
|
366
|
+
|
367
|
+
validate_file_path(file_path)
|
368
|
+
|
369
|
+
try:
|
370
|
+
# Async file reading with aiofiles
|
371
|
+
async with aiofiles.open(file_path, mode="r", encoding=encoding) as file:
|
372
|
+
# Read all lines for CSV parsing
|
373
|
+
content = await file.read()
|
374
|
+
|
375
|
+
# Parse CSV content (CPU-bound, but file I/O is async)
|
376
|
+
import io
|
377
|
+
|
378
|
+
content_io = io.StringIO(content)
|
379
|
+
|
380
|
+
# Skip rows if requested
|
381
|
+
for _ in range(skip_rows):
|
382
|
+
next(content_io, None)
|
383
|
+
|
384
|
+
# Create CSV reader
|
385
|
+
csv_reader = csv.reader(content_io, delimiter=delimiter)
|
386
|
+
|
387
|
+
# Handle header row
|
388
|
+
headers = None
|
389
|
+
if has_header:
|
390
|
+
headers = next(csv_reader, None)
|
391
|
+
if headers and columns:
|
392
|
+
# Validate that specified columns exist
|
393
|
+
missing_cols = set(columns) - set(headers)
|
394
|
+
if missing_cols:
|
395
|
+
raise ValueError(f"Columns not found: {missing_cols}")
|
396
|
+
elif columns:
|
397
|
+
headers = columns
|
398
|
+
|
399
|
+
# Read data rows
|
400
|
+
data = []
|
401
|
+
data_indexed = {}
|
402
|
+
|
403
|
+
for row_num, row in enumerate(csv_reader):
|
404
|
+
if max_rows and row_num >= max_rows:
|
405
|
+
break
|
406
|
+
|
407
|
+
if not row: # Skip empty rows
|
408
|
+
continue
|
409
|
+
|
410
|
+
# Process row based on whether we have headers
|
411
|
+
if headers:
|
412
|
+
# Create dictionary with column names
|
413
|
+
row_data = {}
|
414
|
+
for i, value in enumerate(row):
|
415
|
+
if i < len(headers):
|
416
|
+
col_name = headers[i]
|
417
|
+
# Only include specified columns if provided
|
418
|
+
if not columns or col_name in columns:
|
419
|
+
row_data[col_name] = self._infer_type(
|
420
|
+
value.strip() if value else value
|
421
|
+
)
|
422
|
+
else:
|
423
|
+
# No headers, return as list with type inference
|
424
|
+
row_data = [
|
425
|
+
self._infer_type(cell.strip() if cell else cell) for cell in row
|
426
|
+
]
|
427
|
+
|
428
|
+
data.append(row_data)
|
429
|
+
|
430
|
+
# Handle index column for faster lookups
|
431
|
+
if index_column and headers and index_column in headers:
|
432
|
+
index_value = row_data.get(index_column)
|
433
|
+
if index_value is not None:
|
434
|
+
data_indexed[index_value] = row_data
|
435
|
+
|
436
|
+
except Exception as e:
|
437
|
+
raise ValueError(f"Error reading CSV file: {str(e)}")
|
438
|
+
|
439
|
+
# Return same format as sync version
|
440
|
+
result = {"data": data}
|
441
|
+
if index_column:
|
442
|
+
result["data_indexed"] = data_indexed
|
443
|
+
|
444
|
+
return result
|
445
|
+
|
292
446
|
|
293
447
|
@register_node()
|
294
448
|
class JSONReaderNode(Node):
|
@@ -416,6 +570,51 @@ class JSONReaderNode(Node):
|
|
416
570
|
|
417
571
|
return {"data": data}
|
418
572
|
|
573
|
+
async def async_run(self, **kwargs) -> dict[str, Any]:
|
574
|
+
"""Read JSON file asynchronously for better I/O performance.
|
575
|
+
|
576
|
+
This method provides true async file reading with aiofiles,
|
577
|
+
offering significant performance improvements for large files
|
578
|
+
and concurrent operations.
|
579
|
+
|
580
|
+
Args:
|
581
|
+
Same as run() method
|
582
|
+
|
583
|
+
Returns:
|
584
|
+
Same as run() method
|
585
|
+
|
586
|
+
Raises:
|
587
|
+
Same as run() method
|
588
|
+
"""
|
589
|
+
# Import aiofiles for async file operations
|
590
|
+
try:
|
591
|
+
import aiofiles
|
592
|
+
except ImportError:
|
593
|
+
# Fallback to sync version if async dependencies not available
|
594
|
+
return self.run(**kwargs)
|
595
|
+
|
596
|
+
file_path = kwargs.get("file_path") or self.config.get("file_path")
|
597
|
+
|
598
|
+
# Validate file path for security (same as sync version)
|
599
|
+
validated_path = validate_file_path(file_path, operation="JSON read")
|
600
|
+
|
601
|
+
try:
|
602
|
+
# Async file reading with aiofiles
|
603
|
+
async with aiofiles.open(
|
604
|
+
validated_path, mode="r", encoding="utf-8"
|
605
|
+
) as file:
|
606
|
+
content = await file.read()
|
607
|
+
|
608
|
+
# Parse JSON content (CPU-bound, but file I/O is async)
|
609
|
+
data = json.loads(content)
|
610
|
+
|
611
|
+
except json.JSONDecodeError as e:
|
612
|
+
raise ValueError(f"Invalid JSON in file {validated_path}: {str(e)}")
|
613
|
+
except Exception as e:
|
614
|
+
raise ValueError(f"Error reading JSON file {validated_path}: {str(e)}")
|
615
|
+
|
616
|
+
return {"data": data}
|
617
|
+
|
419
618
|
|
420
619
|
@register_node()
|
421
620
|
class TextReaderNode(Node):
|
@@ -557,3 +756,571 @@ class TextReaderNode(Node):
|
|
557
756
|
text = f.read()
|
558
757
|
|
559
758
|
return {"text": text}
|
759
|
+
|
760
|
+
|
761
|
+
@register_node()
|
762
|
+
class DocumentProcessorNode(Node):
|
763
|
+
"""
|
764
|
+
Advanced document processor that reads and processes multiple document formats
|
765
|
+
with automatic format detection, metadata extraction, and structured output.
|
766
|
+
|
767
|
+
This node unifies document reading across formats (PDF, DOCX, MD, TXT, HTML, RTF)
|
768
|
+
and provides consistent structured output with extracted metadata, making it
|
769
|
+
ideal for document analysis workflows, content management, and RAG systems.
|
770
|
+
|
771
|
+
Design Philosophy:
|
772
|
+
The DocumentProcessorNode embodies "universal document accessibility."
|
773
|
+
Rather than requiring format-specific readers, it automatically detects
|
774
|
+
and processes various document types, extracting both content and metadata
|
775
|
+
for comprehensive document understanding.
|
776
|
+
|
777
|
+
Upstream Dependencies:
|
778
|
+
- File system providing documents
|
779
|
+
- Path discovery nodes
|
780
|
+
- Document management systems
|
781
|
+
- User inputs specifying documents
|
782
|
+
|
783
|
+
Downstream Consumers:
|
784
|
+
- Chunking nodes for text segmentation
|
785
|
+
- Embedding nodes for vector processing
|
786
|
+
- LLM nodes for content analysis
|
787
|
+
- Indexing systems for document search
|
788
|
+
- Metadata analyzers for classification
|
789
|
+
|
790
|
+
Supported Formats:
|
791
|
+
- PDF: Full text extraction with metadata
|
792
|
+
- DOCX: Content and document properties
|
793
|
+
- TXT: Plain text with encoding detection
|
794
|
+
- MD: Markdown with structure parsing
|
795
|
+
- HTML: Text extraction from markup
|
796
|
+
- RTF: Rich text format processing
|
797
|
+
- Auto-detection based on file extension
|
798
|
+
|
799
|
+
Configuration:
|
800
|
+
- extract_metadata: Include document properties
|
801
|
+
- preserve_structure: Maintain document sections
|
802
|
+
- encoding: Text encoding for plain text files
|
803
|
+
- extract_images: Include image references (future)
|
804
|
+
- page_numbers: Include page/section numbers
|
805
|
+
|
806
|
+
Examples:
|
807
|
+
>>> processor = DocumentProcessorNode(
|
808
|
+
... extract_metadata=True,
|
809
|
+
... preserve_structure=True
|
810
|
+
... )
|
811
|
+
>>> result = processor.run(
|
812
|
+
... file_path="document.pdf"
|
813
|
+
... )
|
814
|
+
>>> content = result["content"]
|
815
|
+
>>> metadata = result["metadata"]
|
816
|
+
>>> sections = result["sections"]
|
817
|
+
"""
|
818
|
+
|
819
|
+
def __init__(self, name: str = "document_processor", **kwargs):
|
820
|
+
# Set attributes before calling super().__init__() as Kailash validates during init
|
821
|
+
self.extract_metadata = kwargs.get("extract_metadata", True)
|
822
|
+
self.preserve_structure = kwargs.get("preserve_structure", True)
|
823
|
+
self.encoding = kwargs.get("encoding", "utf-8")
|
824
|
+
self.extract_images = kwargs.get("extract_images", False)
|
825
|
+
self.page_numbers = kwargs.get("page_numbers", True)
|
826
|
+
|
827
|
+
super().__init__(name=name)
|
828
|
+
|
829
|
+
def get_parameters(self) -> dict[str, NodeParameter]:
|
830
|
+
"""Define input parameters for document processing."""
|
831
|
+
return {
|
832
|
+
"file_path": NodeParameter(
|
833
|
+
name="file_path",
|
834
|
+
type=str,
|
835
|
+
required=True,
|
836
|
+
description="Path to the document file to process",
|
837
|
+
),
|
838
|
+
"extract_metadata": NodeParameter(
|
839
|
+
name="extract_metadata",
|
840
|
+
type=bool,
|
841
|
+
required=False,
|
842
|
+
default=self.extract_metadata,
|
843
|
+
description="Extract document metadata (title, author, creation date, etc.)",
|
844
|
+
),
|
845
|
+
"preserve_structure": NodeParameter(
|
846
|
+
name="preserve_structure",
|
847
|
+
type=bool,
|
848
|
+
required=False,
|
849
|
+
default=self.preserve_structure,
|
850
|
+
description="Preserve document structure (sections, headings, etc.)",
|
851
|
+
),
|
852
|
+
"encoding": NodeParameter(
|
853
|
+
name="encoding",
|
854
|
+
type=str,
|
855
|
+
required=False,
|
856
|
+
default=self.encoding,
|
857
|
+
description="Text encoding for plain text files",
|
858
|
+
),
|
859
|
+
"page_numbers": NodeParameter(
|
860
|
+
name="page_numbers",
|
861
|
+
type=bool,
|
862
|
+
required=False,
|
863
|
+
default=self.page_numbers,
|
864
|
+
description="Include page/section numbers in output",
|
865
|
+
),
|
866
|
+
"extract_images": NodeParameter(
|
867
|
+
name="extract_images",
|
868
|
+
type=bool,
|
869
|
+
required=False,
|
870
|
+
default=self.extract_images,
|
871
|
+
description="Extract image references and descriptions",
|
872
|
+
),
|
873
|
+
}
|
874
|
+
|
875
|
+
def run(self, **kwargs) -> dict[str, Any]:
|
876
|
+
"""Execute document processing operation."""
|
877
|
+
file_path = kwargs.get("file_path", "")
|
878
|
+
extract_metadata = kwargs.get("extract_metadata", self.extract_metadata)
|
879
|
+
preserve_structure = kwargs.get("preserve_structure", self.preserve_structure)
|
880
|
+
encoding = kwargs.get("encoding", self.encoding)
|
881
|
+
page_numbers = kwargs.get("page_numbers", self.page_numbers)
|
882
|
+
extract_images = kwargs.get("extract_images", self.extract_images)
|
883
|
+
|
884
|
+
if not file_path:
|
885
|
+
return {
|
886
|
+
"error": "File path is required",
|
887
|
+
"content": "",
|
888
|
+
"metadata": {},
|
889
|
+
"sections": [],
|
890
|
+
}
|
891
|
+
|
892
|
+
try:
|
893
|
+
# Validate file path for security
|
894
|
+
validated_path = validate_file_path(file_path, operation="document read")
|
895
|
+
|
896
|
+
# Detect document format
|
897
|
+
document_format = self._detect_format(validated_path)
|
898
|
+
|
899
|
+
# Process document based on format
|
900
|
+
if document_format == "pdf":
|
901
|
+
result = self._process_pdf(
|
902
|
+
validated_path, extract_metadata, preserve_structure, page_numbers
|
903
|
+
)
|
904
|
+
elif document_format == "docx":
|
905
|
+
result = self._process_docx(
|
906
|
+
validated_path, extract_metadata, preserve_structure
|
907
|
+
)
|
908
|
+
elif document_format == "markdown":
|
909
|
+
result = self._process_markdown(
|
910
|
+
validated_path, encoding, preserve_structure
|
911
|
+
)
|
912
|
+
elif document_format == "html":
|
913
|
+
result = self._process_html(
|
914
|
+
validated_path, encoding, preserve_structure
|
915
|
+
)
|
916
|
+
elif document_format == "rtf":
|
917
|
+
result = self._process_rtf(
|
918
|
+
validated_path, extract_metadata, preserve_structure
|
919
|
+
)
|
920
|
+
else: # Default to text
|
921
|
+
result = self._process_text(validated_path, encoding, extract_metadata)
|
922
|
+
|
923
|
+
# Add common metadata
|
924
|
+
result["metadata"]["file_path"] = file_path
|
925
|
+
result["metadata"]["document_format"] = document_format
|
926
|
+
result["metadata"]["processing_timestamp"] = self._get_timestamp()
|
927
|
+
|
928
|
+
return result
|
929
|
+
|
930
|
+
except Exception as e:
|
931
|
+
return {
|
932
|
+
"error": f"Document processing failed: {str(e)}",
|
933
|
+
"content": "",
|
934
|
+
"metadata": {"file_path": file_path, "error": str(e)},
|
935
|
+
"sections": [],
|
936
|
+
"document_format": "unknown",
|
937
|
+
}
|
938
|
+
|
939
|
+
def _detect_format(self, file_path: str) -> str:
|
940
|
+
"""Detect document format based on file extension."""
|
941
|
+
import os
|
942
|
+
|
943
|
+
extension = os.path.splitext(file_path)[1].lower()
|
944
|
+
|
945
|
+
format_map = {
|
946
|
+
".pdf": "pdf",
|
947
|
+
".docx": "docx",
|
948
|
+
".doc": "docx", # Treat as docx for now
|
949
|
+
".md": "markdown",
|
950
|
+
".markdown": "markdown",
|
951
|
+
".html": "html",
|
952
|
+
".htm": "html",
|
953
|
+
".rtf": "rtf",
|
954
|
+
".txt": "text",
|
955
|
+
".log": "text",
|
956
|
+
".csv": "text", # Could be enhanced
|
957
|
+
".json": "text", # Could be enhanced
|
958
|
+
}
|
959
|
+
|
960
|
+
return format_map.get(extension, "text")
|
961
|
+
|
962
|
+
def _process_pdf(
|
963
|
+
self,
|
964
|
+
file_path: str,
|
965
|
+
extract_metadata: bool,
|
966
|
+
preserve_structure: bool,
|
967
|
+
page_numbers: bool,
|
968
|
+
) -> dict:
|
969
|
+
"""Process PDF document (simplified implementation)."""
|
970
|
+
# In a real implementation, this would use PyPDF2, pdfplumber, or similar
|
971
|
+
# For now, return a structured placeholder
|
972
|
+
|
973
|
+
try:
|
974
|
+
# Placeholder implementation - in reality would use PDF libraries
|
975
|
+
content = f"[PDF Content from {file_path}]"
|
976
|
+
|
977
|
+
metadata = {}
|
978
|
+
if extract_metadata:
|
979
|
+
metadata.update(
|
980
|
+
{
|
981
|
+
"title": "Document Title",
|
982
|
+
"author": "Document Author",
|
983
|
+
"creation_date": "2024-01-01",
|
984
|
+
"page_count": 1,
|
985
|
+
"pdf_version": "1.4",
|
986
|
+
}
|
987
|
+
)
|
988
|
+
|
989
|
+
sections = []
|
990
|
+
if preserve_structure:
|
991
|
+
sections = [
|
992
|
+
{
|
993
|
+
"type": "page",
|
994
|
+
"number": 1,
|
995
|
+
"content": content,
|
996
|
+
"start_position": 0,
|
997
|
+
"end_position": len(content),
|
998
|
+
}
|
999
|
+
]
|
1000
|
+
|
1001
|
+
return {
|
1002
|
+
"content": content,
|
1003
|
+
"metadata": metadata,
|
1004
|
+
"sections": sections,
|
1005
|
+
"document_format": "pdf",
|
1006
|
+
}
|
1007
|
+
|
1008
|
+
except Exception as e:
|
1009
|
+
# Fall back to text reading if PDF processing fails
|
1010
|
+
return self._process_text(file_path, "utf-8", extract_metadata)
|
1011
|
+
|
1012
|
+
def _process_docx(
|
1013
|
+
self, file_path: str, extract_metadata: bool, preserve_structure: bool
|
1014
|
+
) -> dict:
|
1015
|
+
"""Process DOCX document (simplified implementation)."""
|
1016
|
+
# In a real implementation, this would use python-docx
|
1017
|
+
# For now, return a structured placeholder
|
1018
|
+
|
1019
|
+
try:
|
1020
|
+
# Placeholder implementation - in reality would use python-docx
|
1021
|
+
content = f"[DOCX Content from {file_path}]"
|
1022
|
+
|
1023
|
+
metadata = {}
|
1024
|
+
if extract_metadata:
|
1025
|
+
metadata.update(
|
1026
|
+
{
|
1027
|
+
"title": "Document Title",
|
1028
|
+
"author": "Document Author",
|
1029
|
+
"creation_date": "2024-01-01",
|
1030
|
+
"modification_date": "2024-01-01",
|
1031
|
+
"word_count": len(content.split()),
|
1032
|
+
}
|
1033
|
+
)
|
1034
|
+
|
1035
|
+
sections = []
|
1036
|
+
if preserve_structure:
|
1037
|
+
sections = [
|
1038
|
+
{
|
1039
|
+
"type": "paragraph",
|
1040
|
+
"style": "Normal",
|
1041
|
+
"content": content,
|
1042
|
+
"start_position": 0,
|
1043
|
+
"end_position": len(content),
|
1044
|
+
}
|
1045
|
+
]
|
1046
|
+
|
1047
|
+
return {
|
1048
|
+
"content": content,
|
1049
|
+
"metadata": metadata,
|
1050
|
+
"sections": sections,
|
1051
|
+
"document_format": "docx",
|
1052
|
+
}
|
1053
|
+
|
1054
|
+
except Exception as e:
|
1055
|
+
# Fall back to text reading if DOCX processing fails
|
1056
|
+
return self._process_text(file_path, "utf-8", extract_metadata)
|
1057
|
+
|
1058
|
+
def _process_markdown(
|
1059
|
+
self, file_path: str, encoding: str, preserve_structure: bool
|
1060
|
+
) -> dict:
|
1061
|
+
"""Process Markdown document with structure parsing."""
|
1062
|
+
try:
|
1063
|
+
with safe_open(file_path, "r", encoding=encoding) as f:
|
1064
|
+
content = f.read()
|
1065
|
+
|
1066
|
+
metadata = {
|
1067
|
+
"character_count": len(content),
|
1068
|
+
"line_count": len(content.splitlines()),
|
1069
|
+
"word_count": len(content.split()),
|
1070
|
+
}
|
1071
|
+
|
1072
|
+
sections = []
|
1073
|
+
if preserve_structure:
|
1074
|
+
sections = self._parse_markdown_structure(content)
|
1075
|
+
|
1076
|
+
return {
|
1077
|
+
"content": content,
|
1078
|
+
"metadata": metadata,
|
1079
|
+
"sections": sections,
|
1080
|
+
"document_format": "markdown",
|
1081
|
+
}
|
1082
|
+
|
1083
|
+
except Exception as e:
|
1084
|
+
return {
|
1085
|
+
"content": "",
|
1086
|
+
"metadata": {"error": str(e)},
|
1087
|
+
"sections": [],
|
1088
|
+
"document_format": "markdown",
|
1089
|
+
}
|
1090
|
+
|
1091
|
+
def _process_html(
|
1092
|
+
self, file_path: str, encoding: str, preserve_structure: bool
|
1093
|
+
) -> dict:
|
1094
|
+
"""Process HTML document with text extraction."""
|
1095
|
+
try:
|
1096
|
+
with safe_open(file_path, "r", encoding=encoding) as f:
|
1097
|
+
html_content = f.read()
|
1098
|
+
|
1099
|
+
# Simple HTML text extraction (in reality would use BeautifulSoup)
|
1100
|
+
import re
|
1101
|
+
|
1102
|
+
# Remove script and style elements
|
1103
|
+
html_content = re.sub(
|
1104
|
+
r"<script[^>]*>.*?</script>",
|
1105
|
+
"",
|
1106
|
+
html_content,
|
1107
|
+
flags=re.DOTALL | re.IGNORECASE,
|
1108
|
+
)
|
1109
|
+
html_content = re.sub(
|
1110
|
+
r"<style[^>]*>.*?</style>",
|
1111
|
+
"",
|
1112
|
+
html_content,
|
1113
|
+
flags=re.DOTALL | re.IGNORECASE,
|
1114
|
+
)
|
1115
|
+
# Remove HTML tags
|
1116
|
+
content = re.sub(r"<[^>]+>", "", html_content)
|
1117
|
+
# Clean up whitespace
|
1118
|
+
content = re.sub(r"\s+", " ", content).strip()
|
1119
|
+
|
1120
|
+
metadata = {
|
1121
|
+
"character_count": len(content),
|
1122
|
+
"word_count": len(content.split()),
|
1123
|
+
"original_html_length": len(html_content),
|
1124
|
+
}
|
1125
|
+
|
1126
|
+
sections = []
|
1127
|
+
if preserve_structure:
|
1128
|
+
# Simple section detection based on common patterns
|
1129
|
+
sections = self._parse_html_structure(html_content, content)
|
1130
|
+
|
1131
|
+
return {
|
1132
|
+
"content": content,
|
1133
|
+
"metadata": metadata,
|
1134
|
+
"sections": sections,
|
1135
|
+
"document_format": "html",
|
1136
|
+
}
|
1137
|
+
|
1138
|
+
except Exception as e:
|
1139
|
+
return {
|
1140
|
+
"content": "",
|
1141
|
+
"metadata": {"error": str(e)},
|
1142
|
+
"sections": [],
|
1143
|
+
"document_format": "html",
|
1144
|
+
}
|
1145
|
+
|
1146
|
+
def _process_rtf(
|
1147
|
+
self, file_path: str, extract_metadata: bool, preserve_structure: bool
|
1148
|
+
) -> dict:
|
1149
|
+
"""Process RTF document (simplified implementation)."""
|
1150
|
+
# In a real implementation, this would use striprtf or similar
|
1151
|
+
try:
|
1152
|
+
with safe_open(file_path, "r", encoding="utf-8") as f:
|
1153
|
+
rtf_content = f.read()
|
1154
|
+
|
1155
|
+
# Simple RTF text extraction (remove RTF control codes)
|
1156
|
+
import re
|
1157
|
+
|
1158
|
+
content = re.sub(r"\\[a-z]+\d*\s?", "", rtf_content) # Remove RTF commands
|
1159
|
+
content = re.sub(r"[{}]", "", content) # Remove braces
|
1160
|
+
content = re.sub(r"\s+", " ", content).strip() # Clean whitespace
|
1161
|
+
|
1162
|
+
metadata = {}
|
1163
|
+
if extract_metadata:
|
1164
|
+
metadata.update(
|
1165
|
+
{
|
1166
|
+
"character_count": len(content),
|
1167
|
+
"word_count": len(content.split()),
|
1168
|
+
"original_rtf_length": len(rtf_content),
|
1169
|
+
}
|
1170
|
+
)
|
1171
|
+
|
1172
|
+
sections = []
|
1173
|
+
if preserve_structure:
|
1174
|
+
sections = [
|
1175
|
+
{
|
1176
|
+
"type": "document",
|
1177
|
+
"content": content,
|
1178
|
+
"start_position": 0,
|
1179
|
+
"end_position": len(content),
|
1180
|
+
}
|
1181
|
+
]
|
1182
|
+
|
1183
|
+
return {
|
1184
|
+
"content": content,
|
1185
|
+
"metadata": metadata,
|
1186
|
+
"sections": sections,
|
1187
|
+
"document_format": "rtf",
|
1188
|
+
}
|
1189
|
+
|
1190
|
+
except Exception as e:
|
1191
|
+
return {
|
1192
|
+
"content": "",
|
1193
|
+
"metadata": {"error": str(e)},
|
1194
|
+
"sections": [],
|
1195
|
+
"document_format": "rtf",
|
1196
|
+
}
|
1197
|
+
|
1198
|
+
def _process_text(
|
1199
|
+
self, file_path: str, encoding: str, extract_metadata: bool
|
1200
|
+
) -> dict:
|
1201
|
+
"""Process plain text document."""
|
1202
|
+
try:
|
1203
|
+
with safe_open(file_path, "r", encoding=encoding) as f:
|
1204
|
+
content = f.read()
|
1205
|
+
|
1206
|
+
metadata = {}
|
1207
|
+
if extract_metadata:
|
1208
|
+
lines = content.splitlines()
|
1209
|
+
metadata.update(
|
1210
|
+
{
|
1211
|
+
"character_count": len(content),
|
1212
|
+
"line_count": len(lines),
|
1213
|
+
"word_count": len(content.split()),
|
1214
|
+
"encoding": encoding,
|
1215
|
+
"max_line_length": (
|
1216
|
+
max(len(line) for line in lines) if lines else 0
|
1217
|
+
),
|
1218
|
+
"blank_lines": sum(1 for line in lines if not line.strip()),
|
1219
|
+
}
|
1220
|
+
)
|
1221
|
+
|
1222
|
+
sections = [
|
1223
|
+
{
|
1224
|
+
"type": "text",
|
1225
|
+
"content": content,
|
1226
|
+
"start_position": 0,
|
1227
|
+
"end_position": len(content),
|
1228
|
+
}
|
1229
|
+
]
|
1230
|
+
|
1231
|
+
return {
|
1232
|
+
"content": content,
|
1233
|
+
"metadata": metadata,
|
1234
|
+
"sections": sections,
|
1235
|
+
"document_format": "text",
|
1236
|
+
}
|
1237
|
+
|
1238
|
+
except Exception as e:
|
1239
|
+
return {
|
1240
|
+
"content": "",
|
1241
|
+
"metadata": {"error": str(e)},
|
1242
|
+
"sections": [],
|
1243
|
+
"document_format": "text",
|
1244
|
+
}
|
1245
|
+
|
1246
|
+
def _parse_markdown_structure(self, content: str) -> list:
|
1247
|
+
"""Parse Markdown structure into sections."""
|
1248
|
+
import re
|
1249
|
+
|
1250
|
+
sections = []
|
1251
|
+
|
1252
|
+
# Find headings
|
1253
|
+
heading_pattern = r"^(#{1,6})\s+(.+)$"
|
1254
|
+
lines = content.splitlines()
|
1255
|
+
current_pos = 0
|
1256
|
+
|
1257
|
+
for i, line in enumerate(lines):
|
1258
|
+
match = re.match(heading_pattern, line)
|
1259
|
+
if match:
|
1260
|
+
level = len(match.group(1))
|
1261
|
+
title = match.group(2)
|
1262
|
+
|
1263
|
+
# Calculate position in original content
|
1264
|
+
line_start = content.find(line, current_pos)
|
1265
|
+
|
1266
|
+
sections.append(
|
1267
|
+
{
|
1268
|
+
"type": "heading",
|
1269
|
+
"level": level,
|
1270
|
+
"title": title,
|
1271
|
+
"content": line,
|
1272
|
+
"line_number": i + 1,
|
1273
|
+
"start_position": line_start,
|
1274
|
+
"end_position": line_start + len(line),
|
1275
|
+
}
|
1276
|
+
)
|
1277
|
+
|
1278
|
+
current_pos = line_start + len(line)
|
1279
|
+
|
1280
|
+
return sections
|
1281
|
+
|
1282
|
+
def _parse_html_structure(self, html_content: str, text_content: str) -> list:
|
1283
|
+
"""Parse HTML structure into sections (simplified)."""
|
1284
|
+
import re
|
1285
|
+
|
1286
|
+
sections = []
|
1287
|
+
|
1288
|
+
# Find title
|
1289
|
+
title_match = re.search(
|
1290
|
+
r"<title[^>]*>([^<]+)</title>", html_content, re.IGNORECASE
|
1291
|
+
)
|
1292
|
+
if title_match:
|
1293
|
+
sections.append(
|
1294
|
+
{
|
1295
|
+
"type": "title",
|
1296
|
+
"content": title_match.group(1),
|
1297
|
+
"start_position": 0,
|
1298
|
+
"end_position": len(title_match.group(1)),
|
1299
|
+
}
|
1300
|
+
)
|
1301
|
+
|
1302
|
+
# Find headings
|
1303
|
+
heading_pattern = r"<(h[1-6])[^>]*>([^<]+)</h[1-6]>"
|
1304
|
+
for match in re.finditer(heading_pattern, html_content, re.IGNORECASE):
|
1305
|
+
tag = match.group(1)
|
1306
|
+
text = match.group(2)
|
1307
|
+
level = int(tag[1])
|
1308
|
+
|
1309
|
+
sections.append(
|
1310
|
+
{
|
1311
|
+
"type": "heading",
|
1312
|
+
"level": level,
|
1313
|
+
"title": text,
|
1314
|
+
"content": text,
|
1315
|
+
"start_position": match.start(),
|
1316
|
+
"end_position": match.end(),
|
1317
|
+
}
|
1318
|
+
)
|
1319
|
+
|
1320
|
+
return sections
|
1321
|
+
|
1322
|
+
def _get_timestamp(self) -> str:
|
1323
|
+
"""Get current timestamp for metadata."""
|
1324
|
+
from datetime import datetime
|
1325
|
+
|
1326
|
+
return datetime.now().isoformat()
|