chunksilo 2.3.2__tar.gz → 2.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chunksilo-2.3.2/src/chunksilo.egg-info → chunksilo-2.3.3}/PKG-INFO +1 -1
- {chunksilo-2.3.2 → chunksilo-2.3.3}/pyproject.toml +1 -1
- {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo/__init__.py +1 -1
- {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo/cfgload.py +1 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo/index.py +145 -57
- {chunksilo-2.3.2 → chunksilo-2.3.3/src/chunksilo.egg-info}/PKG-INFO +1 -1
- {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo.egg-info/SOURCES.txt +1 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_indexing_ui.py +27 -24
- chunksilo-2.3.3/test/test_scan_timeouts.py +369 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/LICENSE +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/NOTICE +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/README.md +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/requirements.txt +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/setup.cfg +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo/__main__.py +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo/cli.py +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo/confluence_html_formatter.py +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo/search.py +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo/server.py +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo.egg-info/dependency_links.txt +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo.egg-info/entry_points.txt +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo.egg-info/requires.txt +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo.egg-info/top_level.txt +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_chunk_location.py +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_confluence_html_formatter.py +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_error_handling.py +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_heading_path_integration.py +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_incremental_ingest.py +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_indexing_benchmark.py +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_jira_integration.py +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_quoted_phrases.py +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_rag_metrics.py +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_retrieval_only.py +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_system.py +0 -0
- {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_utils.py +0 -0
|
@@ -75,6 +75,7 @@ _DEFAULTS: dict[str, Any] = {
|
|
|
75
75
|
"per_file_seconds": 300, # 5 minutes per file
|
|
76
76
|
"doc_conversion_seconds": 90, # 90 seconds for .doc conversion
|
|
77
77
|
"heartbeat_interval_seconds": 2,
|
|
78
|
+
"scan_item_seconds": 30, # timeout for stat/hash/walk during scanning
|
|
78
79
|
},
|
|
79
80
|
"logging": {
|
|
80
81
|
"log_slow_files": True,
|
|
@@ -10,6 +10,7 @@ import itertools
|
|
|
10
10
|
import json
|
|
11
11
|
import logging
|
|
12
12
|
import os
|
|
13
|
+
import queue
|
|
13
14
|
import signal
|
|
14
15
|
import sqlite3
|
|
15
16
|
import sys
|
|
@@ -575,48 +576,28 @@ def _extract_pdf_headings_from_outline(
|
|
|
575
576
|
return []
|
|
576
577
|
|
|
577
578
|
|
|
578
|
-
|
|
579
|
-
docx_path: Path,
|
|
580
|
-
ctx: "FileProcessingContext | None",
|
|
581
|
-
timeout_seconds: float,
|
|
582
|
-
) -> List[LlamaIndexDocument]:
|
|
583
|
-
"""Run split_docx_into_heading_documents() in a worker thread with a hard timeout.
|
|
579
|
+
_SCAN_TIMEOUT_SENTINEL = object()
|
|
584
580
|
|
|
585
|
-
Returns the loaded documents, or an empty list if the call times out.
|
|
586
|
-
"""
|
|
587
|
-
with ThreadPoolExecutor(max_workers=1) as pool:
|
|
588
|
-
future = pool.submit(split_docx_into_heading_documents, docx_path, ctx)
|
|
589
|
-
try:
|
|
590
|
-
return future.result(timeout=timeout_seconds)
|
|
591
|
-
except Exception as e:
|
|
592
|
-
future.cancel()
|
|
593
|
-
if "TimeoutError" in type(e).__name__ or isinstance(e, TimeoutError):
|
|
594
|
-
logger.warning(
|
|
595
|
-
f"DOCX processing timed out after {timeout_seconds:.0f}s: {docx_path}"
|
|
596
|
-
)
|
|
597
|
-
return []
|
|
598
|
-
raise
|
|
599
581
|
|
|
582
|
+
def _run_with_timeout(fn, timeout_seconds: float, default=_SCAN_TIMEOUT_SENTINEL):
|
|
583
|
+
"""Run *fn* in a background thread, returning *default* on timeout.
|
|
600
584
|
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
"""Run reader.load_data() in a worker thread with a hard timeout.
|
|
605
|
-
|
|
606
|
-
Returns the loaded documents, or an empty list if the call times out.
|
|
585
|
+
If *fn* raises an exception it is re-raised in the caller.
|
|
586
|
+
On timeout the pool is shut down without waiting so the caller is not
|
|
587
|
+
blocked by a still-running filesystem call.
|
|
607
588
|
"""
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
589
|
+
pool = ThreadPoolExecutor(max_workers=1)
|
|
590
|
+
future = pool.submit(fn)
|
|
591
|
+
try:
|
|
592
|
+
result = future.result(timeout=timeout_seconds)
|
|
593
|
+
pool.shutdown(wait=False)
|
|
594
|
+
return result
|
|
595
|
+
except Exception as exc:
|
|
596
|
+
future.cancel()
|
|
597
|
+
pool.shutdown(wait=False, cancel_futures=True)
|
|
598
|
+
if "TimeoutError" in type(exc).__name__ or isinstance(exc, TimeoutError):
|
|
599
|
+
return default
|
|
600
|
+
raise
|
|
620
601
|
|
|
621
602
|
|
|
622
603
|
class LocalFileSystemSource(DataSource):
|
|
@@ -627,17 +608,31 @@ class LocalFileSystemSource(DataSource):
|
|
|
627
608
|
self.base_dir = config.path
|
|
628
609
|
|
|
629
610
|
def is_available(self) -> bool:
|
|
630
|
-
"""Check if the directory is available and accessible.
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
611
|
+
"""Check if the directory is available and accessible.
|
|
612
|
+
|
|
613
|
+
Runs with a timeout to avoid hanging on unresponsive network mounts.
|
|
614
|
+
"""
|
|
615
|
+
timeout = cfgload.get("indexing.timeout.scan_item_seconds", 30)
|
|
616
|
+
|
|
617
|
+
def _check():
|
|
618
|
+
try:
|
|
619
|
+
if not self.base_dir.exists():
|
|
620
|
+
return False
|
|
621
|
+
if not self.base_dir.is_dir():
|
|
622
|
+
return False
|
|
623
|
+
# Try to list directory to verify access (important for network mounts)
|
|
624
|
+
next(self.base_dir.iterdir(), None)
|
|
625
|
+
return True
|
|
626
|
+
except (OSError, PermissionError):
|
|
635
627
|
return False
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
628
|
+
|
|
629
|
+
result = _run_with_timeout(_check, timeout_seconds=timeout, default=False)
|
|
630
|
+
if result is False and timeout > 0:
|
|
631
|
+
# Distinguish genuine "not a dir" from timeout — log only for timeout
|
|
632
|
+
# (the sentinel default=False means we can't distinguish here, but
|
|
633
|
+
# _run_with_timeout already logged nothing; let callers log.)
|
|
634
|
+
pass
|
|
635
|
+
return result
|
|
641
636
|
|
|
642
637
|
def _matches_patterns(self, file_path: Path) -> bool:
|
|
643
638
|
"""Check if file matches include patterns and doesn't match exclude patterns.
|
|
@@ -700,6 +695,43 @@ class LocalFileSystemSource(DataSource):
|
|
|
700
695
|
return True
|
|
701
696
|
return False
|
|
702
697
|
|
|
698
|
+
def _walk_with_timeout(self):
|
|
699
|
+
"""Yield (root, dirs, files) tuples from os.walk with per-iteration timeout.
|
|
700
|
+
|
|
701
|
+
Runs os.walk in a daemon thread, feeding results through a queue.
|
|
702
|
+
If no result arrives within scan_item_seconds, the walk is considered
|
|
703
|
+
stalled and iteration stops.
|
|
704
|
+
"""
|
|
705
|
+
timeout = cfgload.get("indexing.timeout.scan_item_seconds", 30)
|
|
706
|
+
q: queue.Queue = queue.Queue()
|
|
707
|
+
_sentinel = None # signals end of iteration
|
|
708
|
+
|
|
709
|
+
def _producer():
|
|
710
|
+
try:
|
|
711
|
+
for entry in os.walk(self.base_dir):
|
|
712
|
+
q.put(entry)
|
|
713
|
+
q.put(_sentinel)
|
|
714
|
+
except Exception as exc:
|
|
715
|
+
q.put(exc)
|
|
716
|
+
|
|
717
|
+
t = threading.Thread(target=_producer, daemon=True)
|
|
718
|
+
t.start()
|
|
719
|
+
|
|
720
|
+
while True:
|
|
721
|
+
try:
|
|
722
|
+
item = q.get(timeout=timeout)
|
|
723
|
+
except queue.Empty:
|
|
724
|
+
logger.warning(
|
|
725
|
+
f"os.walk() stalled for {timeout}s on {self.base_dir}, "
|
|
726
|
+
"aborting directory scan"
|
|
727
|
+
)
|
|
728
|
+
return
|
|
729
|
+
if item is _sentinel:
|
|
730
|
+
return
|
|
731
|
+
if isinstance(item, Exception):
|
|
732
|
+
raise item
|
|
733
|
+
yield item
|
|
734
|
+
|
|
703
735
|
def iter_files(self, tracked_files: Dict[str, dict] | None = None) -> Iterator[FileInfo]:
|
|
704
736
|
"""Yield FileInfo for each matching file in the source.
|
|
705
737
|
|
|
@@ -708,8 +740,7 @@ class LocalFileSystemSource(DataSource):
|
|
|
708
740
|
keyed by absolute path. Used for mtime-based fast pre-check.
|
|
709
741
|
"""
|
|
710
742
|
if self.config.recursive:
|
|
711
|
-
|
|
712
|
-
for root, dirs, files in os.walk(self.base_dir):
|
|
743
|
+
for root, dirs, files in self._walk_with_timeout():
|
|
713
744
|
# Prune excluded directories in-place to prevent descent
|
|
714
745
|
dirs[:] = [d for d in dirs if not self._should_skip_directory(d)]
|
|
715
746
|
|
|
@@ -720,7 +751,7 @@ class LocalFileSystemSource(DataSource):
|
|
|
720
751
|
continue
|
|
721
752
|
try:
|
|
722
753
|
yield self._create_file_info(file_path, tracked_files)
|
|
723
|
-
except (OSError, IOError) as e:
|
|
754
|
+
except (OSError, IOError, TimeoutError) as e:
|
|
724
755
|
logger.warning(f"Could not access file {file_path}: {e}")
|
|
725
756
|
continue
|
|
726
757
|
else:
|
|
@@ -738,7 +769,7 @@ class LocalFileSystemSource(DataSource):
|
|
|
738
769
|
continue
|
|
739
770
|
try:
|
|
740
771
|
yield self._create_file_info(f, tracked_files)
|
|
741
|
-
except (OSError, IOError) as e:
|
|
772
|
+
except (OSError, IOError, TimeoutError) as e:
|
|
742
773
|
logger.warning(f"Could not access file {f}: {e}")
|
|
743
774
|
continue
|
|
744
775
|
|
|
@@ -746,6 +777,30 @@ class LocalFileSystemSource(DataSource):
|
|
|
746
777
|
self,
|
|
747
778
|
file_path: Path,
|
|
748
779
|
tracked_files: Dict[str, dict] | None = None,
|
|
780
|
+
) -> FileInfo:
|
|
781
|
+
"""Create FileInfo with timeout protection against stalled mounts.
|
|
782
|
+
|
|
783
|
+
Delegates to _create_file_info_inner in a background thread so that a
|
|
784
|
+
blocking stat() or read() cannot hang the scan indefinitely.
|
|
785
|
+
|
|
786
|
+
Raises TimeoutError if the operation exceeds scan_item_seconds.
|
|
787
|
+
"""
|
|
788
|
+
timeout = cfgload.get("indexing.timeout.scan_item_seconds", 30)
|
|
789
|
+
result = _run_with_timeout(
|
|
790
|
+
lambda: self._create_file_info_inner(file_path, tracked_files),
|
|
791
|
+
timeout_seconds=timeout,
|
|
792
|
+
)
|
|
793
|
+
if result is _SCAN_TIMEOUT_SENTINEL:
|
|
794
|
+
logger.warning(
|
|
795
|
+
f"Timed out after {timeout}s accessing file {file_path}, skipping"
|
|
796
|
+
)
|
|
797
|
+
raise TimeoutError(f"stat/hash timed out for {file_path}")
|
|
798
|
+
return result
|
|
799
|
+
|
|
800
|
+
def _create_file_info_inner(
|
|
801
|
+
self,
|
|
802
|
+
file_path: Path,
|
|
803
|
+
tracked_files: Dict[str, dict] | None = None,
|
|
749
804
|
) -> FileInfo:
|
|
750
805
|
"""Create FileInfo with source directory context.
|
|
751
806
|
|
|
@@ -787,15 +842,30 @@ class LocalFileSystemSource(DataSource):
|
|
|
787
842
|
ctx: "FileProcessingContext | None" = None
|
|
788
843
|
) -> List[LlamaIndexDocument]:
|
|
789
844
|
file_path = Path(file_info.path)
|
|
790
|
-
|
|
791
|
-
|
|
845
|
+
exists_timeout = cfgload.get("indexing.timeout.scan_item_seconds", 30)
|
|
846
|
+
exists_result = _run_with_timeout(
|
|
847
|
+
file_path.exists, timeout_seconds=exists_timeout, default=False,
|
|
848
|
+
)
|
|
849
|
+
if not exists_result:
|
|
850
|
+
if exists_result is False:
|
|
851
|
+
logger.warning(f"Skipping disappeared file: {file_path}")
|
|
792
852
|
return []
|
|
793
853
|
if file_path.suffix.lower() == ".docx":
|
|
794
854
|
if ctx:
|
|
795
855
|
ctx.set_phase("Parsing DOCX")
|
|
796
856
|
remaining = ctx.remaining_seconds() if ctx else None
|
|
797
857
|
if remaining is not None:
|
|
798
|
-
|
|
858
|
+
result = _run_with_timeout(
|
|
859
|
+
lambda: split_docx_into_heading_documents(file_path, ctx),
|
|
860
|
+
timeout_seconds=remaining,
|
|
861
|
+
default=None,
|
|
862
|
+
)
|
|
863
|
+
if result is None:
|
|
864
|
+
logger.warning(
|
|
865
|
+
f"DOCX processing timed out after {remaining:.0f}s: {file_path}"
|
|
866
|
+
)
|
|
867
|
+
return []
|
|
868
|
+
return result
|
|
799
869
|
return split_docx_into_heading_documents(file_path, ctx)
|
|
800
870
|
elif file_path.suffix.lower() == ".doc":
|
|
801
871
|
# Convert .doc to .docx using LibreOffice, then process
|
|
@@ -814,7 +884,16 @@ class LocalFileSystemSource(DataSource):
|
|
|
814
884
|
ctx.set_phase("Parsing converted DOCX")
|
|
815
885
|
remaining = ctx.remaining_seconds() if ctx else None
|
|
816
886
|
if remaining is not None:
|
|
817
|
-
|
|
887
|
+
result = _run_with_timeout(
|
|
888
|
+
lambda: split_docx_into_heading_documents(docx_path, ctx),
|
|
889
|
+
timeout_seconds=remaining,
|
|
890
|
+
default=None,
|
|
891
|
+
)
|
|
892
|
+
if result is None:
|
|
893
|
+
logger.warning(
|
|
894
|
+
f"DOCX processing timed out after {remaining:.0f}s: {docx_path}"
|
|
895
|
+
)
|
|
896
|
+
docs = result if result is not None else []
|
|
818
897
|
else:
|
|
819
898
|
docs = split_docx_into_heading_documents(docx_path, ctx)
|
|
820
899
|
# Update metadata to point to original .doc file
|
|
@@ -834,7 +913,16 @@ class LocalFileSystemSource(DataSource):
|
|
|
834
913
|
)
|
|
835
914
|
remaining = ctx.remaining_seconds() if ctx else None
|
|
836
915
|
if remaining is not None:
|
|
837
|
-
|
|
916
|
+
result = _run_with_timeout(
|
|
917
|
+
reader.load_data,
|
|
918
|
+
timeout_seconds=remaining,
|
|
919
|
+
default=None,
|
|
920
|
+
)
|
|
921
|
+
if result is None:
|
|
922
|
+
logger.warning(
|
|
923
|
+
f"load_data() timed out after {remaining:.0f}s"
|
|
924
|
+
)
|
|
925
|
+
docs = result if result is not None else []
|
|
838
926
|
else:
|
|
839
927
|
docs = reader.load_data()
|
|
840
928
|
# Ensure dates are visible to LLM (remove from exclusion list)
|
|
@@ -779,58 +779,57 @@ class TestFileProcessingContextTimeout:
|
|
|
779
779
|
|
|
780
780
|
|
|
781
781
|
# =============================================================================
|
|
782
|
-
#
|
|
782
|
+
# _run_with_timeout integration tests for load_data / DOCX (Issue #39)
|
|
783
783
|
# =============================================================================
|
|
784
784
|
|
|
785
785
|
|
|
786
786
|
class TestLoadDataWithTimeout:
|
|
787
787
|
def test_returns_docs_on_success(self):
|
|
788
|
-
"""
|
|
788
|
+
"""_run_with_timeout returns docs when load_data succeeds."""
|
|
789
789
|
from unittest.mock import MagicMock
|
|
790
|
-
from chunksilo.index import
|
|
790
|
+
from chunksilo.index import _run_with_timeout
|
|
791
791
|
|
|
792
792
|
mock_reader = MagicMock()
|
|
793
793
|
mock_reader.load_data.return_value = ["doc1", "doc2"]
|
|
794
794
|
|
|
795
|
-
result =
|
|
795
|
+
result = _run_with_timeout(mock_reader.load_data, timeout_seconds=5.0, default=None)
|
|
796
796
|
assert result == ["doc1", "doc2"]
|
|
797
797
|
|
|
798
|
-
def
|
|
799
|
-
"""
|
|
798
|
+
def test_returns_default_on_timeout(self):
|
|
799
|
+
"""_run_with_timeout returns default when load_data hangs."""
|
|
800
800
|
from unittest.mock import MagicMock
|
|
801
|
-
from chunksilo.index import
|
|
801
|
+
from chunksilo.index import _run_with_timeout
|
|
802
802
|
|
|
803
803
|
mock_reader = MagicMock()
|
|
804
804
|
mock_reader.load_data.side_effect = lambda: time.sleep(10)
|
|
805
805
|
|
|
806
|
-
result =
|
|
807
|
-
assert result
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
# =============================================================================
|
|
811
|
-
# _split_docx_with_timeout tests (Issue #39)
|
|
812
|
-
# =============================================================================
|
|
806
|
+
result = _run_with_timeout(mock_reader.load_data, timeout_seconds=0.1, default=None)
|
|
807
|
+
assert result is None
|
|
813
808
|
|
|
814
809
|
|
|
815
810
|
class TestSplitDocxWithTimeout:
|
|
816
811
|
def test_returns_docs_on_success(self):
|
|
817
|
-
"""
|
|
812
|
+
"""_run_with_timeout returns docs when DOCX processing succeeds."""
|
|
818
813
|
from unittest.mock import patch, MagicMock
|
|
819
|
-
from chunksilo.index import
|
|
814
|
+
from chunksilo.index import _run_with_timeout
|
|
820
815
|
|
|
821
816
|
mock_doc = MagicMock()
|
|
822
817
|
with patch(
|
|
823
818
|
"chunksilo.index.split_docx_into_heading_documents",
|
|
824
819
|
return_value=[mock_doc],
|
|
825
|
-
):
|
|
826
|
-
result =
|
|
820
|
+
) as mock_split:
|
|
821
|
+
result = _run_with_timeout(
|
|
822
|
+
lambda: mock_split(Path("/fake/doc.docx"), None),
|
|
823
|
+
timeout_seconds=5.0,
|
|
824
|
+
default=None,
|
|
825
|
+
)
|
|
827
826
|
|
|
828
827
|
assert result == [mock_doc]
|
|
829
828
|
|
|
830
|
-
def
|
|
831
|
-
"""
|
|
829
|
+
def test_returns_default_on_timeout(self):
|
|
830
|
+
"""_run_with_timeout returns default when DOCX processing hangs."""
|
|
832
831
|
from unittest.mock import patch
|
|
833
|
-
from chunksilo.index import
|
|
832
|
+
from chunksilo.index import _run_with_timeout
|
|
834
833
|
|
|
835
834
|
def hang(*args, **kwargs):
|
|
836
835
|
time.sleep(10)
|
|
@@ -838,7 +837,11 @@ class TestSplitDocxWithTimeout:
|
|
|
838
837
|
with patch(
|
|
839
838
|
"chunksilo.index.split_docx_into_heading_documents",
|
|
840
839
|
side_effect=hang,
|
|
841
|
-
):
|
|
842
|
-
result =
|
|
840
|
+
) as mock_split:
|
|
841
|
+
result = _run_with_timeout(
|
|
842
|
+
lambda: mock_split(Path("/fake/doc.docx"), None),
|
|
843
|
+
timeout_seconds=0.1,
|
|
844
|
+
default=None,
|
|
845
|
+
)
|
|
843
846
|
|
|
844
|
-
assert result
|
|
847
|
+
assert result is None
|
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Tests for scan-phase timeout protection.
|
|
3
|
+
|
|
4
|
+
Verifies that filesystem operations during the scanning phase (directory
|
|
5
|
+
traversal, stat, MD5 hashing, file existence checks) cannot hang indefinitely
|
|
6
|
+
when a network mount becomes unresponsive.
|
|
7
|
+
|
|
8
|
+
Uses threading.Event + time.sleep to simulate blocking filesystem calls
|
|
9
|
+
without needing a real stalled network mount.
|
|
10
|
+
"""
|
|
11
|
+
import hashlib
|
|
12
|
+
import os
|
|
13
|
+
import threading
|
|
14
|
+
import time
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from unittest.mock import patch, MagicMock
|
|
17
|
+
|
|
18
|
+
import pytest
|
|
19
|
+
|
|
20
|
+
from chunksilo.index import (
|
|
21
|
+
_run_with_timeout,
|
|
22
|
+
_SCAN_TIMEOUT_SENTINEL,
|
|
23
|
+
DirectoryConfig,
|
|
24
|
+
FileInfo,
|
|
25
|
+
IndexConfig,
|
|
26
|
+
LocalFileSystemSource,
|
|
27
|
+
MultiDirectoryDataSource,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
# Helpers
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
def _block_forever(*_args, **_kwargs):
|
|
36
|
+
"""Simulate a blocking syscall that never returns."""
|
|
37
|
+
threading.Event().wait(timeout=60)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _make_source(tmp_path, **overrides):
|
|
41
|
+
"""Create a LocalFileSystemSource pointing at tmp_path."""
|
|
42
|
+
cfg = DirectoryConfig(path=tmp_path, **overrides)
|
|
43
|
+
return LocalFileSystemSource(cfg)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# ===========================================================================
|
|
47
|
+
# TestRunWithTimeout — tests for the _run_with_timeout() helper itself
|
|
48
|
+
# ===========================================================================
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class TestRunWithTimeout:
|
|
52
|
+
def test_fast_call_returns_result(self):
|
|
53
|
+
"""Callable that returns immediately yields its value."""
|
|
54
|
+
assert _run_with_timeout(lambda: 42, timeout_seconds=5) == 42
|
|
55
|
+
|
|
56
|
+
def test_slow_call_times_out(self):
|
|
57
|
+
"""Callable that sleeps longer than timeout returns the default."""
|
|
58
|
+
result = _run_with_timeout(
|
|
59
|
+
lambda: threading.Event().wait(60) or "never",
|
|
60
|
+
timeout_seconds=1,
|
|
61
|
+
default="timed_out",
|
|
62
|
+
)
|
|
63
|
+
assert result == "timed_out"
|
|
64
|
+
|
|
65
|
+
def test_slow_call_returns_sentinel_by_default(self):
|
|
66
|
+
"""When no default is given, the sentinel object is returned."""
|
|
67
|
+
result = _run_with_timeout(
|
|
68
|
+
lambda: threading.Event().wait(60),
|
|
69
|
+
timeout_seconds=1,
|
|
70
|
+
)
|
|
71
|
+
assert result is _SCAN_TIMEOUT_SENTINEL
|
|
72
|
+
|
|
73
|
+
def test_exception_propagates(self):
|
|
74
|
+
"""Callable that raises an exception propagates it."""
|
|
75
|
+
def _raise():
|
|
76
|
+
raise ValueError("boom")
|
|
77
|
+
|
|
78
|
+
with pytest.raises(ValueError, match="boom"):
|
|
79
|
+
_run_with_timeout(_raise, timeout_seconds=5)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# ===========================================================================
|
|
83
|
+
# TestIsAvailableTimeout — is_available() doesn't hang on unresponsive mounts
|
|
84
|
+
# ===========================================================================
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class TestIsAvailableTimeout:
|
|
88
|
+
@patch("chunksilo.index.cfgload")
|
|
89
|
+
def test_is_available_returns_false_on_timeout(self, mock_cfg, tmp_path):
|
|
90
|
+
"""Patch Path.exists() to block; is_available() returns False within
|
|
91
|
+
a reasonable time rather than hanging for 60s."""
|
|
92
|
+
mock_cfg.get.return_value = 2 # 2-second timeout for fast test
|
|
93
|
+
|
|
94
|
+
source = _make_source(tmp_path)
|
|
95
|
+
|
|
96
|
+
# Patch the specific base_dir's exists to block
|
|
97
|
+
original_exists = Path.exists
|
|
98
|
+
def _blocking_exists(self_path):
|
|
99
|
+
if self_path == tmp_path:
|
|
100
|
+
threading.Event().wait(60)
|
|
101
|
+
return original_exists(self_path)
|
|
102
|
+
|
|
103
|
+
with patch.object(Path, "exists", _blocking_exists):
|
|
104
|
+
start = time.monotonic()
|
|
105
|
+
result = source.is_available()
|
|
106
|
+
elapsed = time.monotonic() - start
|
|
107
|
+
|
|
108
|
+
assert result is False
|
|
109
|
+
assert elapsed < 10, f"is_available() took {elapsed:.1f}s, expected < 10s"
|
|
110
|
+
|
|
111
|
+
def test_is_available_returns_true_when_responsive(self, tmp_path):
|
|
112
|
+
"""Normal directory on tmp_path still returns True (no false positives)."""
|
|
113
|
+
(tmp_path / "file.txt").write_text("hello")
|
|
114
|
+
source = _make_source(tmp_path)
|
|
115
|
+
assert source.is_available() is True
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# ===========================================================================
|
|
119
|
+
# TestCreateFileInfoTimeout — _create_file_info() doesn't hang on stat/read
|
|
120
|
+
# ===========================================================================
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class TestCreateFileInfoTimeout:
|
|
124
|
+
@patch("chunksilo.index.cfgload")
|
|
125
|
+
def test_stat_hang_skips_file(self, mock_cfg, tmp_path):
|
|
126
|
+
"""When Path.stat blocks, iter_files skips the file with a warning."""
|
|
127
|
+
mock_cfg.get.return_value = 2 # 2s timeout
|
|
128
|
+
|
|
129
|
+
f = tmp_path / "test.txt"
|
|
130
|
+
f.write_text("data")
|
|
131
|
+
source = _make_source(tmp_path)
|
|
132
|
+
|
|
133
|
+
original_stat = Path.stat
|
|
134
|
+
def _blocking_stat(self_path, *a, **kw):
|
|
135
|
+
if self_path == f:
|
|
136
|
+
threading.Event().wait(60)
|
|
137
|
+
return original_stat(self_path, *a, **kw)
|
|
138
|
+
|
|
139
|
+
with patch.object(Path, "stat", _blocking_stat):
|
|
140
|
+
start = time.monotonic()
|
|
141
|
+
results = list(source.iter_files())
|
|
142
|
+
elapsed = time.monotonic() - start
|
|
143
|
+
|
|
144
|
+
assert len(results) == 0
|
|
145
|
+
assert elapsed < 15, f"iter_files() took {elapsed:.1f}s, expected < 15s"
|
|
146
|
+
|
|
147
|
+
@patch("chunksilo.index.cfgload")
|
|
148
|
+
def test_hash_read_hang_skips_file(self, mock_cfg, tmp_path):
|
|
149
|
+
"""When open().read() blocks during MD5 hashing, the file is skipped."""
|
|
150
|
+
mock_cfg.get.return_value = 2 # 2s timeout
|
|
151
|
+
|
|
152
|
+
f = tmp_path / "test.txt"
|
|
153
|
+
f.write_text("data")
|
|
154
|
+
source = _make_source(tmp_path)
|
|
155
|
+
|
|
156
|
+
original_open = open
|
|
157
|
+
def _blocking_open(path, *args, **kwargs):
|
|
158
|
+
fh = original_open(path, *args, **kwargs)
|
|
159
|
+
if str(path) == str(f) and "b" in (args[0] if args else ""):
|
|
160
|
+
original_read = fh.read
|
|
161
|
+
def _blocking_read(*a, **kw):
|
|
162
|
+
threading.Event().wait(60)
|
|
163
|
+
return original_read(*a, **kw)
|
|
164
|
+
fh.read = _blocking_read
|
|
165
|
+
return fh
|
|
166
|
+
|
|
167
|
+
with patch("builtins.open", _blocking_open):
|
|
168
|
+
start = time.monotonic()
|
|
169
|
+
results = list(source.iter_files())
|
|
170
|
+
elapsed = time.monotonic() - start
|
|
171
|
+
|
|
172
|
+
assert len(results) == 0
|
|
173
|
+
assert elapsed < 15, f"iter_files() took {elapsed:.1f}s, expected < 15s"
|
|
174
|
+
|
|
175
|
+
def test_normal_files_unaffected(self, tmp_path):
|
|
176
|
+
"""Normal local files are returned with correct hashes."""
|
|
177
|
+
files = {}
|
|
178
|
+
for name in ("a.txt", "b.txt", "c.txt"):
|
|
179
|
+
p = tmp_path / name
|
|
180
|
+
p.write_text(f"content of {name}")
|
|
181
|
+
h = hashlib.md5(p.read_bytes()).hexdigest()
|
|
182
|
+
files[str(p.absolute())] = h
|
|
183
|
+
|
|
184
|
+
source = _make_source(tmp_path)
|
|
185
|
+
results = list(source.iter_files())
|
|
186
|
+
|
|
187
|
+
assert len(results) == 3
|
|
188
|
+
for fi in results:
|
|
189
|
+
assert fi.hash == files[fi.path], f"Hash mismatch for {fi.path}"
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# ===========================================================================
|
|
193
|
+
# TestIterFilesWithTimeout — os.walk() hangs are recovered from
|
|
194
|
+
# ===========================================================================
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class TestIterFilesWithTimeout:
|
|
198
|
+
@patch("chunksilo.index.cfgload")
|
|
199
|
+
def test_walk_hang_on_subdirectory_recovers(self, mock_cfg, tmp_path):
|
|
200
|
+
"""When os.walk blocks on a subdirectory, iter_files eventually stops
|
|
201
|
+
rather than hanging forever."""
|
|
202
|
+
mock_cfg.get.return_value = 2 # 2s timeout
|
|
203
|
+
|
|
204
|
+
# Create dir structure: subA/file_a.txt, subB/ (will hang)
|
|
205
|
+
sub_a = tmp_path / "subA"
|
|
206
|
+
sub_a.mkdir()
|
|
207
|
+
(sub_a / "file_a.txt").write_text("aaa")
|
|
208
|
+
sub_b = tmp_path / "subB"
|
|
209
|
+
sub_b.mkdir()
|
|
210
|
+
(sub_b / "file_b.txt").write_text("bbb")
|
|
211
|
+
|
|
212
|
+
source = _make_source(tmp_path)
|
|
213
|
+
|
|
214
|
+
# Replace os.walk to yield first entry normally, then block
|
|
215
|
+
original_walk = os.walk
|
|
216
|
+
|
|
217
|
+
def _stalling_walk(top, **kw):
|
|
218
|
+
call_count = 0
|
|
219
|
+
for entry in original_walk(top, **kw):
|
|
220
|
+
call_count += 1
|
|
221
|
+
yield entry
|
|
222
|
+
if call_count >= 2: # After yielding top + subA, block
|
|
223
|
+
threading.Event().wait(60)
|
|
224
|
+
|
|
225
|
+
with patch("os.walk", _stalling_walk):
|
|
226
|
+
start = time.monotonic()
|
|
227
|
+
results = list(source.iter_files())
|
|
228
|
+
elapsed = time.monotonic() - start
|
|
229
|
+
|
|
230
|
+
# Should have at least the file from subA (and maybe top-level)
|
|
231
|
+
# but should NOT hang for 60s
|
|
232
|
+
assert elapsed < 15, f"iter_files() took {elapsed:.1f}s, expected < 15s"
|
|
233
|
+
|
|
234
|
+
def test_all_walk_results_returned_when_fast(self, tmp_path):
|
|
235
|
+
"""Normal tmp_path with nested dirs returns all files."""
|
|
236
|
+
sub = tmp_path / "nested"
|
|
237
|
+
sub.mkdir()
|
|
238
|
+
(tmp_path / "top.txt").write_text("top")
|
|
239
|
+
(sub / "deep.txt").write_text("deep")
|
|
240
|
+
|
|
241
|
+
source = _make_source(tmp_path)
|
|
242
|
+
results = list(source.iter_files())
|
|
243
|
+
|
|
244
|
+
paths = {fi.path for fi in results}
|
|
245
|
+
assert str((tmp_path / "top.txt").absolute()) in paths
|
|
246
|
+
assert str((sub / "deep.txt").absolute()) in paths
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
# ===========================================================================
|
|
250
|
+
# TestMultiDirectorySourceTimeout — stalled source doesn't block others
|
|
251
|
+
# ===========================================================================
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
class TestMultiDirectorySourceTimeout:
|
|
255
|
+
def test_stalled_source_does_not_block_others(self, tmp_path):
|
|
256
|
+
"""One directory is responsive, the other's is_available blocks.
|
|
257
|
+
Only the working directory's files are indexed."""
|
|
258
|
+
good_dir = tmp_path / "good"
|
|
259
|
+
good_dir.mkdir()
|
|
260
|
+
(good_dir / "file.txt").write_text("hello")
|
|
261
|
+
|
|
262
|
+
bad_dir = tmp_path / "bad"
|
|
263
|
+
bad_dir.mkdir()
|
|
264
|
+
|
|
265
|
+
config = IndexConfig(
|
|
266
|
+
directories=[
|
|
267
|
+
DirectoryConfig(path=good_dir),
|
|
268
|
+
DirectoryConfig(path=bad_dir),
|
|
269
|
+
]
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
original_is_available = LocalFileSystemSource.is_available
|
|
273
|
+
|
|
274
|
+
def _patched_is_available(self):
|
|
275
|
+
if self.base_dir == bad_dir:
|
|
276
|
+
# Simulate stalled mount — return False (as timeout would)
|
|
277
|
+
return False
|
|
278
|
+
return original_is_available(self)
|
|
279
|
+
|
|
280
|
+
with patch.object(LocalFileSystemSource, "is_available", _patched_is_available):
|
|
281
|
+
mds = MultiDirectoryDataSource(config)
|
|
282
|
+
|
|
283
|
+
assert len(mds.sources) == 1
|
|
284
|
+
assert len(mds.unavailable_dirs) == 1
|
|
285
|
+
|
|
286
|
+
results = list(mds.iter_files())
|
|
287
|
+
assert len(results) == 1
|
|
288
|
+
assert results[0].path == str((good_dir / "file.txt").absolute())
|
|
289
|
+
|
|
290
|
+
@patch("chunksilo.index.cfgload")
|
|
291
|
+
def test_stalled_source_during_scan_skips_to_next(self, mock_cfg, tmp_path):
|
|
292
|
+
"""Both directories pass is_available(), but one's os.walk hangs.
|
|
293
|
+
Files from the other source are still returned."""
|
|
294
|
+
mock_cfg.get.return_value = 2 # 2s timeout
|
|
295
|
+
|
|
296
|
+
good_dir = tmp_path / "good"
|
|
297
|
+
good_dir.mkdir()
|
|
298
|
+
(good_dir / "file.txt").write_text("hello")
|
|
299
|
+
|
|
300
|
+
bad_dir = tmp_path / "bad"
|
|
301
|
+
bad_dir.mkdir()
|
|
302
|
+
(bad_dir / "file.txt").write_text("stale")
|
|
303
|
+
|
|
304
|
+
config = IndexConfig(
|
|
305
|
+
directories=[
|
|
306
|
+
DirectoryConfig(path=good_dir),
|
|
307
|
+
DirectoryConfig(path=bad_dir),
|
|
308
|
+
]
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
mds = MultiDirectoryDataSource(config)
|
|
312
|
+
assert len(mds.sources) == 2
|
|
313
|
+
|
|
314
|
+
# Patch os.walk so walks into bad_dir hang, triggering queue timeout
|
|
315
|
+
original_walk = os.walk
|
|
316
|
+
|
|
317
|
+
def _stalling_walk(top, **kw):
|
|
318
|
+
if str(top) == str(bad_dir):
|
|
319
|
+
# Block immediately — queue timeout will abort iteration
|
|
320
|
+
threading.Event().wait(60)
|
|
321
|
+
return
|
|
322
|
+
yield # make it a generator
|
|
323
|
+
yield from original_walk(top, **kw)
|
|
324
|
+
|
|
325
|
+
with patch("os.walk", _stalling_walk):
|
|
326
|
+
start = time.monotonic()
|
|
327
|
+
results = list(mds.iter_files())
|
|
328
|
+
elapsed = time.monotonic() - start
|
|
329
|
+
|
|
330
|
+
good_paths = {fi.path for fi in results}
|
|
331
|
+
assert str((good_dir / "file.txt").absolute()) in good_paths
|
|
332
|
+
assert elapsed < 15, f"iter_files() took {elapsed:.1f}s, expected < 15s"
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
# ===========================================================================
|
|
336
|
+
# TestLoadFileExistsTimeout — load_file existence check doesn't hang
|
|
337
|
+
# ===========================================================================
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
class TestLoadFileExistsTimeout:
|
|
341
|
+
@patch("chunksilo.index.cfgload")
|
|
342
|
+
def test_exists_hang_skips_file(self, mock_cfg, tmp_path):
|
|
343
|
+
"""When Path.exists blocks on a file, load_file returns [] rather
|
|
344
|
+
than hanging."""
|
|
345
|
+
mock_cfg.get.return_value = 2 # 2s timeout
|
|
346
|
+
|
|
347
|
+
f = tmp_path / "test.txt"
|
|
348
|
+
f.write_text("data")
|
|
349
|
+
source = _make_source(tmp_path)
|
|
350
|
+
fi = FileInfo(
|
|
351
|
+
path=str(f.absolute()),
|
|
352
|
+
hash="abc",
|
|
353
|
+
last_modified=0,
|
|
354
|
+
source_dir=str(tmp_path.absolute()),
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
original_exists = Path.exists
|
|
358
|
+
def _blocking_exists(self_path):
|
|
359
|
+
if self_path == f:
|
|
360
|
+
threading.Event().wait(60)
|
|
361
|
+
return original_exists(self_path)
|
|
362
|
+
|
|
363
|
+
with patch.object(Path, "exists", _blocking_exists):
|
|
364
|
+
start = time.monotonic()
|
|
365
|
+
result = source.load_file(fi)
|
|
366
|
+
elapsed = time.monotonic() - start
|
|
367
|
+
|
|
368
|
+
assert result == []
|
|
369
|
+
assert elapsed < 10, f"load_file() took {elapsed:.1f}s, expected < 10s"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|