chunksilo 2.3.2__tar.gz → 2.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {chunksilo-2.3.2/src/chunksilo.egg-info → chunksilo-2.3.3}/PKG-INFO +1 -1
  2. {chunksilo-2.3.2 → chunksilo-2.3.3}/pyproject.toml +1 -1
  3. {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo/__init__.py +1 -1
  4. {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo/cfgload.py +1 -0
  5. {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo/index.py +145 -57
  6. {chunksilo-2.3.2 → chunksilo-2.3.3/src/chunksilo.egg-info}/PKG-INFO +1 -1
  7. {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo.egg-info/SOURCES.txt +1 -0
  8. {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_indexing_ui.py +27 -24
  9. chunksilo-2.3.3/test/test_scan_timeouts.py +369 -0
  10. {chunksilo-2.3.2 → chunksilo-2.3.3}/LICENSE +0 -0
  11. {chunksilo-2.3.2 → chunksilo-2.3.3}/NOTICE +0 -0
  12. {chunksilo-2.3.2 → chunksilo-2.3.3}/README.md +0 -0
  13. {chunksilo-2.3.2 → chunksilo-2.3.3}/requirements.txt +0 -0
  14. {chunksilo-2.3.2 → chunksilo-2.3.3}/setup.cfg +0 -0
  15. {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo/__main__.py +0 -0
  16. {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo/cli.py +0 -0
  17. {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo/confluence_html_formatter.py +0 -0
  18. {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo/search.py +0 -0
  19. {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo/server.py +0 -0
  20. {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo.egg-info/dependency_links.txt +0 -0
  21. {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo.egg-info/entry_points.txt +0 -0
  22. {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo.egg-info/requires.txt +0 -0
  23. {chunksilo-2.3.2 → chunksilo-2.3.3}/src/chunksilo.egg-info/top_level.txt +0 -0
  24. {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_chunk_location.py +0 -0
  25. {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_confluence_html_formatter.py +0 -0
  26. {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_error_handling.py +0 -0
  27. {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_heading_path_integration.py +0 -0
  28. {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_incremental_ingest.py +0 -0
  29. {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_indexing_benchmark.py +0 -0
  30. {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_jira_integration.py +0 -0
  31. {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_quoted_phrases.py +0 -0
  32. {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_rag_metrics.py +0 -0
  33. {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_retrieval_only.py +0 -0
  34. {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_system.py +0 -0
  35. {chunksilo-2.3.2 → chunksilo-2.3.3}/test/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunksilo
3
- Version: 2.3.2
3
+ Version: 2.3.3
4
4
  Summary: Local RAG-based semantic document search with MCP server interface
5
5
  Author: Fredrik Reveny
6
6
  License-Expression: Apache-2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunksilo"
7
- version = "2.3.2"
7
+ version = "2.3.3"
8
8
  description = "Local RAG-based semantic document search with MCP server interface"
9
9
  license = "Apache-2.0"
10
10
  requires-python = ">=3.11"
@@ -1,4 +1,4 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
  """ChunkSilo - Local RAG-based semantic document search."""
3
3
 
4
- __version__ = "2.3.2"
4
+ __version__ = "2.3.3"
@@ -75,6 +75,7 @@ _DEFAULTS: dict[str, Any] = {
75
75
  "per_file_seconds": 300, # 5 minutes per file
76
76
  "doc_conversion_seconds": 90, # 90 seconds for .doc conversion
77
77
  "heartbeat_interval_seconds": 2,
78
+ "scan_item_seconds": 30, # timeout for stat/hash/walk during scanning
78
79
  },
79
80
  "logging": {
80
81
  "log_slow_files": True,
@@ -10,6 +10,7 @@ import itertools
10
10
  import json
11
11
  import logging
12
12
  import os
13
+ import queue
13
14
  import signal
14
15
  import sqlite3
15
16
  import sys
@@ -575,48 +576,28 @@ def _extract_pdf_headings_from_outline(
575
576
  return []
576
577
 
577
578
 
578
- def _split_docx_with_timeout(
579
- docx_path: Path,
580
- ctx: "FileProcessingContext | None",
581
- timeout_seconds: float,
582
- ) -> List[LlamaIndexDocument]:
583
- """Run split_docx_into_heading_documents() in a worker thread with a hard timeout.
579
+ _SCAN_TIMEOUT_SENTINEL = object()
584
580
 
585
- Returns the loaded documents, or an empty list if the call times out.
586
- """
587
- with ThreadPoolExecutor(max_workers=1) as pool:
588
- future = pool.submit(split_docx_into_heading_documents, docx_path, ctx)
589
- try:
590
- return future.result(timeout=timeout_seconds)
591
- except Exception as e:
592
- future.cancel()
593
- if "TimeoutError" in type(e).__name__ or isinstance(e, TimeoutError):
594
- logger.warning(
595
- f"DOCX processing timed out after {timeout_seconds:.0f}s: {docx_path}"
596
- )
597
- return []
598
- raise
599
581
 
582
+ def _run_with_timeout(fn, timeout_seconds: float, default=_SCAN_TIMEOUT_SENTINEL):
583
+ """Run *fn* in a background thread, returning *default* on timeout.
600
584
 
601
- def _load_data_with_timeout(
602
- reader: SimpleDirectoryReader, timeout_seconds: float
603
- ) -> List[LlamaIndexDocument]:
604
- """Run reader.load_data() in a worker thread with a hard timeout.
605
-
606
- Returns the loaded documents, or an empty list if the call times out.
585
+ If *fn* raises an exception it is re-raised in the caller.
586
+ On timeout the pool is shut down without waiting so the caller is not
587
+ blocked by a still-running filesystem call.
607
588
  """
608
- with ThreadPoolExecutor(max_workers=1) as pool:
609
- future = pool.submit(reader.load_data)
610
- try:
611
- return future.result(timeout=timeout_seconds)
612
- except Exception as e:
613
- future.cancel()
614
- if "TimeoutError" in type(e).__name__ or isinstance(e, TimeoutError):
615
- logger.warning(
616
- f"load_data() timed out after {timeout_seconds:.0f}s"
617
- )
618
- return []
619
- raise
589
+ pool = ThreadPoolExecutor(max_workers=1)
590
+ future = pool.submit(fn)
591
+ try:
592
+ result = future.result(timeout=timeout_seconds)
593
+ pool.shutdown(wait=False)
594
+ return result
595
+ except Exception as exc:
596
+ future.cancel()
597
+ pool.shutdown(wait=False, cancel_futures=True)
598
+ if "TimeoutError" in type(exc).__name__ or isinstance(exc, TimeoutError):
599
+ return default
600
+ raise
620
601
 
621
602
 
622
603
  class LocalFileSystemSource(DataSource):
@@ -627,17 +608,31 @@ class LocalFileSystemSource(DataSource):
627
608
  self.base_dir = config.path
628
609
 
629
610
  def is_available(self) -> bool:
630
- """Check if the directory is available and accessible."""
631
- try:
632
- if not self.base_dir.exists():
633
- return False
634
- if not self.base_dir.is_dir():
611
+ """Check if the directory is available and accessible.
612
+
613
+ Runs with a timeout to avoid hanging on unresponsive network mounts.
614
+ """
615
+ timeout = cfgload.get("indexing.timeout.scan_item_seconds", 30)
616
+
617
+ def _check():
618
+ try:
619
+ if not self.base_dir.exists():
620
+ return False
621
+ if not self.base_dir.is_dir():
622
+ return False
623
+ # Try to list directory to verify access (important for network mounts)
624
+ next(self.base_dir.iterdir(), None)
625
+ return True
626
+ except (OSError, PermissionError):
635
627
  return False
636
- # Try to list directory to verify access (important for network mounts)
637
- next(self.base_dir.iterdir(), None)
638
- return True
639
- except (OSError, PermissionError):
640
- return False
628
+
629
+ result = _run_with_timeout(_check, timeout_seconds=timeout, default=False)
630
+ if result is False and timeout > 0:
631
+ # Distinguish genuine "not a dir" from timeout — log only for timeout
632
+ # (the sentinel default=False means we can't distinguish here, but
633
+ # _run_with_timeout already logged nothing; let callers log.)
634
+ pass
635
+ return result
641
636
 
642
637
  def _matches_patterns(self, file_path: Path) -> bool:
643
638
  """Check if file matches include patterns and doesn't match exclude patterns.
@@ -700,6 +695,43 @@ class LocalFileSystemSource(DataSource):
700
695
  return True
701
696
  return False
702
697
 
698
+ def _walk_with_timeout(self):
699
+ """Yield (root, dirs, files) tuples from os.walk with per-iteration timeout.
700
+
701
+ Runs os.walk in a daemon thread, feeding results through a queue.
702
+ If no result arrives within scan_item_seconds, the walk is considered
703
+ stalled and iteration stops.
704
+ """
705
+ timeout = cfgload.get("indexing.timeout.scan_item_seconds", 30)
706
+ q: queue.Queue = queue.Queue()
707
+ _sentinel = None # signals end of iteration
708
+
709
+ def _producer():
710
+ try:
711
+ for entry in os.walk(self.base_dir):
712
+ q.put(entry)
713
+ q.put(_sentinel)
714
+ except Exception as exc:
715
+ q.put(exc)
716
+
717
+ t = threading.Thread(target=_producer, daemon=True)
718
+ t.start()
719
+
720
+ while True:
721
+ try:
722
+ item = q.get(timeout=timeout)
723
+ except queue.Empty:
724
+ logger.warning(
725
+ f"os.walk() stalled for {timeout}s on {self.base_dir}, "
726
+ "aborting directory scan"
727
+ )
728
+ return
729
+ if item is _sentinel:
730
+ return
731
+ if isinstance(item, Exception):
732
+ raise item
733
+ yield item
734
+
703
735
  def iter_files(self, tracked_files: Dict[str, dict] | None = None) -> Iterator[FileInfo]:
704
736
  """Yield FileInfo for each matching file in the source.
705
737
 
@@ -708,8 +740,7 @@ class LocalFileSystemSource(DataSource):
708
740
  keyed by absolute path. Used for mtime-based fast pre-check.
709
741
  """
710
742
  if self.config.recursive:
711
- # topdown=True (default) allows pruning dirs in-place
712
- for root, dirs, files in os.walk(self.base_dir):
743
+ for root, dirs, files in self._walk_with_timeout():
713
744
  # Prune excluded directories in-place to prevent descent
714
745
  dirs[:] = [d for d in dirs if not self._should_skip_directory(d)]
715
746
 
@@ -720,7 +751,7 @@ class LocalFileSystemSource(DataSource):
720
751
  continue
721
752
  try:
722
753
  yield self._create_file_info(file_path, tracked_files)
723
- except (OSError, IOError) as e:
754
+ except (OSError, IOError, TimeoutError) as e:
724
755
  logger.warning(f"Could not access file {file_path}: {e}")
725
756
  continue
726
757
  else:
@@ -738,7 +769,7 @@ class LocalFileSystemSource(DataSource):
738
769
  continue
739
770
  try:
740
771
  yield self._create_file_info(f, tracked_files)
741
- except (OSError, IOError) as e:
772
+ except (OSError, IOError, TimeoutError) as e:
742
773
  logger.warning(f"Could not access file {f}: {e}")
743
774
  continue
744
775
 
@@ -746,6 +777,30 @@ class LocalFileSystemSource(DataSource):
746
777
  self,
747
778
  file_path: Path,
748
779
  tracked_files: Dict[str, dict] | None = None,
780
+ ) -> FileInfo:
781
+ """Create FileInfo with timeout protection against stalled mounts.
782
+
783
+ Delegates to _create_file_info_inner in a background thread so that a
784
+ blocking stat() or read() cannot hang the scan indefinitely.
785
+
786
+ Raises TimeoutError if the operation exceeds scan_item_seconds.
787
+ """
788
+ timeout = cfgload.get("indexing.timeout.scan_item_seconds", 30)
789
+ result = _run_with_timeout(
790
+ lambda: self._create_file_info_inner(file_path, tracked_files),
791
+ timeout_seconds=timeout,
792
+ )
793
+ if result is _SCAN_TIMEOUT_SENTINEL:
794
+ logger.warning(
795
+ f"Timed out after {timeout}s accessing file {file_path}, skipping"
796
+ )
797
+ raise TimeoutError(f"stat/hash timed out for {file_path}")
798
+ return result
799
+
800
+ def _create_file_info_inner(
801
+ self,
802
+ file_path: Path,
803
+ tracked_files: Dict[str, dict] | None = None,
749
804
  ) -> FileInfo:
750
805
  """Create FileInfo with source directory context.
751
806
 
@@ -787,15 +842,30 @@ class LocalFileSystemSource(DataSource):
787
842
  ctx: "FileProcessingContext | None" = None
788
843
  ) -> List[LlamaIndexDocument]:
789
844
  file_path = Path(file_info.path)
790
- if not file_path.exists():
791
- logger.warning(f"Skipping disappeared file: {file_path}")
845
+ exists_timeout = cfgload.get("indexing.timeout.scan_item_seconds", 30)
846
+ exists_result = _run_with_timeout(
847
+ file_path.exists, timeout_seconds=exists_timeout, default=False,
848
+ )
849
+ if not exists_result:
850
+ if exists_result is False:
851
+ logger.warning(f"Skipping disappeared file: {file_path}")
792
852
  return []
793
853
  if file_path.suffix.lower() == ".docx":
794
854
  if ctx:
795
855
  ctx.set_phase("Parsing DOCX")
796
856
  remaining = ctx.remaining_seconds() if ctx else None
797
857
  if remaining is not None:
798
- return _split_docx_with_timeout(file_path, ctx, remaining)
858
+ result = _run_with_timeout(
859
+ lambda: split_docx_into_heading_documents(file_path, ctx),
860
+ timeout_seconds=remaining,
861
+ default=None,
862
+ )
863
+ if result is None:
864
+ logger.warning(
865
+ f"DOCX processing timed out after {remaining:.0f}s: {file_path}"
866
+ )
867
+ return []
868
+ return result
799
869
  return split_docx_into_heading_documents(file_path, ctx)
800
870
  elif file_path.suffix.lower() == ".doc":
801
871
  # Convert .doc to .docx using LibreOffice, then process
@@ -814,7 +884,16 @@ class LocalFileSystemSource(DataSource):
814
884
  ctx.set_phase("Parsing converted DOCX")
815
885
  remaining = ctx.remaining_seconds() if ctx else None
816
886
  if remaining is not None:
817
- docs = _split_docx_with_timeout(docx_path, ctx, remaining)
887
+ result = _run_with_timeout(
888
+ lambda: split_docx_into_heading_documents(docx_path, ctx),
889
+ timeout_seconds=remaining,
890
+ default=None,
891
+ )
892
+ if result is None:
893
+ logger.warning(
894
+ f"DOCX processing timed out after {remaining:.0f}s: {docx_path}"
895
+ )
896
+ docs = result if result is not None else []
818
897
  else:
819
898
  docs = split_docx_into_heading_documents(docx_path, ctx)
820
899
  # Update metadata to point to original .doc file
@@ -834,7 +913,16 @@ class LocalFileSystemSource(DataSource):
834
913
  )
835
914
  remaining = ctx.remaining_seconds() if ctx else None
836
915
  if remaining is not None:
837
- docs = _load_data_with_timeout(reader, remaining)
916
+ result = _run_with_timeout(
917
+ reader.load_data,
918
+ timeout_seconds=remaining,
919
+ default=None,
920
+ )
921
+ if result is None:
922
+ logger.warning(
923
+ f"load_data() timed out after {remaining:.0f}s"
924
+ )
925
+ docs = result if result is not None else []
838
926
  else:
839
927
  docs = reader.load_data()
840
928
  # Ensure dates are visible to LLM (remove from exclusion list)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunksilo
3
- Version: 2.3.2
3
+ Version: 2.3.3
4
4
  Summary: Local RAG-based semantic document search with MCP server interface
5
5
  Author: Fredrik Reveny
6
6
  License-Expression: Apache-2.0
@@ -28,5 +28,6 @@ test/test_jira_integration.py
28
28
  test/test_quoted_phrases.py
29
29
  test/test_rag_metrics.py
30
30
  test/test_retrieval_only.py
31
+ test/test_scan_timeouts.py
31
32
  test/test_system.py
32
33
  test/test_utils.py
@@ -779,58 +779,57 @@ class TestFileProcessingContextTimeout:
779
779
 
780
780
 
781
781
  # =============================================================================
782
- # _load_data_with_timeout tests (Issue #39)
782
+ # _run_with_timeout integration tests for load_data / DOCX (Issue #39)
783
783
  # =============================================================================
784
784
 
785
785
 
786
786
  class TestLoadDataWithTimeout:
787
787
  def test_returns_docs_on_success(self):
788
- """_load_data_with_timeout returns docs when load_data succeeds."""
788
+ """_run_with_timeout returns docs when load_data succeeds."""
789
789
  from unittest.mock import MagicMock
790
- from chunksilo.index import _load_data_with_timeout
790
+ from chunksilo.index import _run_with_timeout
791
791
 
792
792
  mock_reader = MagicMock()
793
793
  mock_reader.load_data.return_value = ["doc1", "doc2"]
794
794
 
795
- result = _load_data_with_timeout(mock_reader, timeout_seconds=5.0)
795
+ result = _run_with_timeout(mock_reader.load_data, timeout_seconds=5.0, default=None)
796
796
  assert result == ["doc1", "doc2"]
797
797
 
798
- def test_returns_empty_on_timeout(self):
799
- """_load_data_with_timeout returns [] when load_data hangs."""
798
+ def test_returns_default_on_timeout(self):
799
+ """_run_with_timeout returns default when load_data hangs."""
800
800
  from unittest.mock import MagicMock
801
- from chunksilo.index import _load_data_with_timeout
801
+ from chunksilo.index import _run_with_timeout
802
802
 
803
803
  mock_reader = MagicMock()
804
804
  mock_reader.load_data.side_effect = lambda: time.sleep(10)
805
805
 
806
- result = _load_data_with_timeout(mock_reader, timeout_seconds=0.1)
807
- assert result == []
808
-
809
-
810
- # =============================================================================
811
- # _split_docx_with_timeout tests (Issue #39)
812
- # =============================================================================
806
+ result = _run_with_timeout(mock_reader.load_data, timeout_seconds=0.1, default=None)
807
+ assert result is None
813
808
 
814
809
 
815
810
  class TestSplitDocxWithTimeout:
816
811
  def test_returns_docs_on_success(self):
817
- """_split_docx_with_timeout returns docs when processing succeeds."""
812
+ """_run_with_timeout returns docs when DOCX processing succeeds."""
818
813
  from unittest.mock import patch, MagicMock
819
- from chunksilo.index import _split_docx_with_timeout
814
+ from chunksilo.index import _run_with_timeout
820
815
 
821
816
  mock_doc = MagicMock()
822
817
  with patch(
823
818
  "chunksilo.index.split_docx_into_heading_documents",
824
819
  return_value=[mock_doc],
825
- ):
826
- result = _split_docx_with_timeout(Path("/fake/doc.docx"), None, 5.0)
820
+ ) as mock_split:
821
+ result = _run_with_timeout(
822
+ lambda: mock_split(Path("/fake/doc.docx"), None),
823
+ timeout_seconds=5.0,
824
+ default=None,
825
+ )
827
826
 
828
827
  assert result == [mock_doc]
829
828
 
830
- def test_returns_empty_on_timeout(self):
831
- """_split_docx_with_timeout returns [] when processing hangs."""
829
+ def test_returns_default_on_timeout(self):
830
+ """_run_with_timeout returns default when DOCX processing hangs."""
832
831
  from unittest.mock import patch
833
- from chunksilo.index import _split_docx_with_timeout
832
+ from chunksilo.index import _run_with_timeout
834
833
 
835
834
  def hang(*args, **kwargs):
836
835
  time.sleep(10)
@@ -838,7 +837,11 @@ class TestSplitDocxWithTimeout:
838
837
  with patch(
839
838
  "chunksilo.index.split_docx_into_heading_documents",
840
839
  side_effect=hang,
841
- ):
842
- result = _split_docx_with_timeout(Path("/fake/doc.docx"), None, 0.1)
840
+ ) as mock_split:
841
+ result = _run_with_timeout(
842
+ lambda: mock_split(Path("/fake/doc.docx"), None),
843
+ timeout_seconds=0.1,
844
+ default=None,
845
+ )
843
846
 
844
- assert result == []
847
+ assert result is None
@@ -0,0 +1,369 @@
1
+ #!/usr/bin/env python3
2
+ """Tests for scan-phase timeout protection.
3
+
4
+ Verifies that filesystem operations during the scanning phase (directory
5
+ traversal, stat, MD5 hashing, file existence checks) cannot hang indefinitely
6
+ when a network mount becomes unresponsive.
7
+
8
+ Uses threading.Event + time.sleep to simulate blocking filesystem calls
9
+ without needing a real stalled network mount.
10
+ """
11
+ import hashlib
12
+ import os
13
+ import threading
14
+ import time
15
+ from pathlib import Path
16
+ from unittest.mock import patch, MagicMock
17
+
18
+ import pytest
19
+
20
+ from chunksilo.index import (
21
+ _run_with_timeout,
22
+ _SCAN_TIMEOUT_SENTINEL,
23
+ DirectoryConfig,
24
+ FileInfo,
25
+ IndexConfig,
26
+ LocalFileSystemSource,
27
+ MultiDirectoryDataSource,
28
+ )
29
+
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Helpers
33
+ # ---------------------------------------------------------------------------
34
+
35
+ def _block_forever(*_args, **_kwargs):
36
+ """Simulate a blocking syscall that never returns."""
37
+ threading.Event().wait(timeout=60)
38
+
39
+
40
+ def _make_source(tmp_path, **overrides):
41
+ """Create a LocalFileSystemSource pointing at tmp_path."""
42
+ cfg = DirectoryConfig(path=tmp_path, **overrides)
43
+ return LocalFileSystemSource(cfg)
44
+
45
+
46
+ # ===========================================================================
47
+ # TestRunWithTimeout — tests for the _run_with_timeout() helper itself
48
+ # ===========================================================================
49
+
50
+
51
+ class TestRunWithTimeout:
52
+ def test_fast_call_returns_result(self):
53
+ """Callable that returns immediately yields its value."""
54
+ assert _run_with_timeout(lambda: 42, timeout_seconds=5) == 42
55
+
56
+ def test_slow_call_times_out(self):
57
+ """Callable that sleeps longer than timeout returns the default."""
58
+ result = _run_with_timeout(
59
+ lambda: threading.Event().wait(60) or "never",
60
+ timeout_seconds=1,
61
+ default="timed_out",
62
+ )
63
+ assert result == "timed_out"
64
+
65
+ def test_slow_call_returns_sentinel_by_default(self):
66
+ """When no default is given, the sentinel object is returned."""
67
+ result = _run_with_timeout(
68
+ lambda: threading.Event().wait(60),
69
+ timeout_seconds=1,
70
+ )
71
+ assert result is _SCAN_TIMEOUT_SENTINEL
72
+
73
+ def test_exception_propagates(self):
74
+ """Callable that raises an exception propagates it."""
75
+ def _raise():
76
+ raise ValueError("boom")
77
+
78
+ with pytest.raises(ValueError, match="boom"):
79
+ _run_with_timeout(_raise, timeout_seconds=5)
80
+
81
+
82
+ # ===========================================================================
83
+ # TestIsAvailableTimeout — is_available() doesn't hang on unresponsive mounts
84
+ # ===========================================================================
85
+
86
+
87
+ class TestIsAvailableTimeout:
88
+ @patch("chunksilo.index.cfgload")
89
+ def test_is_available_returns_false_on_timeout(self, mock_cfg, tmp_path):
90
+ """Patch Path.exists() to block; is_available() returns False within
91
+ a reasonable time rather than hanging for 60s."""
92
+ mock_cfg.get.return_value = 2 # 2-second timeout for fast test
93
+
94
+ source = _make_source(tmp_path)
95
+
96
+ # Patch the specific base_dir's exists to block
97
+ original_exists = Path.exists
98
+ def _blocking_exists(self_path):
99
+ if self_path == tmp_path:
100
+ threading.Event().wait(60)
101
+ return original_exists(self_path)
102
+
103
+ with patch.object(Path, "exists", _blocking_exists):
104
+ start = time.monotonic()
105
+ result = source.is_available()
106
+ elapsed = time.monotonic() - start
107
+
108
+ assert result is False
109
+ assert elapsed < 10, f"is_available() took {elapsed:.1f}s, expected < 10s"
110
+
111
+ def test_is_available_returns_true_when_responsive(self, tmp_path):
112
+ """Normal directory on tmp_path still returns True (no false positives)."""
113
+ (tmp_path / "file.txt").write_text("hello")
114
+ source = _make_source(tmp_path)
115
+ assert source.is_available() is True
116
+
117
+
118
+ # ===========================================================================
119
+ # TestCreateFileInfoTimeout — _create_file_info() doesn't hang on stat/read
120
+ # ===========================================================================
121
+
122
+
123
+ class TestCreateFileInfoTimeout:
124
+ @patch("chunksilo.index.cfgload")
125
+ def test_stat_hang_skips_file(self, mock_cfg, tmp_path):
126
+ """When Path.stat blocks, iter_files skips the file with a warning."""
127
+ mock_cfg.get.return_value = 2 # 2s timeout
128
+
129
+ f = tmp_path / "test.txt"
130
+ f.write_text("data")
131
+ source = _make_source(tmp_path)
132
+
133
+ original_stat = Path.stat
134
+ def _blocking_stat(self_path, *a, **kw):
135
+ if self_path == f:
136
+ threading.Event().wait(60)
137
+ return original_stat(self_path, *a, **kw)
138
+
139
+ with patch.object(Path, "stat", _blocking_stat):
140
+ start = time.monotonic()
141
+ results = list(source.iter_files())
142
+ elapsed = time.monotonic() - start
143
+
144
+ assert len(results) == 0
145
+ assert elapsed < 15, f"iter_files() took {elapsed:.1f}s, expected < 15s"
146
+
147
+ @patch("chunksilo.index.cfgload")
148
+ def test_hash_read_hang_skips_file(self, mock_cfg, tmp_path):
149
+ """When open().read() blocks during MD5 hashing, the file is skipped."""
150
+ mock_cfg.get.return_value = 2 # 2s timeout
151
+
152
+ f = tmp_path / "test.txt"
153
+ f.write_text("data")
154
+ source = _make_source(tmp_path)
155
+
156
+ original_open = open
157
+ def _blocking_open(path, *args, **kwargs):
158
+ fh = original_open(path, *args, **kwargs)
159
+ if str(path) == str(f) and "b" in (args[0] if args else ""):
160
+ original_read = fh.read
161
+ def _blocking_read(*a, **kw):
162
+ threading.Event().wait(60)
163
+ return original_read(*a, **kw)
164
+ fh.read = _blocking_read
165
+ return fh
166
+
167
+ with patch("builtins.open", _blocking_open):
168
+ start = time.monotonic()
169
+ results = list(source.iter_files())
170
+ elapsed = time.monotonic() - start
171
+
172
+ assert len(results) == 0
173
+ assert elapsed < 15, f"iter_files() took {elapsed:.1f}s, expected < 15s"
174
+
175
+ def test_normal_files_unaffected(self, tmp_path):
176
+ """Normal local files are returned with correct hashes."""
177
+ files = {}
178
+ for name in ("a.txt", "b.txt", "c.txt"):
179
+ p = tmp_path / name
180
+ p.write_text(f"content of {name}")
181
+ h = hashlib.md5(p.read_bytes()).hexdigest()
182
+ files[str(p.absolute())] = h
183
+
184
+ source = _make_source(tmp_path)
185
+ results = list(source.iter_files())
186
+
187
+ assert len(results) == 3
188
+ for fi in results:
189
+ assert fi.hash == files[fi.path], f"Hash mismatch for {fi.path}"
190
+
191
+
192
+ # ===========================================================================
193
+ # TestIterFilesWithTimeout — os.walk() hangs are recovered from
194
+ # ===========================================================================
195
+
196
+
197
+ class TestIterFilesWithTimeout:
198
+ @patch("chunksilo.index.cfgload")
199
+ def test_walk_hang_on_subdirectory_recovers(self, mock_cfg, tmp_path):
200
+ """When os.walk blocks on a subdirectory, iter_files eventually stops
201
+ rather than hanging forever."""
202
+ mock_cfg.get.return_value = 2 # 2s timeout
203
+
204
+ # Create dir structure: subA/file_a.txt, subB/ (will hang)
205
+ sub_a = tmp_path / "subA"
206
+ sub_a.mkdir()
207
+ (sub_a / "file_a.txt").write_text("aaa")
208
+ sub_b = tmp_path / "subB"
209
+ sub_b.mkdir()
210
+ (sub_b / "file_b.txt").write_text("bbb")
211
+
212
+ source = _make_source(tmp_path)
213
+
214
+ # Replace os.walk to yield first entry normally, then block
215
+ original_walk = os.walk
216
+
217
+ def _stalling_walk(top, **kw):
218
+ call_count = 0
219
+ for entry in original_walk(top, **kw):
220
+ call_count += 1
221
+ yield entry
222
+ if call_count >= 2: # After yielding top + subA, block
223
+ threading.Event().wait(60)
224
+
225
+ with patch("os.walk", _stalling_walk):
226
+ start = time.monotonic()
227
+ results = list(source.iter_files())
228
+ elapsed = time.monotonic() - start
229
+
230
+ # Should have at least the file from subA (and maybe top-level)
231
+ # but should NOT hang for 60s
232
+ assert elapsed < 15, f"iter_files() took {elapsed:.1f}s, expected < 15s"
233
+
234
+ def test_all_walk_results_returned_when_fast(self, tmp_path):
235
+ """Normal tmp_path with nested dirs returns all files."""
236
+ sub = tmp_path / "nested"
237
+ sub.mkdir()
238
+ (tmp_path / "top.txt").write_text("top")
239
+ (sub / "deep.txt").write_text("deep")
240
+
241
+ source = _make_source(tmp_path)
242
+ results = list(source.iter_files())
243
+
244
+ paths = {fi.path for fi in results}
245
+ assert str((tmp_path / "top.txt").absolute()) in paths
246
+ assert str((sub / "deep.txt").absolute()) in paths
247
+
248
+
249
+ # ===========================================================================
250
+ # TestMultiDirectorySourceTimeout — stalled source doesn't block others
251
+ # ===========================================================================
252
+
253
+
254
+ class TestMultiDirectorySourceTimeout:
255
+ def test_stalled_source_does_not_block_others(self, tmp_path):
256
+ """One directory is responsive, the other's is_available blocks.
257
+ Only the working directory's files are indexed."""
258
+ good_dir = tmp_path / "good"
259
+ good_dir.mkdir()
260
+ (good_dir / "file.txt").write_text("hello")
261
+
262
+ bad_dir = tmp_path / "bad"
263
+ bad_dir.mkdir()
264
+
265
+ config = IndexConfig(
266
+ directories=[
267
+ DirectoryConfig(path=good_dir),
268
+ DirectoryConfig(path=bad_dir),
269
+ ]
270
+ )
271
+
272
+ original_is_available = LocalFileSystemSource.is_available
273
+
274
+ def _patched_is_available(self):
275
+ if self.base_dir == bad_dir:
276
+ # Simulate stalled mount — return False (as timeout would)
277
+ return False
278
+ return original_is_available(self)
279
+
280
+ with patch.object(LocalFileSystemSource, "is_available", _patched_is_available):
281
+ mds = MultiDirectoryDataSource(config)
282
+
283
+ assert len(mds.sources) == 1
284
+ assert len(mds.unavailable_dirs) == 1
285
+
286
+ results = list(mds.iter_files())
287
+ assert len(results) == 1
288
+ assert results[0].path == str((good_dir / "file.txt").absolute())
289
+
290
+ @patch("chunksilo.index.cfgload")
291
+ def test_stalled_source_during_scan_skips_to_next(self, mock_cfg, tmp_path):
292
+ """Both directories pass is_available(), but one's os.walk hangs.
293
+ Files from the other source are still returned."""
294
+ mock_cfg.get.return_value = 2 # 2s timeout
295
+
296
+ good_dir = tmp_path / "good"
297
+ good_dir.mkdir()
298
+ (good_dir / "file.txt").write_text("hello")
299
+
300
+ bad_dir = tmp_path / "bad"
301
+ bad_dir.mkdir()
302
+ (bad_dir / "file.txt").write_text("stale")
303
+
304
+ config = IndexConfig(
305
+ directories=[
306
+ DirectoryConfig(path=good_dir),
307
+ DirectoryConfig(path=bad_dir),
308
+ ]
309
+ )
310
+
311
+ mds = MultiDirectoryDataSource(config)
312
+ assert len(mds.sources) == 2
313
+
314
+ # Patch os.walk so walks into bad_dir hang, triggering queue timeout
315
+ original_walk = os.walk
316
+
317
+ def _stalling_walk(top, **kw):
318
+ if str(top) == str(bad_dir):
319
+ # Block immediately — queue timeout will abort iteration
320
+ threading.Event().wait(60)
321
+ return
322
+ yield # make it a generator
323
+ yield from original_walk(top, **kw)
324
+
325
+ with patch("os.walk", _stalling_walk):
326
+ start = time.monotonic()
327
+ results = list(mds.iter_files())
328
+ elapsed = time.monotonic() - start
329
+
330
+ good_paths = {fi.path for fi in results}
331
+ assert str((good_dir / "file.txt").absolute()) in good_paths
332
+ assert elapsed < 15, f"iter_files() took {elapsed:.1f}s, expected < 15s"
333
+
334
+
335
+ # ===========================================================================
336
+ # TestLoadFileExistsTimeout — load_file existence check doesn't hang
337
+ # ===========================================================================
338
+
339
+
340
+ class TestLoadFileExistsTimeout:
341
+ @patch("chunksilo.index.cfgload")
342
+ def test_exists_hang_skips_file(self, mock_cfg, tmp_path):
343
+ """When Path.exists blocks on a file, load_file returns [] rather
344
+ than hanging."""
345
+ mock_cfg.get.return_value = 2 # 2s timeout
346
+
347
+ f = tmp_path / "test.txt"
348
+ f.write_text("data")
349
+ source = _make_source(tmp_path)
350
+ fi = FileInfo(
351
+ path=str(f.absolute()),
352
+ hash="abc",
353
+ last_modified=0,
354
+ source_dir=str(tmp_path.absolute()),
355
+ )
356
+
357
+ original_exists = Path.exists
358
+ def _blocking_exists(self_path):
359
+ if self_path == f:
360
+ threading.Event().wait(60)
361
+ return original_exists(self_path)
362
+
363
+ with patch.object(Path, "exists", _blocking_exists):
364
+ start = time.monotonic()
365
+ result = source.load_file(fi)
366
+ elapsed = time.monotonic() - start
367
+
368
+ assert result == []
369
+ assert elapsed < 10, f"load_file() took {elapsed:.1f}s, expected < 10s"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes