kssrag 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ kssrag/__init__.py,sha256=N1XfR8IRKtEJAzcOVyHnKXtgx-ztlrSFtFwiVkGbAX8,2041
2
+ kssrag/cli.py,sha256=9AbtUEV9X63bhRj4EU-eHhud8iPM7LJAGSbu_IPlMUE,9703
3
+ kssrag/config.py,sha256=zd978GZQ66TlLZnk9yP7uvoXoWD89BS8VHi7w_yGXrM,6529
4
+ kssrag/kssrag.py,sha256=vy3oCHeHFAp_dJW0JjLbTxeEwCcwtXuOL_Ejmv0qz8Y,5251
5
+ kssrag/server.py,sha256=CbnC0WhIKIi6iJ3q448swEdLDcvmUf80lsdlSKp0GpM,5942
6
+ kssrag/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ kssrag/core/agents.py,sha256=5zRSudh_4tbp4lDfAhaczeOe-INCpgfm6OJEhE6Ut4I,5421
8
+ kssrag/core/chunkers.py,sha256=HmWL3y2DhhobV5zIlIdZP2KK2N7TASqeirPqmc3_inI,7324
9
+ kssrag/core/retrievers.py,sha256=1e9c7ukUD4pFSVasOMTXSKoz_rapXQTl-FrSHK6Osqg,3037
10
+ kssrag/core/vectorstores.py,sha256=H8hTpjc6hAFMhqAO2Cjq-Jp6xrJhsJKiRN9qxb_-6XM,21003
11
+ kssrag/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ kssrag/models/local_llms.py,sha256=IsthEwiNG1QcvHrTpQWdd1kZuHa4-0bfGTxHe8F3i2M,1178
13
+ kssrag/models/openrouter.py,sha256=tplACtQ5J9YTemk0616dhg6H81_eAdsfeLs3AEytKE0,6429
14
+ kssrag/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ kssrag/utils/document_loaders.py,sha256=grXtU4sy8j23zJwanadO8rFXlsRJ2e2JF6MvoczsmqQ,4192
16
+ kssrag/utils/helpers.py,sha256=MoTZRgTTiHDnbELHLrDKOBoxxiwFyLKJXUnZeWOtHFg,3806
17
+ kssrag/utils/ocr.py,sha256=T2yZM-Z8B-1Y7K7CKxM5BrvNFPB5Cx0vjlk-XZnc3p8,1425
18
+ kssrag/utils/ocr_loader.py,sha256=0RvY56aSNulo4U1eHwSzOUBZUIzc1nBLt7395OYdkXM,6930
19
+ kssrag/utils/preprocessors.py,sha256=_kbeZOWnbqbKKSBiyRP8QZAKx9uYMXgHfARcWBqC3JU,938
20
+ tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ tests/test_basic.py,sha256=JdBBRpP9wOo4BvvZTisidP40gGyK_azUoewJpoJaa5M,1275
22
+ tests/test_bm25s.py,sha256=tfvhWGxippGmNiLujc2OiaFewvvkJoOwrGXzBGZQMtU,2749
23
+ tests/test_config.py,sha256=zIawdV9xb-EuDl1BXKKOvgZY-uUc5Q0KeyJHBP85eIE,1398
24
+ tests/test_image_chunker.py,sha256=7cY3HucIFdNzcOVI2WA0nY5QmGcsv5umfE4c_yNnLfw,741
25
+ tests/test_integration.py,sha256=TY7MrTcAiu1KG4MlgIC7VVlzUTnOoqp9pieK8rhBNrg,1059
26
+ tests/test_ocr.py,sha256=PoGKLNISpAwaoPvGuS7qiOf6dsVnsFRFtYkG1WFi6TU,6202
27
+ tests/test_streaming.py,sha256=rMQ0w8_HQFFV0PbHDqQXRBqaNfbd3WqJVNT2hKVbsqw,1442
28
+ tests/test_vectorstores.py,sha256=YOwI2bfqprzbq8ahIw4pbbbEOaKGcg-XPcLCO7WiLxE,1474
29
+ kssrag-0.2.0.dist-info/METADATA,sha256=MK2r6XV1oT76WuQJ01vdAbJxER9ZkCDalAITwMo3tNg,24016
30
+ kssrag-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
+ kssrag-0.2.0.dist-info/entry_points.txt,sha256=g4tQj5YUqPK3Osb9BI85tsErxleSBUENiqlnX0fWK5M,43
32
+ kssrag-0.2.0.dist-info/top_level.txt,sha256=sO9LGINa0GEjLoHTtufpz01yM5SmeTw6M4zWHEF0R2s,13
33
+ kssrag-0.2.0.dist-info/RECORD,,
tests/test_bm25s.py ADDED
@@ -0,0 +1,74 @@
1
+ import pytest
2
+ import numpy as np
3
+ import tempfile
4
+ import os
5
+ from kssrag.core.vectorstores import BM25SVectorStore
6
+
7
+ def test_bm25s_vector_store_basic():
8
+ """Test BM25S vector store basic functionality"""
9
+ documents = [
10
+ {"content": "This is a test document about Python programming.", "metadata": {"source": "test1"}},
11
+ {"content": "Another document about machine learning and AI.", "metadata": {"source": "test2"}},
12
+ {"content": "A third document on web development with JavaScript.", "metadata": {"source": "test3"}},
13
+ ]
14
+
15
+ vector_store = BM25SVectorStore()
16
+ vector_store.add_documents(documents)
17
+
18
+ results = vector_store.retrieve("Python programming", top_k=2)
19
+
20
+ assert len(results) == 2
21
+ assert "Python" in results[0]["content"]
22
+ assert all("metadata" in result for result in results)
23
+
24
+ def test_bm25s_persistence():
25
+ """Test BM25S vector store persistence"""
26
+ documents = [
27
+ {"content": "Test document for persistence.", "metadata": {"source": "test1"}},
28
+ {"content": "Another test document.", "metadata": {"source": "test2"}},
29
+ ]
30
+
31
+ with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as f:
32
+ temp_file = f.name
33
+
34
+ try:
35
+ # Create and persist
36
+ vector_store = BM25SVectorStore(persist_path=temp_file)
37
+ vector_store.add_documents(documents)
38
+ vector_store.persist()
39
+
40
+ # Load and verify
41
+ new_vector_store = BM25SVectorStore(persist_path=temp_file)
42
+ new_vector_store.load()
43
+
44
+ results = new_vector_store.retrieve("persistence", top_k=1)
45
+ assert len(results) == 1
46
+ assert "persistence" in results[0]["content"]
47
+
48
+ finally:
49
+ if os.path.exists(temp_file):
50
+ os.unlink(temp_file)
51
+
52
+ def test_bm25s_empty_query():
53
+ """Test BM25S with empty query"""
54
+ documents = [
55
+ {"content": "Test document.", "metadata": {"source": "test1"}},
56
+ ]
57
+
58
+ vector_store = BM25SVectorStore()
59
+ vector_store.add_documents(documents)
60
+
61
+ results = vector_store.retrieve("", top_k=1)
62
+ # BM25S may return documents even with empty query, but they should have low scores
63
+ # Let's check that the behavior is consistent
64
+ if len(results) > 0:
65
+ # If it returns results, they should be the documents we added
66
+ assert results[0]["content"] == "Test document."
67
+ # Either behavior is acceptable for this test
68
+
69
+ def test_bm25s_no_documents():
70
+ """Test BM25S with no documents added"""
71
+ vector_store = BM25SVectorStore()
72
+
73
+ with pytest.raises(ValueError, match="BM25S index not initialized"):
74
+ vector_store.retrieve("test query")
tests/test_config.py ADDED
@@ -0,0 +1,42 @@
1
+ import pytest
2
+ import os
3
+ from kssrag.config import Config, VectorStoreType, ChunkerType
4
+
5
+ def test_config_new_options():
6
+ """Test new configuration options"""
7
+ config = Config(
8
+ OCR_DEFAULT_MODE="handwritten",
9
+ ENABLE_STREAMING=True,
10
+ VECTOR_STORE_TYPE=VectorStoreType.BM25S
11
+ )
12
+
13
+ assert config.OCR_DEFAULT_MODE == "handwritten"
14
+ assert config.ENABLE_STREAMING == True
15
+ assert config.VECTOR_STORE_TYPE == VectorStoreType.BM25S
16
+
17
+ def test_config_vector_store_types():
18
+ """Test all vector store types including BM25S"""
19
+ config = Config(VECTOR_STORE_TYPE=VectorStoreType.BM25S)
20
+ assert config.VECTOR_STORE_TYPE == "bm25s"
21
+
22
+ config = Config(VECTOR_STORE_TYPE=VectorStoreType.BM25)
23
+ assert config.VECTOR_STORE_TYPE == "bm25"
24
+
25
+ def test_config_chunker_types():
26
+ """Test all chunker types including image"""
27
+ config = Config(CHUNKER_TYPE=ChunkerType.IMAGE)
28
+ assert config.CHUNKER_TYPE == "image"
29
+
30
+ def test_config_environment_variables():
31
+ """Test new environment variables"""
32
+ os.environ["OCR_DEFAULT_MODE"] = "handwritten"
33
+ os.environ["ENABLE_STREAMING"] = "true"
34
+
35
+ config = Config()
36
+
37
+ assert config.OCR_DEFAULT_MODE == "handwritten"
38
+ assert config.ENABLE_STREAMING == True
39
+
40
+ # Cleanup
41
+ del os.environ["OCR_DEFAULT_MODE"]
42
+ del os.environ["ENABLE_STREAMING"]
@@ -0,0 +1,17 @@
1
+ import pytest
2
+ from kssrag.core.chunkers import ImageChunker, OCR_AVAILABLE
3
+
4
+ @pytest.mark.skipif(not OCR_AVAILABLE, reason="OCR dependencies not available")
5
+ def test_image_chunker_basic():
6
+ """Basic test for ImageChunker - just check it initializes"""
7
+ chunker = ImageChunker(ocr_mode="typed")
8
+ assert chunker.ocr_mode == "typed"
9
+
10
+ @pytest.mark.skipif(not OCR_AVAILABLE, reason="OCR dependencies not available")
11
+ def test_image_chunker_modes():
12
+ """Test that ImageChunker accepts valid modes"""
13
+ chunker_typed = ImageChunker(ocr_mode="typed")
14
+ chunker_handwritten = ImageChunker(ocr_mode="handwritten")
15
+
16
+ assert chunker_typed.ocr_mode == "typed"
17
+ assert chunker_handwritten.ocr_mode == "handwritten"
@@ -0,0 +1,35 @@
1
+ import pytest
2
+ import tempfile
3
+ import os
4
+ from kssrag import KSSRAG, Config
5
+
6
+ def test_bm25s_integration():
7
+ """Test BM25S integration with KSSRAG"""
8
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
9
+ f.write("Test document about Python programming and machine learning.")
10
+ temp_file = f.name
11
+
12
+ try:
13
+ config = Config(
14
+ VECTOR_STORE_TYPE="bm25s",
15
+ MAX_DOCS_FOR_TESTING=1
16
+ )
17
+
18
+ rag = KSSRAG(config=config)
19
+ rag.load_document(temp_file, format="text")
20
+
21
+ response = rag.query("Python programming")
22
+
23
+ assert isinstance(response, str)
24
+ assert len(response) > 0
25
+
26
+ finally:
27
+ os.unlink(temp_file)
28
+
29
+ def test_streaming_integration():
30
+ """Test streaming integration (mock test)"""
31
+ config = Config(ENABLE_STREAMING=True)
32
+
33
+ # This is a basic test that config is accepted
34
+ # Actual streaming would require API calls
35
+ assert config.ENABLE_STREAMING == True
tests/test_ocr.py ADDED
@@ -0,0 +1,142 @@
1
+ import pytest
2
+ import tempfile
3
+ import os
4
+ from unittest.mock import Mock, patch, MagicMock
5
+ from kssrag.utils.ocr_loader import OCRLoader
6
+
7
+ def test_ocr_loader_initialization():
8
+ """Test OCRLoader initialization with mocked PaddleOCR"""
9
+ with patch('kssrag.utils.ocr_loader.PaddleOCR') as mock_paddle:
10
+ mock_instance = Mock()
11
+ mock_paddle.return_value = mock_instance
12
+
13
+ loader = OCRLoader()
14
+ assert loader.paddle_ocr == mock_instance
15
+
16
+ def test_ocr_loader_invalid_mode():
17
+ """Test OCRLoader with invalid mode"""
18
+ with patch('kssrag.utils.ocr_loader.PaddleOCR') as mock_paddle:
19
+ mock_instance = Mock()
20
+ mock_paddle.return_value = mock_instance
21
+
22
+ loader = OCRLoader()
23
+
24
+ with pytest.raises(ValueError, match="Invalid OCR mode"):
25
+ loader.extract_text("test.jpg", "invalid_mode")
26
+
27
+ def test_ocr_loader_file_not_found():
28
+ """Test OCRLoader with non-existent file"""
29
+ with patch('kssrag.utils.ocr_loader.PaddleOCR') as mock_paddle:
30
+ mock_instance = Mock()
31
+ mock_paddle.return_value = mock_instance
32
+
33
+ loader = OCRLoader()
34
+
35
+ with pytest.raises(FileNotFoundError):
36
+ loader.extract_text("nonexistent.jpg", "typed")
37
+
38
+ def test_ocr_loader_typed_mode():
39
+ """Test OCRLoader typed mode"""
40
+ with patch('kssrag.utils.ocr_loader.PaddleOCR') as mock_paddle:
41
+ with patch('kssrag.utils.ocr_loader.pytesseract') as mock_tesseract:
42
+ with patch('kssrag.utils.ocr_loader.Image') as mock_image:
43
+ with patch('kssrag.utils.ocr_loader.os.path.exists') as mock_exists:
44
+ # Mock file exists
45
+ mock_exists.return_value = True
46
+
47
+ # Mock image opening
48
+ mock_img_instance = MagicMock()
49
+ mock_image.open.return_value = mock_img_instance
50
+
51
+ # Mock OCR result
52
+ mock_tesseract.image_to_string.return_value = "Typed text content"
53
+
54
+ mock_paddle_instance = Mock()
55
+ mock_paddle.return_value = mock_paddle_instance
56
+
57
+ loader = OCRLoader()
58
+
59
+ result = loader.extract_text("test.jpg", "typed")
60
+ assert result == "Typed text content"
61
+ mock_tesseract.image_to_string.assert_called_once_with(mock_img_instance)
62
+
63
+ def test_ocr_loader_handwritten_mode():
64
+ """Test OCRLoader handwritten mode"""
65
+ with patch('kssrag.utils.ocr_loader.PaddleOCR') as mock_paddle:
66
+ with patch('kssrag.utils.ocr_loader.cv2') as mock_cv2:
67
+ with patch('kssrag.utils.ocr_loader.os.path.exists') as mock_exists:
68
+ # Mock file exists
69
+ mock_exists.return_value = True
70
+
71
+ # Mock image reading
72
+ mock_cv2.imread.return_value = "mock_image"
73
+
74
+ # Mock OCR result
75
+ mock_paddle_instance = Mock()
76
+ mock_paddle_instance.ocr.return_value = [[[None, ["Handwritten text", 0.9]]]]
77
+ mock_paddle.return_value = mock_paddle_instance
78
+
79
+ loader = OCRLoader()
80
+
81
+ result = loader.extract_text("test.jpg", "handwritten")
82
+ assert result == "Handwritten text"
83
+ mock_paddle_instance.ocr.assert_called_once_with("mock_image", cls=True)
84
+
85
+ def test_ocr_loader_paddle_not_initialized():
86
+ """Test OCRLoader when PaddleOCR is not initialized"""
87
+ with patch('kssrag.utils.ocr_loader.PaddleOCR') as mock_paddle:
88
+ mock_paddle.return_value = None # Simulate initialization failure
89
+
90
+ loader = OCRLoader()
91
+ loader.paddle_ocr = None # Force the failure state
92
+
93
+ with pytest.raises(RuntimeError, match="PaddleOCR not initialized"):
94
+ loader.extract_text("test.jpg", "handwritten")
95
+
96
+ def test_ocr_loader_empty_text():
97
+ """Test OCRLoader when no text is extracted"""
98
+ with patch('kssrag.utils.ocr_loader.PaddleOCR') as mock_paddle:
99
+ with patch('kssrag.utils.ocr_loader.pytesseract') as mock_tesseract:
100
+ with patch('kssrag.utils.ocr_loader.Image') as mock_image:
101
+ with patch('kssrag.utils.ocr_loader.os.path.exists') as mock_exists:
102
+ # Mock file exists
103
+ mock_exists.return_value = True
104
+
105
+ # Mock image opening
106
+ mock_img_instance = MagicMock()
107
+ mock_image.open.return_value = mock_img_instance
108
+
109
+ # Mock empty OCR result
110
+ mock_tesseract.image_to_string.return_value = " " # Only whitespace
111
+
112
+ mock_paddle_instance = Mock()
113
+ mock_paddle.return_value = mock_paddle_instance
114
+
115
+ loader = OCRLoader()
116
+
117
+ result = loader.extract_text("test.jpg", "typed")
118
+ assert result == "" # Should return empty string
119
+
120
+ @pytest.mark.skipif(not os.getenv('TEST_OCR'), reason="OCR tests require actual OCR dependencies")
121
+ def test_ocr_loader_integration():
122
+ """Integration test for OCRLoader with actual image"""
123
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
124
+ temp_file = f.name
125
+
126
+ try:
127
+ # Create a simple test image with text
128
+ from PIL import Image, ImageDraw
129
+ img = Image.new('RGB', (400, 100), color='white')
130
+ draw = ImageDraw.Draw(img)
131
+ draw.text((50, 40), "OCR Test Text", fill='black')
132
+ img.save(temp_file)
133
+
134
+ loader = OCRLoader()
135
+ text = loader.extract_text(temp_file, "typed")
136
+
137
+ assert isinstance(text, str)
138
+ # Note: Actual OCR text recognition might vary
139
+
140
+ finally:
141
+ if os.path.exists(temp_file):
142
+ os.unlink(temp_file)
@@ -0,0 +1,41 @@
1
+ import pytest
2
+ import asyncio
3
+ from kssrag.models.openrouter import OpenRouterLLM
4
+
5
+ def test_openrouter_streaming_initialization():
6
+ """Test OpenRouterLLM streaming initialization"""
7
+ llm = OpenRouterLLM(stream=True)
8
+ assert llm.stream == True
9
+
10
+ def test_openrouter_non_streaming_initialization():
11
+ """Test OpenRouterLLM non-streaming initialization"""
12
+ llm = OpenRouterLLM(stream=False)
13
+ assert llm.stream == False
14
+
15
+ def test_streaming_generator():
16
+ """Test streaming generator interface"""
17
+ # Mock the predict_stream method for testing
18
+ class TestOpenRouterLLM(OpenRouterLLM):
19
+ def predict_stream(self, messages):
20
+ yield "Hello "
21
+ yield "World"
22
+ yield "!"
23
+
24
+ llm = TestOpenRouterLLM(stream=True)
25
+ messages = [{"role": "user", "content": "test"}]
26
+
27
+ chunks = list(llm.predict_stream(messages))
28
+ assert chunks == ["Hello ", "World", "!"]
29
+
30
+ def test_streaming_fallback_to_non_streaming():
31
+ """Test that streaming falls back to non-streaming when no chunks"""
32
+ class TestOpenRouterLLM(OpenRouterLLM):
33
+ def predict_stream(self, messages):
34
+ # Simulate no chunks returned
35
+ if False:
36
+ yield "test"
37
+
38
+ llm = TestOpenRouterLLM(stream=True)
39
+ # This should not raise an error
40
+ result = llm.predict([{"role": "user", "content": "test"}])
41
+ assert isinstance(result, str)