PyPI - kiln-ai - Versions diffs - 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl - Mend

kiln-ai 0.19.0py3-none-any.whl → 0.21.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of kiln-ai might be problematic. Click here for more details.

Files changed (158) hide show

kiln_ai/adapters/__init__.py +8 -2
kiln_ai/adapters/adapter_registry.py +43 -208
kiln_ai/adapters/chat/chat_formatter.py +8 -12
kiln_ai/adapters/chat/test_chat_formatter.py +6 -2
kiln_ai/adapters/chunkers/__init__.py +13 -0
kiln_ai/adapters/chunkers/base_chunker.py +42 -0
kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
kiln_ai/adapters/chunkers/helpers.py +23 -0
kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
kiln_ai/adapters/chunkers/test_helpers.py +75 -0
kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
kiln_ai/adapters/docker_model_runner_tools.py +119 -0
kiln_ai/adapters/embedding/__init__.py +0 -0
kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
kiln_ai/adapters/embedding/embedding_registry.py +32 -0
kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
kiln_ai/adapters/eval/base_eval.py +2 -2
kiln_ai/adapters/eval/eval_runner.py +9 -3
kiln_ai/adapters/eval/g_eval.py +2 -2
kiln_ai/adapters/eval/test_base_eval.py +2 -4
kiln_ai/adapters/eval/test_g_eval.py +4 -5
kiln_ai/adapters/extractors/__init__.py +18 -0
kiln_ai/adapters/extractors/base_extractor.py +72 -0
kiln_ai/adapters/extractors/encoding.py +20 -0
kiln_ai/adapters/extractors/extractor_registry.py +44 -0
kiln_ai/adapters/extractors/extractor_runner.py +112 -0
kiln_ai/adapters/extractors/litellm_extractor.py +386 -0
kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
kiln_ai/adapters/extractors/test_encoding.py +54 -0
kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
kiln_ai/adapters/extractors/test_litellm_extractor.py +1192 -0
kiln_ai/adapters/fine_tune/__init__.py +1 -1
kiln_ai/adapters/fine_tune/openai_finetune.py +14 -4
kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
kiln_ai/adapters/fine_tune/test_openai_finetune.py +108 -111
kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
kiln_ai/adapters/ml_embedding_model_list.py +192 -0
kiln_ai/adapters/ml_model_list.py +761 -37
kiln_ai/adapters/model_adapters/base_adapter.py +51 -21
kiln_ai/adapters/model_adapters/litellm_adapter.py +380 -138
kiln_ai/adapters/model_adapters/test_base_adapter.py +193 -17
kiln_ai/adapters/model_adapters/test_litellm_adapter.py +407 -2
kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +1103 -0
kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +5 -5
kiln_ai/adapters/model_adapters/test_structured_output.py +113 -5
kiln_ai/adapters/ollama_tools.py +69 -12
kiln_ai/adapters/parsers/__init__.py +1 -1
kiln_ai/adapters/provider_tools.py +205 -47
kiln_ai/adapters/rag/deduplication.py +49 -0
kiln_ai/adapters/rag/progress.py +252 -0
kiln_ai/adapters/rag/rag_runners.py +844 -0
kiln_ai/adapters/rag/test_deduplication.py +195 -0
kiln_ai/adapters/rag/test_progress.py +785 -0
kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
kiln_ai/adapters/remote_config.py +80 -8
kiln_ai/adapters/repair/test_repair_task.py +12 -9
kiln_ai/adapters/run_output.py +3 -0
kiln_ai/adapters/test_adapter_registry.py +657 -85
kiln_ai/adapters/test_docker_model_runner_tools.py +305 -0
kiln_ai/adapters/test_ml_embedding_model_list.py +429 -0
kiln_ai/adapters/test_ml_model_list.py +251 -1
kiln_ai/adapters/test_ollama_tools.py +340 -1
kiln_ai/adapters/test_prompt_adaptors.py +13 -6
kiln_ai/adapters/test_prompt_builders.py +1 -1
kiln_ai/adapters/test_provider_tools.py +254 -8
kiln_ai/adapters/test_remote_config.py +651 -58
kiln_ai/adapters/vector_store/__init__.py +1 -0
kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
kiln_ai/datamodel/__init__.py +39 -34
kiln_ai/datamodel/basemodel.py +170 -1
kiln_ai/datamodel/chunk.py +158 -0
kiln_ai/datamodel/datamodel_enums.py +28 -0
kiln_ai/datamodel/embedding.py +64 -0
kiln_ai/datamodel/eval.py +1 -1
kiln_ai/datamodel/external_tool_server.py +298 -0
kiln_ai/datamodel/extraction.py +303 -0
kiln_ai/datamodel/json_schema.py +25 -10
kiln_ai/datamodel/project.py +40 -1
kiln_ai/datamodel/rag.py +79 -0
kiln_ai/datamodel/registry.py +0 -15
kiln_ai/datamodel/run_config.py +62 -0
kiln_ai/datamodel/task.py +2 -77
kiln_ai/datamodel/task_output.py +6 -1
kiln_ai/datamodel/task_run.py +41 -0
kiln_ai/datamodel/test_attachment.py +649 -0
kiln_ai/datamodel/test_basemodel.py +4 -4
kiln_ai/datamodel/test_chunk_models.py +317 -0
kiln_ai/datamodel/test_dataset_split.py +1 -1
kiln_ai/datamodel/test_embedding_models.py +448 -0
kiln_ai/datamodel/test_eval_model.py +6 -6
kiln_ai/datamodel/test_example_models.py +175 -0
kiln_ai/datamodel/test_external_tool_server.py +691 -0
kiln_ai/datamodel/test_extraction_chunk.py +206 -0
kiln_ai/datamodel/test_extraction_model.py +470 -0
kiln_ai/datamodel/test_rag.py +641 -0
kiln_ai/datamodel/test_registry.py +8 -3
kiln_ai/datamodel/test_task.py +15 -47
kiln_ai/datamodel/test_tool_id.py +320 -0
kiln_ai/datamodel/test_vector_store.py +320 -0
kiln_ai/datamodel/tool_id.py +105 -0
kiln_ai/datamodel/vector_store.py +141 -0
kiln_ai/tools/__init__.py +8 -0
kiln_ai/tools/base_tool.py +82 -0
kiln_ai/tools/built_in_tools/__init__.py +13 -0
kiln_ai/tools/built_in_tools/math_tools.py +124 -0
kiln_ai/tools/built_in_tools/test_math_tools.py +204 -0
kiln_ai/tools/mcp_server_tool.py +95 -0
kiln_ai/tools/mcp_session_manager.py +246 -0
kiln_ai/tools/rag_tools.py +157 -0
kiln_ai/tools/test_base_tools.py +199 -0
kiln_ai/tools/test_mcp_server_tool.py +457 -0
kiln_ai/tools/test_mcp_session_manager.py +1585 -0
kiln_ai/tools/test_rag_tools.py +848 -0
kiln_ai/tools/test_tool_registry.py +562 -0
kiln_ai/tools/tool_registry.py +85 -0
kiln_ai/utils/__init__.py +3 -0
kiln_ai/utils/async_job_runner.py +62 -17
kiln_ai/utils/config.py +24 -2
kiln_ai/utils/env.py +15 -0
kiln_ai/utils/filesystem.py +14 -0
kiln_ai/utils/filesystem_cache.py +60 -0
kiln_ai/utils/litellm.py +94 -0
kiln_ai/utils/lock.py +100 -0
kiln_ai/utils/mime_type.py +38 -0
kiln_ai/utils/open_ai_types.py +94 -0
kiln_ai/utils/pdf_utils.py +38 -0
kiln_ai/utils/project_utils.py +17 -0
kiln_ai/utils/test_async_job_runner.py +151 -35
kiln_ai/utils/test_config.py +138 -1
kiln_ai/utils/test_env.py +142 -0
kiln_ai/utils/test_filesystem_cache.py +316 -0
kiln_ai/utils/test_litellm.py +206 -0
kiln_ai/utils/test_lock.py +185 -0
kiln_ai/utils/test_mime_type.py +66 -0
kiln_ai/utils/test_open_ai_types.py +131 -0
kiln_ai/utils/test_pdf_utils.py +73 -0
kiln_ai/utils/test_uuid.py +111 -0
kiln_ai/utils/test_validation.py +524 -0
kiln_ai/utils/uuid.py +9 -0
kiln_ai/utils/validation.py +90 -0
{kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/METADATA +12 -5
kiln_ai-0.21.0.dist-info/RECORD +211 -0
kiln_ai-0.19.0.dist-info/RECORD +0 -115
{kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/WHEEL +0 -0
{kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0

kiln_ai/adapters/chunkers/test_fixed_window_chunker.py ADDED Viewed

@@ -0,0 +1,346 @@
+from typing import Callable
+from unittest.mock import patch
+import pytest
+from llama_index.core.text_splitter import SentenceSplitter
+from kiln_ai.adapters.chunkers.base_chunker import ChunkingResult
+from kiln_ai.adapters.chunkers.fixed_window_chunker import FixedWindowChunker
+from kiln_ai.adapters.chunkers.helpers import clean_up_text
+from kiln_ai.datamodel.chunk import ChunkerConfig, ChunkerType
+@pytest.fixture
+def mock_fixed_window_chunker_factory() -> Callable[[int, int], FixedWindowChunker]:
+    def create_chunker(chunk_size: int, chunk_overlap: int) -> FixedWindowChunker:
+        return FixedWindowChunker(
+            ChunkerConfig(
+                name="test-chunker",
+                chunker_type=ChunkerType.FIXED_WINDOW,
+                properties={"chunk_size": chunk_size, "chunk_overlap": chunk_overlap},
+            )
+        )
+    return create_chunker
+async def test_fixed_window_chunker_wrong_chunker_type(
+    mock_fixed_window_chunker_factory,
+):
+    with pytest.raises(ValueError):
+        FixedWindowChunker(
+            ChunkerConfig(
+                name="test-chunker",
+                chunker_type="wrong-chunker-type",  # type: ignore
+                properties={"chunk_size": 100, "chunk_overlap": 10},
+            )
+        )
+async def test_fixed_window_chunker_chunk_empty_text(
+    mock_fixed_window_chunker_factory,
+):
+    # we should not even be calling the splitter if the text is empty
+    chunker = mock_fixed_window_chunker_factory(100, 10)
+    with patch.object(SentenceSplitter, "split_text") as mock_split_text:
+        assert await chunker.chunk("") == ChunkingResult(chunks=[])
+        mock_split_text.assert_not_called()
+@pytest.mark.parametrize(
+    "chunk_size,chunk_overlap,expected_chunks",
+    [(12, 6, 43), (256, 12, 2), (1024, 64, 1), (2048, 128, 1)],
+)
+async def test_fixed_window_chunker_concrete_chunker(
+    chunk_size, chunk_overlap, expected_chunks, mock_fixed_window_chunker_factory
+):
+    """
+    This test is to ensure that the chunker can split text (with markdown syntax). The specific values are just an illustration rather than values we
+    particularly care about.
+    """
+    chunker = mock_fixed_window_chunker_factory(chunk_size, chunk_overlap)
+    text_to_chunk = """# How Ice Cubes Make Drinks Colder
+## Introduction
+When you drop an ice cube into a drink, it does more than just float and look refreshing. It changes the thermal state of the liquid in a precise and physically predictable way. While it may seem like a simple act, the science behind how ice cubes make drinks colder is a fascinating interplay of thermodynamics, phase change, and heat transfer.
+## The Science of Cooling
+### Heat Transfer Basics
+At the core of the process is the concept of **heat exchange**. Heat naturally flows from warmer objects to colder ones until thermal equilibrium is reached. When an ice cube, which is at 0°C (32°F), is placed in a drink that is warmer than that, heat begins to flow from the liquid to the ice. This transfer of energy cools the drink while simultaneously warming the ice.
+### Latent Heat of Fusion
+However, it's not just about the ice warming up. The real magic happens because of **latent heat**—specifically, the heat of fusion. When ice melts, it doesn't instantly become the same temperature as the liquid around it. Instead, it absorbs a significant amount of energy just to change from a solid to a liquid, without its temperature rising. This phase change requires approximately 334 joules per gram of ice, all taken from the drink, which cools as a result."""
+    output = await chunker.chunk(text_to_chunk)
+    assert len(output.chunks) == expected_chunks, (
+        f"Expected {expected_chunks} chunks, got {len(output.chunks)}. If this is the result of an intentional change to chunk boundaries, please update the expected number of chunks in the test. Note that changes to chunk boundaries can have a downstream impact on retrieval."
+    )
+@pytest.mark.parametrize(
+    "chunk_size,chunk_overlap,expected_chunks",
+    [(12, 6, 120), (256, 12, 4), (1024, 64, 1), (2048, 128, 1)],
+)
+async def test_fixed_window_chunker_concrete_chunker_zh(
+    chunk_size, chunk_overlap, expected_chunks, mock_fixed_window_chunker_factory
+):
+    """
+    This test is to ensure that the chunker can split Chinese text. The specific values are just an illustration rather than values we
+    particularly care about.
+    """
+    chunker = mock_fixed_window_chunker_factory(chunk_size, chunk_overlap)
+    text_to_chunk = """火山是地表下在岩浆库中的高温岩浆及其有关的气体、碎屑从行星的地壳中喷出而形成的，具有特殊形態的地质结构。
+岩石圈由若干板块组成，它们漂浮在地幔的软流层之上，在板块的交界处岩石圈比较破碎，地下岩浆容易在此喷发形成火山。[1] 火山可以分为死火山、休眠火山和活火山。在一段时间内，没有出現喷发事件的活火山叫做睡火山（休眠火山）。另外还有一种泥火山，它在科学上严格来说不属于火山，但是许多社会大众也把它看作是火山的一种类型。
+火山爆发可能会造成许多危害，常伴有地震，影响范围不仅在火山爆发附近。其中一个危险是火山灰可能对飞机构成威胁，特别是那些喷气发动机，其中灰尘颗粒可以在高温下熔化; 熔化的颗粒随后粘附到涡轮机叶片并改变它们的形状，从而中断涡轮发动机的操作。大型爆发可能会影响气温，火山灰和硫酸液滴遮挡太阳辐射并冷却地球的低层大气（或对流层）; 然而，它们也吸收地球辐射的热量，从而使高层大气（或平流层）变暖。 历史上，火山冬天造成了灾难性的饥荒。
+虽然火山喷发会对人类造成危害，但同时它也带来一些好处。例如：可以促进宝石的形成；扩大陆地的面积（夏威夷群岛就是由火山喷发而形成的）；作为观光旅游考察景点，推动旅游业，如日本的富士山。[2] 专门研究火山活动的学科称为火山学[3]。
+"""  # noqa: RUF001
+    output = await chunker.chunk(text_to_chunk)
+    assert len(output.chunks) == expected_chunks, (
+        f"Expected {expected_chunks} chunks, got {len(output.chunks)}. If this is the result of an intentional change to chunk boundaries, please update the expected number of chunks in the test. Note that changes to chunk boundaries can have a downstream impact on retrieval."
+    )
+@pytest.mark.parametrize(
+    "chunk_size,chunk_overlap,expected_chunks",
+    [(12, 6, 39), (256, 12, 1), (1024, 64, 1), (2048, 128, 1)],
+)
+async def test_fixed_window_chunker_concrete_chunker_no_punctuation(
+    chunk_size, chunk_overlap, expected_chunks, mock_fixed_window_chunker_factory
+):
+    """
+    This test is to ensure that the chunker still does some splitting even if there is no punctuation. The specific values are just an illustration rather than values we
+    particularly care about.
+    """
+    chunker = mock_fixed_window_chunker_factory(chunk_size, chunk_overlap)
+    text_to_chunk = """how ice cubes make drinks colder introduction when you drop an ice cube into a drink it does more than just float and look refreshing it changes the thermal state of the liquid in a precise and physically predictable way while it may seem like a simple act the science behind how ice cubes make drinks colder is a fascinating interplay of thermodynamics phase change and heat transfer the science of cooling heat transfer basics at the core of the process is the concept of heat exchange heat naturally flows from warmer objects to colder ones until thermal equilibrium is reached when an ice cube which is at 0c 32f is placed in a drink that is warmer than that heat begins to flow from the liquid to the ice this transfer of energy cools the drink while simultaneously warming the ice latent heat of fusion however its not just about the ice warming up the real magic happens because of latent heat specifically the heat of fusion when ice melts it doesnt instantly become the same temperature as the liquid around it instead it absorbs a significant amount of energy just to change from a solid to a liquid without its temperature rising this phase change requires approximately 334 joules per gram of ice all taken from the drink which cools as a result"""
+    output = await chunker.chunk(text_to_chunk)
+    assert len(output.chunks) == expected_chunks, (
+        f"Expected {expected_chunks} chunks, got {len(output.chunks)}. If this is the result of an intentional change to chunk boundaries, please update the expected number of chunks in the test. Note that changes to chunk boundaries can have a downstream impact on retrieval."
+    )
+@pytest.mark.parametrize(
+    "chunk_size,chunk_overlap,expected_chunks",
+    [(12, 6, 106), (256, 12, 3), (1024, 64, 1), (2048, 128, 1)],
+)
+async def test_fixed_window_chunker_concrete_chunker_no_punctuation_zh(
+    chunk_size, chunk_overlap, expected_chunks, mock_fixed_window_chunker_factory
+):
+    """
+    This test is to ensure that the chunker still does some splitting even if there is no punctuation. The specific values are just an illustration rather than values we
+    particularly care about.
+    """
+    text_to_chunk = "火山是地表下在岩浆库中的高温岩浆及其有关的气体碎屑从行星的地壳中喷出而形成的具有特殊形態的地质结构岩石圈由若干板块组成它们漂浮在地幔的软流层之上在板块的交界处岩石圈比较破碎地下岩浆容易在此喷发形成火山火山可以分为死火山休眠火山和活火山在一段时间内没有出現喷发事件的活火山叫做睡火山休眠火山另外还有一种泥火山它在科学上严格来说不属于火山但是许多社会大众也把它看作是火山的一种类型火山爆发可能会造成许多危害常伴有地震影响范围不仅在火山爆发附近其中一个危险是火山灰可能对飞机构成威胁特别是那些喷气发动机其中灰尘颗粒可以在高温下熔化熔化的颗粒随后粘附到涡轮机叶片并改变它们的形状从而中断涡轮发动机的操作大型爆发可能会影响气温火山灰和硫酸液滴遮挡太阳辐射并冷却地球的低层大气或对流层然而它们也吸收地球辐射的热量从而使高层大气或平流层变暖历史上火山冬天造成了灾难性的饥荒虽然火山喷发会对人类造成危害但同时它也带来一些好处例如可以促进宝石的形成扩大陆地的面积夏威夷群岛就是由火山喷发而形成的作为观光旅游考察景点推动旅游业如日本的富士山专门研究火山活动的学科称为火山学"
+    chunker = mock_fixed_window_chunker_factory(chunk_size, chunk_overlap)
+    output = await chunker.chunk(text_to_chunk)
+    assert len(output.chunks) == expected_chunks, (
+        f"Expected {expected_chunks} chunks, got {len(output.chunks)}. If this is the result of an intentional change to chunk boundaries, please update the expected number of chunks in the test. Note that changes to chunk boundaries can have a downstream impact on retrieval."
+    )
+async def test_fixed_window_chunker_preserves_text_content(
+    mock_fixed_window_chunker_factory,
+):
+    """
+    Test that the chunker preserves the original text content when reassembled.
+    """
+    chunker = mock_fixed_window_chunker_factory(100, 10)
+    text_to_chunk = (
+        "This is a test sentence. This is another test sentence. And a third one."
+    )
+    output = await chunker.chunk(text_to_chunk)
+    # Reassemble the text from chunks
+    reassembled_text = " ".join(chunk.text for chunk in output.chunks)
+    # The reassembled text should contain all the original content
+    # (though spacing might differ due to chunking)
+    assert "This is a test sentence" in reassembled_text
+    assert "This is another test sentence" in reassembled_text
+    assert "And a third one" in reassembled_text
+@pytest.mark.parametrize(
+    "text",
+    ["   ", "\n\n\n", "\t\t\t", " \n\t "],
+)
+async def test_fixed_window_chunker_handles_whitespace_only(
+    mock_fixed_window_chunker_factory,
+    text,
+):
+    """
+    Test that the chunker handles whitespace-only text appropriately.
+    """
+    chunker = mock_fixed_window_chunker_factory(100, 10)
+    output = await chunker.chunk(text)
+    # Should return empty chunks for whitespace-only text
+    assert len(output.chunks) == 0
+async def test_fixed_window_chunker_handles_special_characters(
+    mock_fixed_window_chunker_factory,
+):
+    """
+    Test that the chunker handles special characters and unicode properly.
+    """
+    chunker = mock_fixed_window_chunker_factory(50, 5)
+    text_with_special_chars = (
+        "Hello 🌍! This has emojis 🚀 and symbols ©®™. Also unicode: αβγδε."
+    )
+    output = await chunker.chunk(text_with_special_chars)
+    # Should create at least one chunk
+    assert len(output.chunks) > 0
+    # Reassemble and check that special characters are preserved
+    reassembled = " ".join(chunk.text for chunk in output.chunks)
+    assert "Hello" in reassembled
+    assert "This has emojis" in reassembled
+    assert "🌍" in reassembled
+    assert "🚀" in reassembled
+    assert "©®™" in reassembled
+    assert "αβγδε" in reassembled
+async def test_fixed_window_chunker_handles_single_character(
+    mock_fixed_window_chunker_factory,
+):
+    """
+    Test that the chunker handles single character text.
+    """
+    chunker = mock_fixed_window_chunker_factory(100, 10)
+    output = await chunker.chunk("A")
+    assert len(output.chunks) == 1
+    assert output.chunks[0].text == "A"
+async def test_fixed_window_chunker_handles_single_word(
+    mock_fixed_window_chunker_factory,
+):
+    """
+    Test that the chunker handles single word text.
+    """
+    chunker = mock_fixed_window_chunker_factory(100, 10)
+    output = await chunker.chunk("Hello")
+    assert len(output.chunks) == 1
+    assert output.chunks[0].text == "Hello"
+async def test_fixed_window_chunker_handles_single_sentence(
+    mock_fixed_window_chunker_factory,
+):
+    """
+    Test that the chunker handles single sentence text.
+    """
+    chunker = mock_fixed_window_chunker_factory(100, 10)
+    output = await chunker.chunk("This is a single sentence.")
+    assert len(output.chunks) == 1
+    assert output.chunks[0].text == "This is a single sentence."
+async def test_fixed_window_chunker_very_large_text(mock_fixed_window_chunker_factory):
+    """
+    Test that the chunker can handle very large text without issues.
+    """
+    chunker = mock_fixed_window_chunker_factory(100, 10)
+    # Create a large text by repeating a sentence
+    large_text = "This is a test sentence. " * 1000
+    output = await chunker.chunk(large_text)
+    # Should produce multiple chunks
+    assert len(output.chunks) > 1
+    # All chunks should have content
+    for chunk in output.chunks:
+        assert chunk.text.strip() != ""
+@pytest.mark.parametrize(
+    "whitespace_length",
+    [10_000],
+)
+async def test_fixed_window_chunker_removes_consecutive_whitespace(
+    mock_fixed_window_chunker_factory, whitespace_length
+):
+    # this is a very large text due to 1M+ consecutive whitespace characters
+    # the chunker crashes with a rust error
+    text = """Water plays an important role in the world economy. Approximately 70% of the fresh water used by humans goes to agriculture.[26] Fishing in salt and fresh water bodies has been, and continues to be, a major source of food for many parts of the world, providing 6.5% of global protein.[27] Much of the long-distance trade of commodities (such as oil, natural gas, and manufactured products) is transported by boats through seas, rivers, lakes, and canals. Large quantities of water, ice, and steam are used for cooling and heating in industry and homes. Water is an excellent solvent for a wide variety of substances, both mineral and organic; as such, it is widely used in industrial processes and in cooking and washing. Water, ice, and snow are also central to many sports and other forms of entertainment, such as swimming, pleasure boating, boat racing, surfing, sport fishing, diving, ice skating, snowboarding, and skiing.
+{WHITESPACE_PROBLEM_HERE}
+The word water comes from Old English wæter, from Proto-Germanic *watar (source also of Old Saxon watar, Old Frisian wetir, Dutch water, Old High German wazzar, German Wasser, vatn, Gothic 𐍅𐌰𐍄𐍉 (wato)), from Proto-Indo-European *wod-or, suffixed form of root *wed- ('water'; 'wet').[28] Also cognate, through the Indo-European root, with Greek ύδωρ (ýdor; from Ancient Greek ὕδωρ (hýdōr), whence English 'hydro-'), Russian вода́ (vodá), Irish uisce, and Albanian ujë.
+""".replace("{WHITESPACE_PROBLEM_HERE}", " " * whitespace_length)
+    chunker = mock_fixed_window_chunker_factory(32, 8)
+    with patch(
+        "kiln_ai.adapters.chunkers.base_chunker.clean_up_text"
+    ) as mock_clean_up_text:
+        mock_clean_up_text.side_effect = clean_up_text
+        output = await chunker.chunk(text)
+        mock_clean_up_text.assert_called_once_with(text)
+        assert len(output.chunks) > 1
+@pytest.mark.parametrize(
+    "whitespace_length",
+    [100_000, 1_000_000, 5_000_000, 10_000_000],
+)
+@pytest.mark.paid
+async def test_fixed_window_chunker_removes_consecutive_whitespace_heavy_load(
+    mock_fixed_window_chunker_factory, whitespace_length
+):
+    # this is a very large text due to 1M+ consecutive whitespace characters
+    # the chunker crashes with a rust error
+    text = """Water plays an important role in the world economy. Approximately 70% of the fresh water used by humans goes to agriculture.[26] Fishing in salt and fresh water bodies has been, and continues to be, a major source of food for many parts of the world, providing 6.5% of global protein.[27] Much of the long-distance trade of commodities (such as oil, natural gas, and manufactured products) is transported by boats through seas, rivers, lakes, and canals. Large quantities of water, ice, and steam are used for cooling and heating in industry and homes. Water is an excellent solvent for a wide variety of substances, both mineral and organic; as such, it is widely used in industrial processes and in cooking and washing. Water, ice, and snow are also central to many sports and other forms of entertainment, such as swimming, pleasure boating, boat racing, surfing, sport fishing, diving, ice skating, snowboarding, and skiing.
+{WHITESPACE_PROBLEM_HERE}
+The word water comes from Old English wæter, from Proto-Germanic *watar (source also of Old Saxon watar, Old Frisian wetir, Dutch water, Old High German wazzar, German Wasser, vatn, Gothic 𐍅𐌰𐍄𐍉 (wato)), from Proto-Indo-European *wod-or, suffixed form of root *wed- ('water'; 'wet').[28] Also cognate, through the Indo-European root, with Greek ύδωρ (ýdor; from Ancient Greek ὕδωρ (hýdōr), whence English 'hydro-'), Russian вода́ (vodá), Irish uisce, and Albanian ujë.
+""".replace("{WHITESPACE_PROBLEM_HERE}", " " * whitespace_length)
+    chunker = mock_fixed_window_chunker_factory(32, 8)
+    with patch(
+        "kiln_ai.adapters.chunkers.base_chunker.clean_up_text"
+    ) as mock_clean_up_text:
+        mock_clean_up_text.side_effect = clean_up_text
+        output = await chunker.chunk(text)
+        mock_clean_up_text.assert_called_once_with(text)
+        assert len(output.chunks) > 1
+# this test takes a long time to run
+@pytest.mark.paid
+@pytest.mark.parametrize(
+    "number_of_sentences",
+    [10, 100, 1_000, 10_000],
+)
+async def test_fixed_window_chunker_handle_large_text(
+    mock_fixed_window_chunker_factory, number_of_sentences
+):
+    sentence = """Water plays an important role in the world economy. Approximately 70% of the fresh water used by humans goes to agriculture.[26] Fishing in salt and fresh water bodies has been, and continues to be, a major source of food for many parts of the world, providing 6.5% of global protein.[27] Much of the long-distance trade of commodities (such as oil, natural gas, and manufactured products) is transported by boats through seas, rivers, lakes, and canals. Large quantities of water, ice, and steam are used for cooling and heating in industry and homes. Water is an excellent solvent for a wide variety of substances, both mineral and organic; as such, it is widely used in industrial processes and in cooking and washing. Water, ice, and snow are also central to many sports and other forms of entertainment, such as swimming, pleasure boating, boat racing, surfing, sport fishing, diving, ice skating, snowboarding, and skiing."""
+    text = sentence * number_of_sentences
+    chunker = mock_fixed_window_chunker_factory(32, 8)
+    with patch(
+        "kiln_ai.adapters.chunkers.base_chunker.clean_up_text"
+    ) as mock_clean_up_text:
+        mock_clean_up_text.side_effect = clean_up_text
+        output = await chunker.chunk(text)
+        mock_clean_up_text.assert_called_once_with(text)
+        assert len(output.chunks) > 1

kiln_ai/adapters/chunkers/test_helpers.py ADDED Viewed

@@ -0,0 +1,75 @@
+import pytest
+from kiln_ai.adapters.chunkers.helpers import clean_up_text
+def generate_consecutive_char_string(length: int, char: str) -> str:
+    return char * length
+@pytest.mark.parametrize(
+    "text,expected",
+    [
+        # Test single newlines (should remain unchanged)
+        ("Hello\nWorld", "Hello\nWorld"),
+        ("Hello\n\nWorld", "Hello\n\nWorld"),
+        ("Hello\n\n\nWorld", "Hello\n\n\nWorld"),
+        ("Hello\n\n\n\nWorld", "Hello\n\n\n\nWorld"),
+        ("Hello\n\n\n\n\nWorld", "Hello\n\n\n\n\nWorld"),
+        # Test 6+ consecutive newlines (should be replaced with exactly 6)
+        ("Hello\n\n\n\n\n\nWorld", "Hello\n\n\n\n\n\nWorld"),  # exactly 6, unchanged
+        ("Hello\n\n\n\n\n\n\nWorld", "Hello\n\n\n\n\n\nWorld"),  # 7 newlines -> 6
+        ("Hello\n\n\n\n\n\n\n\nWorld", "Hello\n\n\n\n\n\nWorld"),  # 8 newlines -> 6
+        (
+            "Hello\n\n\n\n\n\n\n\n\n\nWorld",
+            "Hello\n\n\n\n\n\nWorld",
+        ),  # 10 newlines -> 6
+        # Test single spaces (should remain unchanged)
+        ("Hello World", "Hello World"),
+        ("Hello  World", "Hello  World"),
+        ("Hello   World", "Hello   World"),
+        ("Hello    World", "Hello    World"),
+        ("Hello     World", "Hello     World"),
+        # Test 50+ consecutive spaces (should be replaced with exactly 50)
+        (
+            "Hello" + " " * 50 + "World",
+            "Hello" + " " * 50 + "World",
+        ),  # exactly 50, unchanged
+        ("Hello" + " " * 51 + "World", "Hello" + " " * 50 + "World"),  # 51 spaces -> 50
+        (
+            "Hello" + " " * 100 + "World",
+            "Hello" + " " * 50 + "World",
+        ),  # 100 spaces -> 50
+        # Test mixed cases
+        (
+            "Hello\n\n\n\n\n\n\nWorld" + " " * 60 + "Test",
+            "Hello\n\n\n\n\n\nWorld" + " " * 50 + "Test",
+        ),
+        (
+            "Text\n\n\n\n\n\n\n\n\n\nMore" + " " * 30 + "Text",
+            "Text\n\n\n\n\n\nMore" + " " * 30 + "Text",
+        ),
+    ],
+)
+def test_clean_up_text(text, expected):
+    assert clean_up_text(text) == expected
+text = """Water is an inorganic compound with the chemical formula H2O. It is a transparent, tasteless, odorless,[c] and nearly colorless chemical substance. It is the main constituent of Earth's hydrosphere and the fluids of all known living organisms in which it acts as a solvent. Water, being a polar molecule, undergoes strong intermolecular hydrogen bonding which is a large contributor to its physical and chemical properties.[20] It is vital for all known forms of life, despite not providing food energy or being an organic micronutrient. Due to its presence in all organisms, its chemical stability, its worldwide abundance and its strong polarity relative to its small molecular size; water is often referred to as the "universal solvent".[21]"""
+long_whitespace_string = generate_consecutive_char_string(1000, " ")
+long_newlines_string = generate_consecutive_char_string(1000, "\n")
+string_with_whitespace = f"{text}{long_whitespace_string}{text}"
+string_with_newlines = f"{text}{long_newlines_string}{text}"
+@pytest.mark.parametrize(
+    "text,expected",
+    [
+        (f"{text}{long_whitespace_string}{text}", f"{text}{' ' * 50}{text}"),
+        (f"{text}{long_newlines_string}{text}", f"{text}{chr(10) * 6}{text}"),
+    ],
+)
+def test_clean_up_text_large_text(text, expected):
+    assert clean_up_text(text) == expected

kiln_ai/adapters/data_gen/test_data_gen_task.py CHANGED Viewed

@@ -111,7 +111,9 @@ async def test_data_gen_all_models_providers(
     _, provider = get_model_and_provider(model_name, provider_name)
     if not provider.supports_data_gen:
         # pass if the model doesn't support data gen (testing the support flag is part of this)
-        return
+        pytest.skip(
+            f"Skipping {model_name} {provider_name} because it does not support data gen"
+        )
     data_gen_task = DataGenCategoriesTask(gen_type="training", guidance=None)
     data_gen_input = DataGenCategoriesTaskInput.from_task(base_task, num_subtopics=6)
@@ -257,7 +259,9 @@ async def test_data_gen_sample_all_models_providers(
     _, provider = get_model_and_provider(model_name, provider_name)
     if provider is None or not provider.supports_data_gen:
         # pass if the model doesn't support data gen (testing the support flag is part of this)
-        return
+        pytest.skip(
+            f"Skipping {model_name} {provider_name} because it does not support data gen"
+        )
     data_gen_task = DataGenSampleTask(
         target_task=base_task, gen_type="training", guidance=None
@@ -313,7 +317,9 @@ async def test_data_gen_sample_all_models_providers_with_structured_output(
     _, provider = get_model_and_provider(model_name, provider_name)
     if not provider.supports_data_gen:
         # pass if the model doesn't support data gen (testing the support flag is part of this)
-        return
+        pytest.skip(
+            f"Skipping {model_name} {provider_name} because it does not support data gen"
+        )
     data_gen_task = DataGenSampleTask(
         target_task=task, gen_type="training", guidance=None

kiln_ai/adapters/docker_model_runner_tools.py ADDED Viewed

@@ -0,0 +1,119 @@
+from typing import List
+import httpx
+import openai
+from pydantic import BaseModel, Field
+from kiln_ai.adapters.ml_model_list import ModelProviderName, built_in_models
+from kiln_ai.utils.config import Config
+def docker_model_runner_base_url() -> str:
+    """
+    Gets the base URL for Docker Model Runner API connections.
+    Returns:
+        The base URL to use for Docker Model Runner API calls, using environment variable if set
+        or falling back to localhost default
+    """
+    config_base_url = Config.shared().docker_model_runner_base_url
+    if config_base_url:
+        return config_base_url
+    return "http://localhost:12434/engines/llama.cpp"
+async def docker_model_runner_online() -> bool:
+    """
+    Checks if the Docker Model Runner service is available and responding.
+    Returns:
+        True if Docker Model Runner is available and responding, False otherwise
+    """
+    try:
+        base_url = docker_model_runner_base_url()
+        # Docker Model Runner uses OpenAI-compatible endpoints
+        async with httpx.AsyncClient() as client:
+            response = await client.get(f"{base_url}/v1/models", timeout=5.0)
+            response.raise_for_status()
+    except httpx.RequestError:
+        return False
+    return True
+class DockerModelRunnerConnection(BaseModel):
+    message: str
+    version: str | None = None
+    supported_models: List[str]
+    untested_models: List[str] = Field(default_factory=list)
+    def all_models(self) -> List[str]:
+        return self.supported_models + self.untested_models
+# Parse the Docker Model Runner /v1/models response
+def parse_docker_model_runner_models(
+    models: List[openai.types.Model],
+) -> DockerModelRunnerConnection | None:
+    # Build a list of models we support for Docker Model Runner from the built-in model list
+    supported_docker_models = [
+        provider.model_id
+        for model in built_in_models
+        for provider in model.providers
+        if provider.name == ModelProviderName.docker_model_runner
+    ]
+    # Note: Docker Model Runner aliases will be added when we configure models
+    model_names = [model.id for model in models]
+    available_supported_models = []
+    untested_models = []
+    for model_name in model_names:
+        if model_name in supported_docker_models:
+            available_supported_models.append(model_name)
+        else:
+            untested_models.append(model_name)
+    if available_supported_models or untested_models:
+        return DockerModelRunnerConnection(
+            message="Docker Model Runner connected",
+            supported_models=available_supported_models,
+            untested_models=untested_models,
+        )
+    return DockerModelRunnerConnection(
+        message="Docker Model Runner is running, but no supported models are available. Ensure models like 'ai/llama3.2:3B-Q4_K_M', 'ai/qwen3:8B-Q4_K_M', or 'ai/gemma3n:4B-Q4_K_M' are loaded.",
+        supported_models=[],
+        untested_models=[],
+    )
+async def get_docker_model_runner_connection(
+    custom_url: str | None = None,
+) -> DockerModelRunnerConnection | None:
+    """
+    Gets the connection status for Docker Model Runner.
+    Args:
+        custom_url: Optional custom URL to use instead of the configured one
+    """
+    try:
+        base_url = custom_url or docker_model_runner_base_url()
+        # Use OpenAI client to get models list
+        client = openai.OpenAI(
+            api_key="dummy",  # Docker Model Runner doesn't require API key
+            base_url=f"{base_url}/v1",
+            max_retries=0,
+        )
+        models_response = client.models.list()
+    except (openai.APIConnectionError, openai.APIError, httpx.RequestError):
+        return None
+    return parse_docker_model_runner_models(list(models_response))
+def docker_model_runner_model_installed(
+    conn: DockerModelRunnerConnection, model_name: str
+) -> bool:
+    all_models = conn.all_models()
+    return model_name in all_models

kiln_ai/adapters/embedding/__init__.py ADDED Viewed

File without changes

kiln_ai/adapters/embedding/base_embedding_adapter.py ADDED Viewed

@@ -0,0 +1,44 @@
+import logging
+from abc import ABC, abstractmethod
+from typing import List
+from litellm import Usage
+from pydantic import BaseModel, Field
+from kiln_ai.datamodel.embedding import EmbeddingConfig
+logger = logging.getLogger(__name__)
+class Embedding(BaseModel):
+    vector: list[float] = Field(description="The vector of the embedding.")
+class EmbeddingResult(BaseModel):
+    embeddings: list[Embedding] = Field(description="The embeddings of the text.")
+    usage: Usage | None = Field(default=None, description="The usage of the embedding.")
+class BaseEmbeddingAdapter(ABC):
+    """
+    Base class for all embedding adapters.
+    Should be subclassed by each embedding adapter.
+    """
+    def __init__(self, embedding_config: EmbeddingConfig):
+        self.embedding_config = embedding_config
+    async def generate_embeddings(self, input_texts: List[str]) -> EmbeddingResult:
+        if not input_texts:
+            return EmbeddingResult(
+                embeddings=[],
+                usage=None,
+            )
+        return await self._generate_embeddings(input_texts)
+    @abstractmethod
+    async def _generate_embeddings(self, input_texts: List[str]) -> EmbeddingResult:
+        pass

kiln_ai/adapters/embedding/embedding_registry.py ADDED Viewed

@@ -0,0 +1,32 @@
+from kiln_ai.adapters.embedding.base_embedding_adapter import BaseEmbeddingAdapter
+from kiln_ai.adapters.embedding.litellm_embedding_adapter import LitellmEmbeddingAdapter
+from kiln_ai.adapters.provider_tools import (
+    core_provider,
+    lite_llm_core_config_for_provider,
+)
+from kiln_ai.datamodel.datamodel_enums import ModelProviderName
+from kiln_ai.datamodel.embedding import EmbeddingConfig
+def embedding_adapter_from_type(
+    embedding_config: EmbeddingConfig,
+) -> BaseEmbeddingAdapter:
+    try:
+        provider_enum = ModelProviderName(embedding_config.model_provider_name)
+    except ValueError:
+        raise ValueError(
+            f"Unsupported model provider name: {embedding_config.model_provider_name.value}. "
+        )
+    core_provider_name = core_provider(embedding_config.model_name, provider_enum)
+    provider_config = lite_llm_core_config_for_provider(core_provider_name)
+    if provider_config is None:
+        raise ValueError(
+            f"No configuration found for core provider: {core_provider_name.value}. "
+        )
+    return LitellmEmbeddingAdapter(
+        embedding_config,
+        provider_config,
+    )

kiln-ai 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl

Potentially problematic release.

kiln-ai 0.19.0py3-none-any.whl → 0.21.0py3-none-any.whl