kiln-ai 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (158) hide show
  1. kiln_ai/adapters/__init__.py +8 -2
  2. kiln_ai/adapters/adapter_registry.py +43 -208
  3. kiln_ai/adapters/chat/chat_formatter.py +8 -12
  4. kiln_ai/adapters/chat/test_chat_formatter.py +6 -2
  5. kiln_ai/adapters/chunkers/__init__.py +13 -0
  6. kiln_ai/adapters/chunkers/base_chunker.py +42 -0
  7. kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
  8. kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
  9. kiln_ai/adapters/chunkers/helpers.py +23 -0
  10. kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
  11. kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
  12. kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
  13. kiln_ai/adapters/chunkers/test_helpers.py +75 -0
  14. kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
  15. kiln_ai/adapters/docker_model_runner_tools.py +119 -0
  16. kiln_ai/adapters/embedding/__init__.py +0 -0
  17. kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
  18. kiln_ai/adapters/embedding/embedding_registry.py +32 -0
  19. kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
  20. kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
  21. kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
  22. kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
  23. kiln_ai/adapters/eval/base_eval.py +2 -2
  24. kiln_ai/adapters/eval/eval_runner.py +9 -3
  25. kiln_ai/adapters/eval/g_eval.py +2 -2
  26. kiln_ai/adapters/eval/test_base_eval.py +2 -4
  27. kiln_ai/adapters/eval/test_g_eval.py +4 -5
  28. kiln_ai/adapters/extractors/__init__.py +18 -0
  29. kiln_ai/adapters/extractors/base_extractor.py +72 -0
  30. kiln_ai/adapters/extractors/encoding.py +20 -0
  31. kiln_ai/adapters/extractors/extractor_registry.py +44 -0
  32. kiln_ai/adapters/extractors/extractor_runner.py +112 -0
  33. kiln_ai/adapters/extractors/litellm_extractor.py +386 -0
  34. kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
  35. kiln_ai/adapters/extractors/test_encoding.py +54 -0
  36. kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
  37. kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
  38. kiln_ai/adapters/extractors/test_litellm_extractor.py +1192 -0
  39. kiln_ai/adapters/fine_tune/__init__.py +1 -1
  40. kiln_ai/adapters/fine_tune/openai_finetune.py +14 -4
  41. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
  42. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
  43. kiln_ai/adapters/fine_tune/test_openai_finetune.py +108 -111
  44. kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
  45. kiln_ai/adapters/ml_embedding_model_list.py +192 -0
  46. kiln_ai/adapters/ml_model_list.py +761 -37
  47. kiln_ai/adapters/model_adapters/base_adapter.py +51 -21
  48. kiln_ai/adapters/model_adapters/litellm_adapter.py +380 -138
  49. kiln_ai/adapters/model_adapters/test_base_adapter.py +193 -17
  50. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +407 -2
  51. kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +1103 -0
  52. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +5 -5
  53. kiln_ai/adapters/model_adapters/test_structured_output.py +113 -5
  54. kiln_ai/adapters/ollama_tools.py +69 -12
  55. kiln_ai/adapters/parsers/__init__.py +1 -1
  56. kiln_ai/adapters/provider_tools.py +205 -47
  57. kiln_ai/adapters/rag/deduplication.py +49 -0
  58. kiln_ai/adapters/rag/progress.py +252 -0
  59. kiln_ai/adapters/rag/rag_runners.py +844 -0
  60. kiln_ai/adapters/rag/test_deduplication.py +195 -0
  61. kiln_ai/adapters/rag/test_progress.py +785 -0
  62. kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
  63. kiln_ai/adapters/remote_config.py +80 -8
  64. kiln_ai/adapters/repair/test_repair_task.py +12 -9
  65. kiln_ai/adapters/run_output.py +3 -0
  66. kiln_ai/adapters/test_adapter_registry.py +657 -85
  67. kiln_ai/adapters/test_docker_model_runner_tools.py +305 -0
  68. kiln_ai/adapters/test_ml_embedding_model_list.py +429 -0
  69. kiln_ai/adapters/test_ml_model_list.py +251 -1
  70. kiln_ai/adapters/test_ollama_tools.py +340 -1
  71. kiln_ai/adapters/test_prompt_adaptors.py +13 -6
  72. kiln_ai/adapters/test_prompt_builders.py +1 -1
  73. kiln_ai/adapters/test_provider_tools.py +254 -8
  74. kiln_ai/adapters/test_remote_config.py +651 -58
  75. kiln_ai/adapters/vector_store/__init__.py +1 -0
  76. kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
  77. kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
  78. kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
  79. kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
  80. kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
  81. kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
  82. kiln_ai/datamodel/__init__.py +39 -34
  83. kiln_ai/datamodel/basemodel.py +170 -1
  84. kiln_ai/datamodel/chunk.py +158 -0
  85. kiln_ai/datamodel/datamodel_enums.py +28 -0
  86. kiln_ai/datamodel/embedding.py +64 -0
  87. kiln_ai/datamodel/eval.py +1 -1
  88. kiln_ai/datamodel/external_tool_server.py +298 -0
  89. kiln_ai/datamodel/extraction.py +303 -0
  90. kiln_ai/datamodel/json_schema.py +25 -10
  91. kiln_ai/datamodel/project.py +40 -1
  92. kiln_ai/datamodel/rag.py +79 -0
  93. kiln_ai/datamodel/registry.py +0 -15
  94. kiln_ai/datamodel/run_config.py +62 -0
  95. kiln_ai/datamodel/task.py +2 -77
  96. kiln_ai/datamodel/task_output.py +6 -1
  97. kiln_ai/datamodel/task_run.py +41 -0
  98. kiln_ai/datamodel/test_attachment.py +649 -0
  99. kiln_ai/datamodel/test_basemodel.py +4 -4
  100. kiln_ai/datamodel/test_chunk_models.py +317 -0
  101. kiln_ai/datamodel/test_dataset_split.py +1 -1
  102. kiln_ai/datamodel/test_embedding_models.py +448 -0
  103. kiln_ai/datamodel/test_eval_model.py +6 -6
  104. kiln_ai/datamodel/test_example_models.py +175 -0
  105. kiln_ai/datamodel/test_external_tool_server.py +691 -0
  106. kiln_ai/datamodel/test_extraction_chunk.py +206 -0
  107. kiln_ai/datamodel/test_extraction_model.py +470 -0
  108. kiln_ai/datamodel/test_rag.py +641 -0
  109. kiln_ai/datamodel/test_registry.py +8 -3
  110. kiln_ai/datamodel/test_task.py +15 -47
  111. kiln_ai/datamodel/test_tool_id.py +320 -0
  112. kiln_ai/datamodel/test_vector_store.py +320 -0
  113. kiln_ai/datamodel/tool_id.py +105 -0
  114. kiln_ai/datamodel/vector_store.py +141 -0
  115. kiln_ai/tools/__init__.py +8 -0
  116. kiln_ai/tools/base_tool.py +82 -0
  117. kiln_ai/tools/built_in_tools/__init__.py +13 -0
  118. kiln_ai/tools/built_in_tools/math_tools.py +124 -0
  119. kiln_ai/tools/built_in_tools/test_math_tools.py +204 -0
  120. kiln_ai/tools/mcp_server_tool.py +95 -0
  121. kiln_ai/tools/mcp_session_manager.py +246 -0
  122. kiln_ai/tools/rag_tools.py +157 -0
  123. kiln_ai/tools/test_base_tools.py +199 -0
  124. kiln_ai/tools/test_mcp_server_tool.py +457 -0
  125. kiln_ai/tools/test_mcp_session_manager.py +1585 -0
  126. kiln_ai/tools/test_rag_tools.py +848 -0
  127. kiln_ai/tools/test_tool_registry.py +562 -0
  128. kiln_ai/tools/tool_registry.py +85 -0
  129. kiln_ai/utils/__init__.py +3 -0
  130. kiln_ai/utils/async_job_runner.py +62 -17
  131. kiln_ai/utils/config.py +24 -2
  132. kiln_ai/utils/env.py +15 -0
  133. kiln_ai/utils/filesystem.py +14 -0
  134. kiln_ai/utils/filesystem_cache.py +60 -0
  135. kiln_ai/utils/litellm.py +94 -0
  136. kiln_ai/utils/lock.py +100 -0
  137. kiln_ai/utils/mime_type.py +38 -0
  138. kiln_ai/utils/open_ai_types.py +94 -0
  139. kiln_ai/utils/pdf_utils.py +38 -0
  140. kiln_ai/utils/project_utils.py +17 -0
  141. kiln_ai/utils/test_async_job_runner.py +151 -35
  142. kiln_ai/utils/test_config.py +138 -1
  143. kiln_ai/utils/test_env.py +142 -0
  144. kiln_ai/utils/test_filesystem_cache.py +316 -0
  145. kiln_ai/utils/test_litellm.py +206 -0
  146. kiln_ai/utils/test_lock.py +185 -0
  147. kiln_ai/utils/test_mime_type.py +66 -0
  148. kiln_ai/utils/test_open_ai_types.py +131 -0
  149. kiln_ai/utils/test_pdf_utils.py +73 -0
  150. kiln_ai/utils/test_uuid.py +111 -0
  151. kiln_ai/utils/test_validation.py +524 -0
  152. kiln_ai/utils/uuid.py +9 -0
  153. kiln_ai/utils/validation.py +90 -0
  154. {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/METADATA +12 -5
  155. kiln_ai-0.21.0.dist-info/RECORD +211 -0
  156. kiln_ai-0.19.0.dist-info/RECORD +0 -115
  157. {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/WHEEL +0 -0
  158. {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,346 @@
1
+ from typing import Callable
2
+ from unittest.mock import patch
3
+
4
+ import pytest
5
+ from llama_index.core.text_splitter import SentenceSplitter
6
+
7
+ from kiln_ai.adapters.chunkers.base_chunker import ChunkingResult
8
+ from kiln_ai.adapters.chunkers.fixed_window_chunker import FixedWindowChunker
9
+ from kiln_ai.adapters.chunkers.helpers import clean_up_text
10
+ from kiln_ai.datamodel.chunk import ChunkerConfig, ChunkerType
11
+
12
+
13
+ @pytest.fixture
14
+ def mock_fixed_window_chunker_factory() -> Callable[[int, int], FixedWindowChunker]:
15
+ def create_chunker(chunk_size: int, chunk_overlap: int) -> FixedWindowChunker:
16
+ return FixedWindowChunker(
17
+ ChunkerConfig(
18
+ name="test-chunker",
19
+ chunker_type=ChunkerType.FIXED_WINDOW,
20
+ properties={"chunk_size": chunk_size, "chunk_overlap": chunk_overlap},
21
+ )
22
+ )
23
+
24
+ return create_chunker
25
+
26
+
27
+ async def test_fixed_window_chunker_wrong_chunker_type(
28
+ mock_fixed_window_chunker_factory,
29
+ ):
30
+ with pytest.raises(ValueError):
31
+ FixedWindowChunker(
32
+ ChunkerConfig(
33
+ name="test-chunker",
34
+ chunker_type="wrong-chunker-type", # type: ignore
35
+ properties={"chunk_size": 100, "chunk_overlap": 10},
36
+ )
37
+ )
38
+
39
+
40
+ async def test_fixed_window_chunker_chunk_empty_text(
41
+ mock_fixed_window_chunker_factory,
42
+ ):
43
+ # we should not even be calling the splitter if the text is empty
44
+ chunker = mock_fixed_window_chunker_factory(100, 10)
45
+ with patch.object(SentenceSplitter, "split_text") as mock_split_text:
46
+ assert await chunker.chunk("") == ChunkingResult(chunks=[])
47
+ mock_split_text.assert_not_called()
48
+
49
+
50
+ @pytest.mark.parametrize(
51
+ "chunk_size,chunk_overlap,expected_chunks",
52
+ [(12, 6, 43), (256, 12, 2), (1024, 64, 1), (2048, 128, 1)],
53
+ )
54
+ async def test_fixed_window_chunker_concrete_chunker(
55
+ chunk_size, chunk_overlap, expected_chunks, mock_fixed_window_chunker_factory
56
+ ):
57
+ """
58
+ This test is to ensure that the chunker can split text (with markdown syntax). The specific values are just an illustration rather than values we
59
+ particularly care about.
60
+ """
61
+ chunker = mock_fixed_window_chunker_factory(chunk_size, chunk_overlap)
62
+ text_to_chunk = """# How Ice Cubes Make Drinks Colder
63
+
64
+ ## Introduction
65
+
66
+ When you drop an ice cube into a drink, it does more than just float and look refreshing. It changes the thermal state of the liquid in a precise and physically predictable way. While it may seem like a simple act, the science behind how ice cubes make drinks colder is a fascinating interplay of thermodynamics, phase change, and heat transfer.
67
+
68
+ ## The Science of Cooling
69
+
70
+ ### Heat Transfer Basics
71
+
72
+ At the core of the process is the concept of **heat exchange**. Heat naturally flows from warmer objects to colder ones until thermal equilibrium is reached. When an ice cube, which is at 0°C (32°F), is placed in a drink that is warmer than that, heat begins to flow from the liquid to the ice. This transfer of energy cools the drink while simultaneously warming the ice.
73
+
74
+ ### Latent Heat of Fusion
75
+
76
+ However, it's not just about the ice warming up. The real magic happens because of **latent heat**—specifically, the heat of fusion. When ice melts, it doesn't instantly become the same temperature as the liquid around it. Instead, it absorbs a significant amount of energy just to change from a solid to a liquid, without its temperature rising. This phase change requires approximately 334 joules per gram of ice, all taken from the drink, which cools as a result."""
77
+
78
+ output = await chunker.chunk(text_to_chunk)
79
+ assert len(output.chunks) == expected_chunks, (
80
+ f"Expected {expected_chunks} chunks, got {len(output.chunks)}. If this is the result of an intentional change to chunk boundaries, please update the expected number of chunks in the test. Note that changes to chunk boundaries can have a downstream impact on retrieval."
81
+ )
82
+
83
+
84
+ @pytest.mark.parametrize(
85
+ "chunk_size,chunk_overlap,expected_chunks",
86
+ [(12, 6, 120), (256, 12, 4), (1024, 64, 1), (2048, 128, 1)],
87
+ )
88
+ async def test_fixed_window_chunker_concrete_chunker_zh(
89
+ chunk_size, chunk_overlap, expected_chunks, mock_fixed_window_chunker_factory
90
+ ):
91
+ """
92
+ This test is to ensure that the chunker can split Chinese text. The specific values are just an illustration rather than values we
93
+ particularly care about.
94
+ """
95
+ chunker = mock_fixed_window_chunker_factory(chunk_size, chunk_overlap)
96
+ text_to_chunk = """火山是地表下在岩浆库中的高温岩浆及其有关的气体、碎屑从行星的地壳中喷出而形成的,具有特殊形態的地质结构。
97
+
98
+ 岩石圈由若干板块组成,它们漂浮在地幔的软流层之上,在板块的交界处岩石圈比较破碎,地下岩浆容易在此喷发形成火山。[1] 火山可以分为死火山、休眠火山和活火山。在一段时间内,没有出現喷发事件的活火山叫做睡火山(休眠火山)。另外还有一种泥火山,它在科学上严格来说不属于火山,但是许多社会大众也把它看作是火山的一种类型。
99
+
100
+ 火山爆发可能会造成许多危害,常伴有地震,影响范围不仅在火山爆发附近。其中一个危险是火山灰可能对飞机构成威胁,特别是那些喷气发动机,其中灰尘颗粒可以在高温下熔化; 熔化的颗粒随后粘附到涡轮机叶片并改变它们的形状,从而中断涡轮发动机的操作。大型爆发可能会影响气温,火山灰和硫酸液滴遮挡太阳辐射并冷却地球的低层大气(或对流层); 然而,它们也吸收地球辐射的热量,从而使高层大气(或平流层)变暖。 历史上,火山冬天造成了灾难性的饥荒。
101
+
102
+ 虽然火山喷发会对人类造成危害,但同时它也带来一些好处。例如:可以促进宝石的形成;扩大陆地的面积(夏威夷群岛就是由火山喷发而形成的);作为观光旅游考察景点,推动旅游业,如日本的富士山。[2] 专门研究火山活动的学科称为火山学[3]。
103
+ """ # noqa: RUF001
104
+
105
+ output = await chunker.chunk(text_to_chunk)
106
+ assert len(output.chunks) == expected_chunks, (
107
+ f"Expected {expected_chunks} chunks, got {len(output.chunks)}. If this is the result of an intentional change to chunk boundaries, please update the expected number of chunks in the test. Note that changes to chunk boundaries can have a downstream impact on retrieval."
108
+ )
109
+
110
+
111
+ @pytest.mark.parametrize(
112
+ "chunk_size,chunk_overlap,expected_chunks",
113
+ [(12, 6, 39), (256, 12, 1), (1024, 64, 1), (2048, 128, 1)],
114
+ )
115
+ async def test_fixed_window_chunker_concrete_chunker_no_punctuation(
116
+ chunk_size, chunk_overlap, expected_chunks, mock_fixed_window_chunker_factory
117
+ ):
118
+ """
119
+ This test is to ensure that the chunker still does some splitting even if there is no punctuation. The specific values are just an illustration rather than values we
120
+ particularly care about.
121
+ """
122
+ chunker = mock_fixed_window_chunker_factory(chunk_size, chunk_overlap)
123
+ text_to_chunk = """how ice cubes make drinks colder introduction when you drop an ice cube into a drink it does more than just float and look refreshing it changes the thermal state of the liquid in a precise and physically predictable way while it may seem like a simple act the science behind how ice cubes make drinks colder is a fascinating interplay of thermodynamics phase change and heat transfer the science of cooling heat transfer basics at the core of the process is the concept of heat exchange heat naturally flows from warmer objects to colder ones until thermal equilibrium is reached when an ice cube which is at 0c 32f is placed in a drink that is warmer than that heat begins to flow from the liquid to the ice this transfer of energy cools the drink while simultaneously warming the ice latent heat of fusion however its not just about the ice warming up the real magic happens because of latent heat specifically the heat of fusion when ice melts it doesnt instantly become the same temperature as the liquid around it instead it absorbs a significant amount of energy just to change from a solid to a liquid without its temperature rising this phase change requires approximately 334 joules per gram of ice all taken from the drink which cools as a result"""
124
+
125
+ output = await chunker.chunk(text_to_chunk)
126
+ assert len(output.chunks) == expected_chunks, (
127
+ f"Expected {expected_chunks} chunks, got {len(output.chunks)}. If this is the result of an intentional change to chunk boundaries, please update the expected number of chunks in the test. Note that changes to chunk boundaries can have a downstream impact on retrieval."
128
+ )
129
+
130
+
131
+ @pytest.mark.parametrize(
132
+ "chunk_size,chunk_overlap,expected_chunks",
133
+ [(12, 6, 106), (256, 12, 3), (1024, 64, 1), (2048, 128, 1)],
134
+ )
135
+ async def test_fixed_window_chunker_concrete_chunker_no_punctuation_zh(
136
+ chunk_size, chunk_overlap, expected_chunks, mock_fixed_window_chunker_factory
137
+ ):
138
+ """
139
+ This test is to ensure that the chunker still does some splitting even if there is no punctuation. The specific values are just an illustration rather than values we
140
+ particularly care about.
141
+ """
142
+ text_to_chunk = "火山是地表下在岩浆库中的高温岩浆及其有关的气体碎屑从行星的地壳中喷出而形成的具有特殊形態的地质结构岩石圈由若干板块组成它们漂浮在地幔的软流层之上在板块的交界处岩石圈比较破碎地下岩浆容易在此喷发形成火山火山可以分为死火山休眠火山和活火山在一段时间内没有出現喷发事件的活火山叫做睡火山休眠火山另外还有一种泥火山它在科学上严格来说不属于火山但是许多社会大众也把它看作是火山的一种类型火山爆发可能会造成许多危害常伴有地震影响范围不仅在火山爆发附近其中一个危险是火山灰可能对飞机构成威胁特别是那些喷气发动机其中灰尘颗粒可以在高温下熔化熔化的颗粒随后粘附到涡轮机叶片并改变它们的形状从而中断涡轮发动机的操作大型爆发可能会影响气温火山灰和硫酸液滴遮挡太阳辐射并冷却地球的低层大气或对流层然而它们也吸收地球辐射的热量从而使高层大气或平流层变暖历史上火山冬天造成了灾难性的饥荒虽然火山喷发会对人类造成危害但同时它也带来一些好处例如可以促进宝石的形成扩大陆地的面积夏威夷群岛就是由火山喷发而形成的作为观光旅游考察景点推动旅游业如日本的富士山专门研究火山活动的学科称为火山学"
143
+ chunker = mock_fixed_window_chunker_factory(chunk_size, chunk_overlap)
144
+ output = await chunker.chunk(text_to_chunk)
145
+ assert len(output.chunks) == expected_chunks, (
146
+ f"Expected {expected_chunks} chunks, got {len(output.chunks)}. If this is the result of an intentional change to chunk boundaries, please update the expected number of chunks in the test. Note that changes to chunk boundaries can have a downstream impact on retrieval."
147
+ )
148
+
149
+
150
+ async def test_fixed_window_chunker_preserves_text_content(
151
+ mock_fixed_window_chunker_factory,
152
+ ):
153
+ """
154
+ Test that the chunker preserves the original text content when reassembled.
155
+ """
156
+ chunker = mock_fixed_window_chunker_factory(100, 10)
157
+ text_to_chunk = (
158
+ "This is a test sentence. This is another test sentence. And a third one."
159
+ )
160
+
161
+ output = await chunker.chunk(text_to_chunk)
162
+
163
+ # Reassemble the text from chunks
164
+ reassembled_text = " ".join(chunk.text for chunk in output.chunks)
165
+
166
+ # The reassembled text should contain all the original content
167
+ # (though spacing might differ due to chunking)
168
+ assert "This is a test sentence" in reassembled_text
169
+ assert "This is another test sentence" in reassembled_text
170
+ assert "And a third one" in reassembled_text
171
+
172
+
173
+ @pytest.mark.parametrize(
174
+ "text",
175
+ [" ", "\n\n\n", "\t\t\t", " \n\t "],
176
+ )
177
+ async def test_fixed_window_chunker_handles_whitespace_only(
178
+ mock_fixed_window_chunker_factory,
179
+ text,
180
+ ):
181
+ """
182
+ Test that the chunker handles whitespace-only text appropriately.
183
+ """
184
+ chunker = mock_fixed_window_chunker_factory(100, 10)
185
+
186
+ output = await chunker.chunk(text)
187
+
188
+ # Should return empty chunks for whitespace-only text
189
+ assert len(output.chunks) == 0
190
+
191
+
192
+ async def test_fixed_window_chunker_handles_special_characters(
193
+ mock_fixed_window_chunker_factory,
194
+ ):
195
+ """
196
+ Test that the chunker handles special characters and unicode properly.
197
+ """
198
+ chunker = mock_fixed_window_chunker_factory(50, 5)
199
+ text_with_special_chars = (
200
+ "Hello 🌍! This has emojis 🚀 and symbols ©®™. Also unicode: αβγδε."
201
+ )
202
+
203
+ output = await chunker.chunk(text_with_special_chars)
204
+
205
+ # Should create at least one chunk
206
+ assert len(output.chunks) > 0
207
+
208
+ # Reassemble and check that special characters are preserved
209
+ reassembled = " ".join(chunk.text for chunk in output.chunks)
210
+ assert "Hello" in reassembled
211
+ assert "This has emojis" in reassembled
212
+ assert "🌍" in reassembled
213
+ assert "🚀" in reassembled
214
+ assert "©®™" in reassembled
215
+ assert "αβγδε" in reassembled
216
+
217
+
218
+ async def test_fixed_window_chunker_handles_single_character(
219
+ mock_fixed_window_chunker_factory,
220
+ ):
221
+ """
222
+ Test that the chunker handles single character text.
223
+ """
224
+ chunker = mock_fixed_window_chunker_factory(100, 10)
225
+
226
+ output = await chunker.chunk("A")
227
+ assert len(output.chunks) == 1
228
+ assert output.chunks[0].text == "A"
229
+
230
+
231
+ async def test_fixed_window_chunker_handles_single_word(
232
+ mock_fixed_window_chunker_factory,
233
+ ):
234
+ """
235
+ Test that the chunker handles single word text.
236
+ """
237
+ chunker = mock_fixed_window_chunker_factory(100, 10)
238
+
239
+ output = await chunker.chunk("Hello")
240
+ assert len(output.chunks) == 1
241
+ assert output.chunks[0].text == "Hello"
242
+
243
+
244
+ async def test_fixed_window_chunker_handles_single_sentence(
245
+ mock_fixed_window_chunker_factory,
246
+ ):
247
+ """
248
+ Test that the chunker handles single sentence text.
249
+ """
250
+ chunker = mock_fixed_window_chunker_factory(100, 10)
251
+
252
+ output = await chunker.chunk("This is a single sentence.")
253
+ assert len(output.chunks) == 1
254
+ assert output.chunks[0].text == "This is a single sentence."
255
+
256
+
257
+ async def test_fixed_window_chunker_very_large_text(mock_fixed_window_chunker_factory):
258
+ """
259
+ Test that the chunker can handle very large text without issues.
260
+ """
261
+ chunker = mock_fixed_window_chunker_factory(100, 10)
262
+
263
+ # Create a large text by repeating a sentence
264
+ large_text = "This is a test sentence. " * 1000
265
+
266
+ output = await chunker.chunk(large_text)
267
+
268
+ # Should produce multiple chunks
269
+ assert len(output.chunks) > 1
270
+
271
+ # All chunks should have content
272
+ for chunk in output.chunks:
273
+ assert chunk.text.strip() != ""
274
+
275
+
276
+ @pytest.mark.parametrize(
277
+ "whitespace_length",
278
+ [10_000],
279
+ )
280
+ async def test_fixed_window_chunker_removes_consecutive_whitespace(
281
+ mock_fixed_window_chunker_factory, whitespace_length
282
+ ):
283
+ # this is a very large text due to 1M+ consecutive whitespace characters
284
+ # the chunker crashes with a rust error
285
+ text = """Water plays an important role in the world economy. Approximately 70% of the fresh water used by humans goes to agriculture.[26] Fishing in salt and fresh water bodies has been, and continues to be, a major source of food for many parts of the world, providing 6.5% of global protein.[27] Much of the long-distance trade of commodities (such as oil, natural gas, and manufactured products) is transported by boats through seas, rivers, lakes, and canals. Large quantities of water, ice, and steam are used for cooling and heating in industry and homes. Water is an excellent solvent for a wide variety of substances, both mineral and organic; as such, it is widely used in industrial processes and in cooking and washing. Water, ice, and snow are also central to many sports and other forms of entertainment, such as swimming, pleasure boating, boat racing, surfing, sport fishing, diving, ice skating, snowboarding, and skiing.
286
+ {WHITESPACE_PROBLEM_HERE}
287
+ The word water comes from Old English wæter, from Proto-Germanic *watar (source also of Old Saxon watar, Old Frisian wetir, Dutch water, Old High German wazzar, German Wasser, vatn, Gothic 𐍅𐌰𐍄𐍉 (wato)), from Proto-Indo-European *wod-or, suffixed form of root *wed- ('water'; 'wet').[28] Also cognate, through the Indo-European root, with Greek ύδωρ (ýdor; from Ancient Greek ὕδωρ (hýdōr), whence English 'hydro-'), Russian вода́ (vodá), Irish uisce, and Albanian ujë.
288
+ """.replace("{WHITESPACE_PROBLEM_HERE}", " " * whitespace_length)
289
+
290
+ chunker = mock_fixed_window_chunker_factory(32, 8)
291
+
292
+ with patch(
293
+ "kiln_ai.adapters.chunkers.base_chunker.clean_up_text"
294
+ ) as mock_clean_up_text:
295
+ mock_clean_up_text.side_effect = clean_up_text
296
+ output = await chunker.chunk(text)
297
+ mock_clean_up_text.assert_called_once_with(text)
298
+ assert len(output.chunks) > 1
299
+
300
+
301
+ @pytest.mark.parametrize(
302
+ "whitespace_length",
303
+ [100_000, 1_000_000, 5_000_000, 10_000_000],
304
+ )
305
+ @pytest.mark.paid
306
+ async def test_fixed_window_chunker_removes_consecutive_whitespace_heavy_load(
307
+ mock_fixed_window_chunker_factory, whitespace_length
308
+ ):
309
+ # this is a very large text due to 1M+ consecutive whitespace characters
310
+ # the chunker crashes with a rust error
311
+ text = """Water plays an important role in the world economy. Approximately 70% of the fresh water used by humans goes to agriculture.[26] Fishing in salt and fresh water bodies has been, and continues to be, a major source of food for many parts of the world, providing 6.5% of global protein.[27] Much of the long-distance trade of commodities (such as oil, natural gas, and manufactured products) is transported by boats through seas, rivers, lakes, and canals. Large quantities of water, ice, and steam are used for cooling and heating in industry and homes. Water is an excellent solvent for a wide variety of substances, both mineral and organic; as such, it is widely used in industrial processes and in cooking and washing. Water, ice, and snow are also central to many sports and other forms of entertainment, such as swimming, pleasure boating, boat racing, surfing, sport fishing, diving, ice skating, snowboarding, and skiing.
312
+ {WHITESPACE_PROBLEM_HERE}
313
+ The word water comes from Old English wæter, from Proto-Germanic *watar (source also of Old Saxon watar, Old Frisian wetir, Dutch water, Old High German wazzar, German Wasser, vatn, Gothic 𐍅𐌰𐍄𐍉 (wato)), from Proto-Indo-European *wod-or, suffixed form of root *wed- ('water'; 'wet').[28] Also cognate, through the Indo-European root, with Greek ύδωρ (ýdor; from Ancient Greek ὕδωρ (hýdōr), whence English 'hydro-'), Russian вода́ (vodá), Irish uisce, and Albanian ujë.
314
+ """.replace("{WHITESPACE_PROBLEM_HERE}", " " * whitespace_length)
315
+
316
+ chunker = mock_fixed_window_chunker_factory(32, 8)
317
+
318
+ with patch(
319
+ "kiln_ai.adapters.chunkers.base_chunker.clean_up_text"
320
+ ) as mock_clean_up_text:
321
+ mock_clean_up_text.side_effect = clean_up_text
322
+ output = await chunker.chunk(text)
323
+ mock_clean_up_text.assert_called_once_with(text)
324
+ assert len(output.chunks) > 1
325
+
326
+
327
+ # this test takes a long time to run
328
+ @pytest.mark.paid
329
+ @pytest.mark.parametrize(
330
+ "number_of_sentences",
331
+ [10, 100, 1_000, 10_000],
332
+ )
333
+ async def test_fixed_window_chunker_handle_large_text(
334
+ mock_fixed_window_chunker_factory, number_of_sentences
335
+ ):
336
+ sentence = """Water plays an important role in the world economy. Approximately 70% of the fresh water used by humans goes to agriculture.[26] Fishing in salt and fresh water bodies has been, and continues to be, a major source of food for many parts of the world, providing 6.5% of global protein.[27] Much of the long-distance trade of commodities (such as oil, natural gas, and manufactured products) is transported by boats through seas, rivers, lakes, and canals. Large quantities of water, ice, and steam are used for cooling and heating in industry and homes. Water is an excellent solvent for a wide variety of substances, both mineral and organic; as such, it is widely used in industrial processes and in cooking and washing. Water, ice, and snow are also central to many sports and other forms of entertainment, such as swimming, pleasure boating, boat racing, surfing, sport fishing, diving, ice skating, snowboarding, and skiing."""
337
+ text = sentence * number_of_sentences
338
+
339
+ chunker = mock_fixed_window_chunker_factory(32, 8)
340
+ with patch(
341
+ "kiln_ai.adapters.chunkers.base_chunker.clean_up_text"
342
+ ) as mock_clean_up_text:
343
+ mock_clean_up_text.side_effect = clean_up_text
344
+ output = await chunker.chunk(text)
345
+ mock_clean_up_text.assert_called_once_with(text)
346
+ assert len(output.chunks) > 1
@@ -0,0 +1,75 @@
1
+ import pytest
2
+
3
+ from kiln_ai.adapters.chunkers.helpers import clean_up_text
4
+
5
+
6
+ def generate_consecutive_char_string(length: int, char: str) -> str:
7
+ return char * length
8
+
9
+
10
+ @pytest.mark.parametrize(
11
+ "text,expected",
12
+ [
13
+ # Test single newlines (should remain unchanged)
14
+ ("Hello\nWorld", "Hello\nWorld"),
15
+ ("Hello\n\nWorld", "Hello\n\nWorld"),
16
+ ("Hello\n\n\nWorld", "Hello\n\n\nWorld"),
17
+ ("Hello\n\n\n\nWorld", "Hello\n\n\n\nWorld"),
18
+ ("Hello\n\n\n\n\nWorld", "Hello\n\n\n\n\nWorld"),
19
+ # Test 6+ consecutive newlines (should be replaced with exactly 6)
20
+ ("Hello\n\n\n\n\n\nWorld", "Hello\n\n\n\n\n\nWorld"), # exactly 6, unchanged
21
+ ("Hello\n\n\n\n\n\n\nWorld", "Hello\n\n\n\n\n\nWorld"), # 7 newlines -> 6
22
+ ("Hello\n\n\n\n\n\n\n\nWorld", "Hello\n\n\n\n\n\nWorld"), # 8 newlines -> 6
23
+ (
24
+ "Hello\n\n\n\n\n\n\n\n\n\nWorld",
25
+ "Hello\n\n\n\n\n\nWorld",
26
+ ), # 10 newlines -> 6
27
+ # Test single spaces (should remain unchanged)
28
+ ("Hello World", "Hello World"),
29
+ ("Hello World", "Hello World"),
30
+ ("Hello World", "Hello World"),
31
+ ("Hello World", "Hello World"),
32
+ ("Hello World", "Hello World"),
33
+ # Test 50+ consecutive spaces (should be replaced with exactly 50)
34
+ (
35
+ "Hello" + " " * 50 + "World",
36
+ "Hello" + " " * 50 + "World",
37
+ ), # exactly 50, unchanged
38
+ ("Hello" + " " * 51 + "World", "Hello" + " " * 50 + "World"), # 51 spaces -> 50
39
+ (
40
+ "Hello" + " " * 100 + "World",
41
+ "Hello" + " " * 50 + "World",
42
+ ), # 100 spaces -> 50
43
+ # Test mixed cases
44
+ (
45
+ "Hello\n\n\n\n\n\n\nWorld" + " " * 60 + "Test",
46
+ "Hello\n\n\n\n\n\nWorld" + " " * 50 + "Test",
47
+ ),
48
+ (
49
+ "Text\n\n\n\n\n\n\n\n\n\nMore" + " " * 30 + "Text",
50
+ "Text\n\n\n\n\n\nMore" + " " * 30 + "Text",
51
+ ),
52
+ ],
53
+ )
54
+ def test_clean_up_text(text, expected):
55
+ assert clean_up_text(text) == expected
56
+
57
+
58
+ text = """Water is an inorganic compound with the chemical formula H2O. It is a transparent, tasteless, odorless,[c] and nearly colorless chemical substance. It is the main constituent of Earth's hydrosphere and the fluids of all known living organisms in which it acts as a solvent. Water, being a polar molecule, undergoes strong intermolecular hydrogen bonding which is a large contributor to its physical and chemical properties.[20] It is vital for all known forms of life, despite not providing food energy or being an organic micronutrient. Due to its presence in all organisms, its chemical stability, its worldwide abundance and its strong polarity relative to its small molecular size; water is often referred to as the "universal solvent".[21]"""
59
+
60
+ long_whitespace_string = generate_consecutive_char_string(1000, " ")
61
+ long_newlines_string = generate_consecutive_char_string(1000, "\n")
62
+
63
+ string_with_whitespace = f"{text}{long_whitespace_string}{text}"
64
+ string_with_newlines = f"{text}{long_newlines_string}{text}"
65
+
66
+
67
+ @pytest.mark.parametrize(
68
+ "text,expected",
69
+ [
70
+ (f"{text}{long_whitespace_string}{text}", f"{text}{' ' * 50}{text}"),
71
+ (f"{text}{long_newlines_string}{text}", f"{text}{chr(10) * 6}{text}"),
72
+ ],
73
+ )
74
+ def test_clean_up_text_large_text(text, expected):
75
+ assert clean_up_text(text) == expected
@@ -111,7 +111,9 @@ async def test_data_gen_all_models_providers(
111
111
  _, provider = get_model_and_provider(model_name, provider_name)
112
112
  if not provider.supports_data_gen:
113
113
  # pass if the model doesn't support data gen (testing the support flag is part of this)
114
- return
114
+ pytest.skip(
115
+ f"Skipping {model_name} {provider_name} because it does not support data gen"
116
+ )
115
117
 
116
118
  data_gen_task = DataGenCategoriesTask(gen_type="training", guidance=None)
117
119
  data_gen_input = DataGenCategoriesTaskInput.from_task(base_task, num_subtopics=6)
@@ -257,7 +259,9 @@ async def test_data_gen_sample_all_models_providers(
257
259
  _, provider = get_model_and_provider(model_name, provider_name)
258
260
  if provider is None or not provider.supports_data_gen:
259
261
  # pass if the model doesn't support data gen (testing the support flag is part of this)
260
- return
262
+ pytest.skip(
263
+ f"Skipping {model_name} {provider_name} because it does not support data gen"
264
+ )
261
265
 
262
266
  data_gen_task = DataGenSampleTask(
263
267
  target_task=base_task, gen_type="training", guidance=None
@@ -313,7 +317,9 @@ async def test_data_gen_sample_all_models_providers_with_structured_output(
313
317
  _, provider = get_model_and_provider(model_name, provider_name)
314
318
  if not provider.supports_data_gen:
315
319
  # pass if the model doesn't support data gen (testing the support flag is part of this)
316
- return
320
+ pytest.skip(
321
+ f"Skipping {model_name} {provider_name} because it does not support data gen"
322
+ )
317
323
 
318
324
  data_gen_task = DataGenSampleTask(
319
325
  target_task=task, gen_type="training", guidance=None
@@ -0,0 +1,119 @@
1
+ from typing import List
2
+
3
+ import httpx
4
+ import openai
5
+ from pydantic import BaseModel, Field
6
+
7
+ from kiln_ai.adapters.ml_model_list import ModelProviderName, built_in_models
8
+ from kiln_ai.utils.config import Config
9
+
10
+
11
+ def docker_model_runner_base_url() -> str:
12
+ """
13
+ Gets the base URL for Docker Model Runner API connections.
14
+
15
+ Returns:
16
+ The base URL to use for Docker Model Runner API calls, using environment variable if set
17
+ or falling back to localhost default
18
+ """
19
+ config_base_url = Config.shared().docker_model_runner_base_url
20
+ if config_base_url:
21
+ return config_base_url
22
+ return "http://localhost:12434/engines/llama.cpp"
23
+
24
+
25
+ async def docker_model_runner_online() -> bool:
26
+ """
27
+ Checks if the Docker Model Runner service is available and responding.
28
+
29
+ Returns:
30
+ True if Docker Model Runner is available and responding, False otherwise
31
+ """
32
+ try:
33
+ base_url = docker_model_runner_base_url()
34
+ # Docker Model Runner uses OpenAI-compatible endpoints
35
+ async with httpx.AsyncClient() as client:
36
+ response = await client.get(f"{base_url}/v1/models", timeout=5.0)
37
+ response.raise_for_status()
38
+ except httpx.RequestError:
39
+ return False
40
+ return True
41
+
42
+
43
+ class DockerModelRunnerConnection(BaseModel):
44
+ message: str
45
+ version: str | None = None
46
+ supported_models: List[str]
47
+ untested_models: List[str] = Field(default_factory=list)
48
+
49
+ def all_models(self) -> List[str]:
50
+ return self.supported_models + self.untested_models
51
+
52
+
53
+ # Parse the Docker Model Runner /v1/models response
54
+ def parse_docker_model_runner_models(
55
+ models: List[openai.types.Model],
56
+ ) -> DockerModelRunnerConnection | None:
57
+ # Build a list of models we support for Docker Model Runner from the built-in model list
58
+ supported_docker_models = [
59
+ provider.model_id
60
+ for model in built_in_models
61
+ for provider in model.providers
62
+ if provider.name == ModelProviderName.docker_model_runner
63
+ ]
64
+ # Note: Docker Model Runner aliases will be added when we configure models
65
+
66
+ model_names = [model.id for model in models]
67
+ available_supported_models = []
68
+ untested_models = []
69
+
70
+ for model_name in model_names:
71
+ if model_name in supported_docker_models:
72
+ available_supported_models.append(model_name)
73
+ else:
74
+ untested_models.append(model_name)
75
+
76
+ if available_supported_models or untested_models:
77
+ return DockerModelRunnerConnection(
78
+ message="Docker Model Runner connected",
79
+ supported_models=available_supported_models,
80
+ untested_models=untested_models,
81
+ )
82
+
83
+ return DockerModelRunnerConnection(
84
+ message="Docker Model Runner is running, but no supported models are available. Ensure models like 'ai/llama3.2:3B-Q4_K_M', 'ai/qwen3:8B-Q4_K_M', or 'ai/gemma3n:4B-Q4_K_M' are loaded.",
85
+ supported_models=[],
86
+ untested_models=[],
87
+ )
88
+
89
+
90
+ async def get_docker_model_runner_connection(
91
+ custom_url: str | None = None,
92
+ ) -> DockerModelRunnerConnection | None:
93
+ """
94
+ Gets the connection status for Docker Model Runner.
95
+
96
+ Args:
97
+ custom_url: Optional custom URL to use instead of the configured one
98
+ """
99
+ try:
100
+ base_url = custom_url or docker_model_runner_base_url()
101
+ # Use OpenAI client to get models list
102
+ client = openai.OpenAI(
103
+ api_key="dummy", # Docker Model Runner doesn't require API key
104
+ base_url=f"{base_url}/v1",
105
+ max_retries=0,
106
+ )
107
+ models_response = client.models.list()
108
+
109
+ except (openai.APIConnectionError, openai.APIError, httpx.RequestError):
110
+ return None
111
+
112
+ return parse_docker_model_runner_models(list(models_response))
113
+
114
+
115
+ def docker_model_runner_model_installed(
116
+ conn: DockerModelRunnerConnection, model_name: str
117
+ ) -> bool:
118
+ all_models = conn.all_models()
119
+ return model_name in all_models
File without changes
@@ -0,0 +1,44 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+ from typing import List
4
+
5
+ from litellm import Usage
6
+ from pydantic import BaseModel, Field
7
+
8
+ from kiln_ai.datamodel.embedding import EmbeddingConfig
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class Embedding(BaseModel):
14
+ vector: list[float] = Field(description="The vector of the embedding.")
15
+
16
+
17
+ class EmbeddingResult(BaseModel):
18
+ embeddings: list[Embedding] = Field(description="The embeddings of the text.")
19
+
20
+ usage: Usage | None = Field(default=None, description="The usage of the embedding.")
21
+
22
+
23
+ class BaseEmbeddingAdapter(ABC):
24
+ """
25
+ Base class for all embedding adapters.
26
+
27
+ Should be subclassed by each embedding adapter.
28
+ """
29
+
30
+ def __init__(self, embedding_config: EmbeddingConfig):
31
+ self.embedding_config = embedding_config
32
+
33
+ async def generate_embeddings(self, input_texts: List[str]) -> EmbeddingResult:
34
+ if not input_texts:
35
+ return EmbeddingResult(
36
+ embeddings=[],
37
+ usage=None,
38
+ )
39
+
40
+ return await self._generate_embeddings(input_texts)
41
+
42
+ @abstractmethod
43
+ async def _generate_embeddings(self, input_texts: List[str]) -> EmbeddingResult:
44
+ pass
@@ -0,0 +1,32 @@
1
+ from kiln_ai.adapters.embedding.base_embedding_adapter import BaseEmbeddingAdapter
2
+ from kiln_ai.adapters.embedding.litellm_embedding_adapter import LitellmEmbeddingAdapter
3
+ from kiln_ai.adapters.provider_tools import (
4
+ core_provider,
5
+ lite_llm_core_config_for_provider,
6
+ )
7
+ from kiln_ai.datamodel.datamodel_enums import ModelProviderName
8
+ from kiln_ai.datamodel.embedding import EmbeddingConfig
9
+
10
+
11
+ def embedding_adapter_from_type(
12
+ embedding_config: EmbeddingConfig,
13
+ ) -> BaseEmbeddingAdapter:
14
+ try:
15
+ provider_enum = ModelProviderName(embedding_config.model_provider_name)
16
+ except ValueError:
17
+ raise ValueError(
18
+ f"Unsupported model provider name: {embedding_config.model_provider_name.value}. "
19
+ )
20
+
21
+ core_provider_name = core_provider(embedding_config.model_name, provider_enum)
22
+
23
+ provider_config = lite_llm_core_config_for_provider(core_provider_name)
24
+ if provider_config is None:
25
+ raise ValueError(
26
+ f"No configuration found for core provider: {core_provider_name.value}. "
27
+ )
28
+
29
+ return LitellmEmbeddingAdapter(
30
+ embedding_config,
31
+ provider_config,
32
+ )