kiln-ai 0.20.1__py3-none-any.whl → 0.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (133) hide show
  1. kiln_ai/adapters/__init__.py +6 -0
  2. kiln_ai/adapters/adapter_registry.py +43 -226
  3. kiln_ai/adapters/chunkers/__init__.py +13 -0
  4. kiln_ai/adapters/chunkers/base_chunker.py +42 -0
  5. kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
  6. kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
  7. kiln_ai/adapters/chunkers/helpers.py +23 -0
  8. kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
  9. kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
  10. kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
  11. kiln_ai/adapters/chunkers/test_helpers.py +75 -0
  12. kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
  13. kiln_ai/adapters/embedding/__init__.py +0 -0
  14. kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
  15. kiln_ai/adapters/embedding/embedding_registry.py +32 -0
  16. kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
  17. kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
  18. kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
  19. kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
  20. kiln_ai/adapters/eval/eval_runner.py +6 -2
  21. kiln_ai/adapters/eval/test_base_eval.py +1 -3
  22. kiln_ai/adapters/eval/test_g_eval.py +1 -1
  23. kiln_ai/adapters/extractors/__init__.py +18 -0
  24. kiln_ai/adapters/extractors/base_extractor.py +72 -0
  25. kiln_ai/adapters/extractors/encoding.py +20 -0
  26. kiln_ai/adapters/extractors/extractor_registry.py +44 -0
  27. kiln_ai/adapters/extractors/extractor_runner.py +112 -0
  28. kiln_ai/adapters/extractors/litellm_extractor.py +406 -0
  29. kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
  30. kiln_ai/adapters/extractors/test_encoding.py +54 -0
  31. kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
  32. kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
  33. kiln_ai/adapters/extractors/test_litellm_extractor.py +1290 -0
  34. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
  35. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
  36. kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
  37. kiln_ai/adapters/ml_embedding_model_list.py +494 -0
  38. kiln_ai/adapters/ml_model_list.py +876 -18
  39. kiln_ai/adapters/model_adapters/litellm_adapter.py +40 -75
  40. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +79 -1
  41. kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +119 -5
  42. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +9 -3
  43. kiln_ai/adapters/model_adapters/test_structured_output.py +9 -10
  44. kiln_ai/adapters/ollama_tools.py +69 -12
  45. kiln_ai/adapters/provider_tools.py +190 -46
  46. kiln_ai/adapters/rag/deduplication.py +49 -0
  47. kiln_ai/adapters/rag/progress.py +252 -0
  48. kiln_ai/adapters/rag/rag_runners.py +844 -0
  49. kiln_ai/adapters/rag/test_deduplication.py +195 -0
  50. kiln_ai/adapters/rag/test_progress.py +785 -0
  51. kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
  52. kiln_ai/adapters/remote_config.py +80 -8
  53. kiln_ai/adapters/test_adapter_registry.py +579 -86
  54. kiln_ai/adapters/test_ml_embedding_model_list.py +239 -0
  55. kiln_ai/adapters/test_ml_model_list.py +202 -0
  56. kiln_ai/adapters/test_ollama_tools.py +340 -1
  57. kiln_ai/adapters/test_prompt_builders.py +1 -1
  58. kiln_ai/adapters/test_provider_tools.py +199 -8
  59. kiln_ai/adapters/test_remote_config.py +551 -56
  60. kiln_ai/adapters/vector_store/__init__.py +1 -0
  61. kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
  62. kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
  63. kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
  64. kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
  65. kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
  66. kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
  67. kiln_ai/datamodel/__init__.py +16 -13
  68. kiln_ai/datamodel/basemodel.py +201 -4
  69. kiln_ai/datamodel/chunk.py +158 -0
  70. kiln_ai/datamodel/datamodel_enums.py +27 -0
  71. kiln_ai/datamodel/embedding.py +64 -0
  72. kiln_ai/datamodel/external_tool_server.py +206 -54
  73. kiln_ai/datamodel/extraction.py +317 -0
  74. kiln_ai/datamodel/project.py +33 -1
  75. kiln_ai/datamodel/rag.py +79 -0
  76. kiln_ai/datamodel/task.py +5 -0
  77. kiln_ai/datamodel/task_output.py +41 -11
  78. kiln_ai/datamodel/test_attachment.py +649 -0
  79. kiln_ai/datamodel/test_basemodel.py +270 -14
  80. kiln_ai/datamodel/test_chunk_models.py +317 -0
  81. kiln_ai/datamodel/test_dataset_split.py +1 -1
  82. kiln_ai/datamodel/test_datasource.py +50 -0
  83. kiln_ai/datamodel/test_embedding_models.py +448 -0
  84. kiln_ai/datamodel/test_eval_model.py +6 -6
  85. kiln_ai/datamodel/test_external_tool_server.py +534 -152
  86. kiln_ai/datamodel/test_extraction_chunk.py +206 -0
  87. kiln_ai/datamodel/test_extraction_model.py +501 -0
  88. kiln_ai/datamodel/test_rag.py +641 -0
  89. kiln_ai/datamodel/test_task.py +35 -1
  90. kiln_ai/datamodel/test_tool_id.py +187 -1
  91. kiln_ai/datamodel/test_vector_store.py +320 -0
  92. kiln_ai/datamodel/tool_id.py +58 -0
  93. kiln_ai/datamodel/vector_store.py +141 -0
  94. kiln_ai/tools/base_tool.py +12 -3
  95. kiln_ai/tools/built_in_tools/math_tools.py +12 -4
  96. kiln_ai/tools/kiln_task_tool.py +158 -0
  97. kiln_ai/tools/mcp_server_tool.py +2 -2
  98. kiln_ai/tools/mcp_session_manager.py +51 -22
  99. kiln_ai/tools/rag_tools.py +164 -0
  100. kiln_ai/tools/test_kiln_task_tool.py +527 -0
  101. kiln_ai/tools/test_mcp_server_tool.py +4 -15
  102. kiln_ai/tools/test_mcp_session_manager.py +187 -227
  103. kiln_ai/tools/test_rag_tools.py +929 -0
  104. kiln_ai/tools/test_tool_registry.py +290 -7
  105. kiln_ai/tools/tool_registry.py +69 -16
  106. kiln_ai/utils/__init__.py +3 -0
  107. kiln_ai/utils/async_job_runner.py +62 -17
  108. kiln_ai/utils/config.py +2 -2
  109. kiln_ai/utils/env.py +15 -0
  110. kiln_ai/utils/filesystem.py +14 -0
  111. kiln_ai/utils/filesystem_cache.py +60 -0
  112. kiln_ai/utils/litellm.py +94 -0
  113. kiln_ai/utils/lock.py +100 -0
  114. kiln_ai/utils/mime_type.py +38 -0
  115. kiln_ai/utils/open_ai_types.py +19 -2
  116. kiln_ai/utils/pdf_utils.py +59 -0
  117. kiln_ai/utils/test_async_job_runner.py +151 -35
  118. kiln_ai/utils/test_env.py +142 -0
  119. kiln_ai/utils/test_filesystem_cache.py +316 -0
  120. kiln_ai/utils/test_litellm.py +206 -0
  121. kiln_ai/utils/test_lock.py +185 -0
  122. kiln_ai/utils/test_mime_type.py +66 -0
  123. kiln_ai/utils/test_open_ai_types.py +88 -12
  124. kiln_ai/utils/test_pdf_utils.py +86 -0
  125. kiln_ai/utils/test_uuid.py +111 -0
  126. kiln_ai/utils/test_validation.py +524 -0
  127. kiln_ai/utils/uuid.py +9 -0
  128. kiln_ai/utils/validation.py +90 -0
  129. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/METADATA +9 -1
  130. kiln_ai-0.22.0.dist-info/RECORD +213 -0
  131. kiln_ai-0.20.1.dist-info/RECORD +0 -138
  132. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/WHEEL +0 -0
  133. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -18,9 +18,12 @@ The eval submodule contains the code for evaluating the performance of a model.
18
18
 
19
19
  from . import (
20
20
  chat,
21
+ chunkers,
21
22
  data_gen,
22
23
  eval,
24
+ extractors,
23
25
  fine_tune,
26
+ ml_embedding_model_list,
24
27
  ml_model_list,
25
28
  model_adapters,
26
29
  prompt_builders,
@@ -29,9 +32,12 @@ from . import (
29
32
 
30
33
  __all__ = [
31
34
  "chat",
35
+ "chunkers",
32
36
  "data_gen",
33
37
  "eval",
38
+ "extractors",
34
39
  "fine_tune",
40
+ "ml_embedding_model_list",
35
41
  "ml_model_list",
36
42
  "model_adapters",
37
43
  "prompt_builders",
@@ -1,5 +1,3 @@
1
- from os import getenv
2
-
3
1
  from kiln_ai import datamodel
4
2
  from kiln_ai.adapters.ml_model_list import ModelProviderName
5
3
  from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, BaseAdapter
@@ -9,11 +7,47 @@ from kiln_ai.adapters.model_adapters.litellm_adapter import (
9
7
  )
10
8
  from kiln_ai.adapters.provider_tools import (
11
9
  core_provider,
12
- lite_llm_config_for_openai_compatible,
10
+ lite_llm_core_config_for_provider,
13
11
  )
14
12
  from kiln_ai.datamodel.task import RunConfigProperties
15
- from kiln_ai.utils.config import Config
16
- from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
13
+
14
+
15
+ def litellm_core_provider_config(
16
+ run_config_properties: RunConfigProperties,
17
+ ) -> LiteLlmConfig:
18
+ # For things like the fine-tune provider, we want to run the underlying provider (e.g. openai)
19
+ core_provider_name = core_provider(
20
+ run_config_properties.model_name, run_config_properties.model_provider_name
21
+ )
22
+
23
+ # For OpenAI compatible providers, we want to retrieve the underlying provider and update the run config properties to match
24
+ openai_compatible_provider_name = None
25
+ if run_config_properties.model_provider_name == ModelProviderName.openai_compatible:
26
+ model_id = run_config_properties.model_name
27
+ try:
28
+ openai_compatible_provider_name, model_id = model_id.split("::")
29
+ except Exception:
30
+ raise ValueError(f"Invalid openai compatible model ID: {model_id}")
31
+
32
+ # Update a copy of the run config properties to use the openai compatible provider
33
+ updated_run_config_properties = run_config_properties.model_copy(deep=True)
34
+ updated_run_config_properties.model_name = model_id
35
+ run_config_properties = updated_run_config_properties
36
+
37
+ config = lite_llm_core_config_for_provider(
38
+ core_provider_name, openai_compatible_provider_name
39
+ )
40
+ if config is None:
41
+ raise ValueError(
42
+ "Fine tune or custom openai compatible provider is not a core provider. The underlying provider should be used when requesting the adapter litellm config instead."
43
+ )
44
+
45
+ return LiteLlmConfig(
46
+ run_config_properties=run_config_properties,
47
+ base_url=config.base_url,
48
+ default_headers=config.default_headers,
49
+ additional_body_options=config.additional_body_options or {},
50
+ )
17
51
 
18
52
 
19
53
  def adapter_for_task(
@@ -21,225 +55,8 @@ def adapter_for_task(
21
55
  run_config_properties: RunConfigProperties,
22
56
  base_adapter_config: AdapterConfig | None = None,
23
57
  ) -> BaseAdapter:
24
- # Get the provider to run. For things like the fine-tune provider, we want to run the underlying provider
25
- core_provider_name = core_provider(
26
- run_config_properties.model_name, run_config_properties.model_provider_name
58
+ return LiteLlmAdapter(
59
+ kiln_task=kiln_task,
60
+ config=litellm_core_provider_config(run_config_properties),
61
+ base_adapter_config=base_adapter_config,
27
62
  )
28
-
29
- match core_provider_name:
30
- case ModelProviderName.openrouter:
31
- return LiteLlmAdapter(
32
- kiln_task=kiln_task,
33
- config=LiteLlmConfig(
34
- run_config_properties=run_config_properties,
35
- base_url=getenv("OPENROUTER_BASE_URL")
36
- or "https://openrouter.ai/api/v1",
37
- default_headers={
38
- "HTTP-Referer": "https://getkiln.ai/openrouter",
39
- "X-Title": "KilnAI",
40
- },
41
- additional_body_options={
42
- "api_key": Config.shared().open_router_api_key,
43
- },
44
- ),
45
- base_adapter_config=base_adapter_config,
46
- )
47
- case ModelProviderName.siliconflow_cn:
48
- return LiteLlmAdapter(
49
- kiln_task=kiln_task,
50
- config=LiteLlmConfig(
51
- run_config_properties=run_config_properties,
52
- base_url=getenv("SILICONFLOW_BASE_URL")
53
- or "https://api.siliconflow.cn/v1",
54
- default_headers={
55
- "HTTP-Referer": "https://kiln.tech/siliconflow",
56
- "X-Title": "KilnAI",
57
- },
58
- additional_body_options={
59
- "api_key": Config.shared().siliconflow_cn_api_key,
60
- },
61
- ),
62
- base_adapter_config=base_adapter_config,
63
- )
64
- case ModelProviderName.openai:
65
- return LiteLlmAdapter(
66
- kiln_task=kiln_task,
67
- config=LiteLlmConfig(
68
- run_config_properties=run_config_properties,
69
- additional_body_options={
70
- "api_key": Config.shared().open_ai_api_key,
71
- },
72
- ),
73
- base_adapter_config=base_adapter_config,
74
- )
75
- case ModelProviderName.openai_compatible:
76
- config = lite_llm_config_for_openai_compatible(run_config_properties)
77
- return LiteLlmAdapter(
78
- kiln_task=kiln_task,
79
- config=config,
80
- base_adapter_config=base_adapter_config,
81
- )
82
- case ModelProviderName.groq:
83
- return LiteLlmAdapter(
84
- kiln_task=kiln_task,
85
- base_adapter_config=base_adapter_config,
86
- config=LiteLlmConfig(
87
- run_config_properties=run_config_properties,
88
- additional_body_options={
89
- "api_key": Config.shared().groq_api_key,
90
- },
91
- ),
92
- )
93
- case ModelProviderName.amazon_bedrock:
94
- return LiteLlmAdapter(
95
- kiln_task=kiln_task,
96
- base_adapter_config=base_adapter_config,
97
- config=LiteLlmConfig(
98
- run_config_properties=run_config_properties,
99
- additional_body_options={
100
- "aws_access_key_id": Config.shared().bedrock_access_key,
101
- "aws_secret_access_key": Config.shared().bedrock_secret_key,
102
- # The only region that's widely supported for bedrock
103
- "aws_region_name": "us-west-2",
104
- },
105
- ),
106
- )
107
- case ModelProviderName.ollama:
108
- ollama_base_url = (
109
- Config.shared().ollama_base_url or "http://localhost:11434"
110
- )
111
- return LiteLlmAdapter(
112
- kiln_task=kiln_task,
113
- base_adapter_config=base_adapter_config,
114
- config=LiteLlmConfig(
115
- run_config_properties=run_config_properties,
116
- # Set the Ollama base URL for 2 reasons:
117
- # 1. To use the correct base URL
118
- # 2. We use Ollama's OpenAI compatible API (/v1), and don't just let litellm use the Ollama API. We use more advanced features like json_schema.
119
- base_url=ollama_base_url + "/v1",
120
- additional_body_options={
121
- # LiteLLM errors without an api_key, even though Ollama doesn't support one.
122
- "api_key": "NA",
123
- },
124
- ),
125
- )
126
- case ModelProviderName.docker_model_runner:
127
- docker_base_url = (
128
- Config.shared().docker_model_runner_base_url
129
- or "http://localhost:12434/engines/llama.cpp"
130
- )
131
- return LiteLlmAdapter(
132
- kiln_task=kiln_task,
133
- base_adapter_config=base_adapter_config,
134
- config=LiteLlmConfig(
135
- run_config_properties=run_config_properties,
136
- # Docker Model Runner uses OpenAI-compatible API at /v1 endpoint
137
- base_url=docker_base_url + "/v1",
138
- additional_body_options={
139
- # LiteLLM errors without an api_key, even though Docker Model Runner doesn't require one.
140
- "api_key": "DMR",
141
- },
142
- ),
143
- )
144
- case ModelProviderName.fireworks_ai:
145
- return LiteLlmAdapter(
146
- kiln_task=kiln_task,
147
- base_adapter_config=base_adapter_config,
148
- config=LiteLlmConfig(
149
- run_config_properties=run_config_properties,
150
- additional_body_options={
151
- "api_key": Config.shared().fireworks_api_key,
152
- },
153
- ),
154
- )
155
- case ModelProviderName.anthropic:
156
- return LiteLlmAdapter(
157
- kiln_task=kiln_task,
158
- base_adapter_config=base_adapter_config,
159
- config=LiteLlmConfig(
160
- run_config_properties=run_config_properties,
161
- additional_body_options={
162
- "api_key": Config.shared().anthropic_api_key,
163
- },
164
- ),
165
- )
166
- case ModelProviderName.gemini_api:
167
- return LiteLlmAdapter(
168
- kiln_task=kiln_task,
169
- base_adapter_config=base_adapter_config,
170
- config=LiteLlmConfig(
171
- run_config_properties=run_config_properties,
172
- additional_body_options={
173
- "api_key": Config.shared().gemini_api_key,
174
- },
175
- ),
176
- )
177
- case ModelProviderName.vertex:
178
- return LiteLlmAdapter(
179
- kiln_task=kiln_task,
180
- base_adapter_config=base_adapter_config,
181
- config=LiteLlmConfig(
182
- run_config_properties=run_config_properties,
183
- additional_body_options={
184
- "vertex_project": Config.shared().vertex_project_id,
185
- "vertex_location": Config.shared().vertex_location,
186
- },
187
- ),
188
- )
189
- case ModelProviderName.together_ai:
190
- return LiteLlmAdapter(
191
- kiln_task=kiln_task,
192
- base_adapter_config=base_adapter_config,
193
- config=LiteLlmConfig(
194
- run_config_properties=run_config_properties,
195
- additional_body_options={
196
- "api_key": Config.shared().together_api_key,
197
- },
198
- ),
199
- )
200
- case ModelProviderName.azure_openai:
201
- return LiteLlmAdapter(
202
- kiln_task=kiln_task,
203
- base_adapter_config=base_adapter_config,
204
- config=LiteLlmConfig(
205
- base_url=Config.shared().azure_openai_endpoint,
206
- run_config_properties=run_config_properties,
207
- additional_body_options={
208
- "api_key": Config.shared().azure_openai_api_key,
209
- "api_version": "2025-02-01-preview",
210
- },
211
- ),
212
- )
213
- case ModelProviderName.huggingface:
214
- return LiteLlmAdapter(
215
- kiln_task=kiln_task,
216
- base_adapter_config=base_adapter_config,
217
- config=LiteLlmConfig(
218
- run_config_properties=run_config_properties,
219
- additional_body_options={
220
- "api_key": Config.shared().huggingface_api_key,
221
- },
222
- ),
223
- )
224
- case ModelProviderName.cerebras:
225
- return LiteLlmAdapter(
226
- kiln_task=kiln_task,
227
- base_adapter_config=base_adapter_config,
228
- config=LiteLlmConfig(
229
- run_config_properties=run_config_properties,
230
- additional_body_options={
231
- "api_key": Config.shared().cerebras_api_key,
232
- },
233
- ),
234
- )
235
- # These are virtual providers that should have mapped to an actual provider in core_provider
236
- case ModelProviderName.kiln_fine_tune:
237
- raise ValueError(
238
- "Fine tune is not a supported core provider. It should map to an actual provider."
239
- )
240
- case ModelProviderName.kiln_custom_registry:
241
- raise ValueError(
242
- "Custom openai compatible provider is not a supported core provider. It should map to an actual provider."
243
- )
244
- case _:
245
- raise_exhaustive_enum_error(core_provider_name)
@@ -0,0 +1,13 @@
1
+ """
2
+ Chunkers for processing different document types.
3
+
4
+ This package provides a framework for chunking text into smaller chunks.
5
+ """
6
+
7
+ from . import base_chunker, chunker_registry, fixed_window_chunker
8
+
9
+ __all__ = [
10
+ "base_chunker",
11
+ "chunker_registry",
12
+ "fixed_window_chunker",
13
+ ]
@@ -0,0 +1,42 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+ from kiln_ai.adapters.chunkers.helpers import clean_up_text
7
+ from kiln_ai.datamodel.chunk import ChunkerConfig
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class TextChunk(BaseModel):
13
+ text: str = Field(description="The text of the chunk.")
14
+
15
+
16
+ class ChunkingResult(BaseModel):
17
+ chunks: list[TextChunk] = Field(description="The chunks of the text.")
18
+
19
+
20
+ class BaseChunker(ABC):
21
+ """
22
+ Base class for all chunkers.
23
+
24
+ Should be subclassed by each chunker.
25
+ """
26
+
27
+ def __init__(self, chunker_config: ChunkerConfig):
28
+ self.chunker_config = chunker_config
29
+
30
+ async def chunk(self, text: str) -> ChunkingResult:
31
+ if not text:
32
+ return ChunkingResult(chunks=[])
33
+
34
+ sanitized_text = clean_up_text(text)
35
+ if not sanitized_text:
36
+ return ChunkingResult(chunks=[])
37
+
38
+ return await self._chunk(sanitized_text)
39
+
40
+ @abstractmethod
41
+ async def _chunk(self, text: str) -> ChunkingResult:
42
+ pass
@@ -0,0 +1,16 @@
1
+ from kiln_ai.adapters.chunkers.base_chunker import BaseChunker
2
+ from kiln_ai.adapters.chunkers.fixed_window_chunker import FixedWindowChunker
3
+ from kiln_ai.datamodel.chunk import ChunkerConfig, ChunkerType
4
+ from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
5
+
6
+
7
+ def chunker_adapter_from_type(
8
+ chunker_type: ChunkerType,
9
+ chunker_config: ChunkerConfig,
10
+ ) -> BaseChunker:
11
+ match chunker_type:
12
+ case ChunkerType.FIXED_WINDOW:
13
+ return FixedWindowChunker(chunker_config)
14
+ case _:
15
+ # type checking will catch missing cases
16
+ raise_exhaustive_enum_error(chunker_type)
@@ -0,0 +1,39 @@
1
+ from typing import List
2
+
3
+ from llama_index.core.text_splitter import SentenceSplitter
4
+
5
+ from kiln_ai.adapters.chunkers.base_chunker import (
6
+ BaseChunker,
7
+ ChunkingResult,
8
+ TextChunk,
9
+ )
10
+ from kiln_ai.datamodel.chunk import ChunkerConfig, ChunkerType
11
+
12
+
13
+ class FixedWindowChunker(BaseChunker):
14
+ def __init__(self, chunker_config: ChunkerConfig):
15
+ if chunker_config.chunker_type != ChunkerType.FIXED_WINDOW:
16
+ raise ValueError("Chunker type must be FIXED_WINDOW")
17
+
18
+ chunk_size = chunker_config.chunk_size()
19
+ if chunk_size is None:
20
+ raise ValueError("Chunk size must be set")
21
+
22
+ chunk_overlap = chunker_config.chunk_overlap()
23
+ if chunk_overlap is None:
24
+ raise ValueError("Chunk overlap must be set")
25
+
26
+ super().__init__(chunker_config)
27
+ self.splitter = SentenceSplitter(
28
+ chunk_size=chunk_size,
29
+ chunk_overlap=chunk_overlap,
30
+ )
31
+
32
+ async def _chunk(self, text: str) -> ChunkingResult:
33
+ sentences = self.splitter.split_text(text)
34
+
35
+ chunks: List[TextChunk] = []
36
+ for sentence in sentences:
37
+ chunks.append(TextChunk(text=sentence))
38
+
39
+ return ChunkingResult(chunks=chunks)
@@ -0,0 +1,23 @@
1
+ import re
2
+
3
+
4
+ def clean_up_text(text: str) -> str:
5
+ """
6
+ Clean up text by limiting consecutive newlines and consecutive whitespace. Models sometimes send a lot of those.
7
+ It seems to happen more when the transcription is done at low temperature.
8
+
9
+ - Replaces 6+ consecutive newlines with exactly 6 newlines
10
+ - Replaces 50+ consecutive spaces with exactly 50 spaces
11
+ - Leaves 1-5 consecutive newlines unchanged
12
+ - Leaves 1-49 consecutive spaces unchanged
13
+ """
14
+ max_consecutive_newlines = 6
15
+ max_consecutive_whitespace = 50
16
+
17
+ # Replace 6+ consecutive newlines with exactly 6 newlines
18
+ text = re.sub(r"\n{6,}", "\n" * max_consecutive_newlines, text)
19
+
20
+ # Replace 50+ consecutive spaces with exactly 50 spaces
21
+ text = re.sub(r" {50,}", " " * max_consecutive_whitespace, text)
22
+
23
+ return text.strip()
@@ -0,0 +1,63 @@
1
+ from unittest.mock import patch
2
+
3
+ import pytest
4
+
5
+ from kiln_ai.adapters.chunkers.base_chunker import (
6
+ BaseChunker,
7
+ ChunkingResult,
8
+ TextChunk,
9
+ )
10
+ from kiln_ai.adapters.chunkers.helpers import clean_up_text
11
+ from kiln_ai.datamodel.chunk import ChunkerConfig, ChunkerType
12
+
13
+
14
+ @pytest.fixture
15
+ def config() -> ChunkerConfig:
16
+ return ChunkerConfig(
17
+ name="test-chunker",
18
+ chunker_type=ChunkerType.FIXED_WINDOW,
19
+ properties={"chunk_size": 100, "chunk_overlap": 10},
20
+ )
21
+
22
+
23
+ class WhitespaceChunker(BaseChunker):
24
+ async def _chunk(self, text: str) -> ChunkingResult:
25
+ return ChunkingResult(chunks=[TextChunk(text=chunk) for chunk in text.split()])
26
+
27
+
28
+ @pytest.fixture
29
+ def chunker(config: ChunkerConfig) -> WhitespaceChunker:
30
+ return WhitespaceChunker(config)
31
+
32
+
33
+ async def test_base_chunker_chunk_empty_text(chunker: WhitespaceChunker):
34
+ assert await chunker.chunk("") == ChunkingResult(chunks=[])
35
+
36
+
37
+ async def test_base_chunker_concrete_chunker(chunker: WhitespaceChunker):
38
+ output = await chunker.chunk("Hello, world!")
39
+ assert len(output.chunks) == 2
40
+
41
+
42
+ async def test_base_chunker_calls_clean_up_text(chunker: WhitespaceChunker):
43
+ with patch(
44
+ "kiln_ai.adapters.chunkers.base_chunker.clean_up_text"
45
+ ) as mock_clean_up_text:
46
+ mock_clean_up_text.side_effect = clean_up_text
47
+ await chunker.chunk("Hello, world!")
48
+ mock_clean_up_text.assert_called_once_with("Hello, world!")
49
+
50
+
51
+ async def test_base_chunker_empty_text(chunker: WhitespaceChunker):
52
+ chunks = await chunker.chunk("")
53
+ assert chunks == ChunkingResult(chunks=[])
54
+
55
+
56
+ async def test_base_chunker_empty_text_after_clean_up(chunker: WhitespaceChunker):
57
+ with patch(
58
+ "kiln_ai.adapters.chunkers.base_chunker.clean_up_text"
59
+ ) as mock_clean_up_text:
60
+ mock_clean_up_text.side_effect = clean_up_text
61
+ chunks = await chunker.chunk("\n\n ")
62
+ mock_clean_up_text.assert_called_once_with("\n\n ")
63
+ assert chunks == ChunkingResult(chunks=[])
@@ -0,0 +1,28 @@
1
+ import pytest
2
+
3
+ from kiln_ai.adapters.chunkers.chunker_registry import chunker_adapter_from_type
4
+ from kiln_ai.adapters.chunkers.fixed_window_chunker import FixedWindowChunker
5
+ from kiln_ai.datamodel.chunk import ChunkerConfig, ChunkerType
6
+
7
+
8
+ def test_chunker_adapter_from_type():
9
+ chunker = chunker_adapter_from_type(
10
+ ChunkerType.FIXED_WINDOW,
11
+ ChunkerConfig(
12
+ name="test-chunker",
13
+ chunker_type=ChunkerType.FIXED_WINDOW,
14
+ properties={
15
+ # do not use these values in production!
16
+ "chunk_size": 5555,
17
+ "chunk_overlap": 1111,
18
+ },
19
+ ),
20
+ )
21
+ assert isinstance(chunker, FixedWindowChunker)
22
+ assert chunker.chunker_config.chunk_size() == 5555
23
+ assert chunker.chunker_config.chunk_overlap() == 1111
24
+
25
+
26
+ def test_chunker_adapter_from_type_invalid():
27
+ with pytest.raises(ValueError):
28
+ chunker_adapter_from_type("invalid-type", {})