kiln-ai 0.20.1__py3-none-any.whl → 0.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (133) hide show
  1. kiln_ai/adapters/__init__.py +6 -0
  2. kiln_ai/adapters/adapter_registry.py +43 -226
  3. kiln_ai/adapters/chunkers/__init__.py +13 -0
  4. kiln_ai/adapters/chunkers/base_chunker.py +42 -0
  5. kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
  6. kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
  7. kiln_ai/adapters/chunkers/helpers.py +23 -0
  8. kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
  9. kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
  10. kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
  11. kiln_ai/adapters/chunkers/test_helpers.py +75 -0
  12. kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
  13. kiln_ai/adapters/embedding/__init__.py +0 -0
  14. kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
  15. kiln_ai/adapters/embedding/embedding_registry.py +32 -0
  16. kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
  17. kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
  18. kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
  19. kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
  20. kiln_ai/adapters/eval/eval_runner.py +6 -2
  21. kiln_ai/adapters/eval/test_base_eval.py +1 -3
  22. kiln_ai/adapters/eval/test_g_eval.py +1 -1
  23. kiln_ai/adapters/extractors/__init__.py +18 -0
  24. kiln_ai/adapters/extractors/base_extractor.py +72 -0
  25. kiln_ai/adapters/extractors/encoding.py +20 -0
  26. kiln_ai/adapters/extractors/extractor_registry.py +44 -0
  27. kiln_ai/adapters/extractors/extractor_runner.py +112 -0
  28. kiln_ai/adapters/extractors/litellm_extractor.py +406 -0
  29. kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
  30. kiln_ai/adapters/extractors/test_encoding.py +54 -0
  31. kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
  32. kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
  33. kiln_ai/adapters/extractors/test_litellm_extractor.py +1290 -0
  34. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
  35. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
  36. kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
  37. kiln_ai/adapters/ml_embedding_model_list.py +494 -0
  38. kiln_ai/adapters/ml_model_list.py +876 -18
  39. kiln_ai/adapters/model_adapters/litellm_adapter.py +40 -75
  40. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +79 -1
  41. kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +119 -5
  42. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +9 -3
  43. kiln_ai/adapters/model_adapters/test_structured_output.py +9 -10
  44. kiln_ai/adapters/ollama_tools.py +69 -12
  45. kiln_ai/adapters/provider_tools.py +190 -46
  46. kiln_ai/adapters/rag/deduplication.py +49 -0
  47. kiln_ai/adapters/rag/progress.py +252 -0
  48. kiln_ai/adapters/rag/rag_runners.py +844 -0
  49. kiln_ai/adapters/rag/test_deduplication.py +195 -0
  50. kiln_ai/adapters/rag/test_progress.py +785 -0
  51. kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
  52. kiln_ai/adapters/remote_config.py +80 -8
  53. kiln_ai/adapters/test_adapter_registry.py +579 -86
  54. kiln_ai/adapters/test_ml_embedding_model_list.py +239 -0
  55. kiln_ai/adapters/test_ml_model_list.py +202 -0
  56. kiln_ai/adapters/test_ollama_tools.py +340 -1
  57. kiln_ai/adapters/test_prompt_builders.py +1 -1
  58. kiln_ai/adapters/test_provider_tools.py +199 -8
  59. kiln_ai/adapters/test_remote_config.py +551 -56
  60. kiln_ai/adapters/vector_store/__init__.py +1 -0
  61. kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
  62. kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
  63. kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
  64. kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
  65. kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
  66. kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
  67. kiln_ai/datamodel/__init__.py +16 -13
  68. kiln_ai/datamodel/basemodel.py +201 -4
  69. kiln_ai/datamodel/chunk.py +158 -0
  70. kiln_ai/datamodel/datamodel_enums.py +27 -0
  71. kiln_ai/datamodel/embedding.py +64 -0
  72. kiln_ai/datamodel/external_tool_server.py +206 -54
  73. kiln_ai/datamodel/extraction.py +317 -0
  74. kiln_ai/datamodel/project.py +33 -1
  75. kiln_ai/datamodel/rag.py +79 -0
  76. kiln_ai/datamodel/task.py +5 -0
  77. kiln_ai/datamodel/task_output.py +41 -11
  78. kiln_ai/datamodel/test_attachment.py +649 -0
  79. kiln_ai/datamodel/test_basemodel.py +270 -14
  80. kiln_ai/datamodel/test_chunk_models.py +317 -0
  81. kiln_ai/datamodel/test_dataset_split.py +1 -1
  82. kiln_ai/datamodel/test_datasource.py +50 -0
  83. kiln_ai/datamodel/test_embedding_models.py +448 -0
  84. kiln_ai/datamodel/test_eval_model.py +6 -6
  85. kiln_ai/datamodel/test_external_tool_server.py +534 -152
  86. kiln_ai/datamodel/test_extraction_chunk.py +206 -0
  87. kiln_ai/datamodel/test_extraction_model.py +501 -0
  88. kiln_ai/datamodel/test_rag.py +641 -0
  89. kiln_ai/datamodel/test_task.py +35 -1
  90. kiln_ai/datamodel/test_tool_id.py +187 -1
  91. kiln_ai/datamodel/test_vector_store.py +320 -0
  92. kiln_ai/datamodel/tool_id.py +58 -0
  93. kiln_ai/datamodel/vector_store.py +141 -0
  94. kiln_ai/tools/base_tool.py +12 -3
  95. kiln_ai/tools/built_in_tools/math_tools.py +12 -4
  96. kiln_ai/tools/kiln_task_tool.py +158 -0
  97. kiln_ai/tools/mcp_server_tool.py +2 -2
  98. kiln_ai/tools/mcp_session_manager.py +51 -22
  99. kiln_ai/tools/rag_tools.py +164 -0
  100. kiln_ai/tools/test_kiln_task_tool.py +527 -0
  101. kiln_ai/tools/test_mcp_server_tool.py +4 -15
  102. kiln_ai/tools/test_mcp_session_manager.py +187 -227
  103. kiln_ai/tools/test_rag_tools.py +929 -0
  104. kiln_ai/tools/test_tool_registry.py +290 -7
  105. kiln_ai/tools/tool_registry.py +69 -16
  106. kiln_ai/utils/__init__.py +3 -0
  107. kiln_ai/utils/async_job_runner.py +62 -17
  108. kiln_ai/utils/config.py +2 -2
  109. kiln_ai/utils/env.py +15 -0
  110. kiln_ai/utils/filesystem.py +14 -0
  111. kiln_ai/utils/filesystem_cache.py +60 -0
  112. kiln_ai/utils/litellm.py +94 -0
  113. kiln_ai/utils/lock.py +100 -0
  114. kiln_ai/utils/mime_type.py +38 -0
  115. kiln_ai/utils/open_ai_types.py +19 -2
  116. kiln_ai/utils/pdf_utils.py +59 -0
  117. kiln_ai/utils/test_async_job_runner.py +151 -35
  118. kiln_ai/utils/test_env.py +142 -0
  119. kiln_ai/utils/test_filesystem_cache.py +316 -0
  120. kiln_ai/utils/test_litellm.py +206 -0
  121. kiln_ai/utils/test_lock.py +185 -0
  122. kiln_ai/utils/test_mime_type.py +66 -0
  123. kiln_ai/utils/test_open_ai_types.py +88 -12
  124. kiln_ai/utils/test_pdf_utils.py +86 -0
  125. kiln_ai/utils/test_uuid.py +111 -0
  126. kiln_ai/utils/test_validation.py +524 -0
  127. kiln_ai/utils/uuid.py +9 -0
  128. kiln_ai/utils/validation.py +90 -0
  129. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/METADATA +9 -1
  130. kiln_ai-0.22.0.dist-info/RECORD +213 -0
  131. kiln_ai-0.20.1.dist-info/RECORD +0 -138
  132. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/WHEEL +0 -0
  133. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,181 @@
1
+ from unittest.mock import patch
2
+
3
+ import pytest
4
+
5
+ from kiln_ai.adapters.extractors.extractor_registry import extractor_adapter_from_type
6
+ from kiln_ai.adapters.extractors.litellm_extractor import LitellmExtractor
7
+ from kiln_ai.adapters.ml_model_list import ModelProviderName
8
+ from kiln_ai.adapters.provider_tools import LiteLlmCoreConfig
9
+ from kiln_ai.datamodel.extraction import ExtractorConfig, ExtractorType
10
+
11
+
12
+ @pytest.fixture
13
+ def mock_provider_configs():
14
+ with patch("kiln_ai.utils.config.Config.shared") as mock_config:
15
+ mock_config.return_value.open_ai_api_key = "test-openai-key"
16
+ mock_config.return_value.gemini_api_key = "test-gemini-key"
17
+ mock_config.return_value.anthropic_api_key = "test-anthropic-key"
18
+ mock_config.return_value.bedrock_access_key = "test-amazon-bedrock-key"
19
+ mock_config.return_value.bedrock_secret_key = "test-amazon-bedrock-secret-key"
20
+ mock_config.return_value.fireworks_api_key = "test-fireworks-key"
21
+ mock_config.return_value.groq_api_key = "test-groq-key"
22
+ mock_config.return_value.huggingface_api_key = "test-huggingface-key"
23
+ yield mock_config
24
+
25
+
26
+ def test_extractor_adapter_from_type(mock_provider_configs):
27
+ extractor = extractor_adapter_from_type(
28
+ ExtractorType.LITELLM,
29
+ ExtractorConfig(
30
+ name="test-extractor",
31
+ extractor_type=ExtractorType.LITELLM,
32
+ model_provider_name="gemini_api",
33
+ model_name="gemini-2.0-flash",
34
+ properties={
35
+ "prompt_document": "Extract the text from the document",
36
+ "prompt_image": "Extract the text from the image",
37
+ "prompt_video": "Extract the text from the video",
38
+ "prompt_audio": "Extract the text from the audio",
39
+ },
40
+ ),
41
+ )
42
+ assert isinstance(extractor, LitellmExtractor)
43
+ assert extractor.extractor_config.model_name == "gemini-2.0-flash"
44
+ assert extractor.extractor_config.model_provider_name == "gemini_api"
45
+
46
+
47
+ @patch(
48
+ "kiln_ai.adapters.extractors.extractor_registry.lite_llm_core_config_for_provider"
49
+ )
50
+ def test_extractor_adapter_from_type_uses_litellm_core_config(
51
+ mock_get_litellm_core_config,
52
+ ):
53
+ """Test that extractor receives auth details from provider_tools."""
54
+ mock_litellm_core_config = LiteLlmCoreConfig(
55
+ base_url="https://test.com",
56
+ additional_body_options={"api_key": "test-key"},
57
+ default_headers={},
58
+ )
59
+ mock_get_litellm_core_config.return_value = mock_litellm_core_config
60
+
61
+ extractor = extractor_adapter_from_type(
62
+ ExtractorType.LITELLM,
63
+ ExtractorConfig(
64
+ name="test-extractor",
65
+ extractor_type=ExtractorType.LITELLM,
66
+ model_provider_name="openai",
67
+ model_name="gpt-4",
68
+ properties={
69
+ "prompt_document": "Extract the text from the document",
70
+ "prompt_image": "Extract the text from the image",
71
+ "prompt_video": "Extract the text from the video",
72
+ "prompt_audio": "Extract the text from the audio",
73
+ },
74
+ ),
75
+ )
76
+
77
+ assert isinstance(extractor, LitellmExtractor)
78
+ assert extractor.litellm_core_config == mock_litellm_core_config
79
+ mock_get_litellm_core_config.assert_called_once_with(ModelProviderName.openai)
80
+
81
+
82
+ def test_extractor_adapter_from_type_invalid_provider():
83
+ """Test that invalid model provider names raise a clear error."""
84
+ with pytest.raises(
85
+ ValueError, match="Unsupported model provider name: invalid_provider"
86
+ ):
87
+ extractor_adapter_from_type(
88
+ ExtractorType.LITELLM,
89
+ ExtractorConfig(
90
+ name="test-extractor",
91
+ extractor_type=ExtractorType.LITELLM,
92
+ model_provider_name="invalid_provider",
93
+ model_name="some-model",
94
+ properties={
95
+ "prompt_document": "Extract the text from the document",
96
+ "prompt_image": "Extract the text from the image",
97
+ "prompt_video": "Extract the text from the video",
98
+ "prompt_audio": "Extract the text from the audio",
99
+ },
100
+ ),
101
+ )
102
+
103
+
104
+ def test_extractor_adapter_from_type_invalid():
105
+ with pytest.raises(ValueError, match="Unhandled enum value: fake_type"):
106
+ extractor_adapter_from_type(
107
+ "fake_type",
108
+ ExtractorConfig(
109
+ name="test-extractor",
110
+ extractor_type=ExtractorType.LITELLM,
111
+ model_provider_name="invalid_provider",
112
+ model_name="some-model",
113
+ properties={
114
+ "prompt_document": "Extract the text from the document",
115
+ "prompt_image": "Extract the text from the image",
116
+ "prompt_video": "Extract the text from the video",
117
+ "prompt_audio": "Extract the text from the audio",
118
+ },
119
+ ),
120
+ )
121
+
122
+
123
+ @pytest.mark.parametrize(
124
+ "provider_name",
125
+ [
126
+ "openai",
127
+ "anthropic",
128
+ "gemini_api",
129
+ "amazon_bedrock",
130
+ "fireworks_ai",
131
+ "groq",
132
+ "huggingface",
133
+ ],
134
+ )
135
+ def test_extractor_adapter_from_type_different_providers(
136
+ provider_name, mock_provider_configs
137
+ ):
138
+ """Test that different providers work correctly."""
139
+ extractor = extractor_adapter_from_type(
140
+ ExtractorType.LITELLM,
141
+ ExtractorConfig(
142
+ name="test-extractor",
143
+ extractor_type=ExtractorType.LITELLM,
144
+ model_provider_name=provider_name,
145
+ model_name="test-model",
146
+ properties={
147
+ "prompt_document": "Extract the text from the document",
148
+ "prompt_image": "Extract the text from the image",
149
+ "prompt_video": "Extract the text from the video",
150
+ "prompt_audio": "Extract the text from the audio",
151
+ },
152
+ ),
153
+ )
154
+
155
+ assert isinstance(extractor, LitellmExtractor)
156
+ assert extractor.extractor_config.model_provider_name == provider_name
157
+
158
+
159
+ def test_extractor_adapter_from_type_no_config_found(mock_provider_configs):
160
+ with patch(
161
+ "kiln_ai.adapters.extractors.extractor_registry.lite_llm_core_config_for_provider"
162
+ ) as mock_lite_llm_core_config_for_provider:
163
+ mock_lite_llm_core_config_for_provider.return_value = None
164
+ with pytest.raises(
165
+ ValueError, match="No configuration found for core provider: openai"
166
+ ):
167
+ extractor_adapter_from_type(
168
+ ExtractorType.LITELLM,
169
+ ExtractorConfig(
170
+ name="test-extractor",
171
+ extractor_type=ExtractorType.LITELLM,
172
+ model_provider_name="openai",
173
+ model_name="gpt-4",
174
+ properties={
175
+ "prompt_document": "Extract the text from the document",
176
+ "prompt_image": "Extract the text from the image",
177
+ "prompt_video": "Extract the text from the video",
178
+ "prompt_audio": "Extract the text from the audio",
179
+ },
180
+ ),
181
+ )
@@ -0,0 +1,181 @@
1
+ from unittest.mock import AsyncMock
2
+
3
+ import pytest
4
+
5
+ from conftest import MockFileFactoryMimeType
6
+ from kiln_ai.adapters.extractors.extractor_runner import ExtractorRunner
7
+ from kiln_ai.datamodel.basemodel import KilnAttachmentModel
8
+ from kiln_ai.datamodel.extraction import (
9
+ Document,
10
+ Extraction,
11
+ ExtractionSource,
12
+ ExtractorConfig,
13
+ ExtractorType,
14
+ FileInfo,
15
+ Kind,
16
+ OutputFormat,
17
+ )
18
+ from kiln_ai.datamodel.project import Project
19
+
20
+
21
+ @pytest.fixture
22
+ def mock_project(tmp_path):
23
+ project = Project(
24
+ name="test",
25
+ description="test",
26
+ path=tmp_path / "project.kiln",
27
+ )
28
+ project.save_to_file()
29
+ return project
30
+
31
+
32
+ @pytest.fixture
33
+ def mock_extractor_config(mock_project):
34
+ extractor_config = ExtractorConfig(
35
+ name="test",
36
+ description="test",
37
+ output_format=OutputFormat.TEXT,
38
+ passthrough_mimetypes=[],
39
+ extractor_type=ExtractorType.LITELLM,
40
+ model_provider_name="gemini_api",
41
+ model_name="gemini-2.0-flash",
42
+ parent=mock_project,
43
+ properties={
44
+ "prompt_document": "Extract the text from the document",
45
+ "prompt_image": "Extract the text from the image",
46
+ "prompt_video": "Extract the text from the video",
47
+ "prompt_audio": "Extract the text from the audio",
48
+ },
49
+ )
50
+ extractor_config.save_to_file()
51
+ return extractor_config
52
+
53
+
54
+ @pytest.fixture
55
+ def mock_document(mock_project, mock_file_factory) -> Document:
56
+ test_pdf_file = mock_file_factory(MockFileFactoryMimeType.PDF)
57
+ document = Document(
58
+ name="test",
59
+ description="test",
60
+ kind=Kind.DOCUMENT,
61
+ original_file=FileInfo(
62
+ filename="test.pdf",
63
+ size=100,
64
+ mime_type="application/pdf",
65
+ attachment=KilnAttachmentModel.from_file(test_pdf_file),
66
+ ),
67
+ parent=mock_project,
68
+ )
69
+ document.save_to_file()
70
+ return document
71
+
72
+
73
+ @pytest.fixture
74
+ def mock_extractor_runner(mock_extractor_config, mock_document):
75
+ return ExtractorRunner(
76
+ extractor_configs=[mock_extractor_config],
77
+ documents=[mock_document],
78
+ )
79
+
80
+
81
+ # Test with and without concurrency
82
+ @pytest.mark.parametrize("concurrency", [1, 25])
83
+ @pytest.mark.asyncio
84
+ async def test_async_extractor_runner_status_updates(
85
+ mock_extractor_runner, concurrency
86
+ ):
87
+ # Real async testing!
88
+
89
+ job_count = 50
90
+ # Job objects are not the right type, but since we're mocking run_job, it doesn't matter
91
+ jobs = [{} for _ in range(job_count)]
92
+
93
+ # Mock collect_tasks to return our fake jobs
94
+ mock_extractor_runner.collect_jobs = lambda: jobs
95
+
96
+ # Mock run_job to return True immediately
97
+ mock_extractor_runner.run_job = AsyncMock(return_value=True)
98
+
99
+ # Expect the status updates in order, and 1 for each job
100
+ expected_completed_count = 0
101
+ async for progress in mock_extractor_runner.run(concurrency=concurrency):
102
+ assert progress.complete == expected_completed_count
103
+ expected_completed_count += 1
104
+ assert progress.errors == 0
105
+ assert progress.total == job_count
106
+
107
+ # Verify last status update was complete
108
+ assert expected_completed_count == job_count + 1
109
+
110
+ # Verify run_job was called for each job
111
+ assert mock_extractor_runner.run_job.call_count == job_count
112
+
113
+
114
+ def test_collect_jobs_excludes_already_run_extraction(
115
+ mock_extractor_runner, mock_document, mock_extractor_config
116
+ ):
117
+ """Test that already run documents are excluded"""
118
+ Extraction(
119
+ parent=mock_document,
120
+ source=ExtractionSource.PROCESSED,
121
+ extractor_config_id="other-extractor-config-id",
122
+ output=KilnAttachmentModel.from_data("test extraction output", "text/plain"),
123
+ ).save_to_file()
124
+
125
+ # should get the one job, since the document was not already extracted with this extractor config
126
+ jobs = mock_extractor_runner.collect_jobs()
127
+ assert len(jobs) == 1
128
+ assert jobs[0].doc.id == mock_document.id
129
+ assert jobs[0].extractor_config.id == mock_extractor_config.id
130
+
131
+ # Create an extraction for this document
132
+ Extraction(
133
+ parent=mock_document,
134
+ source=ExtractionSource.PROCESSED,
135
+ extractor_config_id=mock_extractor_config.id,
136
+ output=KilnAttachmentModel.from_data("test extraction output", "text/plain"),
137
+ ).save_to_file()
138
+
139
+ jobs = mock_extractor_runner.collect_jobs()
140
+
141
+ # should now get no jobs since the document was already extracted with this extractor config
142
+ assert len(jobs) == 0
143
+
144
+
145
+ def test_collect_jobs_multiple_extractor_configs(
146
+ mock_extractor_runner,
147
+ mock_document,
148
+ mock_extractor_config,
149
+ mock_project,
150
+ ):
151
+ """Test handling multiple extractor configs"""
152
+ second_config = ExtractorConfig(
153
+ name="test2",
154
+ description="test2",
155
+ output_format=OutputFormat.TEXT,
156
+ passthrough_mimetypes=[],
157
+ extractor_type=ExtractorType.LITELLM,
158
+ parent=mock_project,
159
+ model_provider_name="gemini_api",
160
+ model_name="gemini-2.0-flash",
161
+ properties={
162
+ "prompt_document": "Extract the text from the document",
163
+ "prompt_image": "Extract the text from the image",
164
+ "prompt_video": "Extract the text from the video",
165
+ "prompt_audio": "Extract the text from the audio",
166
+ },
167
+ )
168
+ second_config.save_to_file()
169
+
170
+ runner = ExtractorRunner(
171
+ extractor_configs=[mock_extractor_config, second_config],
172
+ documents=[mock_document],
173
+ )
174
+ jobs = runner.collect_jobs()
175
+
176
+ # Should get 2 jobs, one for each config
177
+ assert len(jobs) == 2
178
+ assert {job.extractor_config.id for job in jobs} == {
179
+ second_config.id,
180
+ mock_extractor_config.id,
181
+ }