kiln-ai 0.20.1__py3-none-any.whl → 0.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (133) hide show
  1. kiln_ai/adapters/__init__.py +6 -0
  2. kiln_ai/adapters/adapter_registry.py +43 -226
  3. kiln_ai/adapters/chunkers/__init__.py +13 -0
  4. kiln_ai/adapters/chunkers/base_chunker.py +42 -0
  5. kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
  6. kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
  7. kiln_ai/adapters/chunkers/helpers.py +23 -0
  8. kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
  9. kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
  10. kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
  11. kiln_ai/adapters/chunkers/test_helpers.py +75 -0
  12. kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
  13. kiln_ai/adapters/embedding/__init__.py +0 -0
  14. kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
  15. kiln_ai/adapters/embedding/embedding_registry.py +32 -0
  16. kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
  17. kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
  18. kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
  19. kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
  20. kiln_ai/adapters/eval/eval_runner.py +6 -2
  21. kiln_ai/adapters/eval/test_base_eval.py +1 -3
  22. kiln_ai/adapters/eval/test_g_eval.py +1 -1
  23. kiln_ai/adapters/extractors/__init__.py +18 -0
  24. kiln_ai/adapters/extractors/base_extractor.py +72 -0
  25. kiln_ai/adapters/extractors/encoding.py +20 -0
  26. kiln_ai/adapters/extractors/extractor_registry.py +44 -0
  27. kiln_ai/adapters/extractors/extractor_runner.py +112 -0
  28. kiln_ai/adapters/extractors/litellm_extractor.py +406 -0
  29. kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
  30. kiln_ai/adapters/extractors/test_encoding.py +54 -0
  31. kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
  32. kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
  33. kiln_ai/adapters/extractors/test_litellm_extractor.py +1290 -0
  34. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
  35. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
  36. kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
  37. kiln_ai/adapters/ml_embedding_model_list.py +494 -0
  38. kiln_ai/adapters/ml_model_list.py +876 -18
  39. kiln_ai/adapters/model_adapters/litellm_adapter.py +40 -75
  40. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +79 -1
  41. kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +119 -5
  42. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +9 -3
  43. kiln_ai/adapters/model_adapters/test_structured_output.py +9 -10
  44. kiln_ai/adapters/ollama_tools.py +69 -12
  45. kiln_ai/adapters/provider_tools.py +190 -46
  46. kiln_ai/adapters/rag/deduplication.py +49 -0
  47. kiln_ai/adapters/rag/progress.py +252 -0
  48. kiln_ai/adapters/rag/rag_runners.py +844 -0
  49. kiln_ai/adapters/rag/test_deduplication.py +195 -0
  50. kiln_ai/adapters/rag/test_progress.py +785 -0
  51. kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
  52. kiln_ai/adapters/remote_config.py +80 -8
  53. kiln_ai/adapters/test_adapter_registry.py +579 -86
  54. kiln_ai/adapters/test_ml_embedding_model_list.py +239 -0
  55. kiln_ai/adapters/test_ml_model_list.py +202 -0
  56. kiln_ai/adapters/test_ollama_tools.py +340 -1
  57. kiln_ai/adapters/test_prompt_builders.py +1 -1
  58. kiln_ai/adapters/test_provider_tools.py +199 -8
  59. kiln_ai/adapters/test_remote_config.py +551 -56
  60. kiln_ai/adapters/vector_store/__init__.py +1 -0
  61. kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
  62. kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
  63. kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
  64. kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
  65. kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
  66. kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
  67. kiln_ai/datamodel/__init__.py +16 -13
  68. kiln_ai/datamodel/basemodel.py +201 -4
  69. kiln_ai/datamodel/chunk.py +158 -0
  70. kiln_ai/datamodel/datamodel_enums.py +27 -0
  71. kiln_ai/datamodel/embedding.py +64 -0
  72. kiln_ai/datamodel/external_tool_server.py +206 -54
  73. kiln_ai/datamodel/extraction.py +317 -0
  74. kiln_ai/datamodel/project.py +33 -1
  75. kiln_ai/datamodel/rag.py +79 -0
  76. kiln_ai/datamodel/task.py +5 -0
  77. kiln_ai/datamodel/task_output.py +41 -11
  78. kiln_ai/datamodel/test_attachment.py +649 -0
  79. kiln_ai/datamodel/test_basemodel.py +270 -14
  80. kiln_ai/datamodel/test_chunk_models.py +317 -0
  81. kiln_ai/datamodel/test_dataset_split.py +1 -1
  82. kiln_ai/datamodel/test_datasource.py +50 -0
  83. kiln_ai/datamodel/test_embedding_models.py +448 -0
  84. kiln_ai/datamodel/test_eval_model.py +6 -6
  85. kiln_ai/datamodel/test_external_tool_server.py +534 -152
  86. kiln_ai/datamodel/test_extraction_chunk.py +206 -0
  87. kiln_ai/datamodel/test_extraction_model.py +501 -0
  88. kiln_ai/datamodel/test_rag.py +641 -0
  89. kiln_ai/datamodel/test_task.py +35 -1
  90. kiln_ai/datamodel/test_tool_id.py +187 -1
  91. kiln_ai/datamodel/test_vector_store.py +320 -0
  92. kiln_ai/datamodel/tool_id.py +58 -0
  93. kiln_ai/datamodel/vector_store.py +141 -0
  94. kiln_ai/tools/base_tool.py +12 -3
  95. kiln_ai/tools/built_in_tools/math_tools.py +12 -4
  96. kiln_ai/tools/kiln_task_tool.py +158 -0
  97. kiln_ai/tools/mcp_server_tool.py +2 -2
  98. kiln_ai/tools/mcp_session_manager.py +51 -22
  99. kiln_ai/tools/rag_tools.py +164 -0
  100. kiln_ai/tools/test_kiln_task_tool.py +527 -0
  101. kiln_ai/tools/test_mcp_server_tool.py +4 -15
  102. kiln_ai/tools/test_mcp_session_manager.py +187 -227
  103. kiln_ai/tools/test_rag_tools.py +929 -0
  104. kiln_ai/tools/test_tool_registry.py +290 -7
  105. kiln_ai/tools/tool_registry.py +69 -16
  106. kiln_ai/utils/__init__.py +3 -0
  107. kiln_ai/utils/async_job_runner.py +62 -17
  108. kiln_ai/utils/config.py +2 -2
  109. kiln_ai/utils/env.py +15 -0
  110. kiln_ai/utils/filesystem.py +14 -0
  111. kiln_ai/utils/filesystem_cache.py +60 -0
  112. kiln_ai/utils/litellm.py +94 -0
  113. kiln_ai/utils/lock.py +100 -0
  114. kiln_ai/utils/mime_type.py +38 -0
  115. kiln_ai/utils/open_ai_types.py +19 -2
  116. kiln_ai/utils/pdf_utils.py +59 -0
  117. kiln_ai/utils/test_async_job_runner.py +151 -35
  118. kiln_ai/utils/test_env.py +142 -0
  119. kiln_ai/utils/test_filesystem_cache.py +316 -0
  120. kiln_ai/utils/test_litellm.py +206 -0
  121. kiln_ai/utils/test_lock.py +185 -0
  122. kiln_ai/utils/test_mime_type.py +66 -0
  123. kiln_ai/utils/test_open_ai_types.py +88 -12
  124. kiln_ai/utils/test_pdf_utils.py +86 -0
  125. kiln_ai/utils/test_uuid.py +111 -0
  126. kiln_ai/utils/test_validation.py +524 -0
  127. kiln_ai/utils/uuid.py +9 -0
  128. kiln_ai/utils/validation.py +90 -0
  129. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/METADATA +9 -1
  130. kiln_ai-0.22.0.dist-info/RECORD +213 -0
  131. kiln_ai-0.20.1.dist-info/RECORD +0 -138
  132. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/WHEEL +0 -0
  133. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -160,8 +160,12 @@ class EvalRunner:
160
160
  """
161
161
  jobs = self.collect_tasks()
162
162
 
163
- runner = AsyncJobRunner(concurrency=concurrency)
164
- async for progress in runner.run(jobs, self.run_job):
163
+ runner = AsyncJobRunner(
164
+ concurrency=concurrency,
165
+ jobs=jobs,
166
+ run_job_fn=self.run_job,
167
+ )
168
+ async for progress in runner.run():
165
169
  yield progress
166
170
 
167
171
  async def run_job(self, job: EvalJob) -> bool:
@@ -307,9 +307,7 @@ async def test_run_method():
307
307
  evaluator = EvalTester(eval_config, run_config.run_config())
308
308
 
309
309
  # Run the evaluation
310
- task_run, eval_scores, intermediate_outputs = await evaluator.run_task_and_eval(
311
- "test input"
312
- )
310
+ task_run, eval_scores, _ = await evaluator.run_task_and_eval("test input")
313
311
 
314
312
  # Verify task run was created
315
313
  assert task_run.input == "test input"
@@ -188,7 +188,7 @@ async def test_run_g_eval_e2e(
188
188
  g_eval = GEval(test_eval_config, test_run_config)
189
189
 
190
190
  # Run the evaluation
191
- task_run, scores, intermediate_outputs = await g_eval.run_task_and_eval("chickens")
191
+ _, scores, intermediate_outputs = await g_eval.run_task_and_eval("chickens")
192
192
 
193
193
  # Verify the evaluation results
194
194
  assert isinstance(scores, dict)
@@ -0,0 +1,18 @@
1
+ """
2
+ File extractors for processing different document types.
3
+
4
+ This package provides a framework for extracting content from files
5
+ using different extraction methods.
6
+ """
7
+
8
+ from . import base_extractor, extractor_registry, extractor_runner, litellm_extractor
9
+ from .base_extractor import ExtractionInput, ExtractionOutput
10
+
11
+ __all__ = [
12
+ "ExtractionInput",
13
+ "ExtractionOutput",
14
+ "base_extractor",
15
+ "extractor_registry",
16
+ "extractor_runner",
17
+ "litellm_extractor",
18
+ ]
@@ -0,0 +1,72 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+ from pathlib import Path
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+ from kiln_ai.datamodel.extraction import ExtractorConfig, OutputFormat
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class ExtractionInput(BaseModel):
13
+ path: Path | str = Field(description="The absolute path to the file to extract.")
14
+ mime_type: str = Field(description="The mime type of the file.")
15
+
16
+
17
+ class ExtractionOutput(BaseModel):
18
+ """
19
+ The output of an extraction. This is the data that will be saved to the data store.
20
+ """
21
+
22
+ is_passthrough: bool = Field(
23
+ default=False, description="Whether the extractor returned the file as is."
24
+ )
25
+ content_format: OutputFormat = Field(
26
+ description="The format of the extracted data."
27
+ )
28
+ content: str = Field(description="The extracted data.")
29
+
30
+
31
+ class BaseExtractor(ABC):
32
+ """
33
+ Base class for all extractors.
34
+
35
+ Should be subclassed by each extractor.
36
+ """
37
+
38
+ def __init__(self, extractor_config: ExtractorConfig):
39
+ self.extractor_config = extractor_config
40
+
41
+ @abstractmethod
42
+ async def _extract(self, extraction_input: ExtractionInput) -> ExtractionOutput:
43
+ pass
44
+
45
+ async def extract(
46
+ self,
47
+ extraction_input: ExtractionInput,
48
+ ) -> ExtractionOutput:
49
+ """
50
+ Extracts content from a file by delegating to the concrete extractor implementation.
51
+ """
52
+ try:
53
+ if self._should_passthrough(extraction_input.mime_type):
54
+ return ExtractionOutput(
55
+ is_passthrough=True,
56
+ content=Path(extraction_input.path).read_text(encoding="utf-8"),
57
+ content_format=self.extractor_config.output_format,
58
+ )
59
+
60
+ return await self._extract(
61
+ extraction_input,
62
+ )
63
+ except Exception as e:
64
+ raise ValueError(f"Error extracting {extraction_input.path}: {e}") from e
65
+
66
+ def _should_passthrough(self, mime_type: str) -> bool:
67
+ return mime_type.lower() in {
68
+ mt.lower() for mt in self.extractor_config.passthrough_mimetypes
69
+ }
70
+
71
+ def output_format(self) -> OutputFormat:
72
+ return self.extractor_config.output_format
@@ -0,0 +1,20 @@
1
+ import base64
2
+
3
+
4
+ def to_base64_url(mime_type: str, bytes: bytes) -> str:
5
+ base64_url = f"data:{mime_type};base64,{base64.b64encode(bytes).decode('utf-8')}"
6
+ return base64_url
7
+
8
+
9
+ def from_base64_url(base64_url: str) -> bytes:
10
+ if not base64_url.startswith("data:") or "," not in base64_url:
11
+ raise ValueError("Invalid base64 URL format")
12
+
13
+ parts = base64_url.split(",")
14
+ if len(parts) != 2:
15
+ raise ValueError("Invalid base64 URL format")
16
+
17
+ try:
18
+ return base64.b64decode(parts[1])
19
+ except Exception as e:
20
+ raise ValueError(f"Failed to decode base64 data: {e}")
@@ -0,0 +1,44 @@
1
+ from kiln_ai.adapters.extractors.base_extractor import BaseExtractor
2
+ from kiln_ai.adapters.extractors.litellm_extractor import LitellmExtractor
3
+ from kiln_ai.adapters.ml_model_list import ModelProviderName
4
+ from kiln_ai.adapters.provider_tools import (
5
+ core_provider,
6
+ lite_llm_core_config_for_provider,
7
+ )
8
+ from kiln_ai.datamodel.extraction import ExtractorConfig, ExtractorType
9
+ from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
10
+ from kiln_ai.utils.filesystem_cache import FilesystemCache
11
+
12
+
13
+ def extractor_adapter_from_type(
14
+ extractor_type: ExtractorType,
15
+ extractor_config: ExtractorConfig,
16
+ filesystem_cache: FilesystemCache | None = None,
17
+ ) -> BaseExtractor:
18
+ match extractor_type:
19
+ case ExtractorType.LITELLM:
20
+ try:
21
+ provider_enum = ModelProviderName(extractor_config.model_provider_name)
22
+ except ValueError:
23
+ raise ValueError(
24
+ f"Unsupported model provider name: {extractor_config.model_provider_name}. "
25
+ )
26
+
27
+ core_provider_name = core_provider(
28
+ extractor_config.model_name, provider_enum
29
+ )
30
+
31
+ provider_config = lite_llm_core_config_for_provider(core_provider_name)
32
+ if provider_config is None:
33
+ raise ValueError(
34
+ f"No configuration found for core provider: {core_provider_name.value}. "
35
+ )
36
+
37
+ return LitellmExtractor(
38
+ extractor_config,
39
+ provider_config,
40
+ filesystem_cache,
41
+ )
42
+ case _:
43
+ # type checking will catch missing cases
44
+ raise_exhaustive_enum_error(extractor_type)
@@ -0,0 +1,112 @@
1
+ import logging
2
+ from collections import defaultdict
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import AsyncGenerator, Dict, List, Set
6
+
7
+ from kiln_ai.adapters.extractors.base_extractor import BaseExtractor, ExtractionInput
8
+ from kiln_ai.adapters.extractors.extractor_registry import extractor_adapter_from_type
9
+ from kiln_ai.datamodel.basemodel import ID_TYPE, KilnAttachmentModel
10
+ from kiln_ai.datamodel.extraction import (
11
+ Document,
12
+ Extraction,
13
+ ExtractionSource,
14
+ ExtractorConfig,
15
+ )
16
+ from kiln_ai.utils.async_job_runner import AsyncJobRunner, Progress
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ @dataclass
22
+ class ExtractorJob:
23
+ doc: Document
24
+ extractor_config: ExtractorConfig
25
+
26
+
27
+ class ExtractorRunner:
28
+ def __init__(
29
+ self,
30
+ documents: List[Document],
31
+ extractor_configs: List[ExtractorConfig],
32
+ ):
33
+ if len(extractor_configs) == 0:
34
+ raise ValueError("Extractor runner requires at least one extractor config")
35
+
36
+ self.documents = documents
37
+ self.extractor_configs = extractor_configs
38
+
39
+ def collect_jobs(self) -> List[ExtractorJob]:
40
+ jobs = []
41
+
42
+ # we want to avoid re-running the same document for the same extractor config
43
+ already_extracted: Dict[ID_TYPE, Set[ID_TYPE]] = defaultdict(set)
44
+ for document in self.documents:
45
+ for extraction in document.extractions():
46
+ already_extracted[extraction.extractor_config_id].add(document.id)
47
+
48
+ for extractor_config in self.extractor_configs:
49
+ for document in self.documents:
50
+ if document.id not in already_extracted[extractor_config.id]:
51
+ jobs.append(
52
+ ExtractorJob(
53
+ doc=document,
54
+ extractor_config=extractor_config,
55
+ )
56
+ )
57
+
58
+ return jobs
59
+
60
+ async def run(self, concurrency: int = 25) -> AsyncGenerator[Progress, None]:
61
+ jobs = self.collect_jobs()
62
+
63
+ runner = AsyncJobRunner(
64
+ concurrency=concurrency,
65
+ jobs=jobs,
66
+ run_job_fn=self.run_job,
67
+ )
68
+ async for progress in runner.run():
69
+ yield progress
70
+
71
+ async def run_job(self, job: ExtractorJob) -> bool:
72
+ try:
73
+ extractor = extractor_adapter_from_type(
74
+ job.extractor_config.extractor_type,
75
+ job.extractor_config,
76
+ )
77
+ if not isinstance(extractor, BaseExtractor):
78
+ raise ValueError("Not able to create extractor from extractor config")
79
+
80
+ if job.doc.path is None:
81
+ raise ValueError("Document path is not set")
82
+
83
+ output = await extractor.extract(
84
+ extraction_input=ExtractionInput(
85
+ path=Path(
86
+ job.doc.original_file.attachment.resolve_path(
87
+ job.doc.path.parent
88
+ )
89
+ ),
90
+ mime_type=job.doc.original_file.mime_type,
91
+ )
92
+ )
93
+
94
+ extraction = Extraction(
95
+ parent=job.doc,
96
+ extractor_config_id=job.extractor_config.id,
97
+ output=KilnAttachmentModel.from_data(
98
+ data=output.content,
99
+ mime_type=output.content_format,
100
+ ),
101
+ source=ExtractionSource.PASSTHROUGH
102
+ if output.is_passthrough
103
+ else ExtractionSource.PROCESSED,
104
+ )
105
+ extraction.save_to_file()
106
+
107
+ return True
108
+ except Exception as e:
109
+ logger.error(
110
+ f"Error running extraction job for dataset item {job.doc.id}: {e}"
111
+ )
112
+ return False