kiln-ai 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +8 -2
- kiln_ai/adapters/adapter_registry.py +43 -208
- kiln_ai/adapters/chat/chat_formatter.py +8 -12
- kiln_ai/adapters/chat/test_chat_formatter.py +6 -2
- kiln_ai/adapters/chunkers/__init__.py +13 -0
- kiln_ai/adapters/chunkers/base_chunker.py +42 -0
- kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
- kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
- kiln_ai/adapters/chunkers/helpers.py +23 -0
- kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
- kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
- kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
- kiln_ai/adapters/chunkers/test_helpers.py +75 -0
- kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
- kiln_ai/adapters/docker_model_runner_tools.py +119 -0
- kiln_ai/adapters/embedding/__init__.py +0 -0
- kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
- kiln_ai/adapters/embedding/embedding_registry.py +32 -0
- kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
- kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
- kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
- kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
- kiln_ai/adapters/eval/base_eval.py +2 -2
- kiln_ai/adapters/eval/eval_runner.py +9 -3
- kiln_ai/adapters/eval/g_eval.py +2 -2
- kiln_ai/adapters/eval/test_base_eval.py +2 -4
- kiln_ai/adapters/eval/test_g_eval.py +4 -5
- kiln_ai/adapters/extractors/__init__.py +18 -0
- kiln_ai/adapters/extractors/base_extractor.py +72 -0
- kiln_ai/adapters/extractors/encoding.py +20 -0
- kiln_ai/adapters/extractors/extractor_registry.py +44 -0
- kiln_ai/adapters/extractors/extractor_runner.py +112 -0
- kiln_ai/adapters/extractors/litellm_extractor.py +386 -0
- kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
- kiln_ai/adapters/extractors/test_encoding.py +54 -0
- kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
- kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
- kiln_ai/adapters/extractors/test_litellm_extractor.py +1192 -0
- kiln_ai/adapters/fine_tune/__init__.py +1 -1
- kiln_ai/adapters/fine_tune/openai_finetune.py +14 -4
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +108 -111
- kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
- kiln_ai/adapters/ml_embedding_model_list.py +192 -0
- kiln_ai/adapters/ml_model_list.py +761 -37
- kiln_ai/adapters/model_adapters/base_adapter.py +51 -21
- kiln_ai/adapters/model_adapters/litellm_adapter.py +380 -138
- kiln_ai/adapters/model_adapters/test_base_adapter.py +193 -17
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +407 -2
- kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +1103 -0
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +5 -5
- kiln_ai/adapters/model_adapters/test_structured_output.py +113 -5
- kiln_ai/adapters/ollama_tools.py +69 -12
- kiln_ai/adapters/parsers/__init__.py +1 -1
- kiln_ai/adapters/provider_tools.py +205 -47
- kiln_ai/adapters/rag/deduplication.py +49 -0
- kiln_ai/adapters/rag/progress.py +252 -0
- kiln_ai/adapters/rag/rag_runners.py +844 -0
- kiln_ai/adapters/rag/test_deduplication.py +195 -0
- kiln_ai/adapters/rag/test_progress.py +785 -0
- kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
- kiln_ai/adapters/remote_config.py +80 -8
- kiln_ai/adapters/repair/test_repair_task.py +12 -9
- kiln_ai/adapters/run_output.py +3 -0
- kiln_ai/adapters/test_adapter_registry.py +657 -85
- kiln_ai/adapters/test_docker_model_runner_tools.py +305 -0
- kiln_ai/adapters/test_ml_embedding_model_list.py +429 -0
- kiln_ai/adapters/test_ml_model_list.py +251 -1
- kiln_ai/adapters/test_ollama_tools.py +340 -1
- kiln_ai/adapters/test_prompt_adaptors.py +13 -6
- kiln_ai/adapters/test_prompt_builders.py +1 -1
- kiln_ai/adapters/test_provider_tools.py +254 -8
- kiln_ai/adapters/test_remote_config.py +651 -58
- kiln_ai/adapters/vector_store/__init__.py +1 -0
- kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
- kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
- kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
- kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
- kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
- kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
- kiln_ai/datamodel/__init__.py +39 -34
- kiln_ai/datamodel/basemodel.py +170 -1
- kiln_ai/datamodel/chunk.py +158 -0
- kiln_ai/datamodel/datamodel_enums.py +28 -0
- kiln_ai/datamodel/embedding.py +64 -0
- kiln_ai/datamodel/eval.py +1 -1
- kiln_ai/datamodel/external_tool_server.py +298 -0
- kiln_ai/datamodel/extraction.py +303 -0
- kiln_ai/datamodel/json_schema.py +25 -10
- kiln_ai/datamodel/project.py +40 -1
- kiln_ai/datamodel/rag.py +79 -0
- kiln_ai/datamodel/registry.py +0 -15
- kiln_ai/datamodel/run_config.py +62 -0
- kiln_ai/datamodel/task.py +2 -77
- kiln_ai/datamodel/task_output.py +6 -1
- kiln_ai/datamodel/task_run.py +41 -0
- kiln_ai/datamodel/test_attachment.py +649 -0
- kiln_ai/datamodel/test_basemodel.py +4 -4
- kiln_ai/datamodel/test_chunk_models.py +317 -0
- kiln_ai/datamodel/test_dataset_split.py +1 -1
- kiln_ai/datamodel/test_embedding_models.py +448 -0
- kiln_ai/datamodel/test_eval_model.py +6 -6
- kiln_ai/datamodel/test_example_models.py +175 -0
- kiln_ai/datamodel/test_external_tool_server.py +691 -0
- kiln_ai/datamodel/test_extraction_chunk.py +206 -0
- kiln_ai/datamodel/test_extraction_model.py +470 -0
- kiln_ai/datamodel/test_rag.py +641 -0
- kiln_ai/datamodel/test_registry.py +8 -3
- kiln_ai/datamodel/test_task.py +15 -47
- kiln_ai/datamodel/test_tool_id.py +320 -0
- kiln_ai/datamodel/test_vector_store.py +320 -0
- kiln_ai/datamodel/tool_id.py +105 -0
- kiln_ai/datamodel/vector_store.py +141 -0
- kiln_ai/tools/__init__.py +8 -0
- kiln_ai/tools/base_tool.py +82 -0
- kiln_ai/tools/built_in_tools/__init__.py +13 -0
- kiln_ai/tools/built_in_tools/math_tools.py +124 -0
- kiln_ai/tools/built_in_tools/test_math_tools.py +204 -0
- kiln_ai/tools/mcp_server_tool.py +95 -0
- kiln_ai/tools/mcp_session_manager.py +246 -0
- kiln_ai/tools/rag_tools.py +157 -0
- kiln_ai/tools/test_base_tools.py +199 -0
- kiln_ai/tools/test_mcp_server_tool.py +457 -0
- kiln_ai/tools/test_mcp_session_manager.py +1585 -0
- kiln_ai/tools/test_rag_tools.py +848 -0
- kiln_ai/tools/test_tool_registry.py +562 -0
- kiln_ai/tools/tool_registry.py +85 -0
- kiln_ai/utils/__init__.py +3 -0
- kiln_ai/utils/async_job_runner.py +62 -17
- kiln_ai/utils/config.py +24 -2
- kiln_ai/utils/env.py +15 -0
- kiln_ai/utils/filesystem.py +14 -0
- kiln_ai/utils/filesystem_cache.py +60 -0
- kiln_ai/utils/litellm.py +94 -0
- kiln_ai/utils/lock.py +100 -0
- kiln_ai/utils/mime_type.py +38 -0
- kiln_ai/utils/open_ai_types.py +94 -0
- kiln_ai/utils/pdf_utils.py +38 -0
- kiln_ai/utils/project_utils.py +17 -0
- kiln_ai/utils/test_async_job_runner.py +151 -35
- kiln_ai/utils/test_config.py +138 -1
- kiln_ai/utils/test_env.py +142 -0
- kiln_ai/utils/test_filesystem_cache.py +316 -0
- kiln_ai/utils/test_litellm.py +206 -0
- kiln_ai/utils/test_lock.py +185 -0
- kiln_ai/utils/test_mime_type.py +66 -0
- kiln_ai/utils/test_open_ai_types.py +131 -0
- kiln_ai/utils/test_pdf_utils.py +73 -0
- kiln_ai/utils/test_uuid.py +111 -0
- kiln_ai/utils/test_validation.py +524 -0
- kiln_ai/utils/uuid.py +9 -0
- kiln_ai/utils/validation.py +90 -0
- {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/METADATA +12 -5
- kiln_ai-0.21.0.dist-info/RECORD +211 -0
- kiln_ai-0.19.0.dist-info/RECORD +0 -115
- {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from typing import Dict, Literal
|
|
4
|
+
|
|
5
|
+
from kiln_ai.adapters.rag.deduplication import (
|
|
6
|
+
deduplicate_chunk_embeddings,
|
|
7
|
+
deduplicate_chunked_documents,
|
|
8
|
+
deduplicate_extractions,
|
|
9
|
+
filter_documents_by_tags,
|
|
10
|
+
)
|
|
11
|
+
from kiln_ai.adapters.vector_store.vector_store_registry import (
|
|
12
|
+
vector_store_adapter_for_config,
|
|
13
|
+
)
|
|
14
|
+
from kiln_ai.datamodel.project import Project
|
|
15
|
+
from kiln_ai.datamodel.rag import RagConfig
|
|
16
|
+
from kiln_ai.datamodel.vector_store import VectorStoreConfig
|
|
17
|
+
from pydantic import BaseModel, Field
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class LogMessage(BaseModel):
|
|
23
|
+
level: Literal["info", "error", "warning"] = Field(
|
|
24
|
+
description="The level of the log message",
|
|
25
|
+
)
|
|
26
|
+
message: str = Field(
|
|
27
|
+
description="The message to display to the user",
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class RagProgress(BaseModel):
|
|
32
|
+
total_document_count: int = Field(
|
|
33
|
+
description="The total number of items to process",
|
|
34
|
+
default=0,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
total_document_completed_count: int = Field(
|
|
38
|
+
description="The number of items that have been processed",
|
|
39
|
+
default=0,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Progress for indexing is tracked in terms of chunks, not documents. After the initial run
|
|
43
|
+
# the only info we have is how many chunks are in the vector store, and so we need to know
|
|
44
|
+
# the total number of chunks that should be indexed to know if it is completed or not.
|
|
45
|
+
# So we need toset and send that through to the client once we know it (after completing chunking).
|
|
46
|
+
total_chunk_count: int = Field(
|
|
47
|
+
description="The number of chunks that should be indexed for the indexing to be completed.",
|
|
48
|
+
default=0,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
total_chunk_completed_count: int = Field(
|
|
52
|
+
description="The number of chunks that have been indexed",
|
|
53
|
+
default=0,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
total_document_extracted_count: int = Field(
|
|
57
|
+
description="The number of items that have been extracted",
|
|
58
|
+
default=0,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
total_document_extracted_error_count: int = Field(
|
|
62
|
+
description="The number of items that have errored during extraction",
|
|
63
|
+
default=0,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
total_document_chunked_count: int = Field(
|
|
67
|
+
description="The number of items that have been chunked",
|
|
68
|
+
default=0,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
total_document_chunked_error_count: int = Field(
|
|
72
|
+
description="The number of items that have errored during chunking",
|
|
73
|
+
default=0,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
total_document_embedded_count: int = Field(
|
|
77
|
+
description="The number of items that have been embedded",
|
|
78
|
+
default=0,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
total_document_embedded_error_count: int = Field(
|
|
82
|
+
description="The number of items that have errored during embedding",
|
|
83
|
+
default=0,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
total_chunks_indexed_count: int = Field(
|
|
87
|
+
description="The number of chunks that have been indexed",
|
|
88
|
+
default=0,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
total_chunks_indexed_error_count: int = Field(
|
|
92
|
+
description="The number of chunks that have errored during indexing",
|
|
93
|
+
default=0,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
logs: list[LogMessage] | None = Field(
|
|
97
|
+
description="A list of log messages to display to the user",
|
|
98
|
+
default=None,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
async def count_records_in_vector_store(
|
|
103
|
+
rag_config: RagConfig,
|
|
104
|
+
vector_store_config: VectorStoreConfig,
|
|
105
|
+
) -> int:
|
|
106
|
+
vector_store = await vector_store_adapter_for_config(
|
|
107
|
+
rag_config, vector_store_config
|
|
108
|
+
)
|
|
109
|
+
count = await vector_store.count_records()
|
|
110
|
+
return count
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
async def count_records_in_vector_store_for_rag_config(
|
|
114
|
+
project: Project,
|
|
115
|
+
rag_config: RagConfig,
|
|
116
|
+
) -> int:
|
|
117
|
+
vector_store_config = VectorStoreConfig.from_id_and_parent_path(
|
|
118
|
+
str(rag_config.vector_store_config_id),
|
|
119
|
+
project.path,
|
|
120
|
+
)
|
|
121
|
+
if vector_store_config is None:
|
|
122
|
+
raise ValueError(f"Rag config {rag_config.id} has no vector store config")
|
|
123
|
+
return await count_records_in_vector_store(rag_config, vector_store_config)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
async def compute_current_progress_for_rag_configs(
|
|
127
|
+
project: Project,
|
|
128
|
+
rag_configs: list[RagConfig],
|
|
129
|
+
) -> Dict[str, RagProgress]:
|
|
130
|
+
# each RAG config represents a unique path: extractor::chunker::embedding
|
|
131
|
+
# different configs can share common prefixes
|
|
132
|
+
# (e.g., extractor-1::chunker-2 for both extractor-1::chunker-2::embedding-3 and extractor-1::chunker-2::embedding-4)
|
|
133
|
+
# we store prefix -> [rag config ids] mappings so at every step
|
|
134
|
+
# we know all the configs that share the same upstream steps
|
|
135
|
+
path_prefixes: dict[str, set[str]] = defaultdict(set)
|
|
136
|
+
for rag_config in rag_configs:
|
|
137
|
+
complete_path: list[str] = [
|
|
138
|
+
str(rag_config.extractor_config_id),
|
|
139
|
+
str(rag_config.chunker_config_id),
|
|
140
|
+
str(rag_config.embedding_config_id),
|
|
141
|
+
]
|
|
142
|
+
for i in range(len(complete_path)):
|
|
143
|
+
prefix = "::".join(complete_path[: i + 1])
|
|
144
|
+
path_prefixes[prefix].add(str(rag_config.id))
|
|
145
|
+
|
|
146
|
+
rag_config_progress_map: dict[str, RagProgress] = {}
|
|
147
|
+
for rag_config in rag_configs:
|
|
148
|
+
all_documents = project.documents(readonly=True)
|
|
149
|
+
if rag_config.tags:
|
|
150
|
+
all_documents = filter_documents_by_tags(all_documents, rag_config.tags)
|
|
151
|
+
|
|
152
|
+
rag_config_progress_map[str(rag_config.id)] = RagProgress(
|
|
153
|
+
total_document_count=len(all_documents),
|
|
154
|
+
total_document_completed_count=0,
|
|
155
|
+
total_chunk_count=0,
|
|
156
|
+
total_chunk_completed_count=0,
|
|
157
|
+
total_document_extracted_count=0,
|
|
158
|
+
total_document_chunked_count=0,
|
|
159
|
+
total_document_embedded_count=0,
|
|
160
|
+
total_chunks_indexed_count=await count_records_in_vector_store_for_rag_config(
|
|
161
|
+
project, rag_config
|
|
162
|
+
),
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Create a mapping of rag_config_id to its tags for efficient lookup
|
|
166
|
+
rag_config_tags_map = {
|
|
167
|
+
str(rag_config.id): rag_config.tags for rag_config in rag_configs
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
for document in project.documents(readonly=True):
|
|
171
|
+
for extraction in deduplicate_extractions(document.extractions(readonly=True)):
|
|
172
|
+
extraction_path_prefix = str(extraction.extractor_config_id)
|
|
173
|
+
|
|
174
|
+
# increment the extraction count for every rag config that has this extractor
|
|
175
|
+
# and includes this document based on its tags
|
|
176
|
+
for matching_rag_config_id in path_prefixes[extraction_path_prefix]:
|
|
177
|
+
rag_config_tags = rag_config_tags_map[matching_rag_config_id]
|
|
178
|
+
if not rag_config_tags or (
|
|
179
|
+
document.tags
|
|
180
|
+
and any(tag in document.tags for tag in rag_config_tags)
|
|
181
|
+
):
|
|
182
|
+
rag_config_progress_map[
|
|
183
|
+
matching_rag_config_id
|
|
184
|
+
].total_document_extracted_count += 1
|
|
185
|
+
|
|
186
|
+
for chunked_document in deduplicate_chunked_documents(
|
|
187
|
+
extraction.chunked_documents(readonly=True)
|
|
188
|
+
):
|
|
189
|
+
# increment the chunked count for every rag config that has this extractor+chunker combo
|
|
190
|
+
# and includes this document based on its tags
|
|
191
|
+
chunking_path_prefix = (
|
|
192
|
+
f"{extraction_path_prefix}::{chunked_document.chunker_config_id}"
|
|
193
|
+
)
|
|
194
|
+
for matching_rag_config_id in path_prefixes[chunking_path_prefix]:
|
|
195
|
+
rag_config_tags = rag_config_tags_map[matching_rag_config_id]
|
|
196
|
+
if not rag_config_tags or (
|
|
197
|
+
document.tags
|
|
198
|
+
and any(tag in document.tags for tag in rag_config_tags)
|
|
199
|
+
):
|
|
200
|
+
rag_config_progress_map[
|
|
201
|
+
matching_rag_config_id
|
|
202
|
+
].total_document_chunked_count += 1
|
|
203
|
+
|
|
204
|
+
rag_config_progress_map[
|
|
205
|
+
matching_rag_config_id
|
|
206
|
+
].total_chunk_count += len(chunked_document.chunks)
|
|
207
|
+
|
|
208
|
+
for embedding in deduplicate_chunk_embeddings(
|
|
209
|
+
chunked_document.chunk_embeddings(readonly=True)
|
|
210
|
+
):
|
|
211
|
+
# increment the embedding count for every rag config that has this extractor+chunker+embedding combo
|
|
212
|
+
# and includes this document based on its tags
|
|
213
|
+
embedding_path_prefix = (
|
|
214
|
+
f"{chunking_path_prefix}::{embedding.embedding_config_id}"
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
for matching_rag_config_id in path_prefixes[embedding_path_prefix]:
|
|
218
|
+
rag_config_tags = rag_config_tags_map[matching_rag_config_id]
|
|
219
|
+
if not rag_config_tags or (
|
|
220
|
+
document.tags
|
|
221
|
+
and any(tag in document.tags for tag in rag_config_tags)
|
|
222
|
+
):
|
|
223
|
+
rag_config_progress_map[
|
|
224
|
+
matching_rag_config_id
|
|
225
|
+
].total_document_embedded_count += 1
|
|
226
|
+
|
|
227
|
+
# a document is completed only when all steps are completed, so overall progress is the same
|
|
228
|
+
# as the least complete step
|
|
229
|
+
for _, rag_config_progress in rag_config_progress_map.items():
|
|
230
|
+
rag_config_progress.total_document_completed_count = min(
|
|
231
|
+
rag_config_progress.total_document_extracted_count,
|
|
232
|
+
rag_config_progress.total_document_chunked_count,
|
|
233
|
+
rag_config_progress.total_document_embedded_count,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
rag_config_progress.total_chunk_completed_count = (
|
|
237
|
+
rag_config_progress.total_chunks_indexed_count
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
return dict(rag_config_progress_map)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
async def compute_current_progress_for_rag_config(
|
|
244
|
+
project: Project,
|
|
245
|
+
rag_config: RagConfig,
|
|
246
|
+
) -> RagProgress:
|
|
247
|
+
config_progress = await compute_current_progress_for_rag_configs(
|
|
248
|
+
project, [rag_config]
|
|
249
|
+
)
|
|
250
|
+
if str(rag_config.id) not in config_progress:
|
|
251
|
+
raise ValueError(f"Failed to compute progress for rag config {rag_config.id}")
|
|
252
|
+
return config_progress[str(rag_config.id)]
|