PyPI - kiln-ai - Versions diffs - 0.21.0__tar.gz → 0.22.0__tar.gz - Mend

kiln-ai 0.21.0tar.gz → 0.22.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of kiln-ai might be problematic. Click here for more details.

Files changed (251) hide show

{kiln_ai-0.21.0 → kiln_ai-0.22.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kiln-ai
-Version: 0.21.0
+Version: 0.22.0
 Summary: Kiln AI
 Project-URL: Homepage, https://kiln.tech
 Project-URL: Repository, https://github.com/Kiln-AI/kiln
@@ -28,8 +28,10 @@ Requires-Dist: llama-index-vector-stores-lancedb>=0.3.3
 Requires-Dist: llama-index>=0.13.3
 Requires-Dist: openai>=1.53.0
 Requires-Dist: pdoc>=15.0.0
+Requires-Dist: pillow>=11.1.0
 Requires-Dist: pydantic>=2.9.2
 Requires-Dist: pypdf>=6.0.0
+Requires-Dist: pypdfium2>=4.30.0
 Requires-Dist: pytest-benchmark>=5.1.0
 Requires-Dist: pytest-cov>=6.0.0
 Requires-Dist: pyyaml>=6.0.2

{kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/extractors/litellm_extractor.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import hashlib
 import logging
+from functools import cached_property
 from pathlib import Path
 from typing import Any, List
@@ -13,23 +14,16 @@ from kiln_ai.adapters.extractors.base_extractor import (
     ExtractionOutput,
 )
 from kiln_ai.adapters.extractors.encoding import to_base64_url
-from kiln_ai.adapters.ml_model_list import built_in_models_from_provider
+from kiln_ai.adapters.ml_model_list import (
+    KilnModelProvider,
+    built_in_models_from_provider,
+)
 from kiln_ai.adapters.provider_tools import LiteLlmCoreConfig
 from kiln_ai.datamodel.datamodel_enums import ModelProviderName
 from kiln_ai.datamodel.extraction import ExtractorConfig, ExtractorType, Kind
 from kiln_ai.utils.filesystem_cache import FilesystemCache
 from kiln_ai.utils.litellm import get_litellm_provider_info
-from kiln_ai.utils.pdf_utils import split_pdf_into_pages
-def max_pdf_page_concurrency_for_model(model_name: str) -> int:
-    # we assume each batch takes ~5s to complete (likely more in practice)
-    # lowest rate limit is 150 RPM for Tier 1 accounts for gemini-2.5-pro
-    if model_name == "gemini/gemini-2.5-pro":
-        return 2
-    # other models support at least 500 RPM for lowest tier accounts
-    return 5
+from kiln_ai.utils.pdf_utils import convert_pdf_to_images, split_pdf_into_pages
 logger = logging.getLogger(__name__)
@@ -74,11 +68,11 @@ def encode_file_litellm_format(path: Path, mime_type: str) -> dict[str, Any]:
         "text/markdown",
         "text/plain",
     ] or any(mime_type.startswith(m) for m in ["video/", "audio/"]):
-        pdf_bytes = path.read_bytes()
+        file_bytes = path.read_bytes()
         return {
             "type": "file",
             "file": {
-                "file_data": to_base64_url(mime_type, pdf_bytes),
+                "file_data": to_base64_url(mime_type, file_bytes),
             },
         }
@@ -101,6 +95,7 @@ class LitellmExtractor(BaseExtractor):
         extractor_config: ExtractorConfig,
         litellm_core_config: LiteLlmCoreConfig,
         filesystem_cache: FilesystemCache | None = None,
+        default_max_parallel_requests: int = 5,
     ):
         if extractor_config.extractor_type != ExtractorType.LITELLM:
             raise ValueError(
@@ -133,6 +128,7 @@ class LitellmExtractor(BaseExtractor):
         }
         self.litellm_core_config = litellm_core_config
+        self.default_max_parallel_requests = default_max_parallel_requests
     def pdf_page_cache_key(self, pdf_path: Path, page_number: int) -> str:
         """
@@ -171,13 +167,35 @@ class LitellmExtractor(BaseExtractor):
         logger.debug(f"Cache miss for page {page_number} of {pdf_path}")
         return None
+    async def convert_pdf_page_to_image_input(
+        self, page_path: Path, page_number: int
+    ) -> ExtractionInput:
+        image_paths = await convert_pdf_to_images(page_path, page_path.parent)
+        if len(image_paths) != 1:
+            raise ValueError(
+                f"Expected 1 image, got {len(image_paths)} for page {page_number} in {page_path}"
+            )
+        image_path = image_paths[0]
+        page_input = ExtractionInput(path=str(image_path), mime_type="image/png")
+        return page_input
     async def _extract_single_pdf_page(
-        self, pdf_path: Path, page_path: Path, prompt: str, page_number: int
+        self,
+        pdf_path: Path,
+        page_path: Path,
+        prompt: str,
+        page_number: int,
     ) -> str:
         try:
-            page_input = ExtractionInput(
-                path=str(page_path), mime_type="application/pdf"
-            )
+            if self.model_provider.multimodal_requires_pdf_as_image:
+                page_input = await self.convert_pdf_page_to_image_input(
+                    page_path, page_number
+                )
+            else:
+                page_input = ExtractionInput(
+                    path=str(page_path), mime_type="application/pdf"
+                )
             completion_kwargs = self._build_completion_kwargs(prompt, page_input)
             response = await litellm.acompletion(**completion_kwargs)
         except Exception as e:
@@ -201,11 +219,6 @@ class LitellmExtractor(BaseExtractor):
             )
         content = response.choices[0].message.content
-        if not content:
-            raise ValueError(
-                f"No text returned from extraction model when extracting page {page_number} for {page_path}"
-            )
         if self.filesystem_cache is not None:
             # we don't want to fail the whole extraction just because cache write fails
             # as that would block the whole flow
@@ -242,13 +255,14 @@ class LitellmExtractor(BaseExtractor):
                     continue
                 extract_page_jobs.append(
-                    self._extract_single_pdf_page(pdf_path, page_path, prompt, i)
+                    self._extract_single_pdf_page(
+                        pdf_path, page_path, prompt, page_number=i
+                    )
                 )
                 page_indices_for_jobs.append(i)
                 if (
-                    len(extract_page_jobs)
-                    >= max_pdf_page_concurrency_for_model(self.litellm_model_slug())
+                    len(extract_page_jobs) >= self.max_parallel_requests_for_model
                     or i == len(page_paths) - 1
                 ):
                     extraction_results = await asyncio.gather(
@@ -295,7 +309,7 @@ class LitellmExtractor(BaseExtractor):
         self, prompt: str, extraction_input: ExtractionInput
     ) -> dict[str, Any]:
         completion_kwargs = {
-            "model": self.litellm_model_slug(),
+            "model": self.litellm_model_slug,
             "messages": [
                 {
                     "role": "user",
@@ -367,20 +381,26 @@ class LitellmExtractor(BaseExtractor):
             content_format=self.extractor_config.output_format,
         )
-    def litellm_model_slug(self) -> str:
+    @cached_property
+    def model_provider(self) -> KilnModelProvider:
         kiln_model_provider = built_in_models_from_provider(
             ModelProviderName(self.extractor_config.model_provider_name),
             self.extractor_config.model_name,
         )
         if kiln_model_provider is None:
             raise ValueError(
                 f"Model provider {self.extractor_config.model_provider_name} not found in the list of built-in models"
             )
+        return kiln_model_provider
+    @cached_property
+    def max_parallel_requests_for_model(self) -> int:
+        value = self.model_provider.max_parallel_requests
+        return value if value is not None else self.default_max_parallel_requests
-        # need to translate into LiteLLM model slug
+    @cached_property
+    def litellm_model_slug(self) -> str:
         litellm_provider_name = get_litellm_provider_info(
-            kiln_model_provider,
+            self.model_provider,
         )
         return litellm_provider_name.litellm_model_id

{kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/extractors/test_litellm_extractor.py RENAMED Viewed

@@ -1,5 +1,5 @@
 from pathlib import Path
-from unittest.mock import AsyncMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
 from litellm.types.utils import Choices, ModelResponse
@@ -7,13 +7,17 @@ from litellm.types.utils import Choices, ModelResponse
 from conftest import MockFileFactoryMimeType
 from kiln_ai.adapters.extractors.base_extractor import ExtractionInput, OutputFormat
 from kiln_ai.adapters.extractors.encoding import to_base64_url
+from kiln_ai.adapters.extractors.extractor_registry import extractor_adapter_from_type
 from kiln_ai.adapters.extractors.litellm_extractor import (
     ExtractorConfig,
     Kind,
     LitellmExtractor,
     encode_file_litellm_format,
 )
-from kiln_ai.adapters.ml_model_list import built_in_models
+from kiln_ai.adapters.ml_model_list import (
+    built_in_models,
+    built_in_models_from_provider,
+)
 from kiln_ai.adapters.provider_tools import LiteLlmCoreConfig
 from kiln_ai.datamodel.extraction import ExtractorType
 from kiln_ai.utils.filesystem_cache import FilesystemCache
@@ -405,7 +409,7 @@ def test_litellm_model_slug_success(mock_litellm_extractor):
             return_value=mock_provider_info,
         ) as mock_get_provider_info,
     ):
-        result = mock_litellm_extractor.litellm_model_slug()
+        result = mock_litellm_extractor.litellm_model_slug
         assert result == "test-provider/test-model"
@@ -414,6 +418,38 @@ def test_litellm_model_slug_success(mock_litellm_extractor):
         mock_get_provider_info.assert_called_once_with(mock_model_provider)
+@pytest.mark.parametrize(
+    "max_parallel_requests, expected_result",
+    [
+        (10, 10),
+        (0, 0),
+        # 5 is the current default, it may change in the future if we have
+        # a better modeling of rate limit constraints
+        (None, 5),
+    ],
+)
+def test_litellm_model_max_parallel_requests(
+    mock_litellm_extractor, max_parallel_requests, expected_result
+):
+    """Test that max_parallel_requests_for_model returns the provider's limit."""
+    # Mock the built_in_models_from_provider function to return a valid model provider
+    mock_model_provider = MagicMock()
+    mock_model_provider.name = "test-provider"
+    mock_model_provider.max_parallel_requests = max_parallel_requests
+    with (
+        patch(
+            "kiln_ai.adapters.extractors.litellm_extractor.built_in_models_from_provider",
+            return_value=mock_model_provider,
+        ) as mock_built_in_models,
+    ):
+        result = mock_litellm_extractor.max_parallel_requests_for_model
+        assert result == expected_result
+        mock_built_in_models.assert_called_once()
 def test_litellm_model_slug_model_provider_not_found(mock_litellm_extractor):
     """Test that litellm_model_slug raises ValueError when model provider is not found."""
     with patch(
@@ -424,7 +460,7 @@ def test_litellm_model_slug_model_provider_not_found(mock_litellm_extractor):
             ValueError,
             match="Model provider openai not found in the list of built-in models",
         ):
-            mock_litellm_extractor.litellm_model_slug()
+            mock_litellm_extractor.litellm_model_slug
 def test_litellm_model_slug_with_different_provider_names(mock_litellm_core_config):
@@ -468,35 +504,28 @@ def test_litellm_model_slug_with_different_provider_names(mock_litellm_core_conf
                 return_value=mock_provider_info,
             ),
         ):
-            result = extractor.litellm_model_slug()
+            result = extractor.litellm_model_slug
             assert result == expected_slug
 def paid_litellm_extractor(model_name: str, provider_name: str):
-    return LitellmExtractor(
-        extractor_config=ExtractorConfig(
+    extractor = extractor_adapter_from_type(
+        ExtractorType.LITELLM,
+        ExtractorConfig(
             name="paid-litellm",
             extractor_type=ExtractorType.LITELLM,
             model_provider_name=provider_name,
             model_name=model_name,
             properties={
-                # in the paid tests, we can check which prompt is used by checking if the Kind shows up
-                # in the output - not ideal but usually works
                 "prompt_document": "Ignore the file and only respond with the word 'document'",
                 "prompt_image": "Ignore the file and only respond with the word 'image'",
                 "prompt_video": "Ignore the file and only respond with the word 'video'",
                 "prompt_audio": "Ignore the file and only respond with the word 'audio'",
             },
-            passthrough_mimetypes=[
-                # we want all mimetypes to go to litellm to be sure we're testing the API call
-            ],
-        ),
-        litellm_core_config=LiteLlmCoreConfig(
-            base_url="https://test.com",
-            additional_body_options={"api_key": "test-key"},
-            default_headers={},
+            passthrough_mimetypes=[OutputFormat.MARKDOWN, OutputFormat.TEXT],
         ),
     )
+    return extractor
 @pytest.mark.parametrize(
@@ -560,6 +589,7 @@ def get_all_models_support_doc_extraction(
                     provider.multimodal_mime_types is None
                     or must_support_mime_types is None
                 ):
+                    model_provider_pairs.append((model.name, provider.name))
                     continue
                 # check that the model supports all the mime types
                 if all(
@@ -573,23 +603,7 @@ def get_all_models_support_doc_extraction(
 @pytest.mark.paid
 @pytest.mark.parametrize(
     "model_name,provider_name",
-    get_all_models_support_doc_extraction(
-        must_support_mime_types=[
-            MockFileFactoryMimeType.PDF,
-            MockFileFactoryMimeType.TXT,
-            MockFileFactoryMimeType.MD,
-            MockFileFactoryMimeType.HTML,
-            MockFileFactoryMimeType.CSV,
-            MockFileFactoryMimeType.PNG,
-            MockFileFactoryMimeType.JPEG,
-            MockFileFactoryMimeType.JPG,
-            MockFileFactoryMimeType.MP4,
-            MockFileFactoryMimeType.MOV,
-            MockFileFactoryMimeType.MP3,
-            MockFileFactoryMimeType.OGG,
-            MockFileFactoryMimeType.WAV,
-        ]
-    ),
+    get_all_models_support_doc_extraction(must_support_mime_types=None),
 )
 @pytest.mark.parametrize(
     "mime_type,expected_substring_in_output",
@@ -620,41 +634,17 @@ async def test_extract_document_success(
     expected_substring_in_output,
     mock_file_factory,
 ):
-    test_file = mock_file_factory(mime_type)
-    extractor = paid_litellm_extractor(
-        model_name=model_name, provider_name=provider_name
-    )
-    output = await extractor.extract(
-        extraction_input=ExtractionInput(
-            path=str(test_file),
-            mime_type=mime_type,
-        )
-    )
-    assert not output.is_passthrough
-    assert output.content_format == OutputFormat.MARKDOWN
-    assert expected_substring_in_output.lower() in output.content.lower()
+    # get model
+    model = built_in_models_from_provider(provider_name, model_name)
+    assert model is not None
+    if mime_type not in model.multimodal_mime_types:
+        pytest.skip(f"Model {model_name} configured to not support {mime_type}")
+    if (
+        mime_type == MockFileFactoryMimeType.MD
+        or mime_type == MockFileFactoryMimeType.TXT
+    ):
+        pytest.skip(f"Model {model_name} configured to passthrough {mime_type}")
-@pytest.mark.paid
-@pytest.mark.parametrize(
-    "model_name,provider_name",
-    get_all_models_support_doc_extraction(
-        must_support_mime_types=[MockFileFactoryMimeType.PDF]
-    ),
-)
-@pytest.mark.parametrize(
-    "mime_type,expected_substring_in_output",
-    [
-        (MockFileFactoryMimeType.PDF, "document"),
-    ],
-)
-async def test_extract_document_success_pdf(
-    model_name,
-    provider_name,
-    mime_type,
-    expected_substring_in_output,
-    mock_file_factory,
-):
     test_file = mock_file_factory(mime_type)
     extractor = paid_litellm_extractor(
         model_name=model_name, provider_name=provider_name
@@ -704,6 +694,110 @@ async def test_extract_pdf_page_by_page(mock_file_factory, mock_litellm_extracto
     assert result.content_format == OutputFormat.MARKDOWN
+async def test_extract_pdf_page_by_page_pdf_as_image(
+    mock_file_factory, mock_litellm_extractor, tmp_path
+):
+    """Test that PDFs are processed page by page as images if the model requires it."""
+    test_file = mock_file_factory(MockFileFactoryMimeType.PDF)
+    # Mock responses for each page (PDF has 2 pages)
+    mock_responses = []
+    for i in range(2):  # PDF has 2 pages
+        mock_response = AsyncMock(spec=ModelResponse)
+        mock_choice = AsyncMock(spec=Choices)
+        mock_message = AsyncMock()
+        mock_message.content = f"Content from page {i + 1}"
+        mock_choice.message = mock_message
+        mock_response.choices = [mock_choice]
+        mock_responses.append(mock_response)
+    mock_image_path = tmp_path / "img-test_document-mock.png"
+    mock_image_path.write_bytes(b"test image")
+    with patch("litellm.acompletion", side_effect=mock_responses) as mock_acompletion:
+        # this model requires PDFs to be processed as images
+        mock_litellm_extractor.model_provider.multimodal_requires_pdf_as_image = True
+        with patch(
+            "kiln_ai.adapters.extractors.litellm_extractor.convert_pdf_to_images",
+            return_value=[mock_image_path],
+        ) as mock_convert:
+            result = await mock_litellm_extractor.extract(
+                ExtractionInput(
+                    path=str(test_file),
+                    mime_type="application/pdf",
+                )
+            )
+    # Verify image conversion called once per page
+    assert mock_convert.call_count == 2
+    # Verify LiteLLM was called with image inputs (not PDF) for each page
+    for call in mock_acompletion.call_args_list:
+        kwargs = call.kwargs
+        content = kwargs["messages"][0]["content"]
+        assert content[1]["type"] == "image_url"
+    # Verify that the completion was called multiple times (once per page)
+    assert mock_acompletion.call_count == 2
+    # Verify the output contains content from both pages
+    assert "Content from page 1" in result.content
+    assert "Content from page 2" in result.content
+    assert not result.is_passthrough
+    assert result.content_format == OutputFormat.MARKDOWN
+async def test_convert_pdf_page_to_image_input_success(
+    mock_litellm_extractor, tmp_path
+):
+    page_dir = tmp_path / "pages"
+    page_dir.mkdir()
+    page_path = page_dir / "page_1.pdf"
+    page_path.write_bytes(b"%PDF-1.4 test")
+    mock_image_path = page_dir / "img-page_1.pdf-0.png"
+    mock_image_path.write_bytes(b"image-bytes")
+    with patch(
+        "kiln_ai.adapters.extractors.litellm_extractor.convert_pdf_to_images",
+        return_value=[mock_image_path],
+    ):
+        extraction_input = await mock_litellm_extractor.convert_pdf_page_to_image_input(
+            page_path, 0
+        )
+    assert extraction_input.mime_type == "image/png"
+    assert Path(extraction_input.path) == mock_image_path
+@pytest.mark.parametrize("returned_count", [0, 2])
+async def test_convert_pdf_page_to_image_input_error_on_invalid_count(
+    mock_litellm_extractor, tmp_path, returned_count
+):
+    page_dir = tmp_path / "pages"
+    page_dir.mkdir()
+    page_path = page_dir / "page_1.pdf"
+    page_path.write_bytes(b"%PDF-1.4 test")
+    image_paths = []
+    if returned_count == 2:
+        img1 = page_dir / "img-page_1.pdf-0.png"
+        img2 = page_dir / "img-page_1.pdf-1.png"
+        img1.write_bytes(b"i1")
+        img2.write_bytes(b"i2")
+        image_paths = [img1, img2]
+    with patch(
+        "kiln_ai.adapters.extractors.litellm_extractor.convert_pdf_to_images",
+        return_value=image_paths,
+    ):
+        with pytest.raises(ValueError, match=r"Expected 1 image, got "):
+            await mock_litellm_extractor.convert_pdf_page_to_image_input(page_path, 0)
 async def test_extract_pdf_page_by_page_error_handling(
     mock_file_factory, mock_litellm_extractor
 ):
@@ -894,15 +988,19 @@ async def test_extract_pdf_with_cache_storage(
     # Verify that the completion was called for each page
     assert mock_acompletion.call_count == 2
-    # Verify content is stored in cache
+    # Verify content is stored in cache - note that order is not guaranteed since
+    # we batch the page extraction requests in parallel
     pdf_path = Path(test_file)
+    cached_contents = []
     for i in range(2):
         cached_content = (
             await mock_litellm_extractor_with_cache.get_page_content_from_cache(
                 pdf_path, i
             )
         )
-        assert cached_content == f"Content from page {i + 1}"
+        assert cached_content is not None
+        cached_contents.append(cached_content)
+    assert set(cached_contents) == {"Content from page 1", "Content from page 2"}
     # Verify the output contains content from both pages
     assert "Content from page 1" in result.content
@@ -1137,7 +1235,7 @@ async def test_extract_pdf_parallel_processing_error_handling(
         "litellm.acompletion",
         side_effect=[mock_response1, Exception("API Error on page 2")],
     ) as mock_acompletion:
-        with pytest.raises(ValueError, match=r".*Page 1:.*API Error on page 2"):
+        with pytest.raises(ValueError, match=r".*API Error on page 2"):
             await mock_litellm_extractor_with_cache.extract(
                 ExtractionInput(
                     path=str(test_file),

kiln-ai 0.21.0__tar.gz → 0.22.0__tar.gz

Potentially problematic release.

kiln-ai 0.21.0tar.gz → 0.22.0tar.gz