kiln-ai 0.20.1__py3-none-any.whl → 0.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (133) hide show
  1. kiln_ai/adapters/__init__.py +6 -0
  2. kiln_ai/adapters/adapter_registry.py +43 -226
  3. kiln_ai/adapters/chunkers/__init__.py +13 -0
  4. kiln_ai/adapters/chunkers/base_chunker.py +42 -0
  5. kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
  6. kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
  7. kiln_ai/adapters/chunkers/helpers.py +23 -0
  8. kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
  9. kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
  10. kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
  11. kiln_ai/adapters/chunkers/test_helpers.py +75 -0
  12. kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
  13. kiln_ai/adapters/embedding/__init__.py +0 -0
  14. kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
  15. kiln_ai/adapters/embedding/embedding_registry.py +32 -0
  16. kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
  17. kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
  18. kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
  19. kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
  20. kiln_ai/adapters/eval/eval_runner.py +6 -2
  21. kiln_ai/adapters/eval/test_base_eval.py +1 -3
  22. kiln_ai/adapters/eval/test_g_eval.py +1 -1
  23. kiln_ai/adapters/extractors/__init__.py +18 -0
  24. kiln_ai/adapters/extractors/base_extractor.py +72 -0
  25. kiln_ai/adapters/extractors/encoding.py +20 -0
  26. kiln_ai/adapters/extractors/extractor_registry.py +44 -0
  27. kiln_ai/adapters/extractors/extractor_runner.py +112 -0
  28. kiln_ai/adapters/extractors/litellm_extractor.py +406 -0
  29. kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
  30. kiln_ai/adapters/extractors/test_encoding.py +54 -0
  31. kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
  32. kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
  33. kiln_ai/adapters/extractors/test_litellm_extractor.py +1290 -0
  34. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
  35. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
  36. kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
  37. kiln_ai/adapters/ml_embedding_model_list.py +494 -0
  38. kiln_ai/adapters/ml_model_list.py +876 -18
  39. kiln_ai/adapters/model_adapters/litellm_adapter.py +40 -75
  40. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +79 -1
  41. kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +119 -5
  42. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +9 -3
  43. kiln_ai/adapters/model_adapters/test_structured_output.py +9 -10
  44. kiln_ai/adapters/ollama_tools.py +69 -12
  45. kiln_ai/adapters/provider_tools.py +190 -46
  46. kiln_ai/adapters/rag/deduplication.py +49 -0
  47. kiln_ai/adapters/rag/progress.py +252 -0
  48. kiln_ai/adapters/rag/rag_runners.py +844 -0
  49. kiln_ai/adapters/rag/test_deduplication.py +195 -0
  50. kiln_ai/adapters/rag/test_progress.py +785 -0
  51. kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
  52. kiln_ai/adapters/remote_config.py +80 -8
  53. kiln_ai/adapters/test_adapter_registry.py +579 -86
  54. kiln_ai/adapters/test_ml_embedding_model_list.py +239 -0
  55. kiln_ai/adapters/test_ml_model_list.py +202 -0
  56. kiln_ai/adapters/test_ollama_tools.py +340 -1
  57. kiln_ai/adapters/test_prompt_builders.py +1 -1
  58. kiln_ai/adapters/test_provider_tools.py +199 -8
  59. kiln_ai/adapters/test_remote_config.py +551 -56
  60. kiln_ai/adapters/vector_store/__init__.py +1 -0
  61. kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
  62. kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
  63. kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
  64. kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
  65. kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
  66. kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
  67. kiln_ai/datamodel/__init__.py +16 -13
  68. kiln_ai/datamodel/basemodel.py +201 -4
  69. kiln_ai/datamodel/chunk.py +158 -0
  70. kiln_ai/datamodel/datamodel_enums.py +27 -0
  71. kiln_ai/datamodel/embedding.py +64 -0
  72. kiln_ai/datamodel/external_tool_server.py +206 -54
  73. kiln_ai/datamodel/extraction.py +317 -0
  74. kiln_ai/datamodel/project.py +33 -1
  75. kiln_ai/datamodel/rag.py +79 -0
  76. kiln_ai/datamodel/task.py +5 -0
  77. kiln_ai/datamodel/task_output.py +41 -11
  78. kiln_ai/datamodel/test_attachment.py +649 -0
  79. kiln_ai/datamodel/test_basemodel.py +270 -14
  80. kiln_ai/datamodel/test_chunk_models.py +317 -0
  81. kiln_ai/datamodel/test_dataset_split.py +1 -1
  82. kiln_ai/datamodel/test_datasource.py +50 -0
  83. kiln_ai/datamodel/test_embedding_models.py +448 -0
  84. kiln_ai/datamodel/test_eval_model.py +6 -6
  85. kiln_ai/datamodel/test_external_tool_server.py +534 -152
  86. kiln_ai/datamodel/test_extraction_chunk.py +206 -0
  87. kiln_ai/datamodel/test_extraction_model.py +501 -0
  88. kiln_ai/datamodel/test_rag.py +641 -0
  89. kiln_ai/datamodel/test_task.py +35 -1
  90. kiln_ai/datamodel/test_tool_id.py +187 -1
  91. kiln_ai/datamodel/test_vector_store.py +320 -0
  92. kiln_ai/datamodel/tool_id.py +58 -0
  93. kiln_ai/datamodel/vector_store.py +141 -0
  94. kiln_ai/tools/base_tool.py +12 -3
  95. kiln_ai/tools/built_in_tools/math_tools.py +12 -4
  96. kiln_ai/tools/kiln_task_tool.py +158 -0
  97. kiln_ai/tools/mcp_server_tool.py +2 -2
  98. kiln_ai/tools/mcp_session_manager.py +51 -22
  99. kiln_ai/tools/rag_tools.py +164 -0
  100. kiln_ai/tools/test_kiln_task_tool.py +527 -0
  101. kiln_ai/tools/test_mcp_server_tool.py +4 -15
  102. kiln_ai/tools/test_mcp_session_manager.py +187 -227
  103. kiln_ai/tools/test_rag_tools.py +929 -0
  104. kiln_ai/tools/test_tool_registry.py +290 -7
  105. kiln_ai/tools/tool_registry.py +69 -16
  106. kiln_ai/utils/__init__.py +3 -0
  107. kiln_ai/utils/async_job_runner.py +62 -17
  108. kiln_ai/utils/config.py +2 -2
  109. kiln_ai/utils/env.py +15 -0
  110. kiln_ai/utils/filesystem.py +14 -0
  111. kiln_ai/utils/filesystem_cache.py +60 -0
  112. kiln_ai/utils/litellm.py +94 -0
  113. kiln_ai/utils/lock.py +100 -0
  114. kiln_ai/utils/mime_type.py +38 -0
  115. kiln_ai/utils/open_ai_types.py +19 -2
  116. kiln_ai/utils/pdf_utils.py +59 -0
  117. kiln_ai/utils/test_async_job_runner.py +151 -35
  118. kiln_ai/utils/test_env.py +142 -0
  119. kiln_ai/utils/test_filesystem_cache.py +316 -0
  120. kiln_ai/utils/test_litellm.py +206 -0
  121. kiln_ai/utils/test_lock.py +185 -0
  122. kiln_ai/utils/test_mime_type.py +66 -0
  123. kiln_ai/utils/test_open_ai_types.py +88 -12
  124. kiln_ai/utils/test_pdf_utils.py +86 -0
  125. kiln_ai/utils/test_uuid.py +111 -0
  126. kiln_ai/utils/test_validation.py +524 -0
  127. kiln_ai/utils/uuid.py +9 -0
  128. kiln_ai/utils/validation.py +90 -0
  129. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/METADATA +9 -1
  130. kiln_ai-0.22.0.dist-info/RECORD +213 -0
  131. kiln_ai-0.20.1.dist-info/RECORD +0 -138
  132. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/WHEEL +0 -0
  133. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,14 @@
1
+ import os
2
+ import subprocess
3
+ import sys
4
+ from pathlib import Path
5
+
6
+
7
+ def open_folder(path: str | Path) -> None:
8
+ dir = os.path.dirname(path)
9
+ if sys.platform.startswith("darwin"):
10
+ subprocess.run(["open", dir], check=True)
11
+ elif sys.platform.startswith("win"):
12
+ os.startfile(dir) # type: ignore[attr-defined]
13
+ else:
14
+ subprocess.run(["xdg-open", dir], check=True)
@@ -0,0 +1,60 @@
1
+ import logging
2
+ import tempfile
3
+ from pathlib import Path
4
+
5
+ import anyio
6
+
7
+ from kiln_ai.datamodel.basemodel import name_validator
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class FilesystemCache:
13
+ def __init__(self, path: Path):
14
+ self.cache_dir_path = path
15
+
16
+ def validate_key(self, key: str) -> None:
17
+ # throws if invalid
18
+ name_validator(min_length=1, max_length=120)(key)
19
+
20
+ def get_path(self, key: str) -> Path:
21
+ self.validate_key(key)
22
+ return self.cache_dir_path / key
23
+
24
+ async def get(self, key: str) -> bytes | None:
25
+ # check if the file exists - don't need to validate the key
26
+ # worst case we just return None
27
+ if not self.get_path(key).exists():
28
+ return None
29
+
30
+ # we don't want to raise because of internal cache corruption issues
31
+ try:
32
+ return await anyio.Path(self.get_path(key)).read_bytes()
33
+ except Exception:
34
+ logger.error(f"Error reading file {self.get_path(key)}", exc_info=True)
35
+ return None
36
+
37
+ async def set(self, key: str, value: bytes) -> Path:
38
+ logger.debug(f"Caching {key} at {self.get_path(key)}")
39
+ self.validate_key(key)
40
+ path = self.get_path(key)
41
+ await anyio.Path(path).write_bytes(value)
42
+ return path
43
+
44
+
45
+ class TemporaryFilesystemCache:
46
+ _shared_instance = None
47
+
48
+ def __init__(self):
49
+ self._cache_temp_dir = tempfile.mkdtemp(prefix="kiln_cache_")
50
+ self.filesystem_cache = FilesystemCache(path=Path(self._cache_temp_dir))
51
+
52
+ logger.debug(
53
+ f"Created temporary filesystem cache directory: {self._cache_temp_dir}"
54
+ )
55
+
56
+ @classmethod
57
+ def shared(cls) -> FilesystemCache:
58
+ if cls._shared_instance is None:
59
+ cls._shared_instance = cls()
60
+ return cls._shared_instance.filesystem_cache
@@ -0,0 +1,94 @@
1
+ from dataclasses import dataclass
2
+
3
+ from kiln_ai.adapters.ml_embedding_model_list import KilnEmbeddingModelProvider
4
+ from kiln_ai.adapters.ml_model_list import KilnModelProvider
5
+ from kiln_ai.datamodel.datamodel_enums import ModelProviderName
6
+ from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
7
+
8
+
9
+ @dataclass
10
+ class LitellmProviderInfo:
11
+ # The name of the provider, as it appears in litellm
12
+ provider_name: str
13
+ # Whether the provider is custom - e.g. custom models, ollama, fine tunes, and custom registry models
14
+ is_custom: bool
15
+ # The model ID slug to use in litellm
16
+ litellm_model_id: str
17
+
18
+
19
+ def get_litellm_provider_info(
20
+ model_provider: KilnEmbeddingModelProvider | KilnModelProvider,
21
+ ) -> LitellmProviderInfo:
22
+ """
23
+ Maps a Kiln model provider to a litellm provider.
24
+
25
+ Args:
26
+ model_provider: The model provider to get litellm provider info for
27
+
28
+ Returns:
29
+ LitellmProviderInfo containing the provider name and whether it's custom
30
+ """
31
+ if not model_provider.model_id:
32
+ raise ValueError("Model ID is required for OpenAI compatible models")
33
+
34
+ litellm_provider_name: str | None = None
35
+ is_custom = False
36
+ match model_provider.name:
37
+ case ModelProviderName.openrouter:
38
+ litellm_provider_name = "openrouter"
39
+ case ModelProviderName.openai:
40
+ litellm_provider_name = "openai"
41
+ case ModelProviderName.groq:
42
+ litellm_provider_name = "groq"
43
+ case ModelProviderName.anthropic:
44
+ litellm_provider_name = "anthropic"
45
+ case ModelProviderName.ollama:
46
+ # We don't let litellm use the Ollama API and muck with our requests. We use Ollama's OpenAI compatible API.
47
+ # This is because we're setting detailed features like response_format=json_schema and want lower level control.
48
+ is_custom = True
49
+ case ModelProviderName.docker_model_runner:
50
+ # Docker Model Runner uses OpenAI-compatible API, similar to Ollama
51
+ # We want direct control over the requests for features like response_format=json_schema
52
+ is_custom = True
53
+ case ModelProviderName.gemini_api:
54
+ litellm_provider_name = "gemini"
55
+ case ModelProviderName.fireworks_ai:
56
+ litellm_provider_name = "fireworks_ai"
57
+ case ModelProviderName.amazon_bedrock:
58
+ litellm_provider_name = "bedrock"
59
+ case ModelProviderName.azure_openai:
60
+ litellm_provider_name = "azure"
61
+ case ModelProviderName.huggingface:
62
+ litellm_provider_name = "huggingface"
63
+ case ModelProviderName.vertex:
64
+ litellm_provider_name = "vertex_ai"
65
+ case ModelProviderName.together_ai:
66
+ litellm_provider_name = "together_ai"
67
+ case ModelProviderName.cerebras:
68
+ litellm_provider_name = "cerebras"
69
+ case ModelProviderName.siliconflow_cn:
70
+ is_custom = True
71
+ case ModelProviderName.openai_compatible:
72
+ is_custom = True
73
+ case ModelProviderName.kiln_custom_registry:
74
+ is_custom = True
75
+ case ModelProviderName.kiln_fine_tune:
76
+ is_custom = True
77
+ case _:
78
+ raise_exhaustive_enum_error(model_provider.name)
79
+
80
+ if is_custom:
81
+ # Use openai as it's only used for format, not url
82
+ litellm_provider_name = "openai"
83
+
84
+ # Shouldn't be possible but keep type checker happy
85
+ if litellm_provider_name is None:
86
+ raise ValueError(
87
+ f"Provider name could not lookup valid litellm provider ID {model_provider.model_id}"
88
+ )
89
+
90
+ return LitellmProviderInfo(
91
+ provider_name=litellm_provider_name,
92
+ is_custom=is_custom,
93
+ litellm_model_id=f"{litellm_provider_name}/{model_provider.model_id}",
94
+ )
kiln_ai/utils/lock.py ADDED
@@ -0,0 +1,100 @@
1
+ import asyncio
2
+ from contextlib import asynccontextmanager
3
+ from dataclasses import dataclass, field
4
+ from typing import Dict, Hashable
5
+
6
+
7
+ @dataclass
8
+ class _Entry:
9
+ lock: asyncio.Lock = field(default_factory=asyncio.Lock)
10
+ waiters: int = 0 # tasks waiting to acquire
11
+ holders: int = 0 # 0 or 1 for a mutex
12
+
13
+
14
+ class AsyncLockManager:
15
+ """
16
+ A per-key asyncio lock manager that automatically cleans up locks when they're no longer needed.
17
+
18
+ Usage:
19
+ locks = AsyncLockManager()
20
+
21
+ async with locks.acquire("user:123"):
22
+ # critical section for "user:123"
23
+ ...
24
+
25
+ The manager removes a key when there are no holders and no waiters.
26
+ """
27
+
28
+ def __init__(self) -> None:
29
+ # Protects the _locks dict and bookkeeping counters.
30
+ self._mu = asyncio.Lock()
31
+ self._locks: Dict[Hashable, _Entry] = {}
32
+
33
+ @asynccontextmanager
34
+ async def acquire(self, key: Hashable, *, timeout: float | None = None):
35
+ """
36
+ Acquire the lock for `key` as an async context manager.
37
+
38
+ - `timeout`: optional seconds to wait; raises TimeoutError on expiry.
39
+ """
40
+ # Phase 1: register as a waiter and get/create the entry (under manager mutex).
41
+ async with self._mu:
42
+ entry = self._locks.get(key)
43
+ if entry is None:
44
+ entry = self._locks[key] = _Entry()
45
+ entry.waiters += 1
46
+
47
+ # Phase 2: wait on the per-key lock (outside manager mutex).
48
+ try:
49
+ if timeout is None:
50
+ await entry.lock.acquire()
51
+ else:
52
+ # Manual timeout to keep compatibility across Python versions.
53
+ await asyncio.wait_for(entry.lock.acquire(), timeout=timeout)
54
+
55
+ # Phase 3: update counters: became a holder.
56
+ async with self._mu:
57
+ entry.waiters -= 1
58
+ entry.holders += 1
59
+
60
+ try:
61
+ yield # critical section
62
+ finally:
63
+ # Phase 4: release holder and maybe cleanup.
64
+ entry.lock.release()
65
+ async with self._mu:
66
+ entry.holders -= 1
67
+ # Remove the entry if fully idle.
68
+ if entry.waiters == 0 and entry.holders == 0:
69
+ # Double-check we still point to same object (paranoia/race safety).
70
+ if self._locks.get(key) is entry:
71
+ del self._locks[key]
72
+
73
+ except asyncio.TimeoutError:
74
+ # Timed out while waiting; undo waiter count and maybe cleanup.
75
+ async with self._mu:
76
+ entry.waiters -= 1
77
+ if entry.waiters == 0 and entry.holders == 0:
78
+ if self._locks.get(key) is entry:
79
+ del self._locks[key]
80
+ raise
81
+ except asyncio.CancelledError:
82
+ # Cancelled while waiting; same cleanup as timeout.
83
+ async with self._mu:
84
+ entry.waiters -= 1
85
+ if entry.waiters == 0 and entry.holders == 0:
86
+ if self._locks.get(key) is entry:
87
+ del self._locks[key]
88
+ raise
89
+
90
+ # Optional: expose a snapshot for metrics/debugging
91
+ async def snapshot(self) -> Dict[Hashable, dict]:
92
+ async with self._mu:
93
+ return {
94
+ k: {"waiters": e.waiters, "holders": e.holders}
95
+ for k, e in self._locks.items()
96
+ }
97
+
98
+
99
+ # callers should use this global instance instead of creating their own
100
+ shared_async_lock_manager = AsyncLockManager()
@@ -0,0 +1,38 @@
1
+ import mimetypes
2
+
3
+
4
+ def guess_mime_type(filename: str) -> str | None:
5
+ filename_normalized = filename.lower()
6
+
7
+ # we override the mimetypes.guess_type for some common cases
8
+ # because it does not handle them correctly
9
+ if filename_normalized.endswith(".mov"):
10
+ return "video/quicktime"
11
+ elif filename_normalized.endswith(".mp3"):
12
+ return "audio/mpeg"
13
+ elif filename_normalized.endswith(".wav"):
14
+ return "audio/wav"
15
+ elif filename_normalized.endswith(".mp4"):
16
+ return "video/mp4"
17
+
18
+ mime_type, _ = mimetypes.guess_type(filename_normalized)
19
+ return mime_type
20
+
21
+
22
+ def guess_extension(mime_type: str) -> str | None:
23
+ mapping = {
24
+ "application/pdf": ".pdf",
25
+ "image/png": ".png",
26
+ "video/mp4": ".mp4",
27
+ "audio/ogg": ".ogg",
28
+ "text/markdown": ".md",
29
+ "text/plain": ".txt",
30
+ "text/html": ".html",
31
+ "text/csv": ".csv",
32
+ "image/jpeg": ".jpeg",
33
+ "image/jpg": ".jpeg",
34
+ "audio/mpeg": ".mp3",
35
+ "audio/wav": ".wav",
36
+ "video/quicktime": ".mov",
37
+ }
38
+ return mapping.get(mime_type)
@@ -17,11 +17,11 @@ from typing import (
17
17
  )
18
18
 
19
19
  from openai.types.chat import (
20
+ ChatCompletionContentPartTextParam,
20
21
  ChatCompletionDeveloperMessageParam,
21
22
  ChatCompletionFunctionMessageParam,
22
23
  ChatCompletionMessageToolCallParam,
23
24
  ChatCompletionSystemMessageParam,
24
- ChatCompletionToolMessageParam,
25
25
  ChatCompletionUserMessageParam,
26
26
  )
27
27
  from openai.types.chat.chat_completion_assistant_message_param import (
@@ -84,11 +84,28 @@ class ChatCompletionAssistantMessageParamWrapper(TypedDict, total=False):
84
84
  """The tool calls generated by the model, such as function calls."""
85
85
 
86
86
 
87
+ class ChatCompletionToolMessageParamWrapper(TypedDict, total=False):
88
+ content: Required[Union[str, Iterable[ChatCompletionContentPartTextParam]]]
89
+ """The contents of the tool message."""
90
+
91
+ role: Required[Literal["tool"]]
92
+ """The role of the messages author, in this case `tool`."""
93
+
94
+ tool_call_id: Required[str]
95
+ """Tool call that this message is responding to."""
96
+
97
+ kiln_task_tool_data: Optional[str]
98
+ """The data for the Kiln task tool that this message is responding to.
99
+
100
+ Formatted as `<project_id>:::<tool_id>:::<task_id>:::<run_id>`
101
+ """
102
+
103
+
87
104
  ChatCompletionMessageParam: TypeAlias = Union[
88
105
  ChatCompletionDeveloperMessageParam,
89
106
  ChatCompletionSystemMessageParam,
90
107
  ChatCompletionUserMessageParam,
91
108
  ChatCompletionAssistantMessageParamWrapper,
92
- ChatCompletionToolMessageParam,
109
+ ChatCompletionToolMessageParamWrapper,
93
110
  ChatCompletionFunctionMessageParam,
94
111
  ]
@@ -0,0 +1,59 @@
1
+ """
2
+ Utilities for working with PDF files.
3
+ """
4
+
5
+ import asyncio
6
+ import tempfile
7
+ from contextlib import asynccontextmanager
8
+ from pathlib import Path
9
+ from typing import AsyncGenerator
10
+
11
+ import pypdfium2
12
+ from pypdf import PdfReader, PdfWriter
13
+
14
+
15
+ @asynccontextmanager
16
+ async def split_pdf_into_pages(pdf_path: Path) -> AsyncGenerator[list[Path], None]:
17
+ with tempfile.TemporaryDirectory(prefix="kiln_pdf_pages_") as temp_dir:
18
+ page_paths = []
19
+
20
+ with open(pdf_path, "rb") as file:
21
+ # Reader init can be heavy; offload to thread
22
+ pdf_reader = await asyncio.to_thread(PdfReader, file)
23
+
24
+ for page_num in range(len(pdf_reader.pages)):
25
+ await asyncio.sleep(0)
26
+ pdf_writer = PdfWriter()
27
+ pdf_writer.add_page(pdf_reader.pages[page_num])
28
+
29
+ # Create temporary file for this page
30
+ page_filename = f"page_{page_num + 1}.pdf"
31
+ page_path = Path(temp_dir) / page_filename
32
+
33
+ with open(page_path, "wb") as page_file:
34
+ # Writing/compression can be expensive; offload to thread
35
+ await asyncio.to_thread(pdf_writer.write, page_file)
36
+
37
+ page_paths.append(page_path)
38
+
39
+ yield page_paths
40
+
41
+
42
+ async def convert_pdf_to_images(pdf_path: Path, output_dir: Path) -> list[Path]:
43
+ image_paths = []
44
+
45
+ # note: doing this in a thread causes a segfault - but this is slow and blocking
46
+ # so we should try to find a better way
47
+ pdf = pypdfium2.PdfDocument(pdf_path)
48
+ try:
49
+ for idx, page in enumerate(pdf):
50
+ await asyncio.sleep(0)
51
+ # scale=2 is legible for ~A4 pages (research papers, etc.) - lower than this is blurry
52
+ bitmap = page.render(scale=2).to_pil()
53
+ target_path = output_dir / f"img-{pdf_path.name}-{idx}.png"
54
+ bitmap.save(target_path)
55
+ image_paths.append(target_path)
56
+
57
+ return image_paths
58
+ finally:
59
+ pdf.close()