kiln-ai 0.21.0__tar.gz → 0.22.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (251) hide show
  1. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/PKG-INFO +3 -1
  2. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/extractors/litellm_extractor.py +52 -32
  3. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/extractors/test_litellm_extractor.py +169 -71
  4. kiln_ai-0.22.0/kiln_ai/adapters/ml_embedding_model_list.py +494 -0
  5. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/ml_model_list.py +503 -23
  6. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/model_adapters/litellm_adapter.py +34 -7
  7. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/model_adapters/test_litellm_adapter.py +78 -0
  8. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +119 -5
  9. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +9 -3
  10. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/model_adapters/test_structured_output.py +6 -9
  11. kiln_ai-0.22.0/kiln_ai/adapters/test_ml_embedding_model_list.py +239 -0
  12. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/test_ml_model_list.py +0 -10
  13. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/basemodel.py +31 -3
  14. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/external_tool_server.py +206 -54
  15. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/extraction.py +14 -0
  16. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/task.py +5 -0
  17. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/task_output.py +41 -11
  18. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_attachment.py +3 -3
  19. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_basemodel.py +269 -13
  20. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_datasource.py +50 -0
  21. kiln_ai-0.22.0/kiln_ai/datamodel/test_external_tool_server.py +1073 -0
  22. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_extraction_model.py +31 -0
  23. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_task.py +35 -1
  24. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_tool_id.py +106 -1
  25. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/tool_id.py +36 -0
  26. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/tools/base_tool.py +12 -3
  27. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/tools/built_in_tools/math_tools.py +12 -4
  28. kiln_ai-0.22.0/kiln_ai/tools/kiln_task_tool.py +158 -0
  29. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/tools/mcp_server_tool.py +2 -2
  30. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/tools/mcp_session_manager.py +50 -24
  31. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/tools/rag_tools.py +12 -5
  32. kiln_ai-0.22.0/kiln_ai/tools/test_kiln_task_tool.py +527 -0
  33. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/tools/test_mcp_server_tool.py +4 -15
  34. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/tools/test_mcp_session_manager.py +186 -226
  35. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/tools/test_rag_tools.py +86 -5
  36. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/tools/test_tool_registry.py +199 -5
  37. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/tools/tool_registry.py +49 -17
  38. kiln_ai-0.22.0/kiln_ai/utils/filesystem.py +14 -0
  39. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/open_ai_types.py +19 -2
  40. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/pdf_utils.py +21 -0
  41. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/test_open_ai_types.py +88 -12
  42. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/test_pdf_utils.py +14 -1
  43. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/pyproject.toml +3 -1
  44. kiln_ai-0.21.0/kiln_ai/adapters/ml_embedding_model_list.py +0 -192
  45. kiln_ai-0.21.0/kiln_ai/adapters/test_ml_embedding_model_list.py +0 -429
  46. kiln_ai-0.21.0/kiln_ai/datamodel/test_external_tool_server.py +0 -691
  47. kiln_ai-0.21.0/kiln_ai/utils/filesystem.py +0 -14
  48. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/.gitignore +0 -0
  49. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/.python-version +0 -0
  50. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/LICENSE.txt +0 -0
  51. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/README.md +0 -0
  52. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/index.html +0 -0
  53. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/data_gen/data_gen_task.html +0 -0
  54. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/data_gen.html +0 -0
  55. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/eval/base_eval.html +0 -0
  56. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/eval/eval_runner.html +0 -0
  57. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/eval/g_eval.html +0 -0
  58. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/eval/registry.html +0 -0
  59. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/eval.html +0 -0
  60. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/fine_tune/base_finetune.html +0 -0
  61. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/fine_tune/dataset_formatter.html +0 -0
  62. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/fine_tune/finetune_registry.html +0 -0
  63. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/fine_tune/openai_finetune.html +0 -0
  64. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/fine_tune.html +0 -0
  65. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/ml_model_list.html +0 -0
  66. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/model_adapters/base_adapter.html +0 -0
  67. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/model_adapters/litellm_adapter.html +0 -0
  68. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/model_adapters.html +0 -0
  69. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/prompt_builders.html +0 -0
  70. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/repair/repair_task.html +0 -0
  71. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters/repair.html +0 -0
  72. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/adapters.html +0 -0
  73. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/datamodel/dataset_split.html +0 -0
  74. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/datamodel/eval.html +0 -0
  75. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/datamodel/strict_mode.html +0 -0
  76. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/datamodel.html +0 -0
  77. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/utils/config.html +0 -0
  78. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/utils/formatting.html +0 -0
  79. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai/utils.html +0 -0
  80. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/kiln_ai.html +0 -0
  81. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/docs/kiln_core_docs/search.js +0 -0
  82. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/__init__.py +0 -0
  83. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/__init__.py +0 -0
  84. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/adapter_registry.py +0 -0
  85. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/chat/__init__.py +0 -0
  86. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/chat/chat_formatter.py +0 -0
  87. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/chat/test_chat_formatter.py +0 -0
  88. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/chunkers/__init__.py +0 -0
  89. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/chunkers/base_chunker.py +0 -0
  90. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/chunkers/chunker_registry.py +0 -0
  91. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/chunkers/fixed_window_chunker.py +0 -0
  92. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/chunkers/helpers.py +0 -0
  93. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/chunkers/test_base_chunker.py +0 -0
  94. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/chunkers/test_chunker_registry.py +0 -0
  95. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +0 -0
  96. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/chunkers/test_helpers.py +0 -0
  97. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/data_gen/__init__.py +0 -0
  98. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/data_gen/data_gen_prompts.py +0 -0
  99. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/data_gen/data_gen_task.py +0 -0
  100. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/data_gen/test_data_gen_task.py +0 -0
  101. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/docker_model_runner_tools.py +0 -0
  102. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/embedding/__init__.py +0 -0
  103. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/embedding/base_embedding_adapter.py +0 -0
  104. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/embedding/embedding_registry.py +0 -0
  105. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/embedding/litellm_embedding_adapter.py +0 -0
  106. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/embedding/test_base_embedding_adapter.py +0 -0
  107. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/embedding/test_embedding_registry.py +0 -0
  108. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +0 -0
  109. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/eval/__init__.py +0 -0
  110. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/eval/base_eval.py +0 -0
  111. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/eval/eval_runner.py +0 -0
  112. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/eval/g_eval.py +0 -0
  113. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/eval/registry.py +0 -0
  114. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/eval/test_base_eval.py +0 -0
  115. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/eval/test_eval_runner.py +0 -0
  116. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/eval/test_g_eval.py +0 -0
  117. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/eval/test_g_eval_data.py +0 -0
  118. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/extractors/__init__.py +0 -0
  119. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/extractors/base_extractor.py +0 -0
  120. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/extractors/encoding.py +0 -0
  121. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/extractors/extractor_registry.py +0 -0
  122. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/extractors/extractor_runner.py +0 -0
  123. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/extractors/test_base_extractor.py +0 -0
  124. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/extractors/test_encoding.py +0 -0
  125. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/extractors/test_extractor_registry.py +0 -0
  126. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/extractors/test_extractor_runner.py +0 -0
  127. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/fine_tune/__init__.py +0 -0
  128. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/fine_tune/base_finetune.py +0 -0
  129. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/fine_tune/dataset_formatter.py +0 -0
  130. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/fine_tune/finetune_registry.py +0 -0
  131. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/fine_tune/fireworks_finetune.py +0 -0
  132. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/fine_tune/openai_finetune.py +0 -0
  133. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/fine_tune/test_base_finetune.py +0 -0
  134. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/fine_tune/test_dataset_formatter.py +0 -0
  135. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +0 -0
  136. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/fine_tune/test_openai_finetune.py +0 -0
  137. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/fine_tune/test_together_finetune.py +0 -0
  138. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/fine_tune/test_vertex_finetune.py +0 -0
  139. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/fine_tune/together_finetune.py +0 -0
  140. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/fine_tune/vertex_finetune.py +0 -0
  141. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/model_adapters/__init__.py +0 -0
  142. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/model_adapters/base_adapter.py +0 -0
  143. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/model_adapters/litellm_config.py +0 -0
  144. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/model_adapters/test_base_adapter.py +0 -0
  145. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/ollama_tools.py +0 -0
  146. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/parsers/__init__.py +0 -0
  147. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/parsers/base_parser.py +0 -0
  148. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/parsers/json_parser.py +0 -0
  149. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/parsers/parser_registry.py +0 -0
  150. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/parsers/r1_parser.py +0 -0
  151. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/parsers/request_formatters.py +0 -0
  152. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/parsers/test_json_parser.py +0 -0
  153. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/parsers/test_parser_registry.py +0 -0
  154. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/parsers/test_r1_parser.py +0 -0
  155. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/parsers/test_request_formatters.py +0 -0
  156. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/prompt_builders.py +0 -0
  157. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/provider_tools.py +0 -0
  158. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/rag/deduplication.py +0 -0
  159. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/rag/progress.py +0 -0
  160. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/rag/rag_runners.py +0 -0
  161. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/rag/test_deduplication.py +0 -0
  162. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/rag/test_progress.py +0 -0
  163. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/rag/test_rag_runners.py +0 -0
  164. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/remote_config.py +0 -0
  165. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/repair/__init__.py +0 -0
  166. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/repair/repair_task.py +0 -0
  167. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/repair/test_repair_task.py +0 -0
  168. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/run_output.py +0 -0
  169. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/test_adapter_registry.py +0 -0
  170. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/test_docker_model_runner_tools.py +0 -0
  171. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/test_ollama_tools.py +0 -0
  172. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/test_prompt_adaptors.py +0 -0
  173. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/test_prompt_builders.py +0 -0
  174. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/test_provider_tools.py +0 -0
  175. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/test_remote_config.py +0 -0
  176. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/vector_store/__init__.py +0 -0
  177. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/vector_store/base_vector_store_adapter.py +0 -0
  178. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/vector_store/lancedb_adapter.py +0 -0
  179. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/vector_store/test_base_vector_store.py +0 -0
  180. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/vector_store/test_lancedb_adapter.py +0 -0
  181. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/vector_store/test_vector_store_registry.py +0 -0
  182. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/adapters/vector_store/vector_store_registry.py +0 -0
  183. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/__init__.py +0 -0
  184. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/chunk.py +0 -0
  185. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/datamodel_enums.py +0 -0
  186. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/dataset_filters.py +0 -0
  187. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/dataset_split.py +0 -0
  188. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/embedding.py +0 -0
  189. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/eval.py +0 -0
  190. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/finetune.py +0 -0
  191. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/json_schema.py +0 -0
  192. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/model_cache.py +0 -0
  193. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/project.py +0 -0
  194. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/prompt.py +0 -0
  195. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/prompt_id.py +0 -0
  196. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/rag.py +0 -0
  197. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/registry.py +0 -0
  198. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/run_config.py +0 -0
  199. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/strict_mode.py +0 -0
  200. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/task_run.py +0 -0
  201. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_chunk_models.py +0 -0
  202. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_dataset_filters.py +0 -0
  203. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_dataset_split.py +0 -0
  204. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_embedding_models.py +0 -0
  205. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_eval_model.py +0 -0
  206. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_example_models.py +0 -0
  207. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_extraction_chunk.py +0 -0
  208. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_json_schema.py +0 -0
  209. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_model_cache.py +0 -0
  210. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_model_perf.py +0 -0
  211. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_models.py +0 -0
  212. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_nested_save.py +0 -0
  213. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_output_rating.py +0 -0
  214. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_prompt_id.py +0 -0
  215. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_rag.py +0 -0
  216. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_registry.py +0 -0
  217. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/test_vector_store.py +0 -0
  218. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/datamodel/vector_store.py +0 -0
  219. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/tools/__init__.py +0 -0
  220. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/tools/built_in_tools/__init__.py +0 -0
  221. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/tools/built_in_tools/test_math_tools.py +0 -0
  222. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/tools/test_base_tools.py +0 -0
  223. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/__init__.py +0 -0
  224. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/async_job_runner.py +0 -0
  225. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/config.py +0 -0
  226. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/dataset_import.py +0 -0
  227. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/env.py +0 -0
  228. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/exhaustive_error.py +0 -0
  229. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/filesystem_cache.py +0 -0
  230. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/formatting.py +0 -0
  231. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/litellm.py +0 -0
  232. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/lock.py +0 -0
  233. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/logging.py +0 -0
  234. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/mime_type.py +0 -0
  235. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/name_generator.py +0 -0
  236. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/project_utils.py +0 -0
  237. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/test_async_job_runner.py +0 -0
  238. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/test_config.py +0 -0
  239. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/test_dataset_import.py +0 -0
  240. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/test_env.py +0 -0
  241. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/test_filesystem_cache.py +0 -0
  242. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/test_litellm.py +0 -0
  243. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/test_lock.py +0 -0
  244. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/test_mime_type.py +0 -0
  245. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/test_name_geneator.py +0 -0
  246. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/test_uuid.py +0 -0
  247. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/test_validation.py +0 -0
  248. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/uuid.py +0 -0
  249. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/kiln_ai/utils/validation.py +0 -0
  250. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/setup.cfg +0 -0
  251. {kiln_ai-0.21.0 → kiln_ai-0.22.0}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kiln-ai
3
- Version: 0.21.0
3
+ Version: 0.22.0
4
4
  Summary: Kiln AI
5
5
  Project-URL: Homepage, https://kiln.tech
6
6
  Project-URL: Repository, https://github.com/Kiln-AI/kiln
@@ -28,8 +28,10 @@ Requires-Dist: llama-index-vector-stores-lancedb>=0.3.3
28
28
  Requires-Dist: llama-index>=0.13.3
29
29
  Requires-Dist: openai>=1.53.0
30
30
  Requires-Dist: pdoc>=15.0.0
31
+ Requires-Dist: pillow>=11.1.0
31
32
  Requires-Dist: pydantic>=2.9.2
32
33
  Requires-Dist: pypdf>=6.0.0
34
+ Requires-Dist: pypdfium2>=4.30.0
33
35
  Requires-Dist: pytest-benchmark>=5.1.0
34
36
  Requires-Dist: pytest-cov>=6.0.0
35
37
  Requires-Dist: pyyaml>=6.0.2
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  import hashlib
3
3
  import logging
4
+ from functools import cached_property
4
5
  from pathlib import Path
5
6
  from typing import Any, List
6
7
 
@@ -13,23 +14,16 @@ from kiln_ai.adapters.extractors.base_extractor import (
13
14
  ExtractionOutput,
14
15
  )
15
16
  from kiln_ai.adapters.extractors.encoding import to_base64_url
16
- from kiln_ai.adapters.ml_model_list import built_in_models_from_provider
17
+ from kiln_ai.adapters.ml_model_list import (
18
+ KilnModelProvider,
19
+ built_in_models_from_provider,
20
+ )
17
21
  from kiln_ai.adapters.provider_tools import LiteLlmCoreConfig
18
22
  from kiln_ai.datamodel.datamodel_enums import ModelProviderName
19
23
  from kiln_ai.datamodel.extraction import ExtractorConfig, ExtractorType, Kind
20
24
  from kiln_ai.utils.filesystem_cache import FilesystemCache
21
25
  from kiln_ai.utils.litellm import get_litellm_provider_info
22
- from kiln_ai.utils.pdf_utils import split_pdf_into_pages
23
-
24
-
25
- def max_pdf_page_concurrency_for_model(model_name: str) -> int:
26
- # we assume each batch takes ~5s to complete (likely more in practice)
27
- # lowest rate limit is 150 RPM for Tier 1 accounts for gemini-2.5-pro
28
- if model_name == "gemini/gemini-2.5-pro":
29
- return 2
30
- # other models support at least 500 RPM for lowest tier accounts
31
- return 5
32
-
26
+ from kiln_ai.utils.pdf_utils import convert_pdf_to_images, split_pdf_into_pages
33
27
 
34
28
  logger = logging.getLogger(__name__)
35
29
 
@@ -74,11 +68,11 @@ def encode_file_litellm_format(path: Path, mime_type: str) -> dict[str, Any]:
74
68
  "text/markdown",
75
69
  "text/plain",
76
70
  ] or any(mime_type.startswith(m) for m in ["video/", "audio/"]):
77
- pdf_bytes = path.read_bytes()
71
+ file_bytes = path.read_bytes()
78
72
  return {
79
73
  "type": "file",
80
74
  "file": {
81
- "file_data": to_base64_url(mime_type, pdf_bytes),
75
+ "file_data": to_base64_url(mime_type, file_bytes),
82
76
  },
83
77
  }
84
78
 
@@ -101,6 +95,7 @@ class LitellmExtractor(BaseExtractor):
101
95
  extractor_config: ExtractorConfig,
102
96
  litellm_core_config: LiteLlmCoreConfig,
103
97
  filesystem_cache: FilesystemCache | None = None,
98
+ default_max_parallel_requests: int = 5,
104
99
  ):
105
100
  if extractor_config.extractor_type != ExtractorType.LITELLM:
106
101
  raise ValueError(
@@ -133,6 +128,7 @@ class LitellmExtractor(BaseExtractor):
133
128
  }
134
129
 
135
130
  self.litellm_core_config = litellm_core_config
131
+ self.default_max_parallel_requests = default_max_parallel_requests
136
132
 
137
133
  def pdf_page_cache_key(self, pdf_path: Path, page_number: int) -> str:
138
134
  """
@@ -171,13 +167,35 @@ class LitellmExtractor(BaseExtractor):
171
167
  logger.debug(f"Cache miss for page {page_number} of {pdf_path}")
172
168
  return None
173
169
 
170
+ async def convert_pdf_page_to_image_input(
171
+ self, page_path: Path, page_number: int
172
+ ) -> ExtractionInput:
173
+ image_paths = await convert_pdf_to_images(page_path, page_path.parent)
174
+ if len(image_paths) != 1:
175
+ raise ValueError(
176
+ f"Expected 1 image, got {len(image_paths)} for page {page_number} in {page_path}"
177
+ )
178
+ image_path = image_paths[0]
179
+ page_input = ExtractionInput(path=str(image_path), mime_type="image/png")
180
+ return page_input
181
+
174
182
  async def _extract_single_pdf_page(
175
- self, pdf_path: Path, page_path: Path, prompt: str, page_number: int
183
+ self,
184
+ pdf_path: Path,
185
+ page_path: Path,
186
+ prompt: str,
187
+ page_number: int,
176
188
  ) -> str:
177
189
  try:
178
- page_input = ExtractionInput(
179
- path=str(page_path), mime_type="application/pdf"
180
- )
190
+ if self.model_provider.multimodal_requires_pdf_as_image:
191
+ page_input = await self.convert_pdf_page_to_image_input(
192
+ page_path, page_number
193
+ )
194
+ else:
195
+ page_input = ExtractionInput(
196
+ path=str(page_path), mime_type="application/pdf"
197
+ )
198
+
181
199
  completion_kwargs = self._build_completion_kwargs(prompt, page_input)
182
200
  response = await litellm.acompletion(**completion_kwargs)
183
201
  except Exception as e:
@@ -201,11 +219,6 @@ class LitellmExtractor(BaseExtractor):
201
219
  )
202
220
 
203
221
  content = response.choices[0].message.content
204
- if not content:
205
- raise ValueError(
206
- f"No text returned from extraction model when extracting page {page_number} for {page_path}"
207
- )
208
-
209
222
  if self.filesystem_cache is not None:
210
223
  # we don't want to fail the whole extraction just because cache write fails
211
224
  # as that would block the whole flow
@@ -242,13 +255,14 @@ class LitellmExtractor(BaseExtractor):
242
255
  continue
243
256
 
244
257
  extract_page_jobs.append(
245
- self._extract_single_pdf_page(pdf_path, page_path, prompt, i)
258
+ self._extract_single_pdf_page(
259
+ pdf_path, page_path, prompt, page_number=i
260
+ )
246
261
  )
247
262
  page_indices_for_jobs.append(i)
248
263
 
249
264
  if (
250
- len(extract_page_jobs)
251
- >= max_pdf_page_concurrency_for_model(self.litellm_model_slug())
265
+ len(extract_page_jobs) >= self.max_parallel_requests_for_model
252
266
  or i == len(page_paths) - 1
253
267
  ):
254
268
  extraction_results = await asyncio.gather(
@@ -295,7 +309,7 @@ class LitellmExtractor(BaseExtractor):
295
309
  self, prompt: str, extraction_input: ExtractionInput
296
310
  ) -> dict[str, Any]:
297
311
  completion_kwargs = {
298
- "model": self.litellm_model_slug(),
312
+ "model": self.litellm_model_slug,
299
313
  "messages": [
300
314
  {
301
315
  "role": "user",
@@ -367,20 +381,26 @@ class LitellmExtractor(BaseExtractor):
367
381
  content_format=self.extractor_config.output_format,
368
382
  )
369
383
 
370
- def litellm_model_slug(self) -> str:
384
+ @cached_property
385
+ def model_provider(self) -> KilnModelProvider:
371
386
  kiln_model_provider = built_in_models_from_provider(
372
387
  ModelProviderName(self.extractor_config.model_provider_name),
373
388
  self.extractor_config.model_name,
374
389
  )
375
-
376
390
  if kiln_model_provider is None:
377
391
  raise ValueError(
378
392
  f"Model provider {self.extractor_config.model_provider_name} not found in the list of built-in models"
379
393
  )
394
+ return kiln_model_provider
395
+
396
+ @cached_property
397
+ def max_parallel_requests_for_model(self) -> int:
398
+ value = self.model_provider.max_parallel_requests
399
+ return value if value is not None else self.default_max_parallel_requests
380
400
 
381
- # need to translate into LiteLLM model slug
401
+ @cached_property
402
+ def litellm_model_slug(self) -> str:
382
403
  litellm_provider_name = get_litellm_provider_info(
383
- kiln_model_provider,
404
+ self.model_provider,
384
405
  )
385
-
386
406
  return litellm_provider_name.litellm_model_id
@@ -1,5 +1,5 @@
1
1
  from pathlib import Path
2
- from unittest.mock import AsyncMock, patch
2
+ from unittest.mock import AsyncMock, MagicMock, patch
3
3
 
4
4
  import pytest
5
5
  from litellm.types.utils import Choices, ModelResponse
@@ -7,13 +7,17 @@ from litellm.types.utils import Choices, ModelResponse
7
7
  from conftest import MockFileFactoryMimeType
8
8
  from kiln_ai.adapters.extractors.base_extractor import ExtractionInput, OutputFormat
9
9
  from kiln_ai.adapters.extractors.encoding import to_base64_url
10
+ from kiln_ai.adapters.extractors.extractor_registry import extractor_adapter_from_type
10
11
  from kiln_ai.adapters.extractors.litellm_extractor import (
11
12
  ExtractorConfig,
12
13
  Kind,
13
14
  LitellmExtractor,
14
15
  encode_file_litellm_format,
15
16
  )
16
- from kiln_ai.adapters.ml_model_list import built_in_models
17
+ from kiln_ai.adapters.ml_model_list import (
18
+ built_in_models,
19
+ built_in_models_from_provider,
20
+ )
17
21
  from kiln_ai.adapters.provider_tools import LiteLlmCoreConfig
18
22
  from kiln_ai.datamodel.extraction import ExtractorType
19
23
  from kiln_ai.utils.filesystem_cache import FilesystemCache
@@ -405,7 +409,7 @@ def test_litellm_model_slug_success(mock_litellm_extractor):
405
409
  return_value=mock_provider_info,
406
410
  ) as mock_get_provider_info,
407
411
  ):
408
- result = mock_litellm_extractor.litellm_model_slug()
412
+ result = mock_litellm_extractor.litellm_model_slug
409
413
 
410
414
  assert result == "test-provider/test-model"
411
415
 
@@ -414,6 +418,38 @@ def test_litellm_model_slug_success(mock_litellm_extractor):
414
418
  mock_get_provider_info.assert_called_once_with(mock_model_provider)
415
419
 
416
420
 
421
+ @pytest.mark.parametrize(
422
+ "max_parallel_requests, expected_result",
423
+ [
424
+ (10, 10),
425
+ (0, 0),
426
+ # 5 is the current default, it may change in the future if we have
427
+ # a better modeling of rate limit constraints
428
+ (None, 5),
429
+ ],
430
+ )
431
+ def test_litellm_model_max_parallel_requests(
432
+ mock_litellm_extractor, max_parallel_requests, expected_result
433
+ ):
434
+ """Test that max_parallel_requests_for_model returns the provider's limit."""
435
+ # Mock the built_in_models_from_provider function to return a valid model provider
436
+ mock_model_provider = MagicMock()
437
+ mock_model_provider.name = "test-provider"
438
+ mock_model_provider.max_parallel_requests = max_parallel_requests
439
+
440
+ with (
441
+ patch(
442
+ "kiln_ai.adapters.extractors.litellm_extractor.built_in_models_from_provider",
443
+ return_value=mock_model_provider,
444
+ ) as mock_built_in_models,
445
+ ):
446
+ result = mock_litellm_extractor.max_parallel_requests_for_model
447
+
448
+ assert result == expected_result
449
+
450
+ mock_built_in_models.assert_called_once()
451
+
452
+
417
453
  def test_litellm_model_slug_model_provider_not_found(mock_litellm_extractor):
418
454
  """Test that litellm_model_slug raises ValueError when model provider is not found."""
419
455
  with patch(
@@ -424,7 +460,7 @@ def test_litellm_model_slug_model_provider_not_found(mock_litellm_extractor):
424
460
  ValueError,
425
461
  match="Model provider openai not found in the list of built-in models",
426
462
  ):
427
- mock_litellm_extractor.litellm_model_slug()
463
+ mock_litellm_extractor.litellm_model_slug
428
464
 
429
465
 
430
466
  def test_litellm_model_slug_with_different_provider_names(mock_litellm_core_config):
@@ -468,35 +504,28 @@ def test_litellm_model_slug_with_different_provider_names(mock_litellm_core_conf
468
504
  return_value=mock_provider_info,
469
505
  ),
470
506
  ):
471
- result = extractor.litellm_model_slug()
507
+ result = extractor.litellm_model_slug
472
508
  assert result == expected_slug
473
509
 
474
510
 
475
511
  def paid_litellm_extractor(model_name: str, provider_name: str):
476
- return LitellmExtractor(
477
- extractor_config=ExtractorConfig(
512
+ extractor = extractor_adapter_from_type(
513
+ ExtractorType.LITELLM,
514
+ ExtractorConfig(
478
515
  name="paid-litellm",
479
516
  extractor_type=ExtractorType.LITELLM,
480
517
  model_provider_name=provider_name,
481
518
  model_name=model_name,
482
519
  properties={
483
- # in the paid tests, we can check which prompt is used by checking if the Kind shows up
484
- # in the output - not ideal but usually works
485
520
  "prompt_document": "Ignore the file and only respond with the word 'document'",
486
521
  "prompt_image": "Ignore the file and only respond with the word 'image'",
487
522
  "prompt_video": "Ignore the file and only respond with the word 'video'",
488
523
  "prompt_audio": "Ignore the file and only respond with the word 'audio'",
489
524
  },
490
- passthrough_mimetypes=[
491
- # we want all mimetypes to go to litellm to be sure we're testing the API call
492
- ],
493
- ),
494
- litellm_core_config=LiteLlmCoreConfig(
495
- base_url="https://test.com",
496
- additional_body_options={"api_key": "test-key"},
497
- default_headers={},
525
+ passthrough_mimetypes=[OutputFormat.MARKDOWN, OutputFormat.TEXT],
498
526
  ),
499
527
  )
528
+ return extractor
500
529
 
501
530
 
502
531
  @pytest.mark.parametrize(
@@ -560,6 +589,7 @@ def get_all_models_support_doc_extraction(
560
589
  provider.multimodal_mime_types is None
561
590
  or must_support_mime_types is None
562
591
  ):
592
+ model_provider_pairs.append((model.name, provider.name))
563
593
  continue
564
594
  # check that the model supports all the mime types
565
595
  if all(
@@ -573,23 +603,7 @@ def get_all_models_support_doc_extraction(
573
603
  @pytest.mark.paid
574
604
  @pytest.mark.parametrize(
575
605
  "model_name,provider_name",
576
- get_all_models_support_doc_extraction(
577
- must_support_mime_types=[
578
- MockFileFactoryMimeType.PDF,
579
- MockFileFactoryMimeType.TXT,
580
- MockFileFactoryMimeType.MD,
581
- MockFileFactoryMimeType.HTML,
582
- MockFileFactoryMimeType.CSV,
583
- MockFileFactoryMimeType.PNG,
584
- MockFileFactoryMimeType.JPEG,
585
- MockFileFactoryMimeType.JPG,
586
- MockFileFactoryMimeType.MP4,
587
- MockFileFactoryMimeType.MOV,
588
- MockFileFactoryMimeType.MP3,
589
- MockFileFactoryMimeType.OGG,
590
- MockFileFactoryMimeType.WAV,
591
- ]
592
- ),
606
+ get_all_models_support_doc_extraction(must_support_mime_types=None),
593
607
  )
594
608
  @pytest.mark.parametrize(
595
609
  "mime_type,expected_substring_in_output",
@@ -620,41 +634,17 @@ async def test_extract_document_success(
620
634
  expected_substring_in_output,
621
635
  mock_file_factory,
622
636
  ):
623
- test_file = mock_file_factory(mime_type)
624
- extractor = paid_litellm_extractor(
625
- model_name=model_name, provider_name=provider_name
626
- )
627
- output = await extractor.extract(
628
- extraction_input=ExtractionInput(
629
- path=str(test_file),
630
- mime_type=mime_type,
631
- )
632
- )
633
- assert not output.is_passthrough
634
- assert output.content_format == OutputFormat.MARKDOWN
635
- assert expected_substring_in_output.lower() in output.content.lower()
636
-
637
+ # get model
638
+ model = built_in_models_from_provider(provider_name, model_name)
639
+ assert model is not None
640
+ if mime_type not in model.multimodal_mime_types:
641
+ pytest.skip(f"Model {model_name} configured to not support {mime_type}")
642
+ if (
643
+ mime_type == MockFileFactoryMimeType.MD
644
+ or mime_type == MockFileFactoryMimeType.TXT
645
+ ):
646
+ pytest.skip(f"Model {model_name} configured to passthrough {mime_type}")
637
647
 
638
- @pytest.mark.paid
639
- @pytest.mark.parametrize(
640
- "model_name,provider_name",
641
- get_all_models_support_doc_extraction(
642
- must_support_mime_types=[MockFileFactoryMimeType.PDF]
643
- ),
644
- )
645
- @pytest.mark.parametrize(
646
- "mime_type,expected_substring_in_output",
647
- [
648
- (MockFileFactoryMimeType.PDF, "document"),
649
- ],
650
- )
651
- async def test_extract_document_success_pdf(
652
- model_name,
653
- provider_name,
654
- mime_type,
655
- expected_substring_in_output,
656
- mock_file_factory,
657
- ):
658
648
  test_file = mock_file_factory(mime_type)
659
649
  extractor = paid_litellm_extractor(
660
650
  model_name=model_name, provider_name=provider_name
@@ -704,6 +694,110 @@ async def test_extract_pdf_page_by_page(mock_file_factory, mock_litellm_extracto
704
694
  assert result.content_format == OutputFormat.MARKDOWN
705
695
 
706
696
 
697
+ async def test_extract_pdf_page_by_page_pdf_as_image(
698
+ mock_file_factory, mock_litellm_extractor, tmp_path
699
+ ):
700
+ """Test that PDFs are processed page by page as images if the model requires it."""
701
+
702
+ test_file = mock_file_factory(MockFileFactoryMimeType.PDF)
703
+
704
+ # Mock responses for each page (PDF has 2 pages)
705
+ mock_responses = []
706
+ for i in range(2): # PDF has 2 pages
707
+ mock_response = AsyncMock(spec=ModelResponse)
708
+ mock_choice = AsyncMock(spec=Choices)
709
+ mock_message = AsyncMock()
710
+ mock_message.content = f"Content from page {i + 1}"
711
+ mock_choice.message = mock_message
712
+ mock_response.choices = [mock_choice]
713
+ mock_responses.append(mock_response)
714
+
715
+ mock_image_path = tmp_path / "img-test_document-mock.png"
716
+ mock_image_path.write_bytes(b"test image")
717
+
718
+ with patch("litellm.acompletion", side_effect=mock_responses) as mock_acompletion:
719
+ # this model requires PDFs to be processed as images
720
+ mock_litellm_extractor.model_provider.multimodal_requires_pdf_as_image = True
721
+
722
+ with patch(
723
+ "kiln_ai.adapters.extractors.litellm_extractor.convert_pdf_to_images",
724
+ return_value=[mock_image_path],
725
+ ) as mock_convert:
726
+ result = await mock_litellm_extractor.extract(
727
+ ExtractionInput(
728
+ path=str(test_file),
729
+ mime_type="application/pdf",
730
+ )
731
+ )
732
+
733
+ # Verify image conversion called once per page
734
+ assert mock_convert.call_count == 2
735
+
736
+ # Verify LiteLLM was called with image inputs (not PDF) for each page
737
+ for call in mock_acompletion.call_args_list:
738
+ kwargs = call.kwargs
739
+ content = kwargs["messages"][0]["content"]
740
+ assert content[1]["type"] == "image_url"
741
+
742
+ # Verify that the completion was called multiple times (once per page)
743
+ assert mock_acompletion.call_count == 2
744
+
745
+ # Verify the output contains content from both pages
746
+ assert "Content from page 1" in result.content
747
+ assert "Content from page 2" in result.content
748
+
749
+ assert not result.is_passthrough
750
+ assert result.content_format == OutputFormat.MARKDOWN
751
+
752
+
753
+ async def test_convert_pdf_page_to_image_input_success(
754
+ mock_litellm_extractor, tmp_path
755
+ ):
756
+ page_dir = tmp_path / "pages"
757
+ page_dir.mkdir()
758
+ page_path = page_dir / "page_1.pdf"
759
+ page_path.write_bytes(b"%PDF-1.4 test")
760
+
761
+ mock_image_path = page_dir / "img-page_1.pdf-0.png"
762
+ mock_image_path.write_bytes(b"image-bytes")
763
+
764
+ with patch(
765
+ "kiln_ai.adapters.extractors.litellm_extractor.convert_pdf_to_images",
766
+ return_value=[mock_image_path],
767
+ ):
768
+ extraction_input = await mock_litellm_extractor.convert_pdf_page_to_image_input(
769
+ page_path, 0
770
+ )
771
+
772
+ assert extraction_input.mime_type == "image/png"
773
+ assert Path(extraction_input.path) == mock_image_path
774
+
775
+
776
+ @pytest.mark.parametrize("returned_count", [0, 2])
777
+ async def test_convert_pdf_page_to_image_input_error_on_invalid_count(
778
+ mock_litellm_extractor, tmp_path, returned_count
779
+ ):
780
+ page_dir = tmp_path / "pages"
781
+ page_dir.mkdir()
782
+ page_path = page_dir / "page_1.pdf"
783
+ page_path.write_bytes(b"%PDF-1.4 test")
784
+
785
+ image_paths = []
786
+ if returned_count == 2:
787
+ img1 = page_dir / "img-page_1.pdf-0.png"
788
+ img2 = page_dir / "img-page_1.pdf-1.png"
789
+ img1.write_bytes(b"i1")
790
+ img2.write_bytes(b"i2")
791
+ image_paths = [img1, img2]
792
+
793
+ with patch(
794
+ "kiln_ai.adapters.extractors.litellm_extractor.convert_pdf_to_images",
795
+ return_value=image_paths,
796
+ ):
797
+ with pytest.raises(ValueError, match=r"Expected 1 image, got "):
798
+ await mock_litellm_extractor.convert_pdf_page_to_image_input(page_path, 0)
799
+
800
+
707
801
  async def test_extract_pdf_page_by_page_error_handling(
708
802
  mock_file_factory, mock_litellm_extractor
709
803
  ):
@@ -894,15 +988,19 @@ async def test_extract_pdf_with_cache_storage(
894
988
  # Verify that the completion was called for each page
895
989
  assert mock_acompletion.call_count == 2
896
990
 
897
- # Verify content is stored in cache
991
+ # Verify content is stored in cache - note that order is not guaranteed since
992
+ # we batch the page extraction requests in parallel
898
993
  pdf_path = Path(test_file)
994
+ cached_contents = []
899
995
  for i in range(2):
900
996
  cached_content = (
901
997
  await mock_litellm_extractor_with_cache.get_page_content_from_cache(
902
998
  pdf_path, i
903
999
  )
904
1000
  )
905
- assert cached_content == f"Content from page {i + 1}"
1001
+ assert cached_content is not None
1002
+ cached_contents.append(cached_content)
1003
+ assert set(cached_contents) == {"Content from page 1", "Content from page 2"}
906
1004
 
907
1005
  # Verify the output contains content from both pages
908
1006
  assert "Content from page 1" in result.content
@@ -1137,7 +1235,7 @@ async def test_extract_pdf_parallel_processing_error_handling(
1137
1235
  "litellm.acompletion",
1138
1236
  side_effect=[mock_response1, Exception("API Error on page 2")],
1139
1237
  ) as mock_acompletion:
1140
- with pytest.raises(ValueError, match=r".*Page 1:.*API Error on page 2"):
1238
+ with pytest.raises(ValueError, match=r".*API Error on page 2"):
1141
1239
  await mock_litellm_extractor_with_cache.extract(
1142
1240
  ExtractionInput(
1143
1241
  path=str(test_file),