kiln-ai 0.20.1__py3-none-any.whl → 0.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (133) hide show
  1. kiln_ai/adapters/__init__.py +6 -0
  2. kiln_ai/adapters/adapter_registry.py +43 -226
  3. kiln_ai/adapters/chunkers/__init__.py +13 -0
  4. kiln_ai/adapters/chunkers/base_chunker.py +42 -0
  5. kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
  6. kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
  7. kiln_ai/adapters/chunkers/helpers.py +23 -0
  8. kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
  9. kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
  10. kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
  11. kiln_ai/adapters/chunkers/test_helpers.py +75 -0
  12. kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
  13. kiln_ai/adapters/embedding/__init__.py +0 -0
  14. kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
  15. kiln_ai/adapters/embedding/embedding_registry.py +32 -0
  16. kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
  17. kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
  18. kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
  19. kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
  20. kiln_ai/adapters/eval/eval_runner.py +6 -2
  21. kiln_ai/adapters/eval/test_base_eval.py +1 -3
  22. kiln_ai/adapters/eval/test_g_eval.py +1 -1
  23. kiln_ai/adapters/extractors/__init__.py +18 -0
  24. kiln_ai/adapters/extractors/base_extractor.py +72 -0
  25. kiln_ai/adapters/extractors/encoding.py +20 -0
  26. kiln_ai/adapters/extractors/extractor_registry.py +44 -0
  27. kiln_ai/adapters/extractors/extractor_runner.py +112 -0
  28. kiln_ai/adapters/extractors/litellm_extractor.py +406 -0
  29. kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
  30. kiln_ai/adapters/extractors/test_encoding.py +54 -0
  31. kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
  32. kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
  33. kiln_ai/adapters/extractors/test_litellm_extractor.py +1290 -0
  34. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
  35. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
  36. kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
  37. kiln_ai/adapters/ml_embedding_model_list.py +494 -0
  38. kiln_ai/adapters/ml_model_list.py +876 -18
  39. kiln_ai/adapters/model_adapters/litellm_adapter.py +40 -75
  40. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +79 -1
  41. kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +119 -5
  42. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +9 -3
  43. kiln_ai/adapters/model_adapters/test_structured_output.py +9 -10
  44. kiln_ai/adapters/ollama_tools.py +69 -12
  45. kiln_ai/adapters/provider_tools.py +190 -46
  46. kiln_ai/adapters/rag/deduplication.py +49 -0
  47. kiln_ai/adapters/rag/progress.py +252 -0
  48. kiln_ai/adapters/rag/rag_runners.py +844 -0
  49. kiln_ai/adapters/rag/test_deduplication.py +195 -0
  50. kiln_ai/adapters/rag/test_progress.py +785 -0
  51. kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
  52. kiln_ai/adapters/remote_config.py +80 -8
  53. kiln_ai/adapters/test_adapter_registry.py +579 -86
  54. kiln_ai/adapters/test_ml_embedding_model_list.py +239 -0
  55. kiln_ai/adapters/test_ml_model_list.py +202 -0
  56. kiln_ai/adapters/test_ollama_tools.py +340 -1
  57. kiln_ai/adapters/test_prompt_builders.py +1 -1
  58. kiln_ai/adapters/test_provider_tools.py +199 -8
  59. kiln_ai/adapters/test_remote_config.py +551 -56
  60. kiln_ai/adapters/vector_store/__init__.py +1 -0
  61. kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
  62. kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
  63. kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
  64. kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
  65. kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
  66. kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
  67. kiln_ai/datamodel/__init__.py +16 -13
  68. kiln_ai/datamodel/basemodel.py +201 -4
  69. kiln_ai/datamodel/chunk.py +158 -0
  70. kiln_ai/datamodel/datamodel_enums.py +27 -0
  71. kiln_ai/datamodel/embedding.py +64 -0
  72. kiln_ai/datamodel/external_tool_server.py +206 -54
  73. kiln_ai/datamodel/extraction.py +317 -0
  74. kiln_ai/datamodel/project.py +33 -1
  75. kiln_ai/datamodel/rag.py +79 -0
  76. kiln_ai/datamodel/task.py +5 -0
  77. kiln_ai/datamodel/task_output.py +41 -11
  78. kiln_ai/datamodel/test_attachment.py +649 -0
  79. kiln_ai/datamodel/test_basemodel.py +270 -14
  80. kiln_ai/datamodel/test_chunk_models.py +317 -0
  81. kiln_ai/datamodel/test_dataset_split.py +1 -1
  82. kiln_ai/datamodel/test_datasource.py +50 -0
  83. kiln_ai/datamodel/test_embedding_models.py +448 -0
  84. kiln_ai/datamodel/test_eval_model.py +6 -6
  85. kiln_ai/datamodel/test_external_tool_server.py +534 -152
  86. kiln_ai/datamodel/test_extraction_chunk.py +206 -0
  87. kiln_ai/datamodel/test_extraction_model.py +501 -0
  88. kiln_ai/datamodel/test_rag.py +641 -0
  89. kiln_ai/datamodel/test_task.py +35 -1
  90. kiln_ai/datamodel/test_tool_id.py +187 -1
  91. kiln_ai/datamodel/test_vector_store.py +320 -0
  92. kiln_ai/datamodel/tool_id.py +58 -0
  93. kiln_ai/datamodel/vector_store.py +141 -0
  94. kiln_ai/tools/base_tool.py +12 -3
  95. kiln_ai/tools/built_in_tools/math_tools.py +12 -4
  96. kiln_ai/tools/kiln_task_tool.py +158 -0
  97. kiln_ai/tools/mcp_server_tool.py +2 -2
  98. kiln_ai/tools/mcp_session_manager.py +51 -22
  99. kiln_ai/tools/rag_tools.py +164 -0
  100. kiln_ai/tools/test_kiln_task_tool.py +527 -0
  101. kiln_ai/tools/test_mcp_server_tool.py +4 -15
  102. kiln_ai/tools/test_mcp_session_manager.py +187 -227
  103. kiln_ai/tools/test_rag_tools.py +929 -0
  104. kiln_ai/tools/test_tool_registry.py +290 -7
  105. kiln_ai/tools/tool_registry.py +69 -16
  106. kiln_ai/utils/__init__.py +3 -0
  107. kiln_ai/utils/async_job_runner.py +62 -17
  108. kiln_ai/utils/config.py +2 -2
  109. kiln_ai/utils/env.py +15 -0
  110. kiln_ai/utils/filesystem.py +14 -0
  111. kiln_ai/utils/filesystem_cache.py +60 -0
  112. kiln_ai/utils/litellm.py +94 -0
  113. kiln_ai/utils/lock.py +100 -0
  114. kiln_ai/utils/mime_type.py +38 -0
  115. kiln_ai/utils/open_ai_types.py +19 -2
  116. kiln_ai/utils/pdf_utils.py +59 -0
  117. kiln_ai/utils/test_async_job_runner.py +151 -35
  118. kiln_ai/utils/test_env.py +142 -0
  119. kiln_ai/utils/test_filesystem_cache.py +316 -0
  120. kiln_ai/utils/test_litellm.py +206 -0
  121. kiln_ai/utils/test_lock.py +185 -0
  122. kiln_ai/utils/test_mime_type.py +66 -0
  123. kiln_ai/utils/test_open_ai_types.py +88 -12
  124. kiln_ai/utils/test_pdf_utils.py +86 -0
  125. kiln_ai/utils/test_uuid.py +111 -0
  126. kiln_ai/utils/test_validation.py +524 -0
  127. kiln_ai/utils/uuid.py +9 -0
  128. kiln_ai/utils/validation.py +90 -0
  129. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/METADATA +9 -1
  130. kiln_ai-0.22.0.dist-info/RECORD +213 -0
  131. kiln_ai-0.20.1.dist-info/RECORD +0 -138
  132. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/WHEEL +0 -0
  133. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -1,5 +1,7 @@
1
1
  import datetime
2
2
  import json
3
+ import logging
4
+ import time
3
5
  import uuid
4
6
  from pathlib import Path
5
7
  from typing import Optional
@@ -342,12 +344,12 @@ def test_delete_no_path():
342
344
  ("Hello 👍", "Hello 👍"),
343
345
  # Invalid characters are replaced
344
346
  ("Hello@World!", "Hello@World!"),
345
- ("File.name.txt", "File_name_txt"),
346
- ("Special%%%Chars", "Special_Chars"),
347
- ("Special#$%Chars", "Special#$_Chars"),
347
+ ("File.name.txt", "File name txt"),
348
+ ("Special%%%Chars", "Special Chars"),
349
+ ("Special#$%Chars", "Special#$ Chars"),
348
350
  # Consecutive invalid characters are replaced
349
- ("Special%%%Chars", "Special_Chars"),
350
- ("path/to/file", "path_to_file"),
351
+ ("Special%%%Chars", "Special Chars"),
352
+ ("path/to/file", "path to file"),
351
353
  # Leading/trailing special characters are removed
352
354
  ("__test__", "test"),
353
355
  ("...test...", "test"),
@@ -360,14 +362,14 @@ def test_delete_no_path():
360
362
  ("你好_世界", "你好_世界"),
361
363
  ("你好_世界_你好", "你好_世界_你好"),
362
364
  # Newlines, tabs, and other control characters are replaced
363
- ("Hello\nworld", "Hello_world"),
364
- ("Hello\tworld", "Hello_world"),
365
- ("Hello\rworld", "Hello_world"),
366
- ("Hello\fworld", "Hello_world"),
367
- ("Hello\bworld", "Hello_world"),
368
- ("Hello\vworld", "Hello_world"),
369
- ("Hello\0world", "Hello_world"),
370
- ("Hello\x00world", "Hello_world"),
365
+ ("Hello\nworld", "Hello world"),
366
+ ("Hello\tworld", "Hello world"),
367
+ ("Hello\rworld", "Hello world"),
368
+ ("Hello\fworld", "Hello world"),
369
+ ("Hello\bworld", "Hello world"),
370
+ ("Hello\vworld", "Hello world"),
371
+ ("Hello\0world", "Hello world"),
372
+ ("Hello\x00world", "Hello world"),
371
373
  ],
372
374
  )
373
375
  def test_string_to_valid_name(tmp_path, name, expected):
@@ -491,6 +493,7 @@ def test_from_id_and_parent_path(test_base_parented_file, tmp_model_cache):
491
493
  child3.save_to_file()
492
494
 
493
495
  # Test finding existing child by ID
496
+ assert child2.id is not None # Type safety
494
497
  found_child = DefaultParentedModel.from_id_and_parent_path(
495
498
  child2.id, test_base_parented_file
496
499
  )
@@ -513,6 +516,7 @@ def test_from_id_and_parent_path_with_cache(test_base_parented_file, tmp_model_c
513
516
  child.save_to_file()
514
517
 
515
518
  # First load to populate cache
519
+ assert child.id is not None # Type safety
516
520
  _ = DefaultParentedModel.from_id_and_parent_path(child.id, test_base_parented_file)
517
521
 
518
522
  # Mock cache to verify it's used
@@ -534,6 +538,258 @@ def test_from_id_and_parent_path_without_parent():
534
538
  assert not_found is None
535
539
 
536
540
 
541
+ def test_from_ids_and_parent_path_basic(test_base_parented_file, tmp_model_cache):
542
+ """Test basic functionality of from_ids_and_parent_path method"""
543
+ # Set up parent and children models
544
+ parent = BaseParentExample.load_from_file(test_base_parented_file)
545
+
546
+ child1 = DefaultParentedModel(parent=parent, name="Child1")
547
+ child2 = DefaultParentedModel(parent=parent, name="Child2")
548
+ child3 = DefaultParentedModel(parent=parent, name="Child3")
549
+
550
+ # Save all children
551
+ child1.save_to_file()
552
+ child2.save_to_file()
553
+ child3.save_to_file()
554
+
555
+ # Test finding multiple children by IDs
556
+ assert child1.id is not None and child2.id is not None # Type safety
557
+ target_ids = {child1.id, child3.id}
558
+ found_children = DefaultParentedModel.from_ids_and_parent_path(
559
+ target_ids, test_base_parented_file
560
+ )
561
+
562
+ # Verify correct children were found
563
+ assert len(found_children) == 2
564
+ assert child1.id in found_children
565
+ assert child3.id in found_children
566
+ assert child2.id not in found_children
567
+
568
+ # Verify the returned models have correct data
569
+ assert found_children[child1.id].name == "Child1"
570
+ assert found_children[child3.id].name == "Child3"
571
+
572
+ # Verify they are not the same instances (deep copies)
573
+ assert found_children[child1.id] is not child1
574
+ assert found_children[child3.id] is not child3
575
+
576
+
577
+ def test_from_ids_and_parent_path_empty_list(test_base_parented_file):
578
+ """Test from_ids_and_parent_path with empty ID list"""
579
+ found_children = DefaultParentedModel.from_ids_and_parent_path(
580
+ set(), test_base_parented_file
581
+ )
582
+ assert found_children == {}
583
+
584
+
585
+ def test_from_ids_and_parent_path_none_parent():
586
+ """Test from_ids_and_parent_path with None parent_path"""
587
+ found_children = DefaultParentedModel.from_ids_and_parent_path({"any-id"}, None)
588
+ assert found_children == {}
589
+
590
+
591
+ def test_from_ids_and_parent_path_no_matches(test_base_parented_file, tmp_model_cache):
592
+ """Test from_ids_and_parent_path when no IDs match existing children"""
593
+ # Set up parent and children models
594
+ parent = BaseParentExample.load_from_file(test_base_parented_file)
595
+
596
+ child1 = DefaultParentedModel(parent=parent, name="Child1")
597
+ child1.save_to_file()
598
+
599
+ # Test with non-existent IDs
600
+ found_children = DefaultParentedModel.from_ids_and_parent_path(
601
+ {"nonexistent1", "nonexistent2"}, test_base_parented_file
602
+ )
603
+ assert found_children == {}
604
+
605
+
606
+ def test_from_ids_and_parent_path_partial_matches(
607
+ test_base_parented_file, tmp_model_cache
608
+ ):
609
+ """Test from_ids_and_parent_path when only some IDs match"""
610
+ # Set up parent and children models
611
+ parent = BaseParentExample.load_from_file(test_base_parented_file)
612
+
613
+ child1 = DefaultParentedModel(parent=parent, name="Child1")
614
+ child2 = DefaultParentedModel(parent=parent, name="Child2")
615
+
616
+ # Save children
617
+ child1.save_to_file()
618
+ child2.save_to_file()
619
+
620
+ # Test with mix of existing and non-existent IDs
621
+ assert child1.id is not None and child2.id is not None # Type safety
622
+ target_ids = {child1.id, "nonexistent", child2.id, "another_nonexistent"}
623
+ found_children = DefaultParentedModel.from_ids_and_parent_path(
624
+ target_ids, test_base_parented_file
625
+ )
626
+
627
+ # Should only find the existing children
628
+ assert len(found_children) == 2
629
+ assert child1.id in found_children
630
+ assert child2.id in found_children
631
+ assert "nonexistent" not in found_children
632
+ assert "another_nonexistent" not in found_children
633
+
634
+
635
+ def test_from_ids_and_parent_path_with_cache_fallback(
636
+ test_base_parented_file, tmp_model_cache
637
+ ):
638
+ """Test from_ids_and_parent_path when cache returns None and needs to load file"""
639
+ # Set up parent and child
640
+ parent = BaseParentExample.load_from_file(test_base_parented_file)
641
+ child = DefaultParentedModel(parent=parent, name="Child")
642
+ child.save_to_file()
643
+
644
+ # Mock cache to return None for get_model_id, forcing file load
645
+ tmp_model_cache.get_model_id = MagicMock(return_value=None)
646
+
647
+ # Test should still work by loading the file
648
+ assert child.id is not None # Type safety
649
+ found_children = DefaultParentedModel.from_ids_and_parent_path(
650
+ {child.id}, test_base_parented_file
651
+ )
652
+
653
+ assert len(found_children) == 1
654
+ assert child.id in found_children
655
+ assert found_children[child.id].name == "Child"
656
+
657
+ # Verify cache was checked
658
+ tmp_model_cache.get_model_id.assert_called()
659
+
660
+
661
+ def test_from_ids_and_parent_path_equivalent_to_individual_lookups(
662
+ test_base_parented_file, tmp_model_cache
663
+ ):
664
+ """Test that from_ids_and_parent_path returns the same results as individual lookups"""
665
+ # Set up parent and multiple children
666
+ parent = BaseParentExample.load_from_file(test_base_parented_file)
667
+
668
+ children = []
669
+ for i in range(10):
670
+ child = DefaultParentedModel(parent=parent, name=f"Child{i}")
671
+ child.save_to_file()
672
+ children.append(child)
673
+
674
+ # Select 5 children to lookup
675
+ target_ids = {
676
+ child.id for child in children[::2] if child.id is not None
677
+ } # Every other child
678
+ assert len(target_ids) == 5 # Ensure we have 5 children to test
679
+
680
+ # Test bulk method
681
+ bulk_results = DefaultParentedModel.from_ids_and_parent_path(
682
+ target_ids, test_base_parented_file
683
+ )
684
+
685
+ # Test individual method
686
+ individual_results = {}
687
+ for target_id in target_ids:
688
+ result = DefaultParentedModel.from_id_and_parent_path(
689
+ target_id, test_base_parented_file
690
+ )
691
+ if result:
692
+ individual_results[target_id] = result
693
+
694
+ # Results should be equivalent
695
+ assert len(bulk_results) == len(individual_results) == 5
696
+
697
+ for target_id in target_ids:
698
+ assert target_id in bulk_results
699
+ assert target_id in individual_results
700
+
701
+ # Compare the key attributes
702
+ bulk_child = bulk_results[target_id]
703
+ individual_child = individual_results[target_id]
704
+
705
+ assert bulk_child.id == individual_child.id
706
+ assert bulk_child.name == individual_child.name
707
+ assert bulk_child.model_type == individual_child.model_type
708
+
709
+
710
+ # Not actually paid, but we want the "must be run manually" feature of the paid marker as this is very slow
711
+ @pytest.mark.paid
712
+ @pytest.mark.parametrize("num_children", [100, 1000, 2500, 5000])
713
+ def test_from_ids_and_parent_path_benchmark(
714
+ test_base_parented_file, tmp_model_cache, num_children
715
+ ):
716
+ """Benchmark test for from_ids_and_parent_path method performance at scale"""
717
+ # Set up parent and many children
718
+ parent = BaseParentExample.load_from_file(test_base_parented_file)
719
+
720
+ children = []
721
+ for i in range(num_children):
722
+ child = DefaultParentedModel(parent=parent, name=f"Child{i:05d}")
723
+ child.save_to_file()
724
+ children.append(child)
725
+
726
+ # look up all children
727
+ lookup_count = num_children
728
+ target_ids = {child.id for child in children[:lookup_count] if child.id is not None}
729
+ assert len(target_ids) == lookup_count
730
+
731
+ # Benchmark the bulk method using manual timing
732
+ def bulk_lookup():
733
+ return DefaultParentedModel.from_ids_and_parent_path(
734
+ target_ids, test_base_parented_file
735
+ )
736
+
737
+ # Run bulk method once and time it
738
+ start_time = time.perf_counter()
739
+ bulk_result = bulk_lookup()
740
+ end_time = time.perf_counter()
741
+ bulk_time = end_time - start_time
742
+
743
+ # Verify we got the expected results
744
+ assert len(bulk_result) == lookup_count
745
+
746
+ # Calculate bulk method stats
747
+ bulk_ops_per_second = lookup_count / bulk_time
748
+
749
+ # Benchmark the individual lookup method using manual timing
750
+ def individual_lookups():
751
+ results = {}
752
+ for target_id in target_ids:
753
+ result = DefaultParentedModel.from_id_and_parent_path(
754
+ target_id, test_base_parented_file
755
+ )
756
+ if result:
757
+ results[target_id] = result
758
+ return results
759
+
760
+ # Run individual lookup method
761
+ start_time = time.perf_counter()
762
+ individual_result = individual_lookups()
763
+ end_time = time.perf_counter()
764
+ individual_time = end_time - start_time
765
+
766
+ assert len(individual_result) == lookup_count
767
+ individual_ops_per_second = lookup_count / individual_time
768
+
769
+ # Calculate performance comparison
770
+ speedup = individual_time / bulk_time
771
+ time_savings_pct = (individual_time - bulk_time) / individual_time * 100
772
+
773
+ # Use logging to display results (will show with -s flag or --log-cli-level=INFO)
774
+ logger = logging.getLogger(__name__)
775
+ logger.info(
776
+ f"Benchmark results for {num_children} children, {lookup_count} lookups:"
777
+ )
778
+ logger.info(f" Bulk method: {bulk_time:.4f}s ({bulk_ops_per_second:.2f} ops/sec)")
779
+ logger.info(
780
+ f" Individual method: {individual_time:.4f}s ({individual_ops_per_second:.2f} ops/sec)"
781
+ )
782
+ logger.info(
783
+ f" Speedup: {speedup:.2f}x faster, {time_savings_pct:.1f}% time savings"
784
+ )
785
+
786
+ assert bulk_time > 0, "Bulk method should complete successfully"
787
+ assert individual_time > 0, "Individual method should complete successfully"
788
+ assert speedup >= 1.0, (
789
+ f"Expected bulk method to be faster, but got {speedup:.2f}x speedup"
790
+ )
791
+
792
+
537
793
  class MockAdapter(BaseAdapter):
538
794
  """Implementation of BaseAdapter for testing"""
539
795
 
@@ -605,7 +861,7 @@ async def test_invoke_parsing_flow(adapter):
605
861
  mock_provider.reasoning_capable = True
606
862
  with pytest.raises(
607
863
  RuntimeError,
608
- match="Reasoning is required for this model, but no reasoning was returned.",
864
+ match=r"^Reasoning is required for this model, but no reasoning was returned.$",
609
865
  ):
610
866
  await adapter.invoke("test input")
611
867
 
@@ -0,0 +1,317 @@
1
+ import tempfile
2
+ import uuid
3
+ from enum import Enum
4
+ from pathlib import Path
5
+
6
+ import pytest
7
+
8
+ from kiln_ai.datamodel.basemodel import KilnAttachmentModel
9
+ from kiln_ai.datamodel.chunk import Chunk, ChunkedDocument, ChunkerConfig, ChunkerType
10
+ from kiln_ai.datamodel.project import Project
11
+
12
+
13
+ @pytest.fixture
14
+ def mock_project(tmp_path):
15
+ project_root = tmp_path / str(uuid.uuid4())
16
+ project_root.mkdir()
17
+ project = Project(
18
+ name="Test Project",
19
+ description="Test description",
20
+ path=project_root / "project.kiln",
21
+ )
22
+ project.save_to_file()
23
+ return project
24
+
25
+
26
+ class TestFixedWindowChunkerProperties:
27
+ """Test the FixedWindowChunkerProperties class."""
28
+
29
+ def test_required_fields(self):
30
+ """Test that required fields are set correctly."""
31
+ with pytest.raises(ValueError):
32
+ ChunkerConfig(
33
+ name="test-chunker",
34
+ chunker_type=ChunkerType.FIXED_WINDOW,
35
+ properties={},
36
+ )
37
+
38
+ def test_custom_values(self):
39
+ """Test that custom values can be set."""
40
+ config = ChunkerConfig(
41
+ name="test-chunker",
42
+ chunker_type=ChunkerType.FIXED_WINDOW,
43
+ properties={"chunk_size": 512, "chunk_overlap": 20},
44
+ )
45
+ assert config.properties == {
46
+ "chunk_size": 512,
47
+ "chunk_overlap": 20,
48
+ }
49
+
50
+ assert config.chunk_size() == 512
51
+ assert config.chunk_overlap() == 20
52
+
53
+ def test_validation_positive_values(self):
54
+ """Test that positive values are accepted."""
55
+ config = ChunkerConfig(
56
+ name="test-chunker",
57
+ chunker_type=ChunkerType.FIXED_WINDOW,
58
+ properties={"chunk_size": 1, "chunk_overlap": 0},
59
+ )
60
+ assert config.properties == {
61
+ "chunk_size": 1,
62
+ "chunk_overlap": 0,
63
+ }
64
+
65
+ assert config.chunk_size() == 1
66
+ assert config.chunk_overlap() == 0
67
+
68
+ def test_validation_negative_values(self):
69
+ """Test that negative values are rejected."""
70
+ with pytest.raises(ValueError):
71
+ ChunkerConfig(
72
+ name="test-chunker",
73
+ chunker_type=ChunkerType.FIXED_WINDOW,
74
+ properties={"chunk_size": -1, "chunk_overlap": -1},
75
+ )
76
+
77
+ def test_validation_zero_chunk_size(self):
78
+ """Test that zero chunk size is rejected."""
79
+ with pytest.raises(ValueError):
80
+ ChunkerConfig(
81
+ name="test-chunker",
82
+ chunker_type=ChunkerType.FIXED_WINDOW,
83
+ properties={"chunk_size": 0, "chunk_overlap": 0},
84
+ )
85
+
86
+ def test_validation_overlap_greater_than_chunk_size(self):
87
+ """Test that overlap is greater than chunk size."""
88
+ with pytest.raises(ValueError):
89
+ ChunkerConfig(
90
+ name="test-chunker",
91
+ chunker_type=ChunkerType.FIXED_WINDOW,
92
+ properties={"chunk_size": 100, "chunk_overlap": 101},
93
+ )
94
+
95
+ def test_validation_overlap_less_than_zero(self):
96
+ """Test that overlap is less than zero."""
97
+ with pytest.raises(ValueError):
98
+ ChunkerConfig(
99
+ name="test-chunker",
100
+ chunker_type=ChunkerType.FIXED_WINDOW,
101
+ properties={"chunk_size": 100, "chunk_overlap": -1},
102
+ )
103
+
104
+ def test_validation_overlap_without_chunk_size(self):
105
+ """Test that overlap without chunk size is rejected."""
106
+ with pytest.raises(ValueError):
107
+ ChunkerConfig(
108
+ name="test-chunker",
109
+ chunker_type=ChunkerType.FIXED_WINDOW,
110
+ properties={"chunk_overlap": 10},
111
+ )
112
+
113
+ def test_validation_chunk_size_without_overlap(self):
114
+ """Test that chunk size without overlap will raise an error."""
115
+ with pytest.raises(ValueError, match=r"Chunk overlap is required."):
116
+ ChunkerConfig(
117
+ name="test-chunker",
118
+ chunker_type=ChunkerType.FIXED_WINDOW,
119
+ properties={"chunk_size": 100},
120
+ )
121
+
122
+ def test_validation_wrong_type(self):
123
+ """Test that wrong type is rejected."""
124
+ with pytest.raises(ValueError):
125
+ ChunkerConfig(
126
+ name="test-chunker",
127
+ chunker_type=ChunkerType.FIXED_WINDOW,
128
+ properties={"chunk_size": "100", "chunk_overlap": 10},
129
+ )
130
+
131
+ def test_validation_none_values(self):
132
+ """Reject none values - we prefer not to have the properties rather than a None."""
133
+ with pytest.raises(ValueError):
134
+ ChunkerConfig(
135
+ name="test-chunker",
136
+ chunker_type=ChunkerType.FIXED_WINDOW,
137
+ properties={"chunk_size": None, "chunk_overlap": 15},
138
+ )
139
+
140
+
141
+ class TestChunkerType:
142
+ """Test the ChunkerType enum."""
143
+
144
+ def test_enum_values(self):
145
+ """Test that enum has the expected values."""
146
+ assert ChunkerType.FIXED_WINDOW == "fixed_window"
147
+
148
+ def test_enum_inheritance(self):
149
+ """Test that ChunkerType inherits from str and Enum."""
150
+ assert issubclass(ChunkerType, str)
151
+ assert issubclass(ChunkerType, Enum)
152
+
153
+ def test_enum_comparison(self):
154
+ """Test enum comparison operations."""
155
+ assert ChunkerType.FIXED_WINDOW == "fixed_window"
156
+ assert ChunkerType.FIXED_WINDOW.value == "fixed_window"
157
+
158
+
159
+ class TestChunkerConfig:
160
+ """Test the ChunkerConfig class."""
161
+
162
+ def test_optional_description(self):
163
+ """Test that description is optional."""
164
+ config = ChunkerConfig(
165
+ name="test-chunker",
166
+ chunker_type=ChunkerType.FIXED_WINDOW,
167
+ properties={
168
+ "chunk_size": 100,
169
+ "chunk_overlap": 10,
170
+ },
171
+ )
172
+ assert config.description is None
173
+
174
+ config_with_desc = ChunkerConfig(
175
+ name="test-chunker",
176
+ description="A test chunker",
177
+ chunker_type=ChunkerType.FIXED_WINDOW,
178
+ properties={
179
+ "chunk_size": 100,
180
+ "chunk_overlap": 10,
181
+ },
182
+ )
183
+ assert config_with_desc.description == "A test chunker"
184
+
185
+ def test_name_validation(self):
186
+ """Test name field validation."""
187
+ # Test valid name
188
+ config = ChunkerConfig(
189
+ name="valid-name_123",
190
+ chunker_type=ChunkerType.FIXED_WINDOW,
191
+ properties={
192
+ "chunk_size": 100,
193
+ "chunk_overlap": 10,
194
+ },
195
+ )
196
+ assert config.name == "valid-name_123"
197
+
198
+ # Test invalid name (contains special characters)
199
+ with pytest.raises(ValueError):
200
+ ChunkerConfig(
201
+ name="invalid@name",
202
+ chunker_type=ChunkerType.FIXED_WINDOW,
203
+ properties={},
204
+ )
205
+
206
+ # Test empty name
207
+ with pytest.raises(ValueError):
208
+ ChunkerConfig(
209
+ name="",
210
+ chunker_type=ChunkerType.FIXED_WINDOW,
211
+ properties={},
212
+ )
213
+
214
+ def test_parent_project_method_no_parent(self):
215
+ """Test parent_project method when no parent is set."""
216
+ config = ChunkerConfig(
217
+ name="test-chunker",
218
+ chunker_type=ChunkerType.FIXED_WINDOW,
219
+ properties={
220
+ "chunk_size": 100,
221
+ "chunk_overlap": 10,
222
+ },
223
+ )
224
+ assert config.parent_project() is None
225
+
226
+
227
+ class TestChunk:
228
+ """Test the Chunk class."""
229
+
230
+ def test_required_fields(self):
231
+ """Test that required fields are properly validated."""
232
+ # Create a temporary file for the content
233
+ with tempfile.NamedTemporaryFile(delete=True) as tmp_file:
234
+ tmp_file.write(b"test content")
235
+ tmp_path = Path(tmp_file.name)
236
+
237
+ attachment = KilnAttachmentModel.from_file(tmp_path)
238
+ chunk = Chunk(content=attachment)
239
+ assert chunk.content == attachment
240
+
241
+ def test_content_validation(self):
242
+ """Test that content field is properly validated."""
243
+ # Create a temporary file for the attachment
244
+ with tempfile.NamedTemporaryFile(delete=True) as tmp_file:
245
+ tmp_file.write(b"test content")
246
+ tmp_path = Path(tmp_file.name)
247
+
248
+ # Test with valid attachment
249
+ attachment = KilnAttachmentModel.from_file(tmp_path)
250
+ chunk = Chunk(content=attachment)
251
+ assert chunk.content == attachment
252
+
253
+ # Test that attachment is required
254
+ with pytest.raises(ValueError):
255
+ Chunk(content=None)
256
+
257
+
258
+ class TestChunkedDocument:
259
+ """Test the ChunkedDocument class."""
260
+
261
+ def test_required_fields(self):
262
+ """Test that required fields are properly validated."""
263
+ chunks = []
264
+ doc = ChunkedDocument(chunks=chunks, chunker_config_id="fake-id")
265
+ assert doc.chunks == chunks
266
+
267
+ def test_with_chunks(self):
268
+ """Test with actual chunks."""
269
+ # Create a temporary file for the attachment
270
+ with tempfile.NamedTemporaryFile(delete=True) as tmp_file:
271
+ tmp_file.write(b"test content")
272
+ tmp_path = Path(tmp_file.name)
273
+
274
+ attachment = KilnAttachmentModel.from_file(tmp_path)
275
+ chunk1 = Chunk(content=attachment)
276
+ chunk2 = Chunk(content=attachment)
277
+
278
+ chunks = [chunk1, chunk2]
279
+ doc = ChunkedDocument(chunks=chunks, chunker_config_id="fake-id")
280
+ assert doc.chunks == chunks
281
+ assert len(doc.chunks) == 2
282
+
283
+ def test_parent_extraction_method_no_parent(self):
284
+ """Test parent_extraction method when no parent is set."""
285
+ doc = ChunkedDocument(chunks=[], chunker_config_id="fake-id")
286
+ assert doc.parent_extraction() is None
287
+
288
+ def test_empty_chunks_list(self):
289
+ """Test that empty chunks list is valid."""
290
+ doc = ChunkedDocument(chunks=[], chunker_config_id="fake-id")
291
+ assert doc.chunks == []
292
+ assert len(doc.chunks) == 0
293
+
294
+ def test_chunks_validation(self):
295
+ """Test that chunks field validation works correctly."""
296
+ # Create a temporary file for the attachment
297
+ with tempfile.NamedTemporaryFile(delete=True) as tmp_file:
298
+ tmp_file.write(b"test content")
299
+ tmp_path = Path(tmp_file.name)
300
+
301
+ # Test with valid list of chunks
302
+ attachment = KilnAttachmentModel.from_file(tmp_path)
303
+ chunk = Chunk(content=attachment)
304
+ chunks = [chunk]
305
+
306
+ doc = ChunkedDocument(
307
+ chunks=chunks,
308
+ chunker_config_id="fake-id",
309
+ )
310
+ assert doc.chunks == chunks
311
+
312
+ # Test that chunks must be a list
313
+ with pytest.raises(ValueError):
314
+ ChunkedDocument(
315
+ chunks=chunk,
316
+ chunker_config_id="fake-id",
317
+ )
@@ -120,7 +120,7 @@ def test_dataset_split_validation():
120
120
  DatasetSplitDefinition(name="train", percentage=0.8),
121
121
  DatasetSplitDefinition(name="test", percentage=0.3),
122
122
  ]
123
- with pytest.raises(ValueError, match="sum of split percentages must be 1.0"):
123
+ with pytest.raises(ValueError, match=r"sum of split percentages must be 1.0"):
124
124
  DatasetSplit(
125
125
  name="test_split",
126
126
  splits=invalid_splits,