cognee 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +1 -0
- cognee/api/client.py +9 -5
- cognee/api/v1/add/add.py +2 -1
- cognee/api/v1/add/routers/get_add_router.py +3 -1
- cognee/api/v1/cognify/cognify.py +24 -16
- cognee/api/v1/cognify/routers/__init__.py +0 -1
- cognee/api/v1/cognify/routers/get_cognify_router.py +30 -1
- cognee/api/v1/datasets/routers/get_datasets_router.py +3 -3
- cognee/api/v1/ontologies/__init__.py +4 -0
- cognee/api/v1/ontologies/ontologies.py +158 -0
- cognee/api/v1/ontologies/routers/__init__.py +0 -0
- cognee/api/v1/ontologies/routers/get_ontology_router.py +109 -0
- cognee/api/v1/permissions/routers/get_permissions_router.py +41 -1
- cognee/api/v1/search/search.py +4 -0
- cognee/api/v1/ui/node_setup.py +360 -0
- cognee/api/v1/ui/npm_utils.py +50 -0
- cognee/api/v1/ui/ui.py +38 -68
- cognee/cli/commands/cognify_command.py +8 -1
- cognee/cli/config.py +1 -1
- cognee/context_global_variables.py +86 -9
- cognee/eval_framework/Dockerfile +29 -0
- cognee/eval_framework/answer_generation/answer_generation_executor.py +10 -0
- cognee/eval_framework/answer_generation/run_question_answering_module.py +1 -1
- cognee/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py +0 -2
- cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +4 -4
- cognee/eval_framework/eval_config.py +2 -2
- cognee/eval_framework/modal_run_eval.py +16 -28
- cognee/infrastructure/databases/cache/config.py +3 -1
- cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py +151 -0
- cognee/infrastructure/databases/cache/get_cache_engine.py +20 -10
- cognee/infrastructure/databases/dataset_database_handler/__init__.py +3 -0
- cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py +80 -0
- cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py +18 -0
- cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py +10 -0
- cognee/infrastructure/databases/exceptions/exceptions.py +16 -0
- cognee/infrastructure/databases/graph/config.py +7 -0
- cognee/infrastructure/databases/graph/get_graph_engine.py +3 -0
- cognee/infrastructure/databases/graph/graph_db_interface.py +15 -0
- cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py +81 -0
- cognee/infrastructure/databases/graph/kuzu/adapter.py +228 -0
- cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py +168 -0
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +80 -1
- cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +9 -0
- cognee/infrastructure/databases/utils/__init__.py +3 -0
- cognee/infrastructure/databases/utils/get_graph_dataset_database_handler.py +10 -0
- cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +66 -18
- cognee/infrastructure/databases/utils/get_vector_dataset_database_handler.py +10 -0
- cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py +30 -0
- cognee/infrastructure/databases/vector/config.py +5 -0
- cognee/infrastructure/databases/vector/create_vector_engine.py +6 -1
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +8 -6
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +9 -7
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +11 -10
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +2 -0
- cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py +50 -0
- cognee/infrastructure/databases/vector/vector_db_interface.py +35 -0
- cognee/infrastructure/engine/models/Edge.py +13 -1
- cognee/infrastructure/files/storage/s3_config.py +2 -0
- cognee/infrastructure/files/utils/guess_file_type.py +4 -0
- cognee/infrastructure/llm/LLMGateway.py +5 -2
- cognee/infrastructure/llm/config.py +37 -0
- cognee/infrastructure/llm/extraction/knowledge_graph/extract_content_graph.py +2 -2
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/acreate_structured_output.py +23 -8
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +22 -18
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/__init__.py +5 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/adapter.py +153 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +47 -38
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +46 -37
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +20 -10
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +23 -11
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +36 -23
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +47 -36
- cognee/infrastructure/loaders/LoaderEngine.py +1 -0
- cognee/infrastructure/loaders/core/__init__.py +2 -1
- cognee/infrastructure/loaders/core/csv_loader.py +93 -0
- cognee/infrastructure/loaders/core/text_loader.py +1 -2
- cognee/infrastructure/loaders/external/advanced_pdf_loader.py +0 -9
- cognee/infrastructure/loaders/supported_loaders.py +2 -1
- cognee/memify_pipelines/create_triplet_embeddings.py +53 -0
- cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py +55 -0
- cognee/modules/chunking/CsvChunker.py +35 -0
- cognee/modules/chunking/models/DocumentChunk.py +2 -1
- cognee/modules/chunking/text_chunker_with_overlap.py +124 -0
- cognee/modules/cognify/config.py +2 -0
- cognee/modules/data/deletion/prune_system.py +52 -2
- cognee/modules/data/methods/__init__.py +1 -0
- cognee/modules/data/methods/create_dataset.py +4 -2
- cognee/modules/data/methods/delete_dataset.py +26 -0
- cognee/modules/data/methods/get_dataset_ids.py +5 -1
- cognee/modules/data/methods/get_unique_data_id.py +68 -0
- cognee/modules/data/methods/get_unique_dataset_id.py +66 -4
- cognee/modules/data/models/Dataset.py +2 -0
- cognee/modules/data/processing/document_types/CsvDocument.py +33 -0
- cognee/modules/data/processing/document_types/__init__.py +1 -0
- cognee/modules/engine/models/Triplet.py +9 -0
- cognee/modules/engine/models/__init__.py +1 -0
- cognee/modules/graph/cognee_graph/CogneeGraph.py +89 -39
- cognee/modules/graph/cognee_graph/CogneeGraphElements.py +8 -3
- cognee/modules/graph/utils/expand_with_nodes_and_edges.py +19 -2
- cognee/modules/graph/utils/resolve_edges_to_text.py +48 -49
- cognee/modules/ingestion/identify.py +4 -4
- cognee/modules/memify/memify.py +1 -7
- cognee/modules/notebooks/operations/run_in_local_sandbox.py +3 -0
- cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py +55 -23
- cognee/modules/pipelines/operations/pipeline.py +18 -2
- cognee/modules/pipelines/operations/run_tasks_data_item.py +1 -1
- cognee/modules/retrieval/EntityCompletionRetriever.py +10 -3
- cognee/modules/retrieval/__init__.py +1 -1
- cognee/modules/retrieval/base_graph_retriever.py +7 -3
- cognee/modules/retrieval/base_retriever.py +7 -3
- cognee/modules/retrieval/completion_retriever.py +11 -4
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +10 -2
- cognee/modules/retrieval/graph_completion_cot_retriever.py +18 -51
- cognee/modules/retrieval/graph_completion_retriever.py +14 -1
- cognee/modules/retrieval/graph_summary_completion_retriever.py +4 -0
- cognee/modules/retrieval/register_retriever.py +10 -0
- cognee/modules/retrieval/registered_community_retrievers.py +1 -0
- cognee/modules/retrieval/temporal_retriever.py +13 -2
- cognee/modules/retrieval/triplet_retriever.py +182 -0
- cognee/modules/retrieval/utils/brute_force_triplet_search.py +43 -11
- cognee/modules/retrieval/utils/completion.py +2 -22
- cognee/modules/run_custom_pipeline/__init__.py +1 -0
- cognee/modules/run_custom_pipeline/run_custom_pipeline.py +76 -0
- cognee/modules/search/methods/get_search_type_tools.py +54 -8
- cognee/modules/search/methods/no_access_control_search.py +4 -0
- cognee/modules/search/methods/search.py +26 -3
- cognee/modules/search/types/SearchType.py +1 -1
- cognee/modules/settings/get_settings.py +19 -0
- cognee/modules/users/methods/create_user.py +12 -27
- cognee/modules/users/methods/get_authenticated_user.py +3 -2
- cognee/modules/users/methods/get_default_user.py +4 -2
- cognee/modules/users/methods/get_user.py +1 -1
- cognee/modules/users/methods/get_user_by_email.py +1 -1
- cognee/modules/users/models/DatasetDatabase.py +24 -3
- cognee/modules/users/models/Tenant.py +6 -7
- cognee/modules/users/models/User.py +6 -5
- cognee/modules/users/models/UserTenant.py +12 -0
- cognee/modules/users/models/__init__.py +1 -0
- cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +13 -13
- cognee/modules/users/roles/methods/add_user_to_role.py +3 -1
- cognee/modules/users/tenants/methods/__init__.py +1 -0
- cognee/modules/users/tenants/methods/add_user_to_tenant.py +21 -12
- cognee/modules/users/tenants/methods/create_tenant.py +22 -8
- cognee/modules/users/tenants/methods/select_tenant.py +62 -0
- cognee/shared/logging_utils.py +6 -0
- cognee/shared/rate_limiting.py +30 -0
- cognee/tasks/chunks/__init__.py +1 -0
- cognee/tasks/chunks/chunk_by_row.py +94 -0
- cognee/tasks/documents/__init__.py +0 -1
- cognee/tasks/documents/classify_documents.py +2 -0
- cognee/tasks/feedback/generate_improved_answers.py +3 -3
- cognee/tasks/graph/extract_graph_from_data.py +9 -10
- cognee/tasks/ingestion/ingest_data.py +1 -1
- cognee/tasks/memify/__init__.py +2 -0
- cognee/tasks/memify/cognify_session.py +41 -0
- cognee/tasks/memify/extract_user_sessions.py +73 -0
- cognee/tasks/memify/get_triplet_datapoints.py +289 -0
- cognee/tasks/storage/add_data_points.py +142 -2
- cognee/tasks/storage/index_data_points.py +33 -22
- cognee/tasks/storage/index_graph_edges.py +37 -57
- cognee/tests/integration/documents/CsvDocument_test.py +70 -0
- cognee/tests/integration/retrieval/test_triplet_retriever.py +84 -0
- cognee/tests/integration/tasks/test_add_data_points.py +139 -0
- cognee/tests/integration/tasks/test_get_triplet_datapoints.py +69 -0
- cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +1 -1
- cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +1 -1
- cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +13 -27
- cognee/tests/tasks/entity_extraction/entity_extraction_test.py +1 -1
- cognee/tests/test_add_docling_document.py +2 -2
- cognee/tests/test_cognee_server_start.py +84 -3
- cognee/tests/test_conversation_history.py +68 -5
- cognee/tests/test_data/example_with_header.csv +3 -0
- cognee/tests/test_dataset_database_handler.py +137 -0
- cognee/tests/test_dataset_delete.py +76 -0
- cognee/tests/test_edge_centered_payload.py +170 -0
- cognee/tests/test_edge_ingestion.py +27 -0
- cognee/tests/test_feedback_enrichment.py +1 -1
- cognee/tests/test_library.py +6 -4
- cognee/tests/test_load.py +62 -0
- cognee/tests/test_multi_tenancy.py +165 -0
- cognee/tests/test_parallel_databases.py +2 -0
- cognee/tests/test_pipeline_cache.py +164 -0
- cognee/tests/test_relational_db_migration.py +54 -2
- cognee/tests/test_search_db.py +44 -2
- cognee/tests/unit/api/test_conditional_authentication_endpoints.py +12 -3
- cognee/tests/unit/api/test_ontology_endpoint.py +252 -0
- cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +5 -0
- cognee/tests/unit/infrastructure/databases/test_index_data_points.py +27 -0
- cognee/tests/unit/infrastructure/databases/test_index_graph_edges.py +14 -16
- cognee/tests/unit/infrastructure/llm/test_llm_config.py +46 -0
- cognee/tests/unit/infrastructure/mock_embedding_engine.py +3 -7
- cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +0 -5
- cognee/tests/unit/modules/chunking/test_text_chunker.py +248 -0
- cognee/tests/unit/modules/chunking/test_text_chunker_with_overlap.py +324 -0
- cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +2 -2
- cognee/tests/unit/modules/graph/cognee_graph_test.py +406 -0
- cognee/tests/unit/modules/memify_tasks/test_cognify_session.py +111 -0
- cognee/tests/unit/modules/memify_tasks/test_extract_user_sessions.py +175 -0
- cognee/tests/unit/modules/memify_tasks/test_get_triplet_datapoints.py +214 -0
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +0 -51
- cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +1 -0
- cognee/tests/unit/modules/retrieval/structured_output_test.py +204 -0
- cognee/tests/unit/modules/retrieval/summaries_retriever_test.py +1 -1
- cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +0 -1
- cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py +608 -0
- cognee/tests/unit/modules/retrieval/triplet_retriever_test.py +83 -0
- cognee/tests/unit/modules/users/test_conditional_authentication.py +0 -63
- cognee/tests/unit/processing/chunks/chunk_by_row_test.py +52 -0
- cognee/tests/unit/tasks/storage/test_add_data_points.py +288 -0
- {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/METADATA +11 -6
- {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/RECORD +215 -163
- {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/WHEEL +1 -1
- {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/entry_points.txt +0 -1
- cognee/api/v1/cognify/code_graph_pipeline.py +0 -119
- cognee/api/v1/cognify/routers/get_code_pipeline_router.py +0 -90
- cognee/infrastructure/databases/vector/embeddings/embedding_rate_limiter.py +0 -544
- cognee/modules/retrieval/code_retriever.py +0 -232
- cognee/tasks/code/enrich_dependency_graph_checker.py +0 -35
- cognee/tasks/code/get_local_dependencies_checker.py +0 -20
- cognee/tasks/code/get_repo_dependency_graph_checker.py +0 -35
- cognee/tasks/documents/check_permissions_on_dataset.py +0 -26
- cognee/tasks/repo_processor/__init__.py +0 -2
- cognee/tasks/repo_processor/get_local_dependencies.py +0 -335
- cognee/tasks/repo_processor/get_non_code_files.py +0 -158
- cognee/tasks/repo_processor/get_repo_file_dependencies.py +0 -243
- {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from cognee.infrastructure.llm.config import LLMConfig
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_strip_quotes_from_strings():
|
|
7
|
+
"""
|
|
8
|
+
Test if the LLMConfig.strip_quotes_from_strings model validator behaves as expected.
|
|
9
|
+
"""
|
|
10
|
+
config = LLMConfig(
|
|
11
|
+
# Strings with surrounding double quotes ("value" → value)
|
|
12
|
+
llm_api_key='"double_value"',
|
|
13
|
+
# Strings with surrounding single quotes ('value' → value)
|
|
14
|
+
llm_endpoint="'single_value'",
|
|
15
|
+
# Strings without quotes (value → value)
|
|
16
|
+
llm_api_version="no_quotes_value",
|
|
17
|
+
# Empty quoted strings ("" → empty string)
|
|
18
|
+
fallback_model='""',
|
|
19
|
+
# None values (should remain None)
|
|
20
|
+
baml_llm_api_key=None,
|
|
21
|
+
# Mixed quotes ("value' → unchanged)
|
|
22
|
+
fallback_endpoint="\"mixed_quote'",
|
|
23
|
+
# Strings with internal quotes ("internal\"quotes" → internal"quotes")
|
|
24
|
+
baml_llm_model='"internal"quotes"',
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Strings with surrounding double quotes ("value" → value)
|
|
28
|
+
assert config.llm_api_key == "double_value"
|
|
29
|
+
|
|
30
|
+
# Strings with surrounding single quotes ('value' → value)
|
|
31
|
+
assert config.llm_endpoint == "single_value"
|
|
32
|
+
|
|
33
|
+
# Strings without quotes (value → value)
|
|
34
|
+
assert config.llm_api_version == "no_quotes_value"
|
|
35
|
+
|
|
36
|
+
# Empty quoted strings ("" → empty string)
|
|
37
|
+
assert config.fallback_model == ""
|
|
38
|
+
|
|
39
|
+
# None values (should remain None)
|
|
40
|
+
assert config.baml_llm_api_key is None
|
|
41
|
+
|
|
42
|
+
# Mixed quotes ("value' → unchanged)
|
|
43
|
+
assert config.fallback_endpoint == "\"mixed_quote'"
|
|
44
|
+
|
|
45
|
+
# Strings with internal quotes ("internal\"quotes" → internal"quotes")
|
|
46
|
+
assert config.baml_llm_model == 'internal"quotes'
|
|
@@ -4,10 +4,7 @@ from typing import List
|
|
|
4
4
|
from cognee.infrastructure.databases.vector.embeddings.LiteLLMEmbeddingEngine import (
|
|
5
5
|
LiteLLMEmbeddingEngine,
|
|
6
6
|
)
|
|
7
|
-
from cognee.
|
|
8
|
-
embedding_rate_limit_async,
|
|
9
|
-
embedding_sleep_and_retry_async,
|
|
10
|
-
)
|
|
7
|
+
from cognee.shared.rate_limiting import embedding_rate_limiter_context_manager
|
|
11
8
|
|
|
12
9
|
|
|
13
10
|
class MockEmbeddingEngine(LiteLLMEmbeddingEngine):
|
|
@@ -34,8 +31,6 @@ class MockEmbeddingEngine(LiteLLMEmbeddingEngine):
|
|
|
34
31
|
self.fail_every_n_requests = fail_every_n_requests
|
|
35
32
|
self.add_delay = add_delay
|
|
36
33
|
|
|
37
|
-
@embedding_sleep_and_retry_async()
|
|
38
|
-
@embedding_rate_limit_async
|
|
39
34
|
async def embed_text(self, text: List[str]) -> List[List[float]]:
|
|
40
35
|
"""
|
|
41
36
|
Mock implementation that returns fixed embeddings and can
|
|
@@ -52,4 +47,5 @@ class MockEmbeddingEngine(LiteLLMEmbeddingEngine):
|
|
|
52
47
|
raise Exception(f"Mock failure on request #{self.request_count}")
|
|
53
48
|
|
|
54
49
|
# Return mock embeddings of the correct dimension
|
|
55
|
-
|
|
50
|
+
async with embedding_rate_limiter_context_manager():
|
|
51
|
+
return [[0.1] * self.dimensions for _ in text]
|
|
@@ -6,9 +6,6 @@ import logging
|
|
|
6
6
|
from cognee.infrastructure.llm.config import (
|
|
7
7
|
get_llm_config,
|
|
8
8
|
)
|
|
9
|
-
from cognee.infrastructure.databases.vector.embeddings.embedding_rate_limiter import (
|
|
10
|
-
EmbeddingRateLimiter,
|
|
11
|
-
)
|
|
12
9
|
from cognee.tests.unit.infrastructure.mock_embedding_engine import MockEmbeddingEngine
|
|
13
10
|
|
|
14
11
|
# Configure logging
|
|
@@ -33,7 +30,6 @@ async def test_embedding_rate_limiting_realistic():
|
|
|
33
30
|
|
|
34
31
|
# Clear the config and rate limiter caches to ensure our settings are applied
|
|
35
32
|
get_llm_config.cache_clear()
|
|
36
|
-
EmbeddingRateLimiter.reset_instance()
|
|
37
33
|
|
|
38
34
|
# Create a fresh config instance and verify settings
|
|
39
35
|
config = get_llm_config()
|
|
@@ -170,7 +166,6 @@ async def test_with_mock_failures():
|
|
|
170
166
|
|
|
171
167
|
# Clear caches
|
|
172
168
|
get_llm_config.cache_clear()
|
|
173
|
-
EmbeddingRateLimiter.reset_instance()
|
|
174
169
|
|
|
175
170
|
# Create a mock engine configured to fail every 3rd request
|
|
176
171
|
engine = MockEmbeddingEngine()
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""Unit tests for TextChunker and TextChunkerWithOverlap behavioral equivalence."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from uuid import uuid4
|
|
5
|
+
|
|
6
|
+
from cognee.modules.chunking.TextChunker import TextChunker
|
|
7
|
+
from cognee.modules.chunking.text_chunker_with_overlap import TextChunkerWithOverlap
|
|
8
|
+
from cognee.modules.data.processing.document_types import Document
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@pytest.fixture(params=["TextChunker", "TextChunkerWithOverlap"])
|
|
12
|
+
def chunker_class(request):
|
|
13
|
+
"""Parametrize tests to run against both implementations."""
|
|
14
|
+
return TextChunker if request.param == "TextChunker" else TextChunkerWithOverlap
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@pytest.fixture
|
|
18
|
+
def make_text_generator():
|
|
19
|
+
"""Factory for async text generators."""
|
|
20
|
+
|
|
21
|
+
def _factory(*texts):
|
|
22
|
+
async def gen():
|
|
23
|
+
for text in texts:
|
|
24
|
+
yield text
|
|
25
|
+
|
|
26
|
+
return gen
|
|
27
|
+
|
|
28
|
+
return _factory
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
async def collect_chunks(chunker):
|
|
32
|
+
"""Consume async generator and return list of chunks."""
|
|
33
|
+
chunks = []
|
|
34
|
+
async for chunk in chunker.read():
|
|
35
|
+
chunks.append(chunk)
|
|
36
|
+
return chunks
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.mark.asyncio
|
|
40
|
+
async def test_empty_input_produces_no_chunks(chunker_class, make_text_generator):
|
|
41
|
+
"""Empty input should yield no chunks."""
|
|
42
|
+
document = Document(
|
|
43
|
+
id=uuid4(),
|
|
44
|
+
name="test_document",
|
|
45
|
+
raw_data_location="/test/path",
|
|
46
|
+
external_metadata=None,
|
|
47
|
+
mime_type="text/plain",
|
|
48
|
+
)
|
|
49
|
+
get_text = make_text_generator("")
|
|
50
|
+
chunker = chunker_class(document, get_text, max_chunk_size=512)
|
|
51
|
+
chunks = await collect_chunks(chunker)
|
|
52
|
+
|
|
53
|
+
assert len(chunks) == 0, "Empty input should produce no chunks"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@pytest.mark.asyncio
|
|
57
|
+
async def test_whitespace_only_input_emits_single_chunk(chunker_class, make_text_generator):
|
|
58
|
+
"""Whitespace-only input should produce exactly one chunk with unchanged text."""
|
|
59
|
+
whitespace_text = " \n\t \r\n "
|
|
60
|
+
document = Document(
|
|
61
|
+
id=uuid4(),
|
|
62
|
+
name="test_document",
|
|
63
|
+
raw_data_location="/test/path",
|
|
64
|
+
external_metadata=None,
|
|
65
|
+
mime_type="text/plain",
|
|
66
|
+
)
|
|
67
|
+
get_text = make_text_generator(whitespace_text)
|
|
68
|
+
chunker = chunker_class(document, get_text, max_chunk_size=512)
|
|
69
|
+
chunks = await collect_chunks(chunker)
|
|
70
|
+
|
|
71
|
+
assert len(chunks) == 1, "Whitespace-only input should produce exactly one chunk"
|
|
72
|
+
assert chunks[0].text == whitespace_text, "Chunk text should equal input (whitespace preserved)"
|
|
73
|
+
assert chunks[0].chunk_index == 0, "First chunk should have index 0"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@pytest.mark.asyncio
|
|
77
|
+
async def test_single_paragraph_below_limit_emits_one_chunk(chunker_class, make_text_generator):
|
|
78
|
+
"""Single paragraph below limit should emit exactly one chunk."""
|
|
79
|
+
text = "This is a short paragraph."
|
|
80
|
+
document = Document(
|
|
81
|
+
id=uuid4(),
|
|
82
|
+
name="test_document",
|
|
83
|
+
raw_data_location="/test/path",
|
|
84
|
+
external_metadata=None,
|
|
85
|
+
mime_type="text/plain",
|
|
86
|
+
)
|
|
87
|
+
get_text = make_text_generator(text)
|
|
88
|
+
chunker = chunker_class(document, get_text, max_chunk_size=512)
|
|
89
|
+
chunks = await collect_chunks(chunker)
|
|
90
|
+
|
|
91
|
+
assert len(chunks) == 1, "Single short paragraph should produce exactly one chunk"
|
|
92
|
+
assert chunks[0].text == text, "Chunk text should match input"
|
|
93
|
+
assert chunks[0].chunk_index == 0, "First chunk should have index 0"
|
|
94
|
+
assert chunks[0].chunk_size > 0, "Chunk should have positive size"
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@pytest.mark.asyncio
|
|
98
|
+
async def test_oversized_paragraph_gets_emitted_as_a_single_chunk(
|
|
99
|
+
chunker_class, make_text_generator
|
|
100
|
+
):
|
|
101
|
+
"""Oversized paragraph from chunk_by_paragraph should be emitted as single chunk."""
|
|
102
|
+
text = ("A" * 1500) + ". Next sentence."
|
|
103
|
+
document = Document(
|
|
104
|
+
id=uuid4(),
|
|
105
|
+
name="test_document",
|
|
106
|
+
raw_data_location="/test/path",
|
|
107
|
+
external_metadata=None,
|
|
108
|
+
mime_type="text/plain",
|
|
109
|
+
)
|
|
110
|
+
get_text = make_text_generator(text)
|
|
111
|
+
chunker = chunker_class(document, get_text, max_chunk_size=50)
|
|
112
|
+
chunks = await collect_chunks(chunker)
|
|
113
|
+
|
|
114
|
+
assert len(chunks) == 2, "Should produce 2 chunks (oversized paragraph + next sentence)"
|
|
115
|
+
assert chunks[0].chunk_size > 50, "First chunk should be oversized"
|
|
116
|
+
assert chunks[0].chunk_index == 0, "First chunk should have index 0"
|
|
117
|
+
assert chunks[1].chunk_index == 1, "Second chunk should have index 1"
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@pytest.mark.asyncio
|
|
121
|
+
async def test_overflow_on_next_paragraph_emits_separate_chunk(chunker_class, make_text_generator):
|
|
122
|
+
"""First paragraph near limit plus small paragraph should produce two separate chunks."""
|
|
123
|
+
first_para = " ".join(["word"] * 5)
|
|
124
|
+
second_para = "Short text."
|
|
125
|
+
text = first_para + " " + second_para
|
|
126
|
+
document = Document(
|
|
127
|
+
id=uuid4(),
|
|
128
|
+
name="test_document",
|
|
129
|
+
raw_data_location="/test/path",
|
|
130
|
+
external_metadata=None,
|
|
131
|
+
mime_type="text/plain",
|
|
132
|
+
)
|
|
133
|
+
get_text = make_text_generator(text)
|
|
134
|
+
chunker = chunker_class(document, get_text, max_chunk_size=10)
|
|
135
|
+
chunks = await collect_chunks(chunker)
|
|
136
|
+
|
|
137
|
+
assert len(chunks) == 2, "Should produce 2 chunks due to overflow"
|
|
138
|
+
assert chunks[0].text.strip() == first_para, "First chunk should contain only first paragraph"
|
|
139
|
+
assert chunks[1].text.strip() == second_para, (
|
|
140
|
+
"Second chunk should contain only second paragraph"
|
|
141
|
+
)
|
|
142
|
+
assert chunks[0].chunk_index == 0, "First chunk should have index 0"
|
|
143
|
+
assert chunks[1].chunk_index == 1, "Second chunk should have index 1"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@pytest.mark.asyncio
|
|
147
|
+
async def test_small_paragraphs_batch_correctly(chunker_class, make_text_generator):
|
|
148
|
+
"""Multiple small paragraphs should batch together with joiner spaces counted."""
|
|
149
|
+
paragraphs = [" ".join(["word"] * 12) for _ in range(40)]
|
|
150
|
+
text = " ".join(paragraphs)
|
|
151
|
+
document = Document(
|
|
152
|
+
id=uuid4(),
|
|
153
|
+
name="test_document",
|
|
154
|
+
raw_data_location="/test/path",
|
|
155
|
+
external_metadata=None,
|
|
156
|
+
mime_type="text/plain",
|
|
157
|
+
)
|
|
158
|
+
get_text = make_text_generator(text)
|
|
159
|
+
chunker = chunker_class(document, get_text, max_chunk_size=49)
|
|
160
|
+
chunks = await collect_chunks(chunker)
|
|
161
|
+
|
|
162
|
+
assert len(chunks) == 20, (
|
|
163
|
+
"Should batch paragraphs (2 per chunk: 12 words × 2 tokens = 24, 24 + 1 joiner + 24 = 49)"
|
|
164
|
+
)
|
|
165
|
+
assert all(c.chunk_index == i for i, c in enumerate(chunks)), (
|
|
166
|
+
"Chunk indices should be sequential"
|
|
167
|
+
)
|
|
168
|
+
all_text = " ".join(chunk.text.strip() for chunk in chunks)
|
|
169
|
+
expected_text = " ".join(paragraphs)
|
|
170
|
+
assert all_text == expected_text, "All paragraph text should be preserved with correct spacing"
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@pytest.mark.asyncio
|
|
174
|
+
async def test_alternating_large_and_small_paragraphs_dont_batch(
|
|
175
|
+
chunker_class, make_text_generator
|
|
176
|
+
):
|
|
177
|
+
"""Alternating near-max and small paragraphs should each become separate chunks."""
|
|
178
|
+
large1 = "word" * 15 + "."
|
|
179
|
+
small1 = "Short."
|
|
180
|
+
large2 = "word" * 15 + "."
|
|
181
|
+
small2 = "Tiny."
|
|
182
|
+
text = large1 + " " + small1 + " " + large2 + " " + small2
|
|
183
|
+
document = Document(
|
|
184
|
+
id=uuid4(),
|
|
185
|
+
name="test_document",
|
|
186
|
+
raw_data_location="/test/path",
|
|
187
|
+
external_metadata=None,
|
|
188
|
+
mime_type="text/plain",
|
|
189
|
+
)
|
|
190
|
+
max_chunk_size = 10
|
|
191
|
+
get_text = make_text_generator(text)
|
|
192
|
+
chunker = chunker_class(document, get_text, max_chunk_size=max_chunk_size)
|
|
193
|
+
chunks = await collect_chunks(chunker)
|
|
194
|
+
|
|
195
|
+
assert len(chunks) == 4, "Should produce multiple chunks"
|
|
196
|
+
assert all(c.chunk_index == i for i, c in enumerate(chunks)), (
|
|
197
|
+
"Chunk indices should be sequential"
|
|
198
|
+
)
|
|
199
|
+
assert chunks[0].chunk_size > max_chunk_size, (
|
|
200
|
+
"First chunk should be oversized (large paragraph)"
|
|
201
|
+
)
|
|
202
|
+
assert chunks[1].chunk_size <= max_chunk_size, "Second chunk should be small (small paragraph)"
|
|
203
|
+
assert chunks[2].chunk_size > max_chunk_size, (
|
|
204
|
+
"Third chunk should be oversized (large paragraph)"
|
|
205
|
+
)
|
|
206
|
+
assert chunks[3].chunk_size <= max_chunk_size, "Fourth chunk should be small (small paragraph)"
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
@pytest.mark.asyncio
|
|
210
|
+
async def test_chunk_indices_and_ids_are_deterministic(chunker_class, make_text_generator):
|
|
211
|
+
"""Running chunker twice on identical input should produce identical indices and IDs."""
|
|
212
|
+
sentence1 = "one " * 4 + ". "
|
|
213
|
+
sentence2 = "two " * 4 + ". "
|
|
214
|
+
sentence3 = "one " * 4 + ". "
|
|
215
|
+
sentence4 = "two " * 4 + ". "
|
|
216
|
+
text = sentence1 + sentence2 + sentence3 + sentence4
|
|
217
|
+
doc_id = uuid4()
|
|
218
|
+
max_chunk_size = 20
|
|
219
|
+
|
|
220
|
+
document1 = Document(
|
|
221
|
+
id=doc_id,
|
|
222
|
+
name="test_document",
|
|
223
|
+
raw_data_location="/test/path",
|
|
224
|
+
external_metadata=None,
|
|
225
|
+
mime_type="text/plain",
|
|
226
|
+
)
|
|
227
|
+
get_text1 = make_text_generator(text)
|
|
228
|
+
chunker1 = chunker_class(document1, get_text1, max_chunk_size=max_chunk_size)
|
|
229
|
+
chunks1 = await collect_chunks(chunker1)
|
|
230
|
+
|
|
231
|
+
document2 = Document(
|
|
232
|
+
id=doc_id,
|
|
233
|
+
name="test_document",
|
|
234
|
+
raw_data_location="/test/path",
|
|
235
|
+
external_metadata=None,
|
|
236
|
+
mime_type="text/plain",
|
|
237
|
+
)
|
|
238
|
+
get_text2 = make_text_generator(text)
|
|
239
|
+
chunker2 = chunker_class(document2, get_text2, max_chunk_size=max_chunk_size)
|
|
240
|
+
chunks2 = await collect_chunks(chunker2)
|
|
241
|
+
|
|
242
|
+
assert len(chunks1) == 2, "Should produce exactly 2 chunks (4 sentences, 2 per chunk)"
|
|
243
|
+
assert len(chunks2) == 2, "Should produce exactly 2 chunks (4 sentences, 2 per chunk)"
|
|
244
|
+
assert [c.chunk_index for c in chunks1] == [0, 1], "First run indices should be [0, 1]"
|
|
245
|
+
assert [c.chunk_index for c in chunks2] == [0, 1], "Second run indices should be [0, 1]"
|
|
246
|
+
assert chunks1[0].id == chunks2[0].id, "First chunk ID should be deterministic"
|
|
247
|
+
assert chunks1[1].id == chunks2[1].id, "Second chunk ID should be deterministic"
|
|
248
|
+
assert chunks1[0].id != chunks1[1].id, "Chunk IDs should be unique within a run"
|
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
"""Unit tests for TextChunkerWithOverlap overlap behavior."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import pytest
|
|
5
|
+
from uuid import uuid4
|
|
6
|
+
from unittest.mock import patch
|
|
7
|
+
|
|
8
|
+
from cognee.modules.chunking.text_chunker_with_overlap import TextChunkerWithOverlap
|
|
9
|
+
from cognee.modules.data.processing.document_types import Document
|
|
10
|
+
from cognee.tasks.chunks import chunk_by_paragraph
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@pytest.fixture
|
|
14
|
+
def make_text_generator():
|
|
15
|
+
"""Factory for async text generators."""
|
|
16
|
+
|
|
17
|
+
def _factory(*texts):
|
|
18
|
+
async def gen():
|
|
19
|
+
for text in texts:
|
|
20
|
+
yield text
|
|
21
|
+
|
|
22
|
+
return gen
|
|
23
|
+
|
|
24
|
+
return _factory
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@pytest.fixture
|
|
28
|
+
def make_controlled_chunk_data():
|
|
29
|
+
"""Factory for controlled chunk_data generators."""
|
|
30
|
+
|
|
31
|
+
def _factory(*sentences, chunk_size_per_sentence=10):
|
|
32
|
+
def _chunk_data(text):
|
|
33
|
+
return [
|
|
34
|
+
{
|
|
35
|
+
"text": sentence,
|
|
36
|
+
"chunk_size": chunk_size_per_sentence,
|
|
37
|
+
"cut_type": "sentence",
|
|
38
|
+
"chunk_id": uuid4(),
|
|
39
|
+
}
|
|
40
|
+
for sentence in sentences
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
return _chunk_data
|
|
44
|
+
|
|
45
|
+
return _factory
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@pytest.mark.asyncio
|
|
49
|
+
async def test_half_overlap_preserves_content_across_chunks(
|
|
50
|
+
make_text_generator, make_controlled_chunk_data
|
|
51
|
+
):
|
|
52
|
+
"""With 50% overlap, consecutive chunks should share half their content."""
|
|
53
|
+
s1 = "one"
|
|
54
|
+
s2 = "two"
|
|
55
|
+
s3 = "three"
|
|
56
|
+
s4 = "four"
|
|
57
|
+
text = "dummy"
|
|
58
|
+
document = Document(
|
|
59
|
+
id=uuid4(),
|
|
60
|
+
name="test_document",
|
|
61
|
+
raw_data_location="/test/path",
|
|
62
|
+
external_metadata=None,
|
|
63
|
+
mime_type="text/plain",
|
|
64
|
+
)
|
|
65
|
+
get_text = make_text_generator(text)
|
|
66
|
+
get_chunk_data = make_controlled_chunk_data(s1, s2, s3, s4, chunk_size_per_sentence=10)
|
|
67
|
+
chunker = TextChunkerWithOverlap(
|
|
68
|
+
document,
|
|
69
|
+
get_text,
|
|
70
|
+
max_chunk_size=20,
|
|
71
|
+
chunk_overlap_ratio=0.5,
|
|
72
|
+
get_chunk_data=get_chunk_data,
|
|
73
|
+
)
|
|
74
|
+
chunks = [chunk async for chunk in chunker.read()]
|
|
75
|
+
|
|
76
|
+
assert len(chunks) == 3, "Should produce exactly 3 chunks (s1+s2, s2+s3, s3+s4)"
|
|
77
|
+
assert [c.chunk_index for c in chunks] == [0, 1, 2], "Chunk indices should be [0, 1, 2]"
|
|
78
|
+
assert "one" in chunks[0].text and "two" in chunks[0].text, "Chunk 0 should contain s1 and s2"
|
|
79
|
+
assert "two" in chunks[1].text and "three" in chunks[1].text, (
|
|
80
|
+
"Chunk 1 should contain s2 (overlap) and s3"
|
|
81
|
+
)
|
|
82
|
+
assert "three" in chunks[2].text and "four" in chunks[2].text, (
|
|
83
|
+
"Chunk 2 should contain s3 (overlap) and s4"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@pytest.mark.asyncio
|
|
88
|
+
async def test_zero_overlap_produces_no_duplicate_content(
|
|
89
|
+
make_text_generator, make_controlled_chunk_data
|
|
90
|
+
):
|
|
91
|
+
"""With 0% overlap, no content should appear in multiple chunks."""
|
|
92
|
+
s1 = "one"
|
|
93
|
+
s2 = "two"
|
|
94
|
+
s3 = "three"
|
|
95
|
+
s4 = "four"
|
|
96
|
+
text = "dummy"
|
|
97
|
+
document = Document(
|
|
98
|
+
id=uuid4(),
|
|
99
|
+
name="test_document",
|
|
100
|
+
raw_data_location="/test/path",
|
|
101
|
+
external_metadata=None,
|
|
102
|
+
mime_type="text/plain",
|
|
103
|
+
)
|
|
104
|
+
get_text = make_text_generator(text)
|
|
105
|
+
get_chunk_data = make_controlled_chunk_data(s1, s2, s3, s4, chunk_size_per_sentence=10)
|
|
106
|
+
chunker = TextChunkerWithOverlap(
|
|
107
|
+
document,
|
|
108
|
+
get_text,
|
|
109
|
+
max_chunk_size=20,
|
|
110
|
+
chunk_overlap_ratio=0.0,
|
|
111
|
+
get_chunk_data=get_chunk_data,
|
|
112
|
+
)
|
|
113
|
+
chunks = [chunk async for chunk in chunker.read()]
|
|
114
|
+
|
|
115
|
+
assert len(chunks) == 2, "Should produce exactly 2 chunks (s1+s2, s3+s4)"
|
|
116
|
+
assert "one" in chunks[0].text and "two" in chunks[0].text, (
|
|
117
|
+
"First chunk should contain s1 and s2"
|
|
118
|
+
)
|
|
119
|
+
assert "three" in chunks[1].text and "four" in chunks[1].text, (
|
|
120
|
+
"Second chunk should contain s3 and s4"
|
|
121
|
+
)
|
|
122
|
+
assert "two" not in chunks[1].text and "three" not in chunks[0].text, (
|
|
123
|
+
"No overlap: end of chunk 0 should not appear in chunk 1"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@pytest.mark.asyncio
|
|
128
|
+
async def test_small_overlap_ratio_creates_minimal_overlap(
|
|
129
|
+
make_text_generator, make_controlled_chunk_data
|
|
130
|
+
):
|
|
131
|
+
"""With 25% overlap ratio, chunks should have minimal overlap."""
|
|
132
|
+
s1 = "alpha"
|
|
133
|
+
s2 = "beta"
|
|
134
|
+
s3 = "gamma"
|
|
135
|
+
s4 = "delta"
|
|
136
|
+
s5 = "epsilon"
|
|
137
|
+
text = "dummy"
|
|
138
|
+
document = Document(
|
|
139
|
+
id=uuid4(),
|
|
140
|
+
name="test_document",
|
|
141
|
+
raw_data_location="/test/path",
|
|
142
|
+
external_metadata=None,
|
|
143
|
+
mime_type="text/plain",
|
|
144
|
+
)
|
|
145
|
+
get_text = make_text_generator(text)
|
|
146
|
+
get_chunk_data = make_controlled_chunk_data(s1, s2, s3, s4, s5, chunk_size_per_sentence=10)
|
|
147
|
+
chunker = TextChunkerWithOverlap(
|
|
148
|
+
document,
|
|
149
|
+
get_text,
|
|
150
|
+
max_chunk_size=40,
|
|
151
|
+
chunk_overlap_ratio=0.25,
|
|
152
|
+
get_chunk_data=get_chunk_data,
|
|
153
|
+
)
|
|
154
|
+
chunks = [chunk async for chunk in chunker.read()]
|
|
155
|
+
|
|
156
|
+
assert len(chunks) == 2, "Should produce exactly 2 chunks"
|
|
157
|
+
assert [c.chunk_index for c in chunks] == [0, 1], "Chunk indices should be [0, 1]"
|
|
158
|
+
assert all(token in chunks[0].text for token in [s1, s2, s3, s4]), (
|
|
159
|
+
"Chunk 0 should contain s1 through s4"
|
|
160
|
+
)
|
|
161
|
+
assert s4 in chunks[1].text and s5 in chunks[1].text, (
|
|
162
|
+
"Chunk 1 should contain overlap s4 and new content s5"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@pytest.mark.asyncio
|
|
167
|
+
async def test_high_overlap_ratio_creates_significant_overlap(
|
|
168
|
+
make_text_generator, make_controlled_chunk_data
|
|
169
|
+
):
|
|
170
|
+
"""With 75% overlap ratio, consecutive chunks should share most content."""
|
|
171
|
+
s1 = "red"
|
|
172
|
+
s2 = "blue"
|
|
173
|
+
s3 = "green"
|
|
174
|
+
s4 = "yellow"
|
|
175
|
+
s5 = "purple"
|
|
176
|
+
text = "dummy"
|
|
177
|
+
document = Document(
|
|
178
|
+
id=uuid4(),
|
|
179
|
+
name="test_document",
|
|
180
|
+
raw_data_location="/test/path",
|
|
181
|
+
external_metadata=None,
|
|
182
|
+
mime_type="text/plain",
|
|
183
|
+
)
|
|
184
|
+
get_text = make_text_generator(text)
|
|
185
|
+
get_chunk_data = make_controlled_chunk_data(s1, s2, s3, s4, s5, chunk_size_per_sentence=5)
|
|
186
|
+
chunker = TextChunkerWithOverlap(
|
|
187
|
+
document,
|
|
188
|
+
get_text,
|
|
189
|
+
max_chunk_size=20,
|
|
190
|
+
chunk_overlap_ratio=0.75,
|
|
191
|
+
get_chunk_data=get_chunk_data,
|
|
192
|
+
)
|
|
193
|
+
chunks = [chunk async for chunk in chunker.read()]
|
|
194
|
+
|
|
195
|
+
assert len(chunks) == 2, "Should produce exactly 2 chunks with 75% overlap"
|
|
196
|
+
assert [c.chunk_index for c in chunks] == [0, 1], "Chunk indices should be [0, 1]"
|
|
197
|
+
assert all(token in chunks[0].text for token in [s1, s2, s3, s4]), (
|
|
198
|
+
"Chunk 0 should contain s1, s2, s3, s4"
|
|
199
|
+
)
|
|
200
|
+
assert all(token in chunks[1].text for token in [s2, s3, s4, s5]), (
|
|
201
|
+
"Chunk 1 should contain s2, s3, s4 (overlap) and s5"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@pytest.mark.asyncio
|
|
206
|
+
async def test_single_chunk_no_dangling_overlap(make_text_generator, make_controlled_chunk_data):
|
|
207
|
+
"""Text that fits in one chunk should produce exactly one chunk, no overlap artifact."""
|
|
208
|
+
s1 = "alpha"
|
|
209
|
+
s2 = "beta"
|
|
210
|
+
text = "dummy"
|
|
211
|
+
document = Document(
|
|
212
|
+
id=uuid4(),
|
|
213
|
+
name="test_document",
|
|
214
|
+
raw_data_location="/test/path",
|
|
215
|
+
external_metadata=None,
|
|
216
|
+
mime_type="text/plain",
|
|
217
|
+
)
|
|
218
|
+
get_text = make_text_generator(text)
|
|
219
|
+
get_chunk_data = make_controlled_chunk_data(s1, s2, chunk_size_per_sentence=10)
|
|
220
|
+
chunker = TextChunkerWithOverlap(
|
|
221
|
+
document,
|
|
222
|
+
get_text,
|
|
223
|
+
max_chunk_size=20,
|
|
224
|
+
chunk_overlap_ratio=0.5,
|
|
225
|
+
get_chunk_data=get_chunk_data,
|
|
226
|
+
)
|
|
227
|
+
chunks = [chunk async for chunk in chunker.read()]
|
|
228
|
+
|
|
229
|
+
assert len(chunks) == 1, (
|
|
230
|
+
"Should produce exactly 1 chunk when content fits within max_chunk_size"
|
|
231
|
+
)
|
|
232
|
+
assert chunks[0].chunk_index == 0, "Single chunk should have index 0"
|
|
233
|
+
assert "alpha" in chunks[0].text and "beta" in chunks[0].text, (
|
|
234
|
+
"Single chunk should contain all content"
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
@pytest.mark.asyncio
|
|
239
|
+
async def test_paragraph_chunking_with_overlap(make_text_generator):
|
|
240
|
+
"""Test that chunk_by_paragraph integration produces 25% overlap between chunks."""
|
|
241
|
+
|
|
242
|
+
def mock_get_embedding_engine():
|
|
243
|
+
class MockEngine:
|
|
244
|
+
tokenizer = None
|
|
245
|
+
|
|
246
|
+
return MockEngine()
|
|
247
|
+
|
|
248
|
+
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
|
|
249
|
+
|
|
250
|
+
max_chunk_size = 20
|
|
251
|
+
overlap_ratio = 0.25 # 5 token overlap
|
|
252
|
+
paragraph_max_size = int(0.5 * overlap_ratio * max_chunk_size) # = 2
|
|
253
|
+
|
|
254
|
+
text = (
|
|
255
|
+
"A0 A1. A2 A3. A4 A5. A6 A7. A8 A9. " # 10 tokens (0-9)
|
|
256
|
+
"B0 B1. B2 B3. B4 B5. B6 B7. B8 B9. " # 10 tokens (10-19)
|
|
257
|
+
"C0 C1. C2 C3. C4 C5. C6 C7. C8 C9. " # 10 tokens (20-29)
|
|
258
|
+
"D0 D1. D2 D3. D4 D5. D6 D7. D8 D9. " # 10 tokens (30-39)
|
|
259
|
+
"E0 E1. E2 E3. E4 E5. E6 E7. E8 E9." # 10 tokens (40-49)
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
document = Document(
|
|
263
|
+
id=uuid4(),
|
|
264
|
+
name="test_document",
|
|
265
|
+
raw_data_location="/test/path",
|
|
266
|
+
external_metadata=None,
|
|
267
|
+
mime_type="text/plain",
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
get_text = make_text_generator(text)
|
|
271
|
+
|
|
272
|
+
def get_chunk_data(text_input):
|
|
273
|
+
return chunk_by_paragraph(
|
|
274
|
+
text_input, max_chunk_size=paragraph_max_size, batch_paragraphs=True
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
with patch.object(
|
|
278
|
+
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
|
|
279
|
+
):
|
|
280
|
+
chunker = TextChunkerWithOverlap(
|
|
281
|
+
document,
|
|
282
|
+
get_text,
|
|
283
|
+
max_chunk_size=max_chunk_size,
|
|
284
|
+
chunk_overlap_ratio=overlap_ratio,
|
|
285
|
+
get_chunk_data=get_chunk_data,
|
|
286
|
+
)
|
|
287
|
+
chunks = [chunk async for chunk in chunker.read()]
|
|
288
|
+
|
|
289
|
+
assert len(chunks) == 3, f"Should produce exactly 3 chunks, got {len(chunks)}"
|
|
290
|
+
|
|
291
|
+
assert chunks[0].chunk_index == 0, "First chunk should have index 0"
|
|
292
|
+
assert chunks[1].chunk_index == 1, "Second chunk should have index 1"
|
|
293
|
+
assert chunks[2].chunk_index == 2, "Third chunk should have index 2"
|
|
294
|
+
|
|
295
|
+
assert "A0" in chunks[0].text, "Chunk 0 should start with A0"
|
|
296
|
+
assert "A9" in chunks[0].text, "Chunk 0 should contain A9"
|
|
297
|
+
assert "B0" in chunks[0].text, "Chunk 0 should contain B0"
|
|
298
|
+
assert "B9" in chunks[0].text, "Chunk 0 should contain up to B9 (20 tokens)"
|
|
299
|
+
|
|
300
|
+
assert "B" in chunks[1].text, "Chunk 1 should have overlap from B section"
|
|
301
|
+
assert "C" in chunks[1].text, "Chunk 1 should contain C section"
|
|
302
|
+
assert "D" in chunks[1].text, "Chunk 1 should contain D section"
|
|
303
|
+
|
|
304
|
+
assert "D" in chunks[2].text, "Chunk 2 should have overlap from D section"
|
|
305
|
+
assert "E0" in chunks[2].text, "Chunk 2 should contain E0"
|
|
306
|
+
assert "E9" in chunks[2].text, "Chunk 2 should end with E9"
|
|
307
|
+
|
|
308
|
+
chunk_0_end_words = chunks[0].text.split()[-4:]
|
|
309
|
+
chunk_1_words = chunks[1].text.split()
|
|
310
|
+
overlap_0_1 = any(word in chunk_1_words for word in chunk_0_end_words)
|
|
311
|
+
assert overlap_0_1, (
|
|
312
|
+
f"No overlap detected between chunks 0 and 1. "
|
|
313
|
+
f"Chunk 0 ends with: {chunk_0_end_words}, "
|
|
314
|
+
f"Chunk 1 starts with: {chunk_1_words[:6]}"
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
chunk_1_end_words = chunks[1].text.split()[-4:]
|
|
318
|
+
chunk_2_words = chunks[2].text.split()
|
|
319
|
+
overlap_1_2 = any(word in chunk_2_words for word in chunk_1_end_words)
|
|
320
|
+
assert overlap_1_2, (
|
|
321
|
+
f"No overlap detected between chunks 1 and 2. "
|
|
322
|
+
f"Chunk 1 ends with: {chunk_1_end_words}, "
|
|
323
|
+
f"Chunk 2 starts with: {chunk_2_words[:6]}"
|
|
324
|
+
)
|
|
@@ -9,7 +9,7 @@ def test_node_initialization():
|
|
|
9
9
|
"""Test that a Node is initialized correctly."""
|
|
10
10
|
node = Node("node1", {"attr1": "value1"}, dimension=2)
|
|
11
11
|
assert node.id == "node1"
|
|
12
|
-
assert node.attributes == {"attr1": "value1", "vector_distance":
|
|
12
|
+
assert node.attributes == {"attr1": "value1", "vector_distance": 3.5}
|
|
13
13
|
assert len(node.status) == 2
|
|
14
14
|
assert np.all(node.status == 1)
|
|
15
15
|
|
|
@@ -96,7 +96,7 @@ def test_edge_initialization():
|
|
|
96
96
|
edge = Edge(node1, node2, {"weight": 10}, directed=False, dimension=2)
|
|
97
97
|
assert edge.node1 == node1
|
|
98
98
|
assert edge.node2 == node2
|
|
99
|
-
assert edge.attributes == {"vector_distance":
|
|
99
|
+
assert edge.attributes == {"vector_distance": 3.5, "weight": 10}
|
|
100
100
|
assert edge.directed is False
|
|
101
101
|
assert len(edge.status) == 2
|
|
102
102
|
assert np.all(edge.status == 1)
|