cognee 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +2 -0
- cognee/alembic/README +1 -0
- cognee/alembic/env.py +107 -0
- cognee/alembic/script.py.mako +26 -0
- cognee/alembic/versions/1a58b986e6e1_enable_delete_for_old_tutorial_notebooks.py +52 -0
- cognee/alembic/versions/1d0bb7fede17_add_pipeline_run_status.py +33 -0
- cognee/alembic/versions/1daae0df1866_incremental_loading.py +48 -0
- cognee/alembic/versions/211ab850ef3d_add_sync_operations_table.py +118 -0
- cognee/alembic/versions/45957f0a9849_add_notebook_table.py +46 -0
- cognee/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py +333 -0
- cognee/alembic/versions/482cd6517ce4_add_default_user.py +30 -0
- cognee/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py +98 -0
- cognee/alembic/versions/8057ae7329c2_initial_migration.py +25 -0
- cognee/alembic/versions/9e7a3cb85175_loader_separation.py +104 -0
- cognee/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py +38 -0
- cognee/alembic/versions/ab7e313804ae_permission_system_rework.py +236 -0
- cognee/alembic/versions/b9274c27a25a_kuzu_11_migration.py +75 -0
- cognee/alembic/versions/c946955da633_multi_tenant_support.py +137 -0
- cognee/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py +51 -0
- cognee/alembic/versions/e4ebee1091e7_expand_data_model_info.py +140 -0
- cognee/alembic.ini +117 -0
- cognee/api/v1/add/add.py +2 -1
- cognee/api/v1/add/routers/get_add_router.py +2 -0
- cognee/api/v1/cognify/cognify.py +11 -6
- cognee/api/v1/cognify/routers/get_cognify_router.py +8 -0
- cognee/api/v1/config/config.py +60 -0
- cognee/api/v1/datasets/routers/get_datasets_router.py +46 -3
- cognee/api/v1/memify/routers/get_memify_router.py +3 -0
- cognee/api/v1/search/routers/get_search_router.py +21 -6
- cognee/api/v1/search/search.py +21 -5
- cognee/api/v1/sync/routers/get_sync_router.py +3 -3
- cognee/cli/commands/add_command.py +1 -1
- cognee/cli/commands/cognify_command.py +6 -0
- cognee/cli/commands/config_command.py +1 -1
- cognee/context_global_variables.py +5 -1
- cognee/eval_framework/answer_generation/answer_generation_executor.py +7 -8
- cognee/infrastructure/databases/cache/cache_db_interface.py +38 -1
- cognee/infrastructure/databases/cache/config.py +6 -0
- cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py +21 -0
- cognee/infrastructure/databases/cache/get_cache_engine.py +9 -3
- cognee/infrastructure/databases/cache/redis/RedisAdapter.py +60 -1
- cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py +7 -0
- cognee/infrastructure/databases/graph/get_graph_engine.py +29 -1
- cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py +62 -27
- cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +17 -4
- cognee/infrastructure/databases/relational/config.py +16 -1
- cognee/infrastructure/databases/relational/create_relational_engine.py +13 -3
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +26 -3
- cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +2 -0
- cognee/infrastructure/databases/vector/config.py +6 -0
- cognee/infrastructure/databases/vector/create_vector_engine.py +70 -16
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +64 -9
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +13 -2
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +16 -3
- cognee/infrastructure/databases/vector/models/ScoredResult.py +3 -3
- cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +16 -3
- cognee/infrastructure/databases/vector/pgvector/PGVectorDatasetDatabaseHandler.py +86 -0
- cognee/infrastructure/databases/vector/pgvector/create_db_and_tables.py +81 -2
- cognee/infrastructure/databases/vector/vector_db_interface.py +8 -0
- cognee/infrastructure/files/utils/get_data_file_path.py +33 -27
- cognee/infrastructure/llm/LLMGateway.py +0 -13
- cognee/infrastructure/llm/prompts/extract_query_time.txt +1 -1
- cognee/infrastructure/llm/prompts/generate_event_entity_prompt.txt +1 -1
- cognee/infrastructure/llm/prompts/generate_event_graph_prompt.txt +1 -1
- cognee/infrastructure/llm/prompts/generate_graph_prompt.txt +2 -2
- cognee/infrastructure/llm/prompts/generate_graph_prompt_guided.txt +1 -1
- cognee/infrastructure/llm/prompts/generate_graph_prompt_oneshot.txt +2 -2
- cognee/infrastructure/llm/prompts/generate_graph_prompt_simple.txt +1 -1
- cognee/infrastructure/llm/prompts/generate_graph_prompt_strict.txt +1 -1
- cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +6 -6
- cognee/infrastructure/llm/prompts/test.txt +1 -1
- cognee/infrastructure/llm/prompts/translate_content.txt +19 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +17 -12
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +31 -25
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +132 -7
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +29 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/llama_cpp/adapter.py +191 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/llm_interface.py +2 -6
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +58 -13
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +0 -1
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +25 -131
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/types.py +10 -0
- cognee/modules/chunking/models/DocumentChunk.py +0 -1
- cognee/modules/cognify/config.py +2 -0
- cognee/modules/data/models/Data.py +3 -1
- cognee/modules/engine/models/Entity.py +0 -1
- cognee/modules/engine/operations/setup.py +6 -0
- cognee/modules/graph/cognee_graph/CogneeGraph.py +150 -37
- cognee/modules/graph/cognee_graph/CogneeGraphElements.py +48 -2
- cognee/modules/graph/utils/__init__.py +1 -0
- cognee/modules/graph/utils/get_entity_nodes_from_triplets.py +12 -0
- cognee/modules/notebooks/methods/__init__.py +1 -0
- cognee/modules/notebooks/methods/create_notebook.py +0 -34
- cognee/modules/notebooks/methods/create_tutorial_notebooks.py +191 -0
- cognee/modules/notebooks/methods/get_notebooks.py +12 -8
- cognee/modules/notebooks/tutorials/cognee-basics/cell-1.md +3 -0
- cognee/modules/notebooks/tutorials/cognee-basics/cell-2.md +10 -0
- cognee/modules/notebooks/tutorials/cognee-basics/cell-3.md +7 -0
- cognee/modules/notebooks/tutorials/cognee-basics/cell-4.py +28 -0
- cognee/modules/notebooks/tutorials/cognee-basics/cell-5.py +3 -0
- cognee/modules/notebooks/tutorials/cognee-basics/cell-6.py +9 -0
- cognee/modules/notebooks/tutorials/cognee-basics/cell-7.py +17 -0
- cognee/modules/notebooks/tutorials/cognee-basics/config.json +4 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-1.md +3 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-10.md +3 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-11.md +3 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-12.py +3 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-13.md +7 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-14.py +6 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-15.md +3 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-16.py +7 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-2.md +9 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-3.md +7 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-4.md +9 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-5.md +5 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-6.py +13 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-7.md +3 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-8.md +3 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-9.py +31 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/config.json +4 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/data/copilot_conversations.json +107 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/data/guido_contributions.json +976 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/data/my_developer_rules.md +79 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/data/pep_style_guide.md +74 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/data/zen_principles.md +74 -0
- cognee/modules/retrieval/EntityCompletionRetriever.py +51 -38
- cognee/modules/retrieval/__init__.py +0 -1
- cognee/modules/retrieval/base_retriever.py +66 -10
- cognee/modules/retrieval/chunks_retriever.py +57 -49
- cognee/modules/retrieval/coding_rules_retriever.py +12 -5
- cognee/modules/retrieval/completion_retriever.py +29 -28
- cognee/modules/retrieval/cypher_search_retriever.py +25 -20
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +42 -46
- cognee/modules/retrieval/graph_completion_cot_retriever.py +68 -51
- cognee/modules/retrieval/graph_completion_retriever.py +78 -63
- cognee/modules/retrieval/graph_summary_completion_retriever.py +2 -0
- cognee/modules/retrieval/lexical_retriever.py +34 -12
- cognee/modules/retrieval/natural_language_retriever.py +18 -15
- cognee/modules/retrieval/summaries_retriever.py +51 -34
- cognee/modules/retrieval/temporal_retriever.py +59 -49
- cognee/modules/retrieval/triplet_retriever.py +32 -33
- cognee/modules/retrieval/utils/access_tracking.py +88 -0
- cognee/modules/retrieval/utils/brute_force_triplet_search.py +99 -103
- cognee/modules/retrieval/utils/node_edge_vector_search.py +174 -0
- cognee/modules/search/methods/__init__.py +1 -0
- cognee/modules/search/methods/get_retriever_output.py +53 -0
- cognee/modules/search/methods/get_search_type_retriever_instance.py +252 -0
- cognee/modules/search/methods/search.py +90 -222
- cognee/modules/search/models/SearchResultPayload.py +67 -0
- cognee/modules/search/types/SearchResult.py +1 -8
- cognee/modules/search/types/SearchType.py +1 -2
- cognee/modules/search/types/__init__.py +1 -1
- cognee/modules/search/utils/__init__.py +1 -2
- cognee/modules/search/utils/transform_insights_to_graph.py +2 -2
- cognee/modules/search/utils/{transform_context_to_graph.py → transform_triplets_to_graph.py} +2 -2
- cognee/modules/users/authentication/default/default_transport.py +11 -1
- cognee/modules/users/authentication/get_api_auth_backend.py +2 -1
- cognee/modules/users/authentication/get_client_auth_backend.py +2 -1
- cognee/modules/users/methods/create_user.py +0 -9
- cognee/modules/users/permissions/methods/has_user_management_permission.py +29 -0
- cognee/modules/visualization/cognee_network_visualization.py +1 -1
- cognee/run_migrations.py +48 -0
- cognee/shared/exceptions/__init__.py +1 -3
- cognee/shared/exceptions/exceptions.py +11 -1
- cognee/shared/usage_logger.py +332 -0
- cognee/shared/utils.py +12 -5
- cognee/tasks/chunks/__init__.py +9 -0
- cognee/tasks/cleanup/cleanup_unused_data.py +172 -0
- cognee/tasks/graph/__init__.py +7 -0
- cognee/tasks/ingestion/data_item.py +8 -0
- cognee/tasks/ingestion/ingest_data.py +12 -1
- cognee/tasks/ingestion/save_data_item_to_storage.py +5 -0
- cognee/tasks/memify/__init__.py +8 -0
- cognee/tasks/memify/extract_usage_frequency.py +613 -0
- cognee/tasks/summarization/models.py +0 -2
- cognee/tasks/temporal_graph/__init__.py +0 -1
- cognee/tasks/translation/__init__.py +96 -0
- cognee/tasks/translation/config.py +110 -0
- cognee/tasks/translation/detect_language.py +190 -0
- cognee/tasks/translation/exceptions.py +62 -0
- cognee/tasks/translation/models.py +72 -0
- cognee/tasks/translation/providers/__init__.py +44 -0
- cognee/tasks/translation/providers/azure_provider.py +192 -0
- cognee/tasks/translation/providers/base.py +85 -0
- cognee/tasks/translation/providers/google_provider.py +158 -0
- cognee/tasks/translation/providers/llm_provider.py +143 -0
- cognee/tasks/translation/translate_content.py +282 -0
- cognee/tasks/web_scraper/default_url_crawler.py +6 -2
- cognee/tests/cli_tests/cli_unit_tests/test_cli_commands.py +1 -0
- cognee/tests/cli_tests/cli_unit_tests/test_cli_edge_cases.py +3 -0
- cognee/tests/integration/retrieval/test_brute_force_triplet_search_with_cognify.py +62 -0
- cognee/tests/integration/retrieval/test_chunks_retriever.py +351 -0
- cognee/tests/integration/retrieval/test_graph_completion_retriever.py +276 -0
- cognee/tests/integration/retrieval/test_graph_completion_retriever_context_extension.py +228 -0
- cognee/tests/integration/retrieval/test_graph_completion_retriever_cot.py +217 -0
- cognee/tests/integration/retrieval/test_rag_completion_retriever.py +319 -0
- cognee/tests/integration/retrieval/test_structured_output.py +258 -0
- cognee/tests/integration/retrieval/test_summaries_retriever.py +195 -0
- cognee/tests/integration/retrieval/test_temporal_retriever.py +336 -0
- cognee/tests/integration/retrieval/test_triplet_retriever.py +45 -1
- cognee/tests/integration/shared/test_usage_logger_integration.py +255 -0
- cognee/tests/tasks/translation/README.md +147 -0
- cognee/tests/tasks/translation/__init__.py +1 -0
- cognee/tests/tasks/translation/config_test.py +93 -0
- cognee/tests/tasks/translation/detect_language_test.py +118 -0
- cognee/tests/tasks/translation/providers_test.py +151 -0
- cognee/tests/tasks/translation/translate_content_test.py +213 -0
- cognee/tests/test_chromadb.py +1 -1
- cognee/tests/test_cleanup_unused_data.py +165 -0
- cognee/tests/test_custom_data_label.py +68 -0
- cognee/tests/test_delete_by_id.py +6 -6
- cognee/tests/test_extract_usage_frequency.py +308 -0
- cognee/tests/test_kuzu.py +17 -7
- cognee/tests/test_lancedb.py +3 -1
- cognee/tests/test_library.py +1 -1
- cognee/tests/test_neo4j.py +17 -7
- cognee/tests/test_neptune_analytics_vector.py +3 -1
- cognee/tests/test_permissions.py +172 -187
- cognee/tests/test_pgvector.py +3 -1
- cognee/tests/test_relational_db_migration.py +15 -1
- cognee/tests/test_remote_kuzu.py +3 -1
- cognee/tests/test_s3_file_storage.py +1 -1
- cognee/tests/test_search_db.py +345 -205
- cognee/tests/test_usage_logger_e2e.py +268 -0
- cognee/tests/unit/api/test_get_raw_data_endpoint.py +206 -0
- cognee/tests/unit/eval_framework/answer_generation_test.py +4 -3
- cognee/tests/unit/eval_framework/benchmark_adapters_test.py +25 -0
- cognee/tests/unit/eval_framework/corpus_builder_test.py +33 -4
- cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +2 -0
- cognee/tests/unit/infrastructure/databases/relational/test_RelationalConfig.py +69 -0
- cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +42 -2
- cognee/tests/unit/modules/graph/cognee_graph_test.py +329 -31
- cognee/tests/unit/modules/retrieval/chunks_retriever_test.py +122 -168
- cognee/tests/unit/modules/retrieval/conversation_history_test.py +338 -0
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +486 -157
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +693 -155
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +619 -200
- cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +300 -171
- cognee/tests/unit/modules/retrieval/summaries_retriever_test.py +184 -155
- cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +544 -79
- cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py +476 -28
- cognee/tests/unit/modules/retrieval/test_completion.py +343 -0
- cognee/tests/unit/modules/retrieval/test_graph_summary_completion_retriever.py +157 -0
- cognee/tests/unit/modules/retrieval/test_node_edge_vector_search.py +273 -0
- cognee/tests/unit/modules/retrieval/test_user_qa_feedback.py +312 -0
- cognee/tests/unit/modules/retrieval/triplet_retriever_test.py +267 -7
- cognee/tests/unit/modules/search/test_get_search_type_retriever_instance.py +125 -0
- cognee/tests/unit/modules/search/test_search.py +96 -20
- cognee/tests/unit/modules/search/test_search_prepare_search_result_contract.py +190 -0
- cognee/tests/unit/modules/users/test_tutorial_notebook_creation.py +511 -297
- cognee/tests/unit/shared/test_usage_logger.py +241 -0
- cognee/tests/unit/users/permissions/test_has_user_management_permission.py +46 -0
- {cognee-0.5.1.dist-info → cognee-0.5.2.dist-info}/METADATA +22 -17
- {cognee-0.5.1.dist-info → cognee-0.5.2.dist-info}/RECORD +258 -157
- cognee/api/.env.example +0 -5
- cognee/modules/retrieval/base_graph_retriever.py +0 -24
- cognee/modules/search/methods/get_search_type_tools.py +0 -223
- cognee/modules/search/methods/no_access_control_search.py +0 -62
- cognee/modules/search/utils/prepare_search_result.py +0 -63
- cognee/tests/test_feedback_enrichment.py +0 -174
- cognee/tests/unit/modules/retrieval/structured_output_test.py +0 -204
- {cognee-0.5.1.dist-info → cognee-0.5.2.dist-info}/WHEEL +0 -0
- {cognee-0.5.1.dist-info → cognee-0.5.2.dist-info}/entry_points.txt +0 -0
- {cognee-0.5.1.dist-info → cognee-0.5.2.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.5.1.dist-info → cognee-0.5.2.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
from uuid import uuid5
|
|
4
|
+
|
|
5
|
+
from cognee.modules.chunking.models import DocumentChunk
|
|
6
|
+
from cognee.shared.logging_utils import get_logger
|
|
7
|
+
|
|
8
|
+
from .config import get_translation_config, TranslationProviderType
|
|
9
|
+
from .detect_language import detect_language_async, LanguageDetectionResult
|
|
10
|
+
from .exceptions import TranslationError, LanguageDetectionError
|
|
11
|
+
from .models import TranslatedContent, LanguageMetadata
|
|
12
|
+
from .providers import get_translation_provider, TranslationResult
|
|
13
|
+
|
|
14
|
+
logger = get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
async def translate_content(
|
|
18
|
+
data_chunks: List[DocumentChunk],
|
|
19
|
+
target_language: str = None,
|
|
20
|
+
translation_provider: TranslationProviderType = None,
|
|
21
|
+
confidence_threshold: float = None,
|
|
22
|
+
skip_if_target_language: bool = True,
|
|
23
|
+
preserve_original: bool = True,
|
|
24
|
+
) -> List[DocumentChunk]:
|
|
25
|
+
"""
|
|
26
|
+
Translate non-English content to the target language.
|
|
27
|
+
|
|
28
|
+
This task detects the language of each document chunk and translates
|
|
29
|
+
non-target-language content using the specified translation provider.
|
|
30
|
+
Original text is preserved alongside translated versions.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
data_chunks: List of DocumentChunk objects to process
|
|
34
|
+
target_language: Target language code (default: "en" for English)
|
|
35
|
+
If not provided, uses config default
|
|
36
|
+
translation_provider: Translation service to use ("llm", "google", "azure")
|
|
37
|
+
If not provided, uses config default
|
|
38
|
+
confidence_threshold: Minimum confidence for language detection (0.0 to 1.0)
|
|
39
|
+
If not provided, uses config default
|
|
40
|
+
skip_if_target_language: If True, skip chunks already in target language
|
|
41
|
+
preserve_original: If True, store original text in TranslatedContent
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
List of DocumentChunk objects with translated content.
|
|
45
|
+
Chunks that required translation will have TranslatedContent
|
|
46
|
+
objects in their 'contains' list.
|
|
47
|
+
|
|
48
|
+
Note:
|
|
49
|
+
This function mutates the input chunks in-place. Specifically:
|
|
50
|
+
- chunk.text is replaced with the translated text
|
|
51
|
+
- chunk.contains is updated with LanguageMetadata and TranslatedContent
|
|
52
|
+
The original text is preserved in TranslatedContent.original_text
|
|
53
|
+
if preserve_original=True.
|
|
54
|
+
|
|
55
|
+
Example:
|
|
56
|
+
```python
|
|
57
|
+
from cognee.tasks.translation import translate_content
|
|
58
|
+
|
|
59
|
+
# Translate chunks using default settings
|
|
60
|
+
translated_chunks = await translate_content(chunks)
|
|
61
|
+
|
|
62
|
+
# Translate with specific provider
|
|
63
|
+
translated_chunks = await translate_content(
|
|
64
|
+
chunks,
|
|
65
|
+
translation_provider="llm",
|
|
66
|
+
confidence_threshold=0.9
|
|
67
|
+
)
|
|
68
|
+
```
|
|
69
|
+
"""
|
|
70
|
+
if not isinstance(data_chunks, list):
|
|
71
|
+
raise TranslationError("data_chunks must be a list")
|
|
72
|
+
|
|
73
|
+
if len(data_chunks) == 0:
|
|
74
|
+
return data_chunks
|
|
75
|
+
|
|
76
|
+
# Get configuration
|
|
77
|
+
config = get_translation_config()
|
|
78
|
+
provider_name = translation_provider or config.translation_provider
|
|
79
|
+
target_lang = target_language or config.target_language
|
|
80
|
+
threshold = confidence_threshold or config.confidence_threshold
|
|
81
|
+
|
|
82
|
+
logger.info(
|
|
83
|
+
f"Starting translation task for {len(data_chunks)} chunks "
|
|
84
|
+
f"using {provider_name} provider, target language: {target_lang}"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Get the translation provider
|
|
88
|
+
provider = get_translation_provider(provider_name)
|
|
89
|
+
|
|
90
|
+
# Process chunks
|
|
91
|
+
processed_chunks = []
|
|
92
|
+
total_chunks = len(data_chunks)
|
|
93
|
+
|
|
94
|
+
for chunk_index, chunk in enumerate(data_chunks):
|
|
95
|
+
# Log progress for large batches
|
|
96
|
+
if chunk_index > 0 and chunk_index % 100 == 0:
|
|
97
|
+
logger.info(f"Translation progress: {chunk_index}/{total_chunks} chunks processed")
|
|
98
|
+
|
|
99
|
+
if not hasattr(chunk, "text") or not chunk.text:
|
|
100
|
+
processed_chunks.append(chunk)
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
# Detect language
|
|
105
|
+
detection = await detect_language_async(chunk.text, target_lang, threshold)
|
|
106
|
+
|
|
107
|
+
# Create language metadata
|
|
108
|
+
language_metadata = LanguageMetadata(
|
|
109
|
+
id=uuid5(chunk.id, "LanguageMetadata"),
|
|
110
|
+
content_id=chunk.id,
|
|
111
|
+
detected_language=detection.language_code,
|
|
112
|
+
language_confidence=detection.confidence,
|
|
113
|
+
requires_translation=detection.requires_translation,
|
|
114
|
+
character_count=detection.character_count,
|
|
115
|
+
language_name=detection.language_name,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Skip if already in target language
|
|
119
|
+
if not detection.requires_translation:
|
|
120
|
+
if skip_if_target_language:
|
|
121
|
+
logger.debug(
|
|
122
|
+
f"Skipping chunk {chunk.id}: already in target language "
|
|
123
|
+
f"({detection.language_code})"
|
|
124
|
+
)
|
|
125
|
+
# Add language metadata to chunk
|
|
126
|
+
_add_to_chunk_contains(chunk, language_metadata)
|
|
127
|
+
processed_chunks.append(chunk)
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
# Translate the content
|
|
131
|
+
logger.debug(
|
|
132
|
+
f"Translating chunk {chunk.id} from {detection.language_code} to {target_lang}"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
translation_result = await provider.translate(
|
|
136
|
+
text=chunk.text,
|
|
137
|
+
target_language=target_lang,
|
|
138
|
+
source_language=detection.language_code,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Create TranslatedContent data point
|
|
142
|
+
translated_content = TranslatedContent(
|
|
143
|
+
id=uuid5(chunk.id, "TranslatedContent"),
|
|
144
|
+
original_chunk_id=chunk.id,
|
|
145
|
+
original_text=chunk.text if preserve_original else "",
|
|
146
|
+
translated_text=translation_result.translated_text,
|
|
147
|
+
source_language=translation_result.source_language,
|
|
148
|
+
target_language=translation_result.target_language,
|
|
149
|
+
translation_provider=translation_result.provider,
|
|
150
|
+
confidence_score=translation_result.confidence_score,
|
|
151
|
+
translated_from=chunk,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Update chunk text with translated content
|
|
155
|
+
chunk.text = translation_result.translated_text
|
|
156
|
+
|
|
157
|
+
# Add metadata to chunk's contains list
|
|
158
|
+
_add_to_chunk_contains(chunk, language_metadata)
|
|
159
|
+
_add_to_chunk_contains(chunk, translated_content)
|
|
160
|
+
|
|
161
|
+
processed_chunks.append(chunk)
|
|
162
|
+
|
|
163
|
+
logger.debug(
|
|
164
|
+
f"Successfully translated chunk {chunk.id}: "
|
|
165
|
+
f"{detection.language_code} -> {target_lang}"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
except LanguageDetectionError as e:
|
|
169
|
+
logger.warning(f"Language detection failed for chunk {chunk.id}: {e}")
|
|
170
|
+
processed_chunks.append(chunk)
|
|
171
|
+
except TranslationError as e:
|
|
172
|
+
logger.error(f"Translation failed for chunk {chunk.id}: {e}")
|
|
173
|
+
processed_chunks.append(chunk)
|
|
174
|
+
except Exception as e:
|
|
175
|
+
logger.error(f"Unexpected error processing chunk {chunk.id}: {e}")
|
|
176
|
+
processed_chunks.append(chunk)
|
|
177
|
+
|
|
178
|
+
logger.info(f"Translation task completed for {len(processed_chunks)} chunks")
|
|
179
|
+
return processed_chunks
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _add_to_chunk_contains(chunk: DocumentChunk, item) -> None:
|
|
183
|
+
"""Helper to add an item to a chunk's contains list."""
|
|
184
|
+
if chunk.contains is None:
|
|
185
|
+
chunk.contains = []
|
|
186
|
+
chunk.contains.append(item)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
async def translate_text(
|
|
190
|
+
text: str,
|
|
191
|
+
target_language: str = None,
|
|
192
|
+
translation_provider: TranslationProviderType = None,
|
|
193
|
+
source_language: Optional[str] = None,
|
|
194
|
+
) -> TranslationResult:
|
|
195
|
+
"""
|
|
196
|
+
Translate a single text string.
|
|
197
|
+
|
|
198
|
+
This is a convenience function for translating individual texts
|
|
199
|
+
without creating DocumentChunk objects.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
text: The text to translate
|
|
203
|
+
target_language: Target language code (default: uses config, typically "en")
|
|
204
|
+
If not provided, uses config default
|
|
205
|
+
translation_provider: Translation service to use
|
|
206
|
+
If not provided, uses config default
|
|
207
|
+
source_language: Source language code (optional, auto-detected if not provided)
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
TranslationResult with translated text and metadata
|
|
211
|
+
|
|
212
|
+
Example:
|
|
213
|
+
```python
|
|
214
|
+
from cognee.tasks.translation import translate_text
|
|
215
|
+
|
|
216
|
+
result = await translate_text(
|
|
217
|
+
"Bonjour le monde!",
|
|
218
|
+
target_language="en"
|
|
219
|
+
)
|
|
220
|
+
print(result.translated_text) # "Hello world!"
|
|
221
|
+
print(result.source_language) # "fr"
|
|
222
|
+
```
|
|
223
|
+
"""
|
|
224
|
+
config = get_translation_config()
|
|
225
|
+
provider_name = translation_provider or config.translation_provider
|
|
226
|
+
target_lang = target_language or config.target_language
|
|
227
|
+
|
|
228
|
+
provider = get_translation_provider(provider_name)
|
|
229
|
+
|
|
230
|
+
return await provider.translate(
|
|
231
|
+
text=text,
|
|
232
|
+
target_language=target_lang,
|
|
233
|
+
source_language=source_language,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
async def batch_translate_texts(
|
|
238
|
+
texts: List[str],
|
|
239
|
+
target_language: str = None,
|
|
240
|
+
translation_provider: TranslationProviderType = None,
|
|
241
|
+
source_language: Optional[str] = None,
|
|
242
|
+
) -> List[TranslationResult]:
|
|
243
|
+
"""
|
|
244
|
+
Translate multiple text strings in batch.
|
|
245
|
+
|
|
246
|
+
This is more efficient than translating texts individually,
|
|
247
|
+
especially for providers that support native batch operations.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
texts: List of texts to translate
|
|
251
|
+
target_language: Target language code (default: uses config, typically "en")
|
|
252
|
+
If not provided, uses config default
|
|
253
|
+
translation_provider: Translation service to use
|
|
254
|
+
If not provided, uses config default
|
|
255
|
+
source_language: Source language code (optional)
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
List of TranslationResult objects
|
|
259
|
+
|
|
260
|
+
Example:
|
|
261
|
+
```python
|
|
262
|
+
from cognee.tasks.translation import batch_translate_texts
|
|
263
|
+
|
|
264
|
+
results = await batch_translate_texts(
|
|
265
|
+
["Hola", "¿Cómo estás?", "Adiós"],
|
|
266
|
+
target_language="en"
|
|
267
|
+
)
|
|
268
|
+
for result in results:
|
|
269
|
+
print(f"{result.source_language}: {result.translated_text}")
|
|
270
|
+
```
|
|
271
|
+
"""
|
|
272
|
+
config = get_translation_config()
|
|
273
|
+
provider_name = translation_provider or config.translation_provider
|
|
274
|
+
target_lang = target_language or config.target_language
|
|
275
|
+
|
|
276
|
+
provider = get_translation_provider(provider_name)
|
|
277
|
+
|
|
278
|
+
return await provider.translate_batch(
|
|
279
|
+
texts=texts,
|
|
280
|
+
target_language=target_lang,
|
|
281
|
+
source_language=source_language,
|
|
282
|
+
)
|
|
@@ -73,7 +73,11 @@ class DefaultUrlCrawler:
|
|
|
73
73
|
self.timeout = timeout
|
|
74
74
|
self.max_retries = max_retries
|
|
75
75
|
self.retry_delay_factor = retry_delay_factor
|
|
76
|
-
self.headers = headers or {
|
|
76
|
+
self.headers = headers or {
|
|
77
|
+
"User-Agent": "Cognee-Scraper/1.0 (hello@cognee.ai)",
|
|
78
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
79
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
80
|
+
}
|
|
77
81
|
self.robots_cache_ttl = robots_cache_ttl
|
|
78
82
|
self._last_request_time_per_domain: Dict[str, float] = {}
|
|
79
83
|
self._robots_cache: Dict[str, RobotsTxtCache] = {}
|
|
@@ -288,7 +292,7 @@ class DefaultUrlCrawler:
|
|
|
288
292
|
while True:
|
|
289
293
|
try:
|
|
290
294
|
await self._respect_rate_limit(url, crawl_delay)
|
|
291
|
-
resp = await self._client.get(url)
|
|
295
|
+
resp = await self._client.get(url, headers=self.headers)
|
|
292
296
|
resp.raise_for_status()
|
|
293
297
|
logger.info(
|
|
294
298
|
f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)"
|
|
@@ -262,6 +262,7 @@ class TestCognifyCommandEdgeCases:
|
|
|
262
262
|
ontology_file_path=None,
|
|
263
263
|
chunker=TextChunker,
|
|
264
264
|
run_in_background=False,
|
|
265
|
+
chunks_per_batch=None,
|
|
265
266
|
)
|
|
266
267
|
|
|
267
268
|
@patch("cognee.cli.commands.cognify_command.asyncio.run", side_effect=_mock_run)
|
|
@@ -295,6 +296,7 @@ class TestCognifyCommandEdgeCases:
|
|
|
295
296
|
ontology_file_path="/nonexistent/path/ontology.owl",
|
|
296
297
|
chunker=TextChunker,
|
|
297
298
|
run_in_background=False,
|
|
299
|
+
chunks_per_batch=None,
|
|
298
300
|
)
|
|
299
301
|
|
|
300
302
|
@patch("cognee.cli.commands.cognify_command.asyncio.run")
|
|
@@ -373,6 +375,7 @@ class TestCognifyCommandEdgeCases:
|
|
|
373
375
|
ontology_file_path=None,
|
|
374
376
|
chunker=TextChunker,
|
|
375
377
|
run_in_background=False,
|
|
378
|
+
chunks_per_batch=None,
|
|
376
379
|
)
|
|
377
380
|
|
|
378
381
|
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
import pytest_asyncio
|
|
5
|
+
import cognee
|
|
6
|
+
|
|
7
|
+
from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge
|
|
8
|
+
from cognee.modules.retrieval.utils.brute_force_triplet_search import brute_force_triplet_search
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@pytest_asyncio.fixture
|
|
12
|
+
async def clean_environment():
|
|
13
|
+
"""Configure isolated storage and ensure cleanup before/after."""
|
|
14
|
+
base_dir = pathlib.Path(__file__).parent.parent.parent.parent
|
|
15
|
+
system_directory_path = str(base_dir / ".cognee_system/test_brute_force_triplet_search_e2e")
|
|
16
|
+
data_directory_path = str(base_dir / ".data_storage/test_brute_force_triplet_search_e2e")
|
|
17
|
+
|
|
18
|
+
cognee.config.system_root_directory(system_directory_path)
|
|
19
|
+
cognee.config.data_root_directory(data_directory_path)
|
|
20
|
+
|
|
21
|
+
await cognee.prune.prune_data()
|
|
22
|
+
await cognee.prune.prune_system(metadata=True)
|
|
23
|
+
|
|
24
|
+
yield
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
await cognee.prune.prune_data()
|
|
28
|
+
await cognee.prune.prune_system(metadata=True)
|
|
29
|
+
except Exception:
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@pytest.mark.asyncio
|
|
34
|
+
async def test_brute_force_triplet_search_end_to_end(clean_environment):
|
|
35
|
+
"""Minimal end-to-end exercise of single and batch triplet search."""
|
|
36
|
+
|
|
37
|
+
text = """
|
|
38
|
+
Cognee is an open-source AI memory engine that structures data into searchable formats for use with AI agents.
|
|
39
|
+
The company focuses on persistent memory systems using knowledge graphs and vector search.
|
|
40
|
+
It is a Berlin-based startup building infrastructure for context-aware AI applications.
|
|
41
|
+
NLP systems can use Cognee to store and retrieve structured information.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
await cognee.add(text)
|
|
45
|
+
await cognee.cognify()
|
|
46
|
+
|
|
47
|
+
single_result = await brute_force_triplet_search(
|
|
48
|
+
query="What can NLP systems use Cognee for?",
|
|
49
|
+
top_k=1,
|
|
50
|
+
)
|
|
51
|
+
assert isinstance(single_result, list)
|
|
52
|
+
assert single_result
|
|
53
|
+
assert all(isinstance(edge, Edge) for edge in single_result)
|
|
54
|
+
|
|
55
|
+
batch_queries = ["What is Cognee?", "What is the company's focus?"]
|
|
56
|
+
batch_result = await brute_force_triplet_search(query_batch=batch_queries, top_k=1)
|
|
57
|
+
|
|
58
|
+
assert isinstance(batch_result, list)
|
|
59
|
+
assert len(batch_result) == len(batch_queries)
|
|
60
|
+
assert all(isinstance(per_query, list) for per_query in batch_result)
|
|
61
|
+
assert all(per_query for per_query in batch_result)
|
|
62
|
+
assert all(isinstance(edge, Edge) for per_query in batch_result for edge in per_query)
|