cognee 0.5.1.dev0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. cognee/__init__.py +2 -0
  2. cognee/alembic/README +1 -0
  3. cognee/alembic/env.py +107 -0
  4. cognee/alembic/script.py.mako +26 -0
  5. cognee/alembic/versions/1a58b986e6e1_enable_delete_for_old_tutorial_notebooks.py +52 -0
  6. cognee/alembic/versions/1d0bb7fede17_add_pipeline_run_status.py +33 -0
  7. cognee/alembic/versions/1daae0df1866_incremental_loading.py +48 -0
  8. cognee/alembic/versions/211ab850ef3d_add_sync_operations_table.py +118 -0
  9. cognee/alembic/versions/45957f0a9849_add_notebook_table.py +46 -0
  10. cognee/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py +333 -0
  11. cognee/alembic/versions/482cd6517ce4_add_default_user.py +30 -0
  12. cognee/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py +98 -0
  13. cognee/alembic/versions/8057ae7329c2_initial_migration.py +25 -0
  14. cognee/alembic/versions/9e7a3cb85175_loader_separation.py +104 -0
  15. cognee/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py +38 -0
  16. cognee/alembic/versions/ab7e313804ae_permission_system_rework.py +236 -0
  17. cognee/alembic/versions/b9274c27a25a_kuzu_11_migration.py +75 -0
  18. cognee/alembic/versions/c946955da633_multi_tenant_support.py +137 -0
  19. cognee/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py +51 -0
  20. cognee/alembic/versions/e4ebee1091e7_expand_data_model_info.py +140 -0
  21. cognee/alembic.ini +117 -0
  22. cognee/api/v1/add/routers/get_add_router.py +2 -0
  23. cognee/api/v1/cognify/cognify.py +11 -6
  24. cognee/api/v1/cognify/routers/get_cognify_router.py +8 -0
  25. cognee/api/v1/config/config.py +60 -0
  26. cognee/api/v1/datasets/routers/get_datasets_router.py +45 -3
  27. cognee/api/v1/memify/routers/get_memify_router.py +2 -0
  28. cognee/api/v1/search/routers/get_search_router.py +21 -6
  29. cognee/api/v1/search/search.py +25 -5
  30. cognee/api/v1/sync/routers/get_sync_router.py +3 -3
  31. cognee/cli/commands/add_command.py +1 -1
  32. cognee/cli/commands/cognify_command.py +6 -0
  33. cognee/cli/commands/config_command.py +1 -1
  34. cognee/context_global_variables.py +5 -1
  35. cognee/eval_framework/answer_generation/answer_generation_executor.py +7 -8
  36. cognee/infrastructure/databases/cache/cache_db_interface.py +38 -1
  37. cognee/infrastructure/databases/cache/config.py +6 -0
  38. cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py +21 -0
  39. cognee/infrastructure/databases/cache/get_cache_engine.py +9 -3
  40. cognee/infrastructure/databases/cache/redis/RedisAdapter.py +60 -1
  41. cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py +7 -0
  42. cognee/infrastructure/databases/graph/get_graph_engine.py +29 -1
  43. cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py +62 -27
  44. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +17 -4
  45. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +2 -1
  46. cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +2 -0
  47. cognee/infrastructure/databases/vector/config.py +6 -0
  48. cognee/infrastructure/databases/vector/create_vector_engine.py +69 -22
  49. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +64 -9
  50. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +13 -2
  51. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +16 -3
  52. cognee/infrastructure/databases/vector/models/ScoredResult.py +3 -3
  53. cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +16 -3
  54. cognee/infrastructure/databases/vector/pgvector/PGVectorDatasetDatabaseHandler.py +86 -0
  55. cognee/infrastructure/databases/vector/pgvector/create_db_and_tables.py +81 -2
  56. cognee/infrastructure/databases/vector/vector_db_interface.py +8 -0
  57. cognee/infrastructure/files/utils/get_data_file_path.py +33 -27
  58. cognee/infrastructure/llm/prompts/extract_query_time.txt +1 -1
  59. cognee/infrastructure/llm/prompts/generate_event_entity_prompt.txt +1 -1
  60. cognee/infrastructure/llm/prompts/generate_event_graph_prompt.txt +1 -1
  61. cognee/infrastructure/llm/prompts/generate_graph_prompt.txt +2 -2
  62. cognee/infrastructure/llm/prompts/generate_graph_prompt_guided.txt +1 -1
  63. cognee/infrastructure/llm/prompts/generate_graph_prompt_oneshot.txt +2 -2
  64. cognee/infrastructure/llm/prompts/generate_graph_prompt_simple.txt +1 -1
  65. cognee/infrastructure/llm/prompts/generate_graph_prompt_strict.txt +1 -1
  66. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +6 -6
  67. cognee/infrastructure/llm/prompts/test.txt +1 -1
  68. cognee/infrastructure/llm/prompts/translate_content.txt +19 -0
  69. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +24 -0
  70. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/llama_cpp/adapter.py +191 -0
  71. cognee/modules/chunking/models/DocumentChunk.py +0 -1
  72. cognee/modules/cognify/config.py +2 -0
  73. cognee/modules/data/models/Data.py +1 -0
  74. cognee/modules/engine/models/Entity.py +0 -1
  75. cognee/modules/engine/operations/setup.py +6 -0
  76. cognee/modules/graph/cognee_graph/CogneeGraph.py +150 -37
  77. cognee/modules/graph/cognee_graph/CogneeGraphElements.py +48 -2
  78. cognee/modules/graph/utils/__init__.py +1 -0
  79. cognee/modules/graph/utils/get_entity_nodes_from_triplets.py +12 -0
  80. cognee/modules/notebooks/methods/__init__.py +1 -0
  81. cognee/modules/notebooks/methods/create_notebook.py +0 -34
  82. cognee/modules/notebooks/methods/create_tutorial_notebooks.py +191 -0
  83. cognee/modules/notebooks/methods/get_notebooks.py +12 -8
  84. cognee/modules/notebooks/tutorials/cognee-basics/cell-1.md +3 -0
  85. cognee/modules/notebooks/tutorials/cognee-basics/cell-2.md +10 -0
  86. cognee/modules/notebooks/tutorials/cognee-basics/cell-3.md +7 -0
  87. cognee/modules/notebooks/tutorials/cognee-basics/cell-4.py +28 -0
  88. cognee/modules/notebooks/tutorials/cognee-basics/cell-5.py +3 -0
  89. cognee/modules/notebooks/tutorials/cognee-basics/cell-6.py +9 -0
  90. cognee/modules/notebooks/tutorials/cognee-basics/cell-7.py +17 -0
  91. cognee/modules/notebooks/tutorials/cognee-basics/config.json +4 -0
  92. cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-1.md +3 -0
  93. cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-10.md +3 -0
  94. cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-11.md +3 -0
  95. cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-12.py +3 -0
  96. cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-13.md +7 -0
  97. cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-14.py +6 -0
  98. cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-15.md +3 -0
  99. cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-16.py +7 -0
  100. cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-2.md +9 -0
  101. cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-3.md +7 -0
  102. cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-4.md +9 -0
  103. cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-5.md +5 -0
  104. cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-6.py +13 -0
  105. cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-7.md +3 -0
  106. cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-8.md +3 -0
  107. cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-9.py +31 -0
  108. cognee/modules/notebooks/tutorials/python-development-with-cognee/config.json +4 -0
  109. cognee/modules/notebooks/tutorials/python-development-with-cognee/data/copilot_conversations.json +107 -0
  110. cognee/modules/notebooks/tutorials/python-development-with-cognee/data/guido_contributions.json +976 -0
  111. cognee/modules/notebooks/tutorials/python-development-with-cognee/data/my_developer_rules.md +79 -0
  112. cognee/modules/notebooks/tutorials/python-development-with-cognee/data/pep_style_guide.md +74 -0
  113. cognee/modules/notebooks/tutorials/python-development-with-cognee/data/zen_principles.md +74 -0
  114. cognee/modules/retrieval/EntityCompletionRetriever.py +51 -38
  115. cognee/modules/retrieval/__init__.py +0 -1
  116. cognee/modules/retrieval/base_retriever.py +66 -10
  117. cognee/modules/retrieval/chunks_retriever.py +57 -49
  118. cognee/modules/retrieval/coding_rules_retriever.py +12 -5
  119. cognee/modules/retrieval/completion_retriever.py +29 -28
  120. cognee/modules/retrieval/cypher_search_retriever.py +25 -20
  121. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +42 -46
  122. cognee/modules/retrieval/graph_completion_cot_retriever.py +68 -51
  123. cognee/modules/retrieval/graph_completion_retriever.py +78 -63
  124. cognee/modules/retrieval/graph_summary_completion_retriever.py +2 -0
  125. cognee/modules/retrieval/lexical_retriever.py +34 -12
  126. cognee/modules/retrieval/natural_language_retriever.py +18 -15
  127. cognee/modules/retrieval/summaries_retriever.py +51 -34
  128. cognee/modules/retrieval/temporal_retriever.py +59 -49
  129. cognee/modules/retrieval/triplet_retriever.py +31 -32
  130. cognee/modules/retrieval/utils/access_tracking.py +88 -0
  131. cognee/modules/retrieval/utils/brute_force_triplet_search.py +99 -85
  132. cognee/modules/retrieval/utils/node_edge_vector_search.py +174 -0
  133. cognee/modules/search/methods/__init__.py +1 -0
  134. cognee/modules/search/methods/get_retriever_output.py +53 -0
  135. cognee/modules/search/methods/get_search_type_retriever_instance.py +252 -0
  136. cognee/modules/search/methods/search.py +90 -215
  137. cognee/modules/search/models/SearchResultPayload.py +67 -0
  138. cognee/modules/search/types/SearchResult.py +1 -8
  139. cognee/modules/search/types/SearchType.py +1 -2
  140. cognee/modules/search/types/__init__.py +1 -1
  141. cognee/modules/search/utils/__init__.py +1 -2
  142. cognee/modules/search/utils/transform_insights_to_graph.py +2 -2
  143. cognee/modules/search/utils/{transform_context_to_graph.py → transform_triplets_to_graph.py} +2 -2
  144. cognee/modules/users/authentication/default/default_transport.py +11 -1
  145. cognee/modules/users/authentication/get_api_auth_backend.py +2 -1
  146. cognee/modules/users/authentication/get_client_auth_backend.py +2 -1
  147. cognee/modules/users/methods/create_user.py +0 -9
  148. cognee/modules/users/permissions/methods/has_user_management_permission.py +29 -0
  149. cognee/modules/visualization/cognee_network_visualization.py +1 -1
  150. cognee/run_migrations.py +48 -0
  151. cognee/shared/exceptions/__init__.py +1 -3
  152. cognee/shared/exceptions/exceptions.py +11 -1
  153. cognee/shared/usage_logger.py +332 -0
  154. cognee/shared/utils.py +12 -5
  155. cognee/tasks/chunks/__init__.py +9 -0
  156. cognee/tasks/cleanup/cleanup_unused_data.py +172 -0
  157. cognee/tasks/graph/__init__.py +7 -0
  158. cognee/tasks/memify/__init__.py +8 -0
  159. cognee/tasks/memify/extract_usage_frequency.py +613 -0
  160. cognee/tasks/summarization/models.py +0 -2
  161. cognee/tasks/temporal_graph/__init__.py +0 -1
  162. cognee/tasks/translation/__init__.py +96 -0
  163. cognee/tasks/translation/config.py +110 -0
  164. cognee/tasks/translation/detect_language.py +190 -0
  165. cognee/tasks/translation/exceptions.py +62 -0
  166. cognee/tasks/translation/models.py +72 -0
  167. cognee/tasks/translation/providers/__init__.py +44 -0
  168. cognee/tasks/translation/providers/azure_provider.py +192 -0
  169. cognee/tasks/translation/providers/base.py +85 -0
  170. cognee/tasks/translation/providers/google_provider.py +158 -0
  171. cognee/tasks/translation/providers/llm_provider.py +143 -0
  172. cognee/tasks/translation/translate_content.py +282 -0
  173. cognee/tasks/web_scraper/default_url_crawler.py +6 -2
  174. cognee/tests/cli_tests/cli_unit_tests/test_cli_commands.py +1 -0
  175. cognee/tests/cli_tests/cli_unit_tests/test_cli_edge_cases.py +3 -0
  176. cognee/tests/integration/retrieval/test_brute_force_triplet_search_with_cognify.py +62 -0
  177. cognee/tests/integration/retrieval/test_chunks_retriever.py +115 -16
  178. cognee/tests/integration/retrieval/test_graph_completion_retriever.py +13 -5
  179. cognee/tests/integration/retrieval/test_graph_completion_retriever_context_extension.py +22 -20
  180. cognee/tests/integration/retrieval/test_graph_completion_retriever_cot.py +23 -24
  181. cognee/tests/integration/retrieval/test_rag_completion_retriever.py +70 -5
  182. cognee/tests/integration/retrieval/test_structured_output.py +62 -18
  183. cognee/tests/integration/retrieval/test_summaries_retriever.py +20 -9
  184. cognee/tests/integration/retrieval/test_temporal_retriever.py +38 -8
  185. cognee/tests/integration/retrieval/test_triplet_retriever.py +13 -4
  186. cognee/tests/integration/shared/test_usage_logger_integration.py +255 -0
  187. cognee/tests/tasks/translation/README.md +147 -0
  188. cognee/tests/tasks/translation/__init__.py +1 -0
  189. cognee/tests/tasks/translation/config_test.py +93 -0
  190. cognee/tests/tasks/translation/detect_language_test.py +118 -0
  191. cognee/tests/tasks/translation/providers_test.py +151 -0
  192. cognee/tests/tasks/translation/translate_content_test.py +213 -0
  193. cognee/tests/test_chromadb.py +1 -1
  194. cognee/tests/test_cleanup_unused_data.py +165 -0
  195. cognee/tests/test_delete_by_id.py +6 -6
  196. cognee/tests/test_extract_usage_frequency.py +308 -0
  197. cognee/tests/test_kuzu.py +17 -7
  198. cognee/tests/test_lancedb.py +3 -1
  199. cognee/tests/test_library.py +1 -1
  200. cognee/tests/test_neo4j.py +17 -7
  201. cognee/tests/test_neptune_analytics_vector.py +3 -1
  202. cognee/tests/test_permissions.py +172 -187
  203. cognee/tests/test_pgvector.py +3 -1
  204. cognee/tests/test_relational_db_migration.py +15 -1
  205. cognee/tests/test_remote_kuzu.py +3 -1
  206. cognee/tests/test_s3_file_storage.py +1 -1
  207. cognee/tests/test_search_db.py +97 -110
  208. cognee/tests/test_usage_logger_e2e.py +268 -0
  209. cognee/tests/unit/api/test_get_raw_data_endpoint.py +206 -0
  210. cognee/tests/unit/eval_framework/answer_generation_test.py +4 -3
  211. cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +2 -0
  212. cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +42 -2
  213. cognee/tests/unit/modules/graph/cognee_graph_test.py +329 -31
  214. cognee/tests/unit/modules/retrieval/chunks_retriever_test.py +31 -59
  215. cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +70 -33
  216. cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +72 -52
  217. cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +27 -33
  218. cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +28 -15
  219. cognee/tests/unit/modules/retrieval/summaries_retriever_test.py +37 -42
  220. cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +48 -64
  221. cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py +263 -24
  222. cognee/tests/unit/modules/retrieval/test_node_edge_vector_search.py +273 -0
  223. cognee/tests/unit/modules/retrieval/triplet_retriever_test.py +30 -16
  224. cognee/tests/unit/modules/search/test_get_search_type_retriever_instance.py +125 -0
  225. cognee/tests/unit/modules/search/test_search.py +176 -0
  226. cognee/tests/unit/modules/search/test_search_prepare_search_result_contract.py +190 -0
  227. cognee/tests/unit/modules/users/test_tutorial_notebook_creation.py +511 -297
  228. cognee/tests/unit/shared/test_usage_logger.py +241 -0
  229. cognee/tests/unit/users/permissions/test_has_user_management_permission.py +46 -0
  230. {cognee-0.5.1.dev0.dist-info → cognee-0.5.2.dist-info}/METADATA +22 -17
  231. {cognee-0.5.1.dev0.dist-info → cognee-0.5.2.dist-info}/RECORD +235 -147
  232. cognee/api/.env.example +0 -5
  233. cognee/modules/retrieval/base_graph_retriever.py +0 -24
  234. cognee/modules/search/methods/get_search_type_tools.py +0 -223
  235. cognee/modules/search/methods/no_access_control_search.py +0 -62
  236. cognee/modules/search/utils/prepare_search_result.py +0 -63
  237. cognee/tests/test_feedback_enrichment.py +0 -174
  238. {cognee-0.5.1.dev0.dist-info → cognee-0.5.2.dist-info}/WHEEL +0 -0
  239. {cognee-0.5.1.dev0.dist-info → cognee-0.5.2.dist-info}/entry_points.txt +0 -0
  240. {cognee-0.5.1.dev0.dist-info → cognee-0.5.2.dist-info}/licenses/LICENSE +0 -0
  241. {cognee-0.5.1.dev0.dist-info → cognee-0.5.2.dist-info}/licenses/NOTICE.md +0 -0
@@ -0,0 +1,96 @@
1
+ """
2
+ Translation task for Cognee.
3
+
4
+ This module provides multilingual content translation capabilities,
5
+ allowing automatic detection and translation of non-English content
6
+ to a target language while preserving original text and metadata.
7
+
8
+ Main Components:
9
+ - translate_content: Main task function for translating document chunks
10
+ - translate_text: Convenience function for translating single texts
11
+ - batch_translate_texts: Batch translation for multiple texts
12
+ - detect_language: Language detection utility
13
+ - TranslatedContent: DataPoint model for translated content
14
+ - LanguageMetadata: DataPoint model for language information
15
+
16
+ Supported Translation Providers:
17
+ - LLM (default): Uses the configured LLM via existing infrastructure
18
+ - Google Translate: Requires google-cloud-translate package
19
+ - Azure Translator: Requires Azure Translator API key
20
+
21
+ Example Usage:
22
+ ```python
23
+ from cognee.tasks.translation import translate_content, translate_text
24
+
25
+ # Translate document chunks in a pipeline
26
+ translated_chunks = await translate_content(
27
+ chunks,
28
+ target_language="en",
29
+ translation_provider="llm"
30
+ )
31
+
32
+ # Translate a single text
33
+ result = await translate_text("Bonjour le monde!")
34
+ print(result.translated_text) # "Hello world!"
35
+ ```
36
+ """
37
+
38
+ from .config import get_translation_config, TranslationConfig
39
+ from .detect_language import (
40
+ detect_language,
41
+ detect_language_async,
42
+ LanguageDetectionResult,
43
+ get_language_name,
44
+ )
45
+ from .exceptions import (
46
+ TranslationError,
47
+ LanguageDetectionError,
48
+ TranslationProviderError,
49
+ UnsupportedLanguageError,
50
+ TranslationConfigError,
51
+ )
52
+ from .models import TranslatedContent, LanguageMetadata
53
+ from .providers import (
54
+ TranslationProvider,
55
+ TranslationResult,
56
+ get_translation_provider,
57
+ LLMTranslationProvider,
58
+ GoogleTranslationProvider,
59
+ AzureTranslationProvider,
60
+ )
61
+ from .translate_content import (
62
+ translate_content,
63
+ translate_text,
64
+ batch_translate_texts,
65
+ )
66
+
67
+ __all__ = [
68
+ # Main task functions
69
+ "translate_content",
70
+ "translate_text",
71
+ "batch_translate_texts",
72
+ # Language detection
73
+ "detect_language",
74
+ "detect_language_async",
75
+ "LanguageDetectionResult",
76
+ "get_language_name",
77
+ # Models
78
+ "TranslatedContent",
79
+ "LanguageMetadata",
80
+ # Configuration
81
+ "get_translation_config",
82
+ "TranslationConfig",
83
+ # Providers
84
+ "TranslationProvider",
85
+ "TranslationResult",
86
+ "get_translation_provider",
87
+ "LLMTranslationProvider",
88
+ "GoogleTranslationProvider",
89
+ "AzureTranslationProvider",
90
+ # Exceptions
91
+ "TranslationError",
92
+ "LanguageDetectionError",
93
+ "TranslationProviderError",
94
+ "UnsupportedLanguageError",
95
+ "TranslationConfigError",
96
+ ]
@@ -0,0 +1,110 @@
1
+ from functools import lru_cache
2
+ from typing import Literal, Optional
3
+
4
+ from pydantic import AliasChoices, Field
5
+ from pydantic_settings import BaseSettings, SettingsConfigDict
6
+
7
+
8
+ TranslationProviderType = Literal["llm", "google", "azure"]
9
+
10
+
11
+ class TranslationConfig(BaseSettings):
12
+ """
13
+ Configuration settings for the translation task.
14
+
15
+ Environment variables can be used to configure these settings:
16
+ - TRANSLATION_PROVIDER: The translation service to use ("llm", "google", "azure")
17
+ - TARGET_LANGUAGE: Default target language (ISO 639-1 code, e.g., "en", "es", "fr")
18
+ - CONFIDENCE_THRESHOLD: Minimum confidence for language detection (0.0 to 1.0)
19
+ - GOOGLE_TRANSLATE_API_KEY: API key for Google Translate
20
+ - GOOGLE_PROJECT_ID: Google Cloud project ID
21
+ - AZURE_TRANSLATOR_KEY: API key for Azure Translator
22
+ - AZURE_TRANSLATOR_REGION: Region for Azure Translator
23
+ - AZURE_TRANSLATOR_ENDPOINT: Endpoint URL for Azure Translator
24
+ - TRANSLATION_BATCH_SIZE: Number of texts to translate per batch
25
+ - TRANSLATION_MAX_RETRIES: Maximum retry attempts on failure
26
+ - TRANSLATION_TIMEOUT_SECONDS: Request timeout in seconds
27
+ """
28
+
29
+ # Translation provider settings
30
+ translation_provider: TranslationProviderType = Field(
31
+ default="llm",
32
+ validation_alias=AliasChoices("TRANSLATION_PROVIDER", "translation_provider"),
33
+ )
34
+ target_language: str = Field(
35
+ default="en",
36
+ validation_alias=AliasChoices("TARGET_LANGUAGE", "target_language"),
37
+ )
38
+ confidence_threshold: float = Field(
39
+ default=0.8,
40
+ ge=0.0,
41
+ le=1.0,
42
+ validation_alias=AliasChoices("CONFIDENCE_THRESHOLD", "confidence_threshold"),
43
+ )
44
+
45
+ # Google Translate settings
46
+ google_translate_api_key: Optional[str] = Field(
47
+ default=None,
48
+ validation_alias=AliasChoices("GOOGLE_TRANSLATE_API_KEY", "google_translate_api_key"),
49
+ )
50
+ google_project_id: Optional[str] = Field(
51
+ default=None,
52
+ validation_alias=AliasChoices("GOOGLE_PROJECT_ID", "google_project_id"),
53
+ )
54
+
55
+ # Azure Translator settings
56
+ azure_translator_key: Optional[str] = Field(
57
+ default=None,
58
+ validation_alias=AliasChoices("AZURE_TRANSLATOR_KEY", "azure_translator_key"),
59
+ )
60
+ azure_translator_region: Optional[str] = Field(
61
+ default=None,
62
+ validation_alias=AliasChoices("AZURE_TRANSLATOR_REGION", "azure_translator_region"),
63
+ )
64
+ azure_translator_endpoint: str = Field(
65
+ default="https://api.cognitive.microsofttranslator.com",
66
+ validation_alias=AliasChoices("AZURE_TRANSLATOR_ENDPOINT", "azure_translator_endpoint"),
67
+ )
68
+
69
+ # LLM provider uses the existing LLM configuration
70
+
71
+ # Performance settings (with TRANSLATION_ prefix for env vars)
72
+ batch_size: int = Field(
73
+ default=10,
74
+ validation_alias=AliasChoices("TRANSLATION_BATCH_SIZE", "batch_size"),
75
+ )
76
+ max_retries: int = Field(
77
+ default=3,
78
+ validation_alias=AliasChoices("TRANSLATION_MAX_RETRIES", "max_retries"),
79
+ )
80
+ timeout_seconds: int = Field(
81
+ default=30,
82
+ validation_alias=AliasChoices("TRANSLATION_TIMEOUT_SECONDS", "timeout_seconds"),
83
+ )
84
+
85
+ # Language detection settings
86
+ min_text_length_for_detection: int = 10
87
+ skip_detection_for_short_text: bool = True
88
+
89
+ model_config = SettingsConfigDict(env_file=".env", extra="allow")
90
+
91
+ def to_dict(self) -> dict:
92
+ return {
93
+ "translation_provider": self.translation_provider,
94
+ "target_language": self.target_language,
95
+ "confidence_threshold": self.confidence_threshold,
96
+ "batch_size": self.batch_size,
97
+ "max_retries": self.max_retries,
98
+ "timeout_seconds": self.timeout_seconds,
99
+ }
100
+
101
+
102
+ @lru_cache()
103
+ def get_translation_config() -> TranslationConfig:
104
+ """Get the translation configuration singleton."""
105
+ return TranslationConfig()
106
+
107
+
108
+ def clear_translation_config_cache():
109
+ """Clear the cached config for testing purposes."""
110
+ get_translation_config.cache_clear()
@@ -0,0 +1,190 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+ from cognee.shared.logging_utils import get_logger
5
+
6
+ from .config import get_translation_config
7
+ from .exceptions import LanguageDetectionError
8
+
9
+ logger = get_logger(__name__)
10
+
11
+
12
+ # ISO 639-1 language code to name mapping
13
+ LANGUAGE_NAMES = {
14
+ "af": "Afrikaans",
15
+ "ar": "Arabic",
16
+ "bg": "Bulgarian",
17
+ "bn": "Bengali",
18
+ "ca": "Catalan",
19
+ "cs": "Czech",
20
+ "cy": "Welsh",
21
+ "da": "Danish",
22
+ "de": "German",
23
+ "el": "Greek",
24
+ "en": "English",
25
+ "es": "Spanish",
26
+ "et": "Estonian",
27
+ "fa": "Persian",
28
+ "fi": "Finnish",
29
+ "fr": "French",
30
+ "gu": "Gujarati",
31
+ "he": "Hebrew",
32
+ "hi": "Hindi",
33
+ "hr": "Croatian",
34
+ "hu": "Hungarian",
35
+ "id": "Indonesian",
36
+ "it": "Italian",
37
+ "ja": "Japanese",
38
+ "kn": "Kannada",
39
+ "ko": "Korean",
40
+ "lt": "Lithuanian",
41
+ "lv": "Latvian",
42
+ "mk": "Macedonian",
43
+ "ml": "Malayalam",
44
+ "mr": "Marathi",
45
+ "ne": "Nepali",
46
+ "nl": "Dutch",
47
+ "no": "Norwegian",
48
+ "pa": "Punjabi",
49
+ "pl": "Polish",
50
+ "pt": "Portuguese",
51
+ "ro": "Romanian",
52
+ "ru": "Russian",
53
+ "sk": "Slovak",
54
+ "sl": "Slovenian",
55
+ "so": "Somali",
56
+ "sq": "Albanian",
57
+ "sv": "Swedish",
58
+ "sw": "Swahili",
59
+ "ta": "Tamil",
60
+ "te": "Telugu",
61
+ "th": "Thai",
62
+ "tl": "Tagalog",
63
+ "tr": "Turkish",
64
+ "uk": "Ukrainian",
65
+ "ur": "Urdu",
66
+ "vi": "Vietnamese",
67
+ "zh-cn": "Chinese (Simplified)",
68
+ "zh-tw": "Chinese (Traditional)",
69
+ }
70
+
71
+
72
+ @dataclass
73
+ class LanguageDetectionResult:
74
+ """Result of language detection."""
75
+
76
+ language_code: str
77
+ language_name: str
78
+ confidence: float
79
+ requires_translation: bool
80
+ character_count: int
81
+
82
+
83
+ def get_language_name(language_code: str) -> str:
84
+ """Get the human-readable name for a language code."""
85
+ return LANGUAGE_NAMES.get(language_code.lower(), language_code)
86
+
87
+
88
+ def detect_language(
89
+ text: str,
90
+ target_language: str = "en",
91
+ confidence_threshold: Optional[float] = None,
92
+ ) -> LanguageDetectionResult:
93
+ """
94
+ Detect the language of the given text.
95
+
96
+ Uses the langdetect library which is already a dependency of cognee.
97
+
98
+ Args:
99
+ text: The text to analyze
100
+ target_language: The target language for translation comparison
101
+ confidence_threshold: Minimum confidence to consider detection reliable
102
+
103
+ Returns:
104
+ LanguageDetectionResult with language info and translation requirement
105
+
106
+ Raises:
107
+ LanguageDetectionError: If language detection fails
108
+ """
109
+ config = get_translation_config()
110
+ threshold = confidence_threshold or config.confidence_threshold
111
+
112
+ # Handle empty or very short text
113
+ if not text or len(text.strip()) < config.min_text_length_for_detection:
114
+ if config.skip_detection_for_short_text:
115
+ return LanguageDetectionResult(
116
+ language_code="unknown",
117
+ language_name="Unknown",
118
+ confidence=0.0,
119
+ requires_translation=False,
120
+ character_count=len(text) if text else 0,
121
+ )
122
+ else:
123
+ raise LanguageDetectionError(
124
+ f"Text too short for reliable language detection: {len(text)} characters"
125
+ )
126
+
127
+ try:
128
+ from langdetect import detect_langs, LangDetectException
129
+ except ImportError:
130
+ raise LanguageDetectionError(
131
+ "langdetect is required for language detection. Install it with: pip install langdetect"
132
+ )
133
+
134
+ try:
135
+ # Get detection results with probabilities
136
+ detections = detect_langs(text)
137
+
138
+ if not detections:
139
+ raise LanguageDetectionError("No language detected")
140
+
141
+ # Get the most likely language
142
+ best_detection = detections[0]
143
+ language_code = best_detection.lang
144
+ confidence = best_detection.prob
145
+
146
+ # Check if translation is needed
147
+ requires_translation = (
148
+ language_code.lower() != target_language.lower() and confidence >= threshold
149
+ )
150
+
151
+ return LanguageDetectionResult(
152
+ language_code=language_code,
153
+ language_name=get_language_name(language_code),
154
+ confidence=confidence,
155
+ requires_translation=requires_translation,
156
+ character_count=len(text),
157
+ )
158
+
159
+ except LangDetectException as e:
160
+ logger.warning(f"Language detection failed: {e}")
161
+ raise LanguageDetectionError(f"Language detection failed: {e}", original_error=e)
162
+ except Exception as e:
163
+ logger.error(f"Unexpected error during language detection: {e}")
164
+ raise LanguageDetectionError(
165
+ f"Unexpected error during language detection: {e}", original_error=e
166
+ )
167
+
168
+
169
+ async def detect_language_async(
170
+ text: str,
171
+ target_language: str = "en",
172
+ confidence_threshold: Optional[float] = None,
173
+ ) -> LanguageDetectionResult:
174
+ """
175
+ Async wrapper for language detection.
176
+
177
+ Args:
178
+ text: The text to analyze
179
+ target_language: The target language for translation comparison
180
+ confidence_threshold: Minimum confidence to consider detection reliable
181
+
182
+ Returns:
183
+ LanguageDetectionResult with language info and translation requirement
184
+ """
185
+ import asyncio
186
+
187
+ loop = asyncio.get_running_loop()
188
+ return await loop.run_in_executor(
189
+ None, detect_language, text, target_language, confidence_threshold
190
+ )
@@ -0,0 +1,62 @@
1
+ class TranslationError(Exception):
2
+ """Base exception for translation errors."""
3
+
4
+ def __init__(self, message: str, original_error: Exception = None):
5
+ self.message = message
6
+ self.original_error = original_error
7
+ super().__init__(self.message)
8
+ if original_error:
9
+ self.__cause__ = original_error
10
+
11
+
12
+ class LanguageDetectionError(TranslationError):
13
+ """Exception raised when language detection fails."""
14
+
15
+ def __init__(
16
+ self, message: str = "Failed to detect language", original_error: Exception = None
17
+ ):
18
+ super().__init__(message, original_error)
19
+
20
+
21
+ class TranslationProviderError(TranslationError):
22
+ """Exception raised when the translation provider encounters an error."""
23
+
24
+ def __init__(
25
+ self,
26
+ provider: str,
27
+ message: str = "Translation provider error",
28
+ original_error: Exception = None,
29
+ ):
30
+ self.provider = provider
31
+ full_message = f"[{provider}] {message}"
32
+ super().__init__(full_message, original_error)
33
+
34
+
35
+ class UnsupportedLanguageError(TranslationError):
36
+ """Exception raised when the language is not supported."""
37
+
38
+ def __init__(
39
+ self,
40
+ language: str,
41
+ provider: str = None,
42
+ message: str = None,
43
+ original_error: Exception = None,
44
+ ):
45
+ self.language = language
46
+ self.provider = provider
47
+ if message is None:
48
+ message = f"Language '{language}' is not supported"
49
+ if provider:
50
+ message += f" by {provider}"
51
+ super().__init__(message, original_error)
52
+
53
+
54
+ class TranslationConfigError(TranslationError):
55
+ """Exception raised when translation configuration is invalid."""
56
+
57
+ def __init__(
58
+ self,
59
+ message: str = "Invalid translation configuration",
60
+ original_error: Exception = None,
61
+ ):
62
+ super().__init__(message, original_error)
@@ -0,0 +1,72 @@
1
+ from datetime import datetime, timezone
2
+ from typing import Optional
3
+ from uuid import UUID
4
+
5
+ from cognee.infrastructure.engine import DataPoint
6
+ from cognee.modules.chunking.models import DocumentChunk
7
+
8
+
9
+ class TranslatedContent(DataPoint):
10
+ """
11
+ Represents translated content with quality metrics.
12
+
13
+ This class stores both the original and translated versions of content,
14
+ along with metadata about the translation process including source and
15
+ target languages, translation provider used, and confidence scores.
16
+
17
+ Instance variables include:
18
+
19
+ - original_chunk_id: UUID of the original document chunk
20
+ - original_text: The original text before translation
21
+ - translated_text: The translated text content
22
+ - source_language: Detected or specified source language code (e.g., "es", "fr", "de")
23
+ - target_language: Target language code for translation (default: "en")
24
+ - translation_provider: Name of the translation service used
25
+ - confidence_score: Translation quality/confidence score (0.0 to 1.0)
26
+ - translation_timestamp: When the translation was performed
27
+ - translated_from: Reference to the original DocumentChunk
28
+ """
29
+
30
+ original_chunk_id: UUID
31
+ original_text: str
32
+ translated_text: str
33
+ source_language: str
34
+ target_language: str = "en"
35
+ translation_provider: str
36
+ confidence_score: float
37
+ translation_timestamp: datetime = None
38
+ translated_from: Optional[DocumentChunk] = None
39
+
40
+ metadata: dict = {"index_fields": ["source_language", "translated_text"]}
41
+
42
+ def __init__(self, **data):
43
+ if data.get("translation_timestamp") is None:
44
+ data["translation_timestamp"] = datetime.now(timezone.utc)
45
+ super().__init__(**data)
46
+
47
+
48
+ class LanguageMetadata(DataPoint):
49
+ """
50
+ Language information for content.
51
+
52
+ This class stores metadata about the detected language of content,
53
+ including confidence scores and whether translation is required.
54
+
55
+ Instance variables include:
56
+
57
+ - content_id: UUID of the associated content
58
+ - detected_language: ISO 639-1 language code (e.g., "en", "es", "fr")
59
+ - language_confidence: Confidence score for language detection (0.0 to 1.0)
60
+ - requires_translation: Whether the content needs translation
61
+ - character_count: Number of characters in the content
62
+ - language_name: Human-readable language name (e.g., "English", "Spanish")
63
+ """
64
+
65
+ content_id: UUID
66
+ detected_language: str
67
+ language_confidence: float
68
+ requires_translation: bool
69
+ character_count: int
70
+ language_name: Optional[str] = None
71
+
72
+ metadata: dict = {"index_fields": ["detected_language"]}
@@ -0,0 +1,44 @@
1
+ from .base import TranslationProvider, TranslationResult
2
+ from .llm_provider import LLMTranslationProvider
3
+ from .google_provider import GoogleTranslationProvider
4
+ from .azure_provider import AzureTranslationProvider
5
+
6
+ __all__ = [
7
+ "TranslationProvider",
8
+ "TranslationResult",
9
+ "LLMTranslationProvider",
10
+ "GoogleTranslationProvider",
11
+ "AzureTranslationProvider",
12
+ "get_translation_provider",
13
+ ]
14
+
15
+
16
+ def get_translation_provider(provider_name: str) -> TranslationProvider:
17
+ """
18
+ Factory function to get the appropriate translation provider.
19
+
20
+ Args:
21
+ provider_name: Name of the provider:
22
+ - "llm": Uses the configured LLM (OpenAI, Azure, Ollama, Anthropic, etc.)
23
+ - "google": Uses Google Cloud Translation API
24
+ - "azure": Uses Azure Translator API
25
+
26
+ Returns:
27
+ TranslationProvider instance
28
+
29
+ Raises:
30
+ ValueError: If the provider name is not recognized
31
+ """
32
+ providers = {
33
+ "llm": LLMTranslationProvider,
34
+ "google": GoogleTranslationProvider,
35
+ "azure": AzureTranslationProvider,
36
+ }
37
+
38
+ if provider_name.lower() not in providers:
39
+ raise ValueError(
40
+ f"Unknown translation provider: {provider_name}. "
41
+ f"Available providers: {list(providers.keys())}"
42
+ )
43
+
44
+ return providers[provider_name.lower()]()