cognee 0.3.4.dev4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. cognee/api/client.py +16 -7
  2. cognee/api/health.py +5 -9
  3. cognee/api/v1/add/add.py +3 -1
  4. cognee/api/v1/cognify/cognify.py +44 -7
  5. cognee/api/v1/permissions/routers/get_permissions_router.py +8 -4
  6. cognee/api/v1/search/search.py +3 -0
  7. cognee/api/v1/ui/__init__.py +1 -1
  8. cognee/api/v1/ui/ui.py +215 -150
  9. cognee/api/v1/update/__init__.py +1 -0
  10. cognee/api/v1/update/routers/__init__.py +1 -0
  11. cognee/api/v1/update/routers/get_update_router.py +90 -0
  12. cognee/api/v1/update/update.py +100 -0
  13. cognee/base_config.py +5 -2
  14. cognee/cli/_cognee.py +28 -10
  15. cognee/cli/commands/delete_command.py +34 -2
  16. cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +2 -2
  17. cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +3 -2
  18. cognee/eval_framework/modal_eval_dashboard.py +9 -1
  19. cognee/infrastructure/databases/graph/config.py +9 -9
  20. cognee/infrastructure/databases/graph/get_graph_engine.py +4 -21
  21. cognee/infrastructure/databases/graph/kuzu/adapter.py +60 -9
  22. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +3 -3
  23. cognee/infrastructure/databases/relational/config.py +4 -4
  24. cognee/infrastructure/databases/relational/create_relational_engine.py +11 -3
  25. cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +7 -3
  26. cognee/infrastructure/databases/vector/config.py +7 -7
  27. cognee/infrastructure/databases/vector/create_vector_engine.py +7 -15
  28. cognee/infrastructure/databases/vector/embeddings/EmbeddingEngine.py +9 -0
  29. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +11 -0
  30. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +19 -2
  31. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +11 -0
  32. cognee/infrastructure/databases/vector/embeddings/config.py +8 -0
  33. cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +5 -0
  34. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +11 -10
  35. cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +48 -38
  36. cognee/infrastructure/databases/vector/vector_db_interface.py +8 -4
  37. cognee/infrastructure/files/storage/S3FileStorage.py +15 -5
  38. cognee/infrastructure/files/storage/s3_config.py +1 -0
  39. cognee/infrastructure/files/utils/open_data_file.py +7 -14
  40. cognee/infrastructure/llm/LLMGateway.py +19 -117
  41. cognee/infrastructure/llm/config.py +28 -13
  42. cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/extract_categories.py +2 -1
  43. cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/extract_event_entities.py +3 -2
  44. cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/extract_summary.py +3 -2
  45. cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/knowledge_graph/extract_content_graph.py +2 -1
  46. cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/knowledge_graph/extract_event_graph.py +3 -2
  47. cognee/infrastructure/llm/prompts/read_query_prompt.py +3 -2
  48. cognee/infrastructure/llm/prompts/show_prompt.py +35 -0
  49. cognee/infrastructure/llm/prompts/test.txt +1 -0
  50. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +2 -2
  51. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +50 -397
  52. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +2 -3
  53. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +8 -88
  54. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +78 -0
  55. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +2 -99
  56. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +49 -401
  57. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +19 -882
  58. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +2 -34
  59. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +2 -107
  60. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/acreate_structured_output.baml +26 -0
  61. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/__init__.py +1 -2
  62. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/acreate_structured_output.py +76 -0
  63. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/create_dynamic_baml_type.py +122 -0
  64. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +3 -3
  65. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +0 -32
  66. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +107 -98
  67. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +5 -6
  68. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +5 -6
  69. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/llm_interface.py +0 -26
  70. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +17 -67
  71. cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +8 -7
  72. cognee/infrastructure/llm/utils.py +4 -4
  73. cognee/infrastructure/loaders/LoaderEngine.py +5 -2
  74. cognee/infrastructure/loaders/external/__init__.py +7 -0
  75. cognee/infrastructure/loaders/external/advanced_pdf_loader.py +244 -0
  76. cognee/infrastructure/loaders/supported_loaders.py +7 -0
  77. cognee/modules/data/methods/create_authorized_dataset.py +9 -0
  78. cognee/modules/data/methods/get_authorized_dataset.py +1 -1
  79. cognee/modules/data/methods/get_authorized_dataset_by_name.py +11 -0
  80. cognee/modules/data/methods/get_deletion_counts.py +92 -0
  81. cognee/modules/graph/cognee_graph/CogneeGraph.py +1 -1
  82. cognee/modules/graph/utils/expand_with_nodes_and_edges.py +22 -8
  83. cognee/modules/graph/utils/retrieve_existing_edges.py +0 -2
  84. cognee/modules/ingestion/data_types/TextData.py +0 -1
  85. cognee/modules/observability/get_observe.py +14 -0
  86. cognee/modules/observability/observers.py +1 -0
  87. cognee/modules/ontology/base_ontology_resolver.py +42 -0
  88. cognee/modules/ontology/get_default_ontology_resolver.py +41 -0
  89. cognee/modules/ontology/matching_strategies.py +53 -0
  90. cognee/modules/ontology/models.py +20 -0
  91. cognee/modules/ontology/ontology_config.py +24 -0
  92. cognee/modules/ontology/ontology_env_config.py +45 -0
  93. cognee/modules/ontology/rdf_xml/{OntologyResolver.py → RDFLibOntologyResolver.py} +20 -28
  94. cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py +21 -24
  95. cognee/modules/pipelines/layers/resolve_authorized_user_datasets.py +3 -3
  96. cognee/modules/retrieval/code_retriever.py +2 -1
  97. cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +1 -4
  98. cognee/modules/retrieval/graph_completion_cot_retriever.py +6 -5
  99. cognee/modules/retrieval/graph_completion_retriever.py +0 -3
  100. cognee/modules/retrieval/insights_retriever.py +1 -1
  101. cognee/modules/retrieval/jaccard_retrival.py +60 -0
  102. cognee/modules/retrieval/lexical_retriever.py +123 -0
  103. cognee/modules/retrieval/natural_language_retriever.py +2 -1
  104. cognee/modules/retrieval/temporal_retriever.py +3 -2
  105. cognee/modules/retrieval/utils/brute_force_triplet_search.py +2 -12
  106. cognee/modules/retrieval/utils/completion.py +4 -7
  107. cognee/modules/search/methods/get_search_type_tools.py +7 -0
  108. cognee/modules/search/methods/no_access_control_search.py +1 -1
  109. cognee/modules/search/methods/search.py +32 -13
  110. cognee/modules/search/types/SearchType.py +1 -0
  111. cognee/modules/users/permissions/methods/authorized_give_permission_on_datasets.py +12 -0
  112. cognee/modules/users/permissions/methods/check_permission_on_dataset.py +11 -0
  113. cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +10 -0
  114. cognee/modules/users/permissions/methods/get_document_ids_for_user.py +10 -0
  115. cognee/modules/users/permissions/methods/get_principal.py +9 -0
  116. cognee/modules/users/permissions/methods/get_principal_datasets.py +11 -0
  117. cognee/modules/users/permissions/methods/get_role.py +10 -0
  118. cognee/modules/users/permissions/methods/get_specific_user_permission_datasets.py +3 -3
  119. cognee/modules/users/permissions/methods/get_tenant.py +9 -0
  120. cognee/modules/users/permissions/methods/give_default_permission_to_role.py +9 -0
  121. cognee/modules/users/permissions/methods/give_default_permission_to_tenant.py +9 -0
  122. cognee/modules/users/permissions/methods/give_default_permission_to_user.py +9 -0
  123. cognee/modules/users/permissions/methods/give_permission_on_dataset.py +10 -0
  124. cognee/modules/users/roles/methods/add_user_to_role.py +11 -0
  125. cognee/modules/users/roles/methods/create_role.py +12 -1
  126. cognee/modules/users/tenants/methods/add_user_to_tenant.py +12 -0
  127. cognee/modules/users/tenants/methods/create_tenant.py +12 -1
  128. cognee/modules/visualization/cognee_network_visualization.py +13 -9
  129. cognee/shared/data_models.py +0 -1
  130. cognee/shared/utils.py +0 -32
  131. cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
  132. cognee/tasks/codingagents/coding_rule_associations.py +3 -2
  133. cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +3 -2
  134. cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +3 -2
  135. cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +3 -2
  136. cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +3 -2
  137. cognee/tasks/graph/extract_graph_from_code.py +2 -2
  138. cognee/tasks/graph/extract_graph_from_data.py +55 -12
  139. cognee/tasks/graph/extract_graph_from_data_v2.py +16 -4
  140. cognee/tasks/ingestion/migrate_relational_database.py +132 -41
  141. cognee/tasks/ingestion/resolve_data_directories.py +4 -1
  142. cognee/tasks/schema/ingest_database_schema.py +134 -0
  143. cognee/tasks/schema/models.py +40 -0
  144. cognee/tasks/storage/index_data_points.py +1 -1
  145. cognee/tasks/storage/index_graph_edges.py +3 -1
  146. cognee/tasks/summarization/summarize_code.py +2 -2
  147. cognee/tasks/summarization/summarize_text.py +2 -2
  148. cognee/tasks/temporal_graph/enrich_events.py +2 -2
  149. cognee/tasks/temporal_graph/extract_events_and_entities.py +2 -2
  150. cognee/tests/cli_tests/cli_unit_tests/test_cli_commands.py +13 -4
  151. cognee/tests/cli_tests/cli_unit_tests/test_cli_edge_cases.py +13 -3
  152. cognee/tests/test_advanced_pdf_loader.py +141 -0
  153. cognee/tests/test_chromadb.py +40 -0
  154. cognee/tests/test_cognee_server_start.py +6 -1
  155. cognee/tests/test_data/Quantum_computers.txt +9 -0
  156. cognee/tests/test_lancedb.py +211 -0
  157. cognee/tests/test_pgvector.py +40 -0
  158. cognee/tests/test_relational_db_migration.py +76 -0
  159. cognee/tests/unit/infrastructure/databases/test_index_graph_edges.py +2 -1
  160. cognee/tests/unit/modules/ontology/test_ontology_adapter.py +330 -13
  161. cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +0 -4
  162. cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +0 -4
  163. cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +0 -4
  164. {cognee-0.3.4.dev4.dist-info → cognee-0.3.6.dist-info}/METADATA +92 -96
  165. {cognee-0.3.4.dev4.dist-info → cognee-0.3.6.dist-info}/RECORD +172 -160
  166. cognee/infrastructure/data/utils/extract_keywords.py +0 -48
  167. cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py +0 -1227
  168. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +0 -109
  169. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +0 -343
  170. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_categories.py +0 -0
  171. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +0 -89
  172. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/__init__.py +0 -0
  173. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +0 -44
  174. cognee/tasks/graph/infer_data_ontology.py +0 -309
  175. cognee/tests/test_falkordb.py +0 -174
  176. distributed/poetry.lock +0 -12238
  177. distributed/pyproject.toml +0 -186
  178. /cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/__init__.py +0 -0
  179. /cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/knowledge_graph/__init__.py +0 -0
  180. /cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/texts.json +0 -0
  181. {cognee-0.3.4.dev4.dist-info → cognee-0.3.6.dist-info}/WHEEL +0 -0
  182. {cognee-0.3.4.dev4.dist-info → cognee-0.3.6.dist-info}/entry_points.txt +0 -0
  183. {cognee-0.3.4.dev4.dist-info → cognee-0.3.6.dist-info}/licenses/LICENSE +0 -0
  184. {cognee-0.3.4.dev4.dist-info → cognee-0.3.6.dist-info}/licenses/NOTICE.md +0 -0
@@ -0,0 +1,20 @@
1
+ from typing import Any
2
+
3
+
4
+ class AttachedOntologyNode:
5
+ """Lightweight wrapper to be able to parse any ontology solution and generalize cognee interface."""
6
+
7
+ def __init__(self, uri: Any, category: str):
8
+ self.uri = uri
9
+ self.name = self._extract_name(uri)
10
+ self.category = category
11
+
12
+ @staticmethod
13
+ def _extract_name(uri: Any) -> str:
14
+ uri_str = str(uri)
15
+ if "#" in uri_str:
16
+ return uri_str.split("#")[-1]
17
+ return uri_str.rstrip("/").split("/")[-1]
18
+
19
+ def __repr__(self):
20
+ return f"AttachedOntologyNode(name={self.name}, category={self.category})"
@@ -0,0 +1,24 @@
1
+ from typing import TypedDict, Optional
2
+
3
+ from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver
4
+ from cognee.modules.ontology.matching_strategies import MatchingStrategy
5
+
6
+
7
+ class OntologyConfig(TypedDict, total=False):
8
+ """Configuration containing ontology resolver.
9
+
10
+ Attributes:
11
+ ontology_resolver: The ontology resolver instance to use
12
+ """
13
+
14
+ ontology_resolver: Optional[BaseOntologyResolver]
15
+
16
+
17
+ class Config(TypedDict, total=False):
18
+ """Top-level configuration dictionary.
19
+
20
+ Attributes:
21
+ ontology_config: Configuration containing ontology resolver
22
+ """
23
+
24
+ ontology_config: Optional[OntologyConfig]
@@ -0,0 +1,45 @@
1
+ """This module contains the configuration for ontology handling."""
2
+
3
+ from functools import lru_cache
4
+ from pydantic_settings import BaseSettings, SettingsConfigDict
5
+
6
+
7
+ class OntologyEnvConfig(BaseSettings):
8
+ """
9
+ Represents the configuration for ontology handling, including parameters for
10
+ ontology file storage and resolution/matching strategies.
11
+
12
+ Public methods:
13
+ - to_dict
14
+
15
+ Instance variables:
16
+ - ontology_resolver
17
+ - ontology_matching
18
+ - ontology_file_path
19
+ - model_config
20
+ """
21
+
22
+ ontology_resolver: str = "rdflib"
23
+ matching_strategy: str = "fuzzy"
24
+ ontology_file_path: str = ""
25
+
26
+ model_config = SettingsConfigDict(env_file=".env", extra="allow", populate_by_name=True)
27
+
28
+ def to_dict(self) -> dict:
29
+ """
30
+ Return the configuration as a dictionary.
31
+ """
32
+ return {
33
+ "ontology_resolver": self.ontology_resolver,
34
+ "matching_strategy": self.matching_strategy,
35
+ "ontology_file_path": self.ontology_file_path,
36
+ }
37
+
38
+
39
+ @lru_cache
40
+ def get_ontology_env_config():
41
+ """
42
+ Retrieve the ontology configuration. This function utilizes caching to return a
43
+ singleton instance of the OntologyConfig class for efficiency.
44
+ """
45
+ return OntologyEnvConfig()
@@ -10,31 +10,26 @@ from cognee.modules.ontology.exceptions import (
10
10
  FindClosestMatchError,
11
11
  GetSubgraphError,
12
12
  )
13
+ from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver
14
+ from cognee.modules.ontology.models import AttachedOntologyNode
15
+ from cognee.modules.ontology.matching_strategies import MatchingStrategy, FuzzyMatchingStrategy
13
16
 
14
17
  logger = get_logger("OntologyAdapter")
15
18
 
16
19
 
17
- class AttachedOntologyNode:
18
- """Lightweight wrapper to be able to parse any ontology solution and generalize cognee interface."""
20
+ class RDFLibOntologyResolver(BaseOntologyResolver):
21
+ """RDFLib-based ontology resolver implementation.
19
22
 
20
- def __init__(self, uri: URIRef, category: str):
21
- self.uri = uri
22
- self.name = self._extract_name(uri)
23
- self.category = category
23
+ This implementation uses RDFLib to parse and work with RDF/OWL ontology files.
24
+ It provides fuzzy matching and subgraph extraction capabilities for ontology entities.
25
+ """
24
26
 
25
- @staticmethod
26
- def _extract_name(uri: URIRef) -> str:
27
- uri_str = str(uri)
28
- if "#" in uri_str:
29
- return uri_str.split("#")[-1]
30
- return uri_str.rstrip("/").split("/")[-1]
31
-
32
- def __repr__(self):
33
- return f"AttachedOntologyNode(name={self.name}, category={self.category})"
34
-
35
-
36
- class OntologyResolver:
37
- def __init__(self, ontology_file: Optional[str] = None):
27
+ def __init__(
28
+ self,
29
+ ontology_file: Optional[str] = None,
30
+ matching_strategy: Optional[MatchingStrategy] = None,
31
+ ) -> None:
32
+ super().__init__(matching_strategy)
38
33
  self.ontology_file = ontology_file
39
34
  try:
40
35
  if ontology_file and os.path.exists(ontology_file):
@@ -60,7 +55,7 @@ class OntologyResolver:
60
55
  name = uri_str.rstrip("/").split("/")[-1]
61
56
  return name.lower().replace(" ", "_").strip()
62
57
 
63
- def build_lookup(self):
58
+ def build_lookup(self) -> None:
64
59
  try:
65
60
  classes: Dict[str, URIRef] = {}
66
61
  individuals: Dict[str, URIRef] = {}
@@ -97,7 +92,7 @@ class OntologyResolver:
97
92
  logger.error("Failed to build lookup dictionary: %s", str(e))
98
93
  raise RuntimeError("Lookup build failed") from e
99
94
 
100
- def refresh_lookup(self):
95
+ def refresh_lookup(self) -> None:
101
96
  self.build_lookup()
102
97
  logger.info("Ontology lookup refreshed.")
103
98
 
@@ -105,13 +100,8 @@ class OntologyResolver:
105
100
  try:
106
101
  normalized_name = name.lower().replace(" ", "_").strip()
107
102
  possible_matches = list(self.lookup.get(category, {}).keys())
108
- if normalized_name in possible_matches:
109
- return normalized_name
110
103
 
111
- best_match = difflib.get_close_matches(
112
- normalized_name, possible_matches, n=1, cutoff=0.8
113
- )
114
- return best_match[0] if best_match else None
104
+ return self.matching_strategy.find_match(normalized_name, possible_matches)
115
105
  except Exception as e:
116
106
  logger.error("Error in find_closest_match: %s", str(e))
117
107
  raise FindClosestMatchError() from e
@@ -125,7 +115,9 @@ class OntologyResolver:
125
115
 
126
116
  def get_subgraph(
127
117
  self, node_name: str, node_type: str = "individuals", directed: bool = True
128
- ) -> Tuple[List[Any], List[Tuple[str, str, str]], Optional[Any]]:
118
+ ) -> Tuple[
119
+ List[AttachedOntologyNode], List[Tuple[str, str, str]], Optional[AttachedOntologyNode]
120
+ ]:
129
121
  nodes_set = set()
130
122
  edges: List[Tuple[str, str, str]] = []
131
123
  visited = set()
@@ -1,34 +1,31 @@
1
1
  from uuid import UUID
2
+ from typing import Optional
2
3
 
3
- from cognee.api.v1.exceptions import DatasetNotFoundError
4
4
  from cognee.modules.users.models import User
5
- from cognee.modules.users.methods import get_default_user
6
- from cognee.modules.data.methods import (
7
- create_authorized_dataset,
8
- get_authorized_dataset,
9
- get_authorized_dataset_by_name,
5
+ from cognee.modules.pipelines.layers.resolve_authorized_user_datasets import (
6
+ resolve_authorized_user_datasets,
10
7
  )
11
8
 
12
9
 
13
- async def resolve_authorized_user_dataset(dataset_id: UUID, dataset_name: str, user: User):
14
- if not user:
15
- user = await get_default_user()
10
+ async def resolve_authorized_user_dataset(
11
+ dataset_name: str, dataset_id: Optional[UUID] = None, user: Optional[User] = None
12
+ ):
13
+ """
14
+ Function handles creation and dataset authorization if dataset already exist for Cognee.
15
+ Verifies that provided user has necessary permission for provided Dataset.
16
+ If Dataset does not exist creates the Dataset and gives permission for the user creating the dataset.
16
17
 
17
- if dataset_id:
18
- authorized_dataset = await get_authorized_dataset(user, dataset_id, "write")
19
- elif dataset_name:
20
- authorized_dataset = await get_authorized_dataset_by_name(dataset_name, user, "write")
18
+ Args:
19
+ dataset_name: Name of the dataset.
20
+ dataset_id: Id of the dataset.
21
+ user: Cognee User request is being processed for, if None default user will be used.
21
22
 
22
- if not authorized_dataset:
23
- authorized_dataset = await create_authorized_dataset(
24
- dataset_name=dataset_name, user=user
25
- )
26
- else:
27
- raise ValueError("Either dataset_id or dataset_name must be provided.")
23
+ Returns:
24
+ Tuple[User, Dataset]: A tuple containing the user and the authorized dataset.
25
+ """
28
26
 
29
- if not authorized_dataset:
30
- raise DatasetNotFoundError(
31
- message=f"Dataset ({str(dataset_id) or dataset_name}) not found."
32
- )
27
+ user, authorized_datasets = await resolve_authorized_user_datasets(
28
+ datasets=dataset_id if dataset_id else dataset_name, user=user
29
+ )
33
30
 
34
- return user, authorized_dataset
31
+ return user, authorized_datasets[0]
@@ -1,5 +1,5 @@
1
1
  from uuid import UUID
2
- from typing import Union, Tuple, List
2
+ from typing import Union, Tuple, List, Optional
3
3
 
4
4
  from cognee.modules.users.methods import get_default_user
5
5
  from cognee.modules.users.models import User
@@ -13,7 +13,7 @@ from cognee.modules.data.methods import (
13
13
 
14
14
 
15
15
  async def resolve_authorized_user_datasets(
16
- datasets: Union[str, UUID, list[str], list[UUID]], user: User = None
16
+ datasets: Union[str, UUID, list[str], list[UUID]], user: Optional[User] = None
17
17
  ) -> Tuple[User, List[Dataset]]:
18
18
  """
19
19
  Function handles creation and dataset authorization if datasets already exist for Cognee.
@@ -25,7 +25,7 @@ async def resolve_authorized_user_datasets(
25
25
  datasets: Dataset names or Dataset UUID (in case Datasets already exist)
26
26
 
27
27
  Returns:
28
-
28
+ Tuple[User, List[Dataset]]: A tuple containing the user and the list of authorized datasets.
29
29
  """
30
30
  # If no user is provided use default user
31
31
  if user is None:
@@ -7,6 +7,7 @@ from cognee.shared.logging_utils import get_logger
7
7
  from cognee.modules.retrieval.base_retriever import BaseRetriever
8
8
  from cognee.infrastructure.databases.graph import get_graph_engine
9
9
  from cognee.infrastructure.databases.vector import get_vector_engine
10
+ from cognee.infrastructure.llm.prompts import read_query_prompt
10
11
  from cognee.infrastructure.llm.LLMGateway import LLMGateway
11
12
 
12
13
  logger = get_logger("CodeRetriever")
@@ -41,7 +42,7 @@ class CodeRetriever(BaseRetriever):
41
42
  f"Processing query with LLM: '{query[:100]}{'...' if len(query) > 100 else ''}'"
42
43
  )
43
44
 
44
- system_prompt = LLMGateway.read_query_prompt("codegraph_retriever_system.txt")
45
+ system_prompt = read_query_prompt("codegraph_retriever_system.txt")
45
46
 
46
47
  try:
47
48
  result = await LLMGateway.acreate_structured_output(
@@ -42,14 +42,12 @@ class TripletSearchContextProvider(BaseContextProvider):
42
42
  self,
43
43
  entities: List[DataPoint],
44
44
  query: str,
45
- user: User,
46
45
  memory_fragment: CogneeGraph,
47
46
  ) -> List:
48
47
  """Creates search tasks for valid entities."""
49
48
  tasks = [
50
49
  brute_force_triplet_search(
51
50
  query=f"{entity_text} {query}",
52
- user=user,
53
51
  top_k=self.top_k,
54
52
  collections=self.collections,
55
53
  properties_to_project=self.properties_to_project,
@@ -84,9 +82,8 @@ class TripletSearchContextProvider(BaseContextProvider):
84
82
  if not entities:
85
83
  return "No entities provided for context search."
86
84
 
87
- user = await get_default_user()
88
85
  memory_fragment = await get_memory_fragment(self.properties_to_project)
89
- search_tasks = self._get_search_tasks(entities, query, user, memory_fragment)
86
+ search_tasks = self._get_search_tasks(entities, query, memory_fragment)
90
87
 
91
88
  if not search_tasks:
92
89
  return "No valid entities found for context search."
@@ -1,10 +1,11 @@
1
- from typing import Optional, List, Type
1
+ from typing import Optional, List, Type, Any
2
2
  from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge
3
3
  from cognee.shared.logging_utils import get_logger
4
4
 
5
5
  from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever
6
6
  from cognee.modules.retrieval.utils.completion import generate_completion
7
7
  from cognee.infrastructure.llm.LLMGateway import LLMGateway
8
+ from cognee.infrastructure.llm.prompts import render_prompt, read_query_prompt
8
9
 
9
10
  logger = get_logger()
10
11
 
@@ -106,10 +107,10 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever):
106
107
  logger.info(f"Chain-of-thought: round {round_idx} - answer: {completion}")
107
108
  if round_idx < max_iter:
108
109
  valid_args = {"query": query, "answer": completion, "context": context_text}
109
- valid_user_prompt = LLMGateway.render_prompt(
110
+ valid_user_prompt = render_prompt(
110
111
  filename=self.validation_user_prompt_path, context=valid_args
111
112
  )
112
- valid_system_prompt = LLMGateway.read_query_prompt(
113
+ valid_system_prompt = read_query_prompt(
113
114
  prompt_file_name=self.validation_system_prompt_path
114
115
  )
115
116
 
@@ -119,10 +120,10 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever):
119
120
  response_model=str,
120
121
  )
121
122
  followup_args = {"query": query, "answer": completion, "reasoning": reasoning}
122
- followup_prompt = LLMGateway.render_prompt(
123
+ followup_prompt = render_prompt(
123
124
  filename=self.followup_user_prompt_path, context=followup_args
124
125
  )
125
- followup_system = LLMGateway.read_query_prompt(
126
+ followup_system = read_query_prompt(
126
127
  prompt_file_name=self.followup_system_prompt_path
127
128
  )
128
129
 
@@ -93,11 +93,8 @@ class GraphCompletionRetriever(BaseGraphRetriever):
93
93
  for field_name in index_fields:
94
94
  vector_index_collections.append(f"{subclass.__name__}_{field_name}")
95
95
 
96
- user = await get_default_user()
97
-
98
96
  found_triplets = await brute_force_triplet_search(
99
97
  query,
100
- user=user,
101
98
  top_k=self.top_k,
102
99
  collections=vector_index_collections or None,
103
100
  node_type=self.node_type,
@@ -25,7 +25,7 @@ class InsightsRetriever(BaseGraphRetriever):
25
25
  - top_k
26
26
  """
27
27
 
28
- def __init__(self, exploration_levels: int = 1, top_k: int = 5):
28
+ def __init__(self, exploration_levels: int = 1, top_k: Optional[int] = 5):
29
29
  """Initialize retriever with exploration levels and search parameters."""
30
30
  self.exploration_levels = exploration_levels
31
31
  self.top_k = top_k
@@ -0,0 +1,60 @@
1
+ from cognee.modules.retrieval.lexical_retriever import LexicalRetriever
2
+ import re
3
+ from collections import Counter
4
+ from typing import Optional
5
+
6
+
7
+ class JaccardChunksRetriever(LexicalRetriever):
8
+ """
9
+ Retriever that specializes LexicalRetriever to use Jaccard similarity.
10
+ """
11
+
12
+ def __init__(
13
+ self,
14
+ top_k: int = 10,
15
+ with_scores: bool = False,
16
+ stop_words: Optional[list[str]] = None,
17
+ multiset_jaccard: bool = False,
18
+ ):
19
+ """
20
+ Parameters
21
+ ----------
22
+ top_k : int
23
+ Number of top results to return.
24
+ with_scores : bool
25
+ If True, return (payload, score) pairs. Otherwise, only payloads.
26
+ stop_words : list[str], optional
27
+ List of tokens to filter out.
28
+ multiset_jaccard : bool
29
+ If True, use multiset Jaccard (frequency aware).
30
+ """
31
+ self.stop_words = {t.lower() for t in stop_words} if stop_words else set()
32
+ self.multiset_jaccard = multiset_jaccard
33
+
34
+ super().__init__(
35
+ tokenizer=self._tokenizer, scorer=self._scorer, top_k=top_k, with_scores=with_scores
36
+ )
37
+
38
+ def _tokenizer(self, text: str) -> list[str]:
39
+ """
40
+ Tokenizer: lowercases, splits on word characters (w+), filters stopwords.
41
+ """
42
+ tokens = re.findall(r"\w+", text.lower())
43
+ return [t for t in tokens if t not in self.stop_words]
44
+
45
+ def _scorer(self, query_tokens: list[str], chunk_tokens: list[str]) -> float:
46
+ """
47
+ Jaccard similarity scorer.
48
+ - If multiset_jaccard=True, uses frequency-aware Jaccard.
49
+ - Otherwise, normal set Jaccard.
50
+ """
51
+ if self.multiset_jaccard:
52
+ q_counts, c_counts = Counter(query_tokens), Counter(chunk_tokens)
53
+ numerator = sum(min(q_counts[t], c_counts[t]) for t in set(q_counts) | set(c_counts))
54
+ denominator = sum(max(q_counts[t], c_counts[t]) for t in set(q_counts) | set(c_counts))
55
+ return numerator / denominator if denominator else 0.0
56
+ else:
57
+ q_set, c_set = set(query_tokens), set(chunk_tokens)
58
+ if not q_set or not c_set:
59
+ return 0.0
60
+ return len(q_set & c_set) / len(q_set | c_set)
@@ -0,0 +1,123 @@
1
+ import asyncio
2
+ from typing import Any, Callable, Optional
3
+ from heapq import nlargest
4
+
5
+ from cognee.infrastructure.databases.graph import get_graph_engine
6
+ from cognee.modules.retrieval.base_retriever import BaseRetriever
7
+ from cognee.modules.retrieval.exceptions.exceptions import NoDataError
8
+ from cognee.shared.logging_utils import get_logger
9
+
10
+
11
+ logger = get_logger("LexicalRetriever")
12
+
13
+
14
+ class LexicalRetriever(BaseRetriever):
15
+ def __init__(
16
+ self, tokenizer: Callable, scorer: Callable, top_k: int = 10, with_scores: bool = False
17
+ ):
18
+ if not callable(tokenizer) or not callable(scorer):
19
+ raise TypeError("tokenizer and scorer must be callables")
20
+ if not isinstance(top_k, int) or top_k <= 0:
21
+ raise ValueError("top_k must be a positive integer")
22
+
23
+ self.tokenizer = tokenizer
24
+ self.scorer = scorer
25
+ self.top_k = top_k
26
+ self.with_scores = bool(with_scores)
27
+
28
+ # Cache keyed by dataset context
29
+ self.chunks: dict[str, Any] = {} # {chunk_id: tokens}
30
+ self.payloads: dict[str, Any] = {} # {chunk_id: original_document}
31
+ self._initialized = False
32
+ self._init_lock = asyncio.Lock()
33
+
34
+ async def initialize(self):
35
+ """Initialize retriever by reading all DocumentChunks from graph_engine."""
36
+ async with self._init_lock:
37
+ if self._initialized:
38
+ return
39
+
40
+ logger.info("Initializing LexicalRetriever by loading DocumentChunks from graph engine")
41
+
42
+ try:
43
+ graph_engine = await get_graph_engine()
44
+ nodes, _ = await graph_engine.get_filtered_graph_data([{"type": ["DocumentChunk"]}])
45
+ except Exception as e:
46
+ logger.error("Graph engine initialization failed")
47
+ raise NoDataError("Graph engine initialization failed") from e
48
+
49
+ chunk_count = 0
50
+ for node in nodes:
51
+ try:
52
+ chunk_id, document = node
53
+ except Exception:
54
+ logger.warning("Skipping node with unexpected shape: %r", node)
55
+ continue
56
+
57
+ if document.get("type") == "DocumentChunk" and document.get("text"):
58
+ try:
59
+ tokens = self.tokenizer(document["text"])
60
+ if not tokens:
61
+ continue
62
+ self.chunks[str(document.get("id", chunk_id))] = tokens
63
+ self.payloads[str(document.get("id", chunk_id))] = document
64
+ chunk_count += 1
65
+ except Exception as e:
66
+ logger.error("Tokenizer failed for chunk %s: %s", chunk_id, str(e))
67
+
68
+ if chunk_count == 0:
69
+ logger.error("Initialization completed but no valid chunks were loaded.")
70
+ raise NoDataError("No valid chunks loaded during initialization.")
71
+
72
+ self._initialized = True
73
+ logger.info("Initialized with %d document chunks", len(self.chunks))
74
+
75
+ async def get_context(self, query: str) -> Any:
76
+ """Retrieves relevant chunks for the given query."""
77
+ if not self._initialized:
78
+ await self.initialize()
79
+
80
+ if not self.chunks:
81
+ logger.warning("No chunks available in retriever")
82
+ return []
83
+
84
+ try:
85
+ query_tokens = self.tokenizer(query)
86
+ except Exception as e:
87
+ logger.error("Failed to tokenize query: %s", str(e))
88
+ return []
89
+
90
+ if not query_tokens:
91
+ logger.warning("Query produced no tokens")
92
+ return []
93
+
94
+ results = []
95
+ for chunk_id, chunk_tokens in self.chunks.items():
96
+ try:
97
+ score = self.scorer(query_tokens, chunk_tokens)
98
+ if not isinstance(score, (int, float)):
99
+ logger.warning("Non-numeric score for chunk %s → treated as 0.0", chunk_id)
100
+ score = 0.0
101
+ except Exception as e:
102
+ logger.error("Scorer failed for chunk %s: %s", chunk_id, str(e))
103
+ score = 0.0
104
+ results.append((chunk_id, score))
105
+
106
+ top_results = nlargest(self.top_k, results, key=lambda x: x[1])
107
+ logger.info(
108
+ "Retrieved %d/%d chunks for query (len=%d)",
109
+ len(top_results),
110
+ len(results),
111
+ len(query_tokens),
112
+ )
113
+
114
+ if self.with_scores:
115
+ return [(self.payloads[chunk_id], score) for chunk_id, score in top_results]
116
+ else:
117
+ return [self.payloads[chunk_id] for chunk_id, _ in top_results]
118
+
119
+ async def get_completion(self, query: str, context: Optional[Any] = None) -> Any:
120
+ """Returns context for the given query (retrieves if not provided)."""
121
+ if context is None:
122
+ context = await self.get_context(query)
123
+ return context
@@ -2,6 +2,7 @@ from typing import Any, Optional
2
2
  from cognee.shared.logging_utils import get_logger
3
3
  from cognee.infrastructure.databases.graph import get_graph_engine
4
4
  from cognee.infrastructure.llm.LLMGateway import LLMGateway
5
+ from cognee.infrastructure.llm.prompts import render_prompt
5
6
  from cognee.modules.retrieval.base_retriever import BaseRetriever
6
7
  from cognee.modules.retrieval.exceptions import SearchTypeNotSupported
7
8
  from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface
@@ -49,7 +50,7 @@ class NaturalLanguageRetriever(BaseRetriever):
49
50
 
50
51
  async def _generate_cypher_query(self, query: str, edge_schemas, previous_attempts=None) -> str:
51
52
  """Generate a Cypher query using LLM based on natural language query and schema information."""
52
- system_prompt = LLMGateway.render_prompt(
53
+ system_prompt = render_prompt(
53
54
  self.system_prompt_path,
54
55
  context={
55
56
  "edge_schemas": edge_schemas,
@@ -6,6 +6,7 @@ from operator import itemgetter
6
6
  from cognee.infrastructure.databases.vector import get_vector_engine
7
7
  from cognee.modules.retrieval.utils.completion import generate_completion
8
8
  from cognee.infrastructure.databases.graph import get_graph_engine
9
+ from cognee.infrastructure.llm.prompts import render_prompt
9
10
  from cognee.infrastructure.llm import LLMGateway
10
11
  from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever
11
12
  from cognee.shared.logging_utils import get_logger
@@ -72,7 +73,7 @@ class TemporalRetriever(GraphCompletionRetriever):
72
73
  else:
73
74
  base_directory = None
74
75
 
75
- system_prompt = LLMGateway.render_prompt(prompt_path, {}, base_directory=base_directory)
76
+ system_prompt = render_prompt(prompt_path, {}, base_directory=base_directory)
76
77
 
77
78
  interval = await LLMGateway.acreate_structured_output(query, system_prompt, QueryInterval)
78
79
 
@@ -129,7 +130,7 @@ class TemporalRetriever(GraphCompletionRetriever):
129
130
  query_vector = (await vector_engine.embedding_engine.embed_text([query]))[0]
130
131
 
131
132
  vector_search_results = await vector_engine.search(
132
- collection_name="Event_name", query_vector=query_vector, limit=0
133
+ collection_name="Event_name", query_vector=query_vector, limit=None
133
134
  )
134
135
 
135
136
  top_k_events = await self.filter_top_k_events(relevant_events, vector_search_results)
@@ -89,7 +89,6 @@ async def get_memory_fragment(
89
89
 
90
90
  async def brute_force_triplet_search(
91
91
  query: str,
92
- user: User,
93
92
  top_k: int = 5,
94
93
  collections: Optional[List[str]] = None,
95
94
  properties_to_project: Optional[List[str]] = None,
@@ -102,7 +101,6 @@ async def brute_force_triplet_search(
102
101
 
103
102
  Args:
104
103
  query (str): The search query.
105
- user (User): The user performing the search.
106
104
  top_k (int): The number of top results to retrieve.
107
105
  collections (Optional[List[str]]): List of collections to query.
108
106
  properties_to_project (Optional[List[str]]): List of properties to project.
@@ -139,12 +137,10 @@ async def brute_force_triplet_search(
139
137
 
140
138
  query_vector = (await vector_engine.embedding_engine.embed_text([query]))[0]
141
139
 
142
- send_telemetry("cognee.brute_force_triplet_search EXECUTION STARTED", user.id)
143
-
144
140
  async def search_in_collection(collection_name: str):
145
141
  try:
146
142
  return await vector_engine.search(
147
- collection_name=collection_name, query_vector=query_vector, limit=0
143
+ collection_name=collection_name, query_vector=query_vector, limit=None
148
144
  )
149
145
  except CollectionNotFoundError:
150
146
  return []
@@ -176,20 +172,14 @@ async def brute_force_triplet_search(
176
172
 
177
173
  results = await memory_fragment.calculate_top_triplet_importances(k=top_k)
178
174
 
179
- send_telemetry("cognee.brute_force_triplet_search EXECUTION COMPLETED", user.id)
180
-
181
175
  return results
182
176
 
183
177
  except CollectionNotFoundError:
184
178
  return []
185
179
  except Exception as error:
186
180
  logger.error(
187
- "Error during brute force search for user: %s, query: %s. Error: %s",
188
- user.id,
181
+ "Error during brute force search for query: %s. Error: %s",
189
182
  query,
190
183
  error,
191
184
  )
192
- send_telemetry(
193
- "cognee.brute_force_triplet_search EXECUTION FAILED", user.id, {"error": str(error)}
194
- )
195
185
  raise error