cognee 0.3.4.dev4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. cognee/api/client.py +16 -7
  2. cognee/api/health.py +5 -9
  3. cognee/api/v1/add/add.py +3 -1
  4. cognee/api/v1/cognify/cognify.py +44 -7
  5. cognee/api/v1/permissions/routers/get_permissions_router.py +8 -4
  6. cognee/api/v1/search/search.py +3 -0
  7. cognee/api/v1/ui/__init__.py +1 -1
  8. cognee/api/v1/ui/ui.py +215 -150
  9. cognee/api/v1/update/__init__.py +1 -0
  10. cognee/api/v1/update/routers/__init__.py +1 -0
  11. cognee/api/v1/update/routers/get_update_router.py +90 -0
  12. cognee/api/v1/update/update.py +100 -0
  13. cognee/base_config.py +5 -2
  14. cognee/cli/_cognee.py +28 -10
  15. cognee/cli/commands/delete_command.py +34 -2
  16. cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +2 -2
  17. cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +3 -2
  18. cognee/eval_framework/modal_eval_dashboard.py +9 -1
  19. cognee/infrastructure/databases/graph/config.py +9 -9
  20. cognee/infrastructure/databases/graph/get_graph_engine.py +4 -21
  21. cognee/infrastructure/databases/graph/kuzu/adapter.py +60 -9
  22. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +3 -3
  23. cognee/infrastructure/databases/relational/config.py +4 -4
  24. cognee/infrastructure/databases/relational/create_relational_engine.py +11 -3
  25. cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +7 -3
  26. cognee/infrastructure/databases/vector/config.py +7 -7
  27. cognee/infrastructure/databases/vector/create_vector_engine.py +7 -15
  28. cognee/infrastructure/databases/vector/embeddings/EmbeddingEngine.py +9 -0
  29. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +11 -0
  30. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +19 -2
  31. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +11 -0
  32. cognee/infrastructure/databases/vector/embeddings/config.py +8 -0
  33. cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +5 -0
  34. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +11 -10
  35. cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +48 -38
  36. cognee/infrastructure/databases/vector/vector_db_interface.py +8 -4
  37. cognee/infrastructure/files/storage/S3FileStorage.py +15 -5
  38. cognee/infrastructure/files/storage/s3_config.py +1 -0
  39. cognee/infrastructure/files/utils/open_data_file.py +7 -14
  40. cognee/infrastructure/llm/LLMGateway.py +19 -117
  41. cognee/infrastructure/llm/config.py +28 -13
  42. cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/extract_categories.py +2 -1
  43. cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/extract_event_entities.py +3 -2
  44. cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/extract_summary.py +3 -2
  45. cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/knowledge_graph/extract_content_graph.py +2 -1
  46. cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/knowledge_graph/extract_event_graph.py +3 -2
  47. cognee/infrastructure/llm/prompts/read_query_prompt.py +3 -2
  48. cognee/infrastructure/llm/prompts/show_prompt.py +35 -0
  49. cognee/infrastructure/llm/prompts/test.txt +1 -0
  50. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +2 -2
  51. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +50 -397
  52. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +2 -3
  53. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +8 -88
  54. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +78 -0
  55. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +2 -99
  56. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +49 -401
  57. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +19 -882
  58. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +2 -34
  59. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +2 -107
  60. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/acreate_structured_output.baml +26 -0
  61. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/__init__.py +1 -2
  62. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/acreate_structured_output.py +76 -0
  63. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/create_dynamic_baml_type.py +122 -0
  64. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +3 -3
  65. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +0 -32
  66. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +107 -98
  67. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +5 -6
  68. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +5 -6
  69. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/llm_interface.py +0 -26
  70. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +17 -67
  71. cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +8 -7
  72. cognee/infrastructure/llm/utils.py +4 -4
  73. cognee/infrastructure/loaders/LoaderEngine.py +5 -2
  74. cognee/infrastructure/loaders/external/__init__.py +7 -0
  75. cognee/infrastructure/loaders/external/advanced_pdf_loader.py +244 -0
  76. cognee/infrastructure/loaders/supported_loaders.py +7 -0
  77. cognee/modules/data/methods/create_authorized_dataset.py +9 -0
  78. cognee/modules/data/methods/get_authorized_dataset.py +1 -1
  79. cognee/modules/data/methods/get_authorized_dataset_by_name.py +11 -0
  80. cognee/modules/data/methods/get_deletion_counts.py +92 -0
  81. cognee/modules/graph/cognee_graph/CogneeGraph.py +1 -1
  82. cognee/modules/graph/utils/expand_with_nodes_and_edges.py +22 -8
  83. cognee/modules/graph/utils/retrieve_existing_edges.py +0 -2
  84. cognee/modules/ingestion/data_types/TextData.py +0 -1
  85. cognee/modules/observability/get_observe.py +14 -0
  86. cognee/modules/observability/observers.py +1 -0
  87. cognee/modules/ontology/base_ontology_resolver.py +42 -0
  88. cognee/modules/ontology/get_default_ontology_resolver.py +41 -0
  89. cognee/modules/ontology/matching_strategies.py +53 -0
  90. cognee/modules/ontology/models.py +20 -0
  91. cognee/modules/ontology/ontology_config.py +24 -0
  92. cognee/modules/ontology/ontology_env_config.py +45 -0
  93. cognee/modules/ontology/rdf_xml/{OntologyResolver.py → RDFLibOntologyResolver.py} +20 -28
  94. cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py +21 -24
  95. cognee/modules/pipelines/layers/resolve_authorized_user_datasets.py +3 -3
  96. cognee/modules/retrieval/code_retriever.py +2 -1
  97. cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +1 -4
  98. cognee/modules/retrieval/graph_completion_cot_retriever.py +6 -5
  99. cognee/modules/retrieval/graph_completion_retriever.py +0 -3
  100. cognee/modules/retrieval/insights_retriever.py +1 -1
  101. cognee/modules/retrieval/jaccard_retrival.py +60 -0
  102. cognee/modules/retrieval/lexical_retriever.py +123 -0
  103. cognee/modules/retrieval/natural_language_retriever.py +2 -1
  104. cognee/modules/retrieval/temporal_retriever.py +3 -2
  105. cognee/modules/retrieval/utils/brute_force_triplet_search.py +2 -12
  106. cognee/modules/retrieval/utils/completion.py +4 -7
  107. cognee/modules/search/methods/get_search_type_tools.py +7 -0
  108. cognee/modules/search/methods/no_access_control_search.py +1 -1
  109. cognee/modules/search/methods/search.py +32 -13
  110. cognee/modules/search/types/SearchType.py +1 -0
  111. cognee/modules/users/permissions/methods/authorized_give_permission_on_datasets.py +12 -0
  112. cognee/modules/users/permissions/methods/check_permission_on_dataset.py +11 -0
  113. cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +10 -0
  114. cognee/modules/users/permissions/methods/get_document_ids_for_user.py +10 -0
  115. cognee/modules/users/permissions/methods/get_principal.py +9 -0
  116. cognee/modules/users/permissions/methods/get_principal_datasets.py +11 -0
  117. cognee/modules/users/permissions/methods/get_role.py +10 -0
  118. cognee/modules/users/permissions/methods/get_specific_user_permission_datasets.py +3 -3
  119. cognee/modules/users/permissions/methods/get_tenant.py +9 -0
  120. cognee/modules/users/permissions/methods/give_default_permission_to_role.py +9 -0
  121. cognee/modules/users/permissions/methods/give_default_permission_to_tenant.py +9 -0
  122. cognee/modules/users/permissions/methods/give_default_permission_to_user.py +9 -0
  123. cognee/modules/users/permissions/methods/give_permission_on_dataset.py +10 -0
  124. cognee/modules/users/roles/methods/add_user_to_role.py +11 -0
  125. cognee/modules/users/roles/methods/create_role.py +12 -1
  126. cognee/modules/users/tenants/methods/add_user_to_tenant.py +12 -0
  127. cognee/modules/users/tenants/methods/create_tenant.py +12 -1
  128. cognee/modules/visualization/cognee_network_visualization.py +13 -9
  129. cognee/shared/data_models.py +0 -1
  130. cognee/shared/utils.py +0 -32
  131. cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
  132. cognee/tasks/codingagents/coding_rule_associations.py +3 -2
  133. cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +3 -2
  134. cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +3 -2
  135. cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +3 -2
  136. cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +3 -2
  137. cognee/tasks/graph/extract_graph_from_code.py +2 -2
  138. cognee/tasks/graph/extract_graph_from_data.py +55 -12
  139. cognee/tasks/graph/extract_graph_from_data_v2.py +16 -4
  140. cognee/tasks/ingestion/migrate_relational_database.py +132 -41
  141. cognee/tasks/ingestion/resolve_data_directories.py +4 -1
  142. cognee/tasks/schema/ingest_database_schema.py +134 -0
  143. cognee/tasks/schema/models.py +40 -0
  144. cognee/tasks/storage/index_data_points.py +1 -1
  145. cognee/tasks/storage/index_graph_edges.py +3 -1
  146. cognee/tasks/summarization/summarize_code.py +2 -2
  147. cognee/tasks/summarization/summarize_text.py +2 -2
  148. cognee/tasks/temporal_graph/enrich_events.py +2 -2
  149. cognee/tasks/temporal_graph/extract_events_and_entities.py +2 -2
  150. cognee/tests/cli_tests/cli_unit_tests/test_cli_commands.py +13 -4
  151. cognee/tests/cli_tests/cli_unit_tests/test_cli_edge_cases.py +13 -3
  152. cognee/tests/test_advanced_pdf_loader.py +141 -0
  153. cognee/tests/test_chromadb.py +40 -0
  154. cognee/tests/test_cognee_server_start.py +6 -1
  155. cognee/tests/test_data/Quantum_computers.txt +9 -0
  156. cognee/tests/test_lancedb.py +211 -0
  157. cognee/tests/test_pgvector.py +40 -0
  158. cognee/tests/test_relational_db_migration.py +76 -0
  159. cognee/tests/unit/infrastructure/databases/test_index_graph_edges.py +2 -1
  160. cognee/tests/unit/modules/ontology/test_ontology_adapter.py +330 -13
  161. cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +0 -4
  162. cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +0 -4
  163. cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +0 -4
  164. {cognee-0.3.4.dev4.dist-info → cognee-0.3.6.dist-info}/METADATA +92 -96
  165. {cognee-0.3.4.dev4.dist-info → cognee-0.3.6.dist-info}/RECORD +172 -160
  166. cognee/infrastructure/data/utils/extract_keywords.py +0 -48
  167. cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py +0 -1227
  168. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +0 -109
  169. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +0 -343
  170. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_categories.py +0 -0
  171. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +0 -89
  172. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/__init__.py +0 -0
  173. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +0 -44
  174. cognee/tasks/graph/infer_data_ontology.py +0 -309
  175. cognee/tests/test_falkordb.py +0 -174
  176. distributed/poetry.lock +0 -12238
  177. distributed/pyproject.toml +0 -186
  178. /cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/__init__.py +0 -0
  179. /cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/knowledge_graph/__init__.py +0 -0
  180. /cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/texts.json +0 -0
  181. {cognee-0.3.4.dev4.dist-info → cognee-0.3.6.dist-info}/WHEEL +0 -0
  182. {cognee-0.3.4.dev4.dist-info → cognee-0.3.6.dist-info}/entry_points.txt +0 -0
  183. {cognee-0.3.4.dev4.dist-info → cognee-0.3.6.dist-info}/licenses/LICENSE +0 -0
  184. {cognee-0.3.4.dev4.dist-info → cognee-0.3.6.dist-info}/licenses/NOTICE.md +0 -0
@@ -0,0 +1,244 @@
1
+ """Advanced PDF loader leveraging unstructured for layout-aware extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict, List, Optional
7
+ import asyncio
8
+ from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
9
+ from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
10
+ from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
11
+ from cognee.shared.logging_utils import get_logger
12
+
13
+ from cognee.infrastructure.loaders.external.pypdf_loader import PyPdfLoader
14
+
15
+ logger = get_logger(__name__)
16
+
17
+ try:
18
+ from unstructured.partition.pdf import partition_pdf
19
+ except ImportError as e:
20
+ logger.info(
21
+ "unstructured[pdf] not installed, can't use AdvancedPdfLoader, will use PyPdfLoader instead."
22
+ )
23
+ raise ImportError from e
24
+
25
+
26
+ @dataclass
27
+ class _PageBuffer:
28
+ page_num: Optional[int]
29
+ segments: List[str]
30
+
31
+
32
+ class AdvancedPdfLoader(LoaderInterface):
33
+ """
34
+ PDF loader using unstructured library.
35
+
36
+ Extracts text content, images, tables from PDF files page by page, providing
37
+ structured page information and handling PDF-specific errors.
38
+ """
39
+
40
+ @property
41
+ def supported_extensions(self) -> List[str]:
42
+ return ["pdf"]
43
+
44
+ @property
45
+ def supported_mime_types(self) -> List[str]:
46
+ return ["application/pdf"]
47
+
48
+ @property
49
+ def loader_name(self) -> str:
50
+ return "advanced_pdf_loader"
51
+
52
+ def can_handle(self, extension: str, mime_type: str) -> bool:
53
+ """Check if file can be handled by this loader."""
54
+ # Check file extension
55
+ if extension in self.supported_extensions and mime_type in self.supported_mime_types:
56
+ return True
57
+
58
+ return False
59
+
60
+ async def load(self, file_path: str, strategy: str = "auto", **kwargs: Any) -> str:
61
+ """Load PDF file using unstructured library. If Exception occurs, fallback to PyPDFLoader.
62
+
63
+ Args:
64
+ file_path: Path to the document file
65
+ strategy: Partitioning strategy ("auto", "fast", "hi_res", "ocr_only")
66
+ **kwargs: Additional arguments passed to unstructured partition
67
+
68
+ Returns:
69
+ LoaderResult with extracted text content and metadata
70
+
71
+ """
72
+ try:
73
+ logger.info(f"Processing PDF: {file_path}")
74
+
75
+ with open(file_path, "rb") as f:
76
+ file_metadata = await get_file_metadata(f)
77
+
78
+ # Name ingested file of current loader based on original file content hash
79
+ storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
80
+
81
+ # Set partitioning parameters
82
+ partition_kwargs: Dict[str, Any] = {
83
+ "filename": file_path,
84
+ "strategy": strategy,
85
+ "infer_table_structure": True,
86
+ "include_page_breaks": False,
87
+ "include_metadata": True,
88
+ **kwargs,
89
+ }
90
+ # Use partition to extract elements
91
+ elements = partition_pdf(**partition_kwargs)
92
+
93
+ # Process elements into text content
94
+ page_contents = self._format_elements_by_page(elements)
95
+
96
+ # Check if there is any content
97
+ if not page_contents:
98
+ logger.warning(
99
+ "AdvancedPdfLoader returned no content. Falling back to PyPDF loader."
100
+ )
101
+ return await self._fallback(file_path, **kwargs)
102
+
103
+ # Combine all page outputs
104
+ full_content = "\n".join(page_contents)
105
+
106
+ # Store the content
107
+ storage_config = get_storage_config()
108
+ data_root_directory = storage_config["data_root_directory"]
109
+ storage = get_file_storage(data_root_directory)
110
+
111
+ full_file_path = await storage.store(storage_file_name, full_content)
112
+
113
+ return full_file_path
114
+
115
+ except Exception as exc:
116
+ logger.warning("Failed to process PDF with AdvancedPdfLoader: %s", exc)
117
+ return await self._fallback(file_path, **kwargs)
118
+
119
+ async def _fallback(self, file_path: str, **kwargs: Any) -> str:
120
+ logger.info("Falling back to PyPDF loader for %s", file_path)
121
+ fallback_loader = PyPdfLoader()
122
+ return await fallback_loader.load(file_path, **kwargs)
123
+
124
+ def _format_elements_by_page(self, elements: List[Any]) -> List[str]:
125
+ """Format elements by page."""
126
+ page_buffers: List[_PageBuffer] = []
127
+ current_buffer = _PageBuffer(page_num=None, segments=[])
128
+
129
+ for element in elements:
130
+ element_dict = self._safe_to_dict(element)
131
+ metadata = element_dict.get("metadata", {})
132
+ page_num = metadata.get("page_number")
133
+
134
+ if current_buffer.page_num != page_num:
135
+ if current_buffer.segments:
136
+ page_buffers.append(current_buffer)
137
+ current_buffer = _PageBuffer(page_num=page_num, segments=[])
138
+
139
+ formatted = self._format_element(element_dict)
140
+
141
+ if formatted:
142
+ current_buffer.segments.append(formatted)
143
+
144
+ if current_buffer.segments:
145
+ page_buffers.append(current_buffer)
146
+
147
+ page_contents: List[str] = []
148
+ for buffer in page_buffers:
149
+ header = f"Page {buffer.page_num}:\n" if buffer.page_num is not None else "Page:"
150
+ content = header + "\n\n".join(buffer.segments) + "\n"
151
+ page_contents.append(str(content))
152
+ return page_contents
153
+
154
+ def _format_element(
155
+ self,
156
+ element: Dict[str, Any],
157
+ ) -> str:
158
+ """Format element."""
159
+ element_type = element.get("type")
160
+ text = self._clean_text(element.get("text", ""))
161
+ metadata = element.get("metadata", {})
162
+
163
+ if element_type.lower() == "table":
164
+ return self._format_table_element(element) or text
165
+
166
+ if element_type.lower() == "image":
167
+ description = text or self._format_image_element(metadata)
168
+ return description
169
+
170
+ # Ignore header and footer
171
+ if element_type.lower() in ["header", "footer"]:
172
+ pass
173
+
174
+ return text
175
+
176
+ def _format_table_element(self, element: Dict[str, Any]) -> str:
177
+ """Format table element."""
178
+ metadata = element.get("metadata", {})
179
+ text = self._clean_text(element.get("text", ""))
180
+ table_html = metadata.get("text_as_html")
181
+
182
+ if table_html:
183
+ return table_html.strip()
184
+
185
+ return text
186
+
187
+ def _format_image_element(self, metadata: Dict[str, Any]) -> str:
188
+ """Format image."""
189
+ placeholder = "[Image omitted]"
190
+ image_text = placeholder
191
+ coordinates = metadata.get("coordinates", {})
192
+ points = coordinates.get("points") if isinstance(coordinates, dict) else None
193
+ if points and isinstance(points, tuple) and len(points) == 4:
194
+ leftup = points[0]
195
+ rightdown = points[3]
196
+ if (
197
+ isinstance(leftup, tuple)
198
+ and isinstance(rightdown, tuple)
199
+ and len(leftup) == 2
200
+ and len(rightdown) == 2
201
+ ):
202
+ image_text = f"{placeholder} (bbox=({leftup[0]}, {leftup[1]}, {rightdown[0]}, {rightdown[1]}))"
203
+
204
+ layout_width = coordinates.get("layout_width")
205
+ layout_height = coordinates.get("layout_height")
206
+ system = coordinates.get("system")
207
+ if layout_width and layout_height and system:
208
+ image_text = (
209
+ image_text
210
+ + f", system={system}, layout_width={layout_width}, layout_height={layout_height}))"
211
+ )
212
+
213
+ return image_text
214
+
215
+ def _safe_to_dict(self, element: Any) -> Dict[str, Any]:
216
+ """Safe to dict."""
217
+ try:
218
+ if hasattr(element, "to_dict"):
219
+ return element.to_dict()
220
+ except Exception:
221
+ pass
222
+ fallback_type = getattr(element, "category", None)
223
+ if not fallback_type:
224
+ fallback_type = getattr(element, "__class__", type("", (), {})).__name__
225
+
226
+ return {
227
+ "type": fallback_type,
228
+ "text": getattr(element, "text", ""),
229
+ "metadata": getattr(element, "metadata", {}),
230
+ }
231
+
232
+ def _clean_text(self, value: Any) -> str:
233
+ if value is None:
234
+ return ""
235
+ return str(value).replace("\xa0", " ").strip()
236
+
237
+
238
+ if __name__ == "__main__":
239
+ loader = AdvancedPdfLoader()
240
+ asyncio.run(
241
+ loader.load(
242
+ "/Users/xiaotao/work/cognee/cognee/infrastructure/loaders/external/attention_is_all_you_need.pdf"
243
+ )
244
+ )
@@ -16,3 +16,10 @@ try:
16
16
  supported_loaders[UnstructuredLoader.loader_name] = UnstructuredLoader
17
17
  except ImportError:
18
18
  pass
19
+
20
+ try:
21
+ from cognee.infrastructure.loaders.external import AdvancedPdfLoader
22
+
23
+ supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
24
+ except ImportError:
25
+ pass
@@ -6,6 +6,15 @@ from .create_dataset import create_dataset
6
6
 
7
7
 
8
8
  async def create_authorized_dataset(dataset_name: str, user: User) -> Dataset:
9
+ """
10
+ Create a new dataset and give all permissions on this dataset to the given user.
11
+ Args:
12
+ dataset_name: Name of the dataset.
13
+ user: The user object.
14
+
15
+ Returns:
16
+ Dataset: The new authorized dataset.
17
+ """
9
18
  db_engine = get_relational_engine()
10
19
 
11
20
  async with db_engine.get_async_session() as session:
@@ -15,7 +15,7 @@ async def get_authorized_dataset(
15
15
  Get a specific dataset with permissions for a user.
16
16
 
17
17
  Args:
18
- user_id (UUID): user id
18
+ user: User object
19
19
  dataset_id (UUID): dataset id
20
20
  permission_type (str): permission type(read, write, delete, share), default is read
21
21
 
@@ -11,6 +11,17 @@ from ..models import Dataset
11
11
  async def get_authorized_dataset_by_name(
12
12
  dataset_name: str, user: User, permission_type: str
13
13
  ) -> Optional[Dataset]:
14
+ """
15
+ Get a specific dataset with the given name, with permissions for a given user.
16
+
17
+ Args:
18
+ dataset_name: Name of the dataset.
19
+ user: User object.
20
+ permission_type (str): permission type(read, write, delete, share), default is read
21
+
22
+ Returns:
23
+ Optional[Dataset]: dataset with permissions
24
+ """
14
25
  authorized_datasets = await get_authorized_existing_datasets([], permission_type, user)
15
26
 
16
27
  return next((dataset for dataset in authorized_datasets if dataset.name == dataset_name), None)
@@ -0,0 +1,92 @@
1
+ from uuid import UUID
2
+ from cognee.cli.exceptions import CliCommandException
3
+ from cognee.infrastructure.databases.exceptions.exceptions import EntityNotFoundError
4
+ from sqlalchemy import select
5
+ from sqlalchemy.sql import func
6
+ from cognee.infrastructure.databases.relational import get_relational_engine
7
+ from cognee.modules.data.models import Dataset, Data, DatasetData
8
+ from cognee.modules.users.models import User
9
+ from cognee.modules.users.methods import get_user
10
+ from dataclasses import dataclass
11
+
12
+
13
+ @dataclass
14
+ class DeletionCountsPreview:
15
+ datasets: int = 0
16
+ data_entries: int = 0
17
+ users: int = 0
18
+
19
+
20
+ async def get_deletion_counts(
21
+ dataset_name: str = None, user_id: str = None, all_data: bool = False
22
+ ) -> DeletionCountsPreview:
23
+ """
24
+ Calculates the number of items that will be deleted based on the provided arguments.
25
+ """
26
+ counts = DeletionCountsPreview()
27
+ relational_engine = get_relational_engine()
28
+ async with relational_engine.get_async_session() as session:
29
+ if dataset_name:
30
+ # Find the dataset by name
31
+ dataset_result = await session.execute(
32
+ select(Dataset).where(Dataset.name == dataset_name)
33
+ )
34
+ dataset = dataset_result.scalar_one_or_none()
35
+
36
+ if dataset is None:
37
+ raise CliCommandException(
38
+ f"No Dataset exists with the name {dataset_name}", error_code=1
39
+ )
40
+
41
+ # Count data entries linked to this dataset
42
+ count_query = (
43
+ select(func.count())
44
+ .select_from(DatasetData)
45
+ .where(DatasetData.dataset_id == dataset.id)
46
+ )
47
+ data_entry_count = (await session.execute(count_query)).scalar_one()
48
+ counts.users = 1
49
+ counts.datasets = 1
50
+ counts.entries = data_entry_count
51
+ return counts
52
+
53
+ elif all_data:
54
+ # Simplified logic: Get total counts directly from the tables.
55
+ counts.datasets = (
56
+ await session.execute(select(func.count()).select_from(Dataset))
57
+ ).scalar_one()
58
+ counts.entries = (
59
+ await session.execute(select(func.count()).select_from(Data))
60
+ ).scalar_one()
61
+ counts.users = (
62
+ await session.execute(select(func.count()).select_from(User))
63
+ ).scalar_one()
64
+ return counts
65
+
66
+ # Placeholder for user_id logic
67
+ elif user_id:
68
+ user = None
69
+ try:
70
+ user_uuid = UUID(user_id)
71
+ user = await get_user(user_uuid)
72
+ except (ValueError, EntityNotFoundError):
73
+ raise CliCommandException(f"No User exists with ID {user_id}", error_code=1)
74
+ counts.users = 1
75
+ # Find all datasets owned by this user
76
+ datasets_query = select(Dataset).where(Dataset.owner_id == user.id)
77
+ user_datasets = (await session.execute(datasets_query)).scalars().all()
78
+ dataset_count = len(user_datasets)
79
+ counts.datasets = dataset_count
80
+ if dataset_count > 0:
81
+ dataset_ids = [d.id for d in user_datasets]
82
+ # Count all data entries across all of the user's datasets
83
+ data_count_query = (
84
+ select(func.count())
85
+ .select_from(DatasetData)
86
+ .where(DatasetData.dataset_id.in_(dataset_ids))
87
+ )
88
+ data_entry_count = (await session.execute(data_count_query)).scalar_one()
89
+ counts.entries = data_entry_count
90
+ else:
91
+ counts.entries = 0
92
+ return counts
@@ -161,7 +161,7 @@ class CogneeGraph(CogneeAbstractGraph):
161
161
  edge_distances = await vector_engine.search(
162
162
  collection_name="EdgeType_relationship_name",
163
163
  query_vector=query_vector,
164
- limit=0,
164
+ limit=None,
165
165
  )
166
166
  projection_time = time.time() - start_time
167
167
  logger.info(
@@ -7,8 +7,14 @@ from cognee.modules.engine.utils import (
7
7
  generate_node_id,
8
8
  generate_node_name,
9
9
  )
10
+ from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver
11
+ from cognee.modules.ontology.ontology_env_config import get_ontology_env_config
10
12
  from cognee.shared.data_models import KnowledgeGraph
11
- from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver
13
+ from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver
14
+ from cognee.modules.ontology.get_default_ontology_resolver import (
15
+ get_default_ontology_resolver,
16
+ get_ontology_resolver_from_env,
17
+ )
12
18
 
13
19
 
14
20
  def _create_node_key(node_id: str, category: str) -> str:
@@ -83,7 +89,7 @@ def _process_ontology_edges(
83
89
 
84
90
  def _create_type_node(
85
91
  node_type: str,
86
- ontology_resolver: OntologyResolver,
92
+ ontology_resolver: RDFLibOntologyResolver,
87
93
  added_nodes_map: dict,
88
94
  added_ontology_nodes_map: dict,
89
95
  name_mapping: dict,
@@ -141,7 +147,7 @@ def _create_entity_node(
141
147
  node_name: str,
142
148
  node_description: str,
143
149
  type_node: EntityType,
144
- ontology_resolver: OntologyResolver,
150
+ ontology_resolver: RDFLibOntologyResolver,
145
151
  added_nodes_map: dict,
146
152
  added_ontology_nodes_map: dict,
147
153
  name_mapping: dict,
@@ -198,7 +204,7 @@ def _create_entity_node(
198
204
  def _process_graph_nodes(
199
205
  data_chunk: DocumentChunk,
200
206
  graph: KnowledgeGraph,
201
- ontology_resolver: OntologyResolver,
207
+ ontology_resolver: RDFLibOntologyResolver,
202
208
  added_nodes_map: dict,
203
209
  added_ontology_nodes_map: dict,
204
210
  name_mapping: dict,
@@ -277,7 +283,7 @@ def _process_graph_edges(
277
283
  def expand_with_nodes_and_edges(
278
284
  data_chunks: list[DocumentChunk],
279
285
  chunk_graphs: list[KnowledgeGraph],
280
- ontology_resolver: OntologyResolver = None,
286
+ ontology_resolver: BaseOntologyResolver = None,
281
287
  existing_edges_map: Optional[dict[str, bool]] = None,
282
288
  ):
283
289
  """
@@ -296,8 +302,8 @@ def expand_with_nodes_and_edges(
296
302
  chunk_graphs (list[KnowledgeGraph]): List of knowledge graphs corresponding to each
297
303
  data chunk. Each graph contains nodes (entities) and edges (relationships) extracted
298
304
  from the chunk content.
299
- ontology_resolver (OntologyResolver, optional): Resolver for validating entities and
300
- types against an ontology. If None, a default OntologyResolver is created.
305
+ ontology_resolver (BaseOntologyResolver, optional): Resolver for validating entities and
306
+ types against an ontology. If None, a default RDFLibOntologyResolver is created.
301
307
  Defaults to None.
302
308
  existing_edges_map (dict[str, bool], optional): Mapping of existing edge keys to prevent
303
309
  duplicate edge creation. Keys are formatted as "{source_id}_{target_id}_{relation}".
@@ -320,7 +326,15 @@ def expand_with_nodes_and_edges(
320
326
  existing_edges_map = {}
321
327
 
322
328
  if ontology_resolver is None:
323
- ontology_resolver = OntologyResolver()
329
+ ontology_config = get_ontology_env_config()
330
+ if (
331
+ ontology_config.ontology_file_path
332
+ and ontology_config.ontology_resolver
333
+ and ontology_config.matching_strategy
334
+ ):
335
+ ontology_resolver = get_ontology_resolver_from_env(**ontology_config.to_dict())
336
+ else:
337
+ ontology_resolver = get_default_ontology_resolver()
324
338
 
325
339
  added_nodes_map = {}
326
340
  added_ontology_nodes_map = {}
@@ -23,8 +23,6 @@ async def retrieve_existing_edges(
23
23
  chunk_graphs (list[KnowledgeGraph]): List of knowledge graphs corresponding to each
24
24
  data chunk. Each graph contains nodes (entities) and edges (relationships) that
25
25
  were extracted from the chunk content.
26
- graph_engine (GraphDBInterface): Interface to the graph database that will be queried
27
- to check for existing edges. Must implement the has_edges() method.
28
26
 
29
27
  Returns:
30
28
  dict[str, bool]: A mapping of edge keys to boolean values indicating existence.
@@ -1,7 +1,6 @@
1
1
  from typing import BinaryIO
2
2
  from contextlib import asynccontextmanager
3
3
  import hashlib
4
- from cognee.infrastructure.data.utils.extract_keywords import extract_keywords
5
4
  from .IngestionData import IngestionData
6
5
 
7
6
 
@@ -9,3 +9,17 @@ def get_observe():
9
9
  from langfuse.decorators import observe
10
10
 
11
11
  return observe
12
+ elif monitoring == Observer.NONE:
13
+ # Return a no-op decorator that handles keyword arguments
14
+ def no_op_decorator(*args, **kwargs):
15
+ if len(args) == 1 and callable(args[0]) and not kwargs:
16
+ # Direct decoration: @observe
17
+ return args[0]
18
+ else:
19
+ # Parameterized decoration: @observe(as_type="generation")
20
+ def decorator(func):
21
+ return func
22
+
23
+ return decorator
24
+
25
+ return no_op_decorator
@@ -4,6 +4,7 @@ from enum import Enum
4
4
  class Observer(str, Enum):
5
5
  """Monitoring tools"""
6
6
 
7
+ NONE = "none"
7
8
  LANGFUSE = "langfuse"
8
9
  LLMLITE = "llmlite"
9
10
  LANGSMITH = "langsmith"
@@ -0,0 +1,42 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Tuple, Optional
3
+
4
+ from cognee.modules.ontology.models import AttachedOntologyNode
5
+ from cognee.modules.ontology.matching_strategies import MatchingStrategy, FuzzyMatchingStrategy
6
+
7
+
8
+ class BaseOntologyResolver(ABC):
9
+ """Abstract base class for ontology resolvers."""
10
+
11
+ def __init__(self, matching_strategy: Optional[MatchingStrategy] = None):
12
+ """Initialize the ontology resolver with a matching strategy.
13
+
14
+ Args:
15
+ matching_strategy: The strategy to use for entity matching.
16
+ Defaults to FuzzyMatchingStrategy if None.
17
+ """
18
+ self.matching_strategy = matching_strategy or FuzzyMatchingStrategy()
19
+
20
+ @abstractmethod
21
+ def build_lookup(self) -> None:
22
+ """Build the lookup dictionary for ontology entities."""
23
+ pass
24
+
25
+ @abstractmethod
26
+ def refresh_lookup(self) -> None:
27
+ """Refresh the lookup dictionary."""
28
+ pass
29
+
30
+ @abstractmethod
31
+ def find_closest_match(self, name: str, category: str) -> Optional[str]:
32
+ """Find the closest match for a given name in the specified category."""
33
+ pass
34
+
35
+ @abstractmethod
36
+ def get_subgraph(
37
+ self, node_name: str, node_type: str = "individuals", directed: bool = True
38
+ ) -> Tuple[
39
+ List[AttachedOntologyNode], List[Tuple[str, str, str]], Optional[AttachedOntologyNode]
40
+ ]:
41
+ """Get a subgraph for the given node."""
42
+ pass
@@ -0,0 +1,41 @@
1
+ from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver
2
+ from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver
3
+ from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy
4
+
5
+
6
+ def get_default_ontology_resolver() -> BaseOntologyResolver:
7
+ return RDFLibOntologyResolver(ontology_file=None, matching_strategy=FuzzyMatchingStrategy())
8
+
9
+
10
+ def get_ontology_resolver_from_env(
11
+ ontology_resolver: str = "", matching_strategy: str = "", ontology_file_path: str = ""
12
+ ) -> BaseOntologyResolver:
13
+ """
14
+ Create and return an ontology resolver instance based on environment parameters.
15
+
16
+ Currently, this function supports only the RDFLib-based ontology resolver
17
+ with a fuzzy matching strategy.
18
+
19
+ Args:
20
+ ontology_resolver (str): The ontology resolver type to use.
21
+ Supported value: "rdflib".
22
+ matching_strategy (str): The matching strategy to apply.
23
+ Supported value: "fuzzy".
24
+ ontology_file_path (str): Path to the ontology file required for the resolver.
25
+
26
+ Returns:
27
+ BaseOntologyResolver: An instance of the requested ontology resolver.
28
+
29
+ Raises:
30
+ EnvironmentError: If the provided resolver or strategy is unsupported,
31
+ or if required parameters are missing.
32
+ """
33
+ if ontology_resolver == "rdflib" and matching_strategy == "fuzzy" and ontology_file_path:
34
+ return RDFLibOntologyResolver(
35
+ matching_strategy=FuzzyMatchingStrategy(), ontology_file=ontology_file_path
36
+ )
37
+ else:
38
+ raise EnvironmentError(
39
+ f"Unsupported ontology resolver: {ontology_resolver}. "
40
+ f"Supported resolvers are: RdfLib with FuzzyMatchingStrategy."
41
+ )
@@ -0,0 +1,53 @@
1
+ import difflib
2
+ from abc import ABC, abstractmethod
3
+ from typing import List, Optional
4
+
5
+
6
+ class MatchingStrategy(ABC):
7
+ """Abstract base class for ontology entity matching strategies."""
8
+
9
+ @abstractmethod
10
+ def find_match(self, name: str, candidates: List[str]) -> Optional[str]:
11
+ """Find the best match for a given name from a list of candidates.
12
+
13
+ Args:
14
+ name: The name to match
15
+ candidates: List of candidate names to match against
16
+
17
+ Returns:
18
+ The best matching candidate name, or None if no match found
19
+ """
20
+ pass
21
+
22
+
23
+ class FuzzyMatchingStrategy(MatchingStrategy):
24
+ """Fuzzy matching strategy using difflib for approximate string matching."""
25
+
26
+ def __init__(self, cutoff: float = 0.8):
27
+ """Initialize fuzzy matching strategy.
28
+
29
+ Args:
30
+ cutoff: Minimum similarity score (0.0 to 1.0) for a match to be considered valid
31
+ """
32
+ self.cutoff = cutoff
33
+
34
+ def find_match(self, name: str, candidates: List[str]) -> Optional[str]:
35
+ """Find the closest fuzzy match for a given name.
36
+
37
+ Args:
38
+ name: The normalized name to match
39
+ candidates: List of normalized candidate names
40
+
41
+ Returns:
42
+ The best matching candidate name, or None if no match meets the cutoff
43
+ """
44
+ if not candidates:
45
+ return None
46
+
47
+ # Check for exact match first
48
+ if name in candidates:
49
+ return name
50
+
51
+ # Find fuzzy match
52
+ best_match = difflib.get_close_matches(name, candidates, n=1, cutoff=self.cutoff)
53
+ return best_match[0] if best_match else None