cognee 0.2.1.dev7__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (223) hide show
  1. cognee/api/client.py +44 -4
  2. cognee/api/health.py +332 -0
  3. cognee/api/v1/add/add.py +5 -2
  4. cognee/api/v1/add/routers/get_add_router.py +3 -0
  5. cognee/api/v1/cognify/code_graph_pipeline.py +3 -1
  6. cognee/api/v1/cognify/cognify.py +8 -0
  7. cognee/api/v1/cognify/routers/get_cognify_router.py +8 -1
  8. cognee/api/v1/config/config.py +3 -1
  9. cognee/api/v1/datasets/routers/get_datasets_router.py +2 -8
  10. cognee/api/v1/delete/delete.py +16 -12
  11. cognee/api/v1/responses/routers/get_responses_router.py +3 -1
  12. cognee/api/v1/search/search.py +10 -0
  13. cognee/api/v1/settings/routers/get_settings_router.py +0 -2
  14. cognee/base_config.py +1 -0
  15. cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +5 -6
  16. cognee/infrastructure/databases/graph/config.py +2 -0
  17. cognee/infrastructure/databases/graph/get_graph_engine.py +58 -12
  18. cognee/infrastructure/databases/graph/graph_db_interface.py +15 -10
  19. cognee/infrastructure/databases/graph/kuzu/adapter.py +43 -16
  20. cognee/infrastructure/databases/graph/kuzu/kuzu_migrate.py +281 -0
  21. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +151 -77
  22. cognee/infrastructure/databases/graph/neptune_driver/__init__.py +15 -0
  23. cognee/infrastructure/databases/graph/neptune_driver/adapter.py +1427 -0
  24. cognee/infrastructure/databases/graph/neptune_driver/exceptions.py +115 -0
  25. cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +224 -0
  26. cognee/infrastructure/databases/graph/networkx/adapter.py +3 -3
  27. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +449 -0
  28. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +11 -3
  29. cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +8 -3
  30. cognee/infrastructure/databases/vector/create_vector_engine.py +31 -23
  31. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +3 -1
  32. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +21 -6
  33. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +4 -3
  34. cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +3 -1
  35. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +22 -16
  36. cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +36 -34
  37. cognee/infrastructure/databases/vector/vector_db_interface.py +78 -7
  38. cognee/infrastructure/files/utils/get_data_file_path.py +39 -0
  39. cognee/infrastructure/files/utils/guess_file_type.py +2 -2
  40. cognee/infrastructure/files/utils/open_data_file.py +4 -23
  41. cognee/infrastructure/llm/LLMGateway.py +137 -0
  42. cognee/infrastructure/llm/__init__.py +14 -4
  43. cognee/infrastructure/llm/config.py +29 -1
  44. cognee/infrastructure/llm/prompts/answer_hotpot_question.txt +1 -1
  45. cognee/infrastructure/llm/prompts/answer_hotpot_using_cognee_search.txt +1 -1
  46. cognee/infrastructure/llm/prompts/answer_simple_question.txt +1 -1
  47. cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt +1 -1
  48. cognee/infrastructure/llm/prompts/categorize_categories.txt +1 -1
  49. cognee/infrastructure/llm/prompts/classify_content.txt +1 -1
  50. cognee/infrastructure/llm/prompts/context_for_question.txt +1 -1
  51. cognee/infrastructure/llm/prompts/graph_context_for_question.txt +1 -1
  52. cognee/infrastructure/llm/prompts/natural_language_retriever_system.txt +1 -1
  53. cognee/infrastructure/llm/prompts/patch_gen_instructions.txt +1 -1
  54. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +130 -0
  55. cognee/infrastructure/llm/prompts/summarize_code.txt +2 -2
  56. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +57 -0
  57. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +533 -0
  58. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/config.py +94 -0
  59. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/globals.py +37 -0
  60. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +21 -0
  61. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +131 -0
  62. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +266 -0
  63. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +137 -0
  64. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +550 -0
  65. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/tracing.py +26 -0
  66. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +962 -0
  67. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +52 -0
  68. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +166 -0
  69. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +109 -0
  70. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +343 -0
  71. cognee/{modules/data → infrastructure/llm/structured_output_framework/baml/baml_src}/extraction/__init__.py +1 -0
  72. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +89 -0
  73. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +33 -0
  74. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +18 -0
  75. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +3 -0
  76. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_categories.py +12 -0
  77. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/extract_summary.py +16 -7
  78. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/extract_content_graph.py +7 -6
  79. cognee/infrastructure/llm/{anthropic → structured_output_framework/litellm_instructor/llm/anthropic}/adapter.py +10 -4
  80. cognee/infrastructure/llm/{gemini → structured_output_framework/litellm_instructor/llm/gemini}/adapter.py +6 -5
  81. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/__init__.py +0 -0
  82. cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/litellm_instructor/llm/generic_llm_api}/adapter.py +7 -3
  83. cognee/infrastructure/llm/{get_llm_client.py → structured_output_framework/litellm_instructor/llm/get_llm_client.py} +18 -6
  84. cognee/infrastructure/llm/{llm_interface.py → structured_output_framework/litellm_instructor/llm/llm_interface.py} +2 -2
  85. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/__init__.py +0 -0
  86. cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor/llm/ollama}/adapter.py +4 -2
  87. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/__init__.py +0 -0
  88. cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm/openai}/adapter.py +6 -4
  89. cognee/infrastructure/llm/{rate_limiter.py → structured_output_framework/litellm_instructor/llm/rate_limiter.py} +0 -5
  90. cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +4 -2
  91. cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +7 -3
  92. cognee/infrastructure/llm/tokenizer/__init__.py +4 -0
  93. cognee/infrastructure/llm/utils.py +3 -1
  94. cognee/infrastructure/loaders/LoaderEngine.py +156 -0
  95. cognee/infrastructure/loaders/LoaderInterface.py +73 -0
  96. cognee/infrastructure/loaders/__init__.py +18 -0
  97. cognee/infrastructure/loaders/core/__init__.py +7 -0
  98. cognee/infrastructure/loaders/core/audio_loader.py +98 -0
  99. cognee/infrastructure/loaders/core/image_loader.py +114 -0
  100. cognee/infrastructure/loaders/core/text_loader.py +90 -0
  101. cognee/infrastructure/loaders/create_loader_engine.py +32 -0
  102. cognee/infrastructure/loaders/external/__init__.py +22 -0
  103. cognee/infrastructure/loaders/external/pypdf_loader.py +96 -0
  104. cognee/infrastructure/loaders/external/unstructured_loader.py +127 -0
  105. cognee/infrastructure/loaders/get_loader_engine.py +18 -0
  106. cognee/infrastructure/loaders/supported_loaders.py +18 -0
  107. cognee/infrastructure/loaders/use_loader.py +21 -0
  108. cognee/infrastructure/loaders/utils/__init__.py +0 -0
  109. cognee/modules/data/methods/__init__.py +1 -0
  110. cognee/modules/data/methods/get_authorized_dataset.py +23 -0
  111. cognee/modules/data/models/Data.py +13 -3
  112. cognee/modules/data/processing/document_types/AudioDocument.py +2 -2
  113. cognee/modules/data/processing/document_types/ImageDocument.py +2 -2
  114. cognee/modules/data/processing/document_types/PdfDocument.py +4 -11
  115. cognee/modules/data/processing/document_types/UnstructuredDocument.py +2 -5
  116. cognee/modules/engine/utils/generate_edge_id.py +5 -0
  117. cognee/modules/graph/cognee_graph/CogneeGraph.py +45 -35
  118. cognee/modules/graph/methods/get_formatted_graph_data.py +8 -2
  119. cognee/modules/graph/utils/get_graph_from_model.py +93 -101
  120. cognee/modules/ingestion/data_types/TextData.py +8 -2
  121. cognee/modules/ingestion/save_data_to_file.py +1 -1
  122. cognee/modules/pipelines/exceptions/__init__.py +1 -0
  123. cognee/modules/pipelines/exceptions/exceptions.py +12 -0
  124. cognee/modules/pipelines/models/DataItemStatus.py +5 -0
  125. cognee/modules/pipelines/models/PipelineRunInfo.py +6 -0
  126. cognee/modules/pipelines/models/__init__.py +1 -0
  127. cognee/modules/pipelines/operations/pipeline.py +10 -2
  128. cognee/modules/pipelines/operations/run_tasks.py +252 -20
  129. cognee/modules/pipelines/operations/run_tasks_distributed.py +1 -1
  130. cognee/modules/retrieval/chunks_retriever.py +23 -1
  131. cognee/modules/retrieval/code_retriever.py +66 -9
  132. cognee/modules/retrieval/completion_retriever.py +11 -9
  133. cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +0 -2
  134. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +0 -2
  135. cognee/modules/retrieval/graph_completion_cot_retriever.py +8 -9
  136. cognee/modules/retrieval/graph_completion_retriever.py +1 -1
  137. cognee/modules/retrieval/insights_retriever.py +4 -0
  138. cognee/modules/retrieval/natural_language_retriever.py +9 -15
  139. cognee/modules/retrieval/summaries_retriever.py +23 -1
  140. cognee/modules/retrieval/utils/brute_force_triplet_search.py +23 -4
  141. cognee/modules/retrieval/utils/completion.py +6 -9
  142. cognee/modules/retrieval/utils/description_to_codepart_search.py +2 -3
  143. cognee/modules/search/methods/search.py +5 -1
  144. cognee/modules/search/operations/__init__.py +1 -0
  145. cognee/modules/search/operations/select_search_type.py +42 -0
  146. cognee/modules/search/types/SearchType.py +1 -0
  147. cognee/modules/settings/get_settings.py +0 -8
  148. cognee/modules/settings/save_vector_db_config.py +1 -1
  149. cognee/shared/data_models.py +3 -1
  150. cognee/shared/logging_utils.py +0 -5
  151. cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
  152. cognee/tasks/documents/extract_chunks_from_documents.py +10 -12
  153. cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +4 -6
  154. cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +4 -6
  155. cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +6 -7
  156. cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +4 -7
  157. cognee/tasks/graph/extract_graph_from_code.py +3 -2
  158. cognee/tasks/graph/extract_graph_from_data.py +4 -3
  159. cognee/tasks/graph/infer_data_ontology.py +5 -6
  160. cognee/tasks/ingestion/data_item_to_text_file.py +79 -0
  161. cognee/tasks/ingestion/ingest_data.py +91 -61
  162. cognee/tasks/ingestion/resolve_data_directories.py +3 -0
  163. cognee/tasks/repo_processor/get_repo_file_dependencies.py +3 -0
  164. cognee/tasks/storage/index_data_points.py +1 -1
  165. cognee/tasks/storage/index_graph_edges.py +4 -1
  166. cognee/tasks/summarization/summarize_code.py +2 -3
  167. cognee/tasks/summarization/summarize_text.py +3 -2
  168. cognee/tests/test_cognee_server_start.py +12 -7
  169. cognee/tests/test_deduplication.py +2 -2
  170. cognee/tests/test_deletion.py +58 -17
  171. cognee/tests/test_graph_visualization_permissions.py +161 -0
  172. cognee/tests/test_neptune_analytics_graph.py +309 -0
  173. cognee/tests/test_neptune_analytics_hybrid.py +176 -0
  174. cognee/tests/{test_weaviate.py → test_neptune_analytics_vector.py} +86 -11
  175. cognee/tests/test_pgvector.py +5 -5
  176. cognee/tests/test_s3.py +1 -6
  177. cognee/tests/unit/infrastructure/databases/test_rate_limiter.py +11 -10
  178. cognee/tests/unit/infrastructure/databases/vector/__init__.py +0 -0
  179. cognee/tests/unit/infrastructure/mock_embedding_engine.py +1 -1
  180. cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +5 -5
  181. cognee/tests/unit/infrastructure/test_rate_limiting_realistic.py +6 -4
  182. cognee/tests/unit/infrastructure/test_rate_limiting_retry.py +1 -1
  183. cognee/tests/unit/interfaces/graph/get_graph_from_model_unit_test.py +61 -3
  184. cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +84 -9
  185. cognee/tests/unit/modules/search/search_methods_test.py +55 -0
  186. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/METADATA +13 -9
  187. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/RECORD +203 -164
  188. cognee/infrastructure/databases/vector/pinecone/adapter.py +0 -8
  189. cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +0 -514
  190. cognee/infrastructure/databases/vector/qdrant/__init__.py +0 -2
  191. cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +0 -527
  192. cognee/infrastructure/databases/vector/weaviate_db/__init__.py +0 -1
  193. cognee/modules/data/extraction/extract_categories.py +0 -14
  194. cognee/tests/test_qdrant.py +0 -99
  195. distributed/Dockerfile +0 -34
  196. distributed/app.py +0 -4
  197. distributed/entrypoint.py +0 -71
  198. distributed/entrypoint.sh +0 -5
  199. distributed/modal_image.py +0 -11
  200. distributed/queues.py +0 -5
  201. distributed/tasks/queued_add_data_points.py +0 -13
  202. distributed/tasks/queued_add_edges.py +0 -13
  203. distributed/tasks/queued_add_nodes.py +0 -13
  204. distributed/test.py +0 -28
  205. distributed/utils.py +0 -19
  206. distributed/workers/data_point_saving_worker.py +0 -93
  207. distributed/workers/graph_saving_worker.py +0 -104
  208. /cognee/infrastructure/databases/{graph/memgraph → hybrid/neptune_analytics}/__init__.py +0 -0
  209. /cognee/infrastructure/{llm → databases/vector/embeddings}/embedding_rate_limiter.py +0 -0
  210. /cognee/infrastructure/{databases/vector/pinecone → llm/structured_output_framework}/__init__.py +0 -0
  211. /cognee/infrastructure/llm/{anthropic → structured_output_framework/baml/baml_src}/__init__.py +0 -0
  212. /cognee/infrastructure/llm/{gemini/__init__.py → structured_output_framework/baml/baml_src/extraction/extract_categories.py} +0 -0
  213. /cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/baml/baml_src/extraction/knowledge_graph}/__init__.py +0 -0
  214. /cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor}/__init__.py +0 -0
  215. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/__init__.py +0 -0
  216. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/texts.json +0 -0
  217. /cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm}/__init__.py +0 -0
  218. {distributed → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic}/__init__.py +0 -0
  219. {distributed/tasks → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini}/__init__.py +0 -0
  220. /cognee/modules/data/{extraction/knowledge_graph → methods}/add_model_class_to_graph.py +0 -0
  221. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/WHEEL +0 -0
  222. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/LICENSE +0 -0
  223. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/NOTICE.md +0 -0
@@ -0,0 +1,96 @@
1
+ from typing import List
2
+ from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
3
+ from cognee.shared.logging_utils import get_logger
4
+ from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
5
+ from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
6
+
7
+ logger = get_logger(__name__)
8
+
9
+
10
+ class PyPdfLoader(LoaderInterface):
11
+ """
12
+ PDF loader using pypdf library.
13
+
14
+ Extracts text content from PDF files page by page, providing
15
+ structured page information and handling PDF-specific errors.
16
+ """
17
+
18
+ @property
19
+ def supported_extensions(self) -> List[str]:
20
+ return ["pdf"]
21
+
22
+ @property
23
+ def supported_mime_types(self) -> List[str]:
24
+ return ["application/pdf"]
25
+
26
+ @property
27
+ def loader_name(self) -> str:
28
+ return "pypdf_loader"
29
+
30
+ def can_handle(self, extension: str, mime_type: str) -> bool:
31
+ """Check if file can be handled by this loader."""
32
+ # Check file extension
33
+ if extension in self.supported_extensions and mime_type in self.supported_mime_types:
34
+ return True
35
+
36
+ return False
37
+
38
+ async def load(self, file_path: str, strict: bool = False, **kwargs) -> str:
39
+ """
40
+ Load PDF file and extract text content.
41
+
42
+ Args:
43
+ file_path: Path to the PDF file
44
+ strict: Whether to use strict mode for PDF reading
45
+ **kwargs: Additional arguments
46
+
47
+ Returns:
48
+ LoaderResult with extracted text content and metadata
49
+
50
+ Raises:
51
+ ImportError: If pypdf is not installed
52
+ Exception: If PDF processing fails
53
+ """
54
+ try:
55
+ from pypdf import PdfReader
56
+ except ImportError as e:
57
+ raise ImportError(
58
+ "pypdf is required for PDF processing. Install with: pip install pypdf"
59
+ ) from e
60
+
61
+ try:
62
+ with open(file_path, "rb") as file:
63
+ file_metadata = await get_file_metadata(file)
64
+ # Name ingested file of current loader based on original file content hash
65
+ storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
66
+
67
+ logger.info(f"Reading PDF: {file_path}")
68
+ reader = PdfReader(file, strict=strict)
69
+
70
+ content_parts = []
71
+ page_texts = []
72
+
73
+ for page_num, page in enumerate(reader.pages, 1):
74
+ try:
75
+ page_text = page.extract_text()
76
+ if page_text.strip(): # Only add non-empty pages
77
+ page_texts.append(page_text)
78
+ content_parts.append(f"Page {page_num}:\n{page_text}\n")
79
+ except Exception as e:
80
+ logger.warning(f"Failed to extract text from page {page_num}: {e}")
81
+ continue
82
+
83
+ # Combine all content
84
+ full_content = "\n".join(content_parts)
85
+
86
+ storage_config = get_storage_config()
87
+ data_root_directory = storage_config["data_root_directory"]
88
+ storage = get_file_storage(data_root_directory)
89
+
90
+ full_file_path = await storage.store(storage_file_name, full_content)
91
+
92
+ return full_file_path
93
+
94
+ except Exception as e:
95
+ logger.error(f"Failed to process PDF {file_path}: {e}")
96
+ raise Exception(f"PDF processing failed: {e}") from e
@@ -0,0 +1,127 @@
1
+ from typing import List
2
+ from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
3
+ from cognee.shared.logging_utils import get_logger
4
+ from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
5
+ from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
6
+
7
+ logger = get_logger(__name__)
8
+
9
+
10
+ class UnstructuredLoader(LoaderInterface):
11
+ """
12
+ Document loader using the unstructured library.
13
+
14
+ Handles various document formats including docx, pptx, xlsx, odt, etc.
15
+ Uses the unstructured library's auto-partition functionality.
16
+ """
17
+
18
+ @property
19
+ def supported_extensions(self) -> List[str]:
20
+ return [
21
+ "docx",
22
+ "doc",
23
+ "odt", # Word documents
24
+ "xlsx",
25
+ "xls",
26
+ "ods", # Spreadsheets
27
+ "pptx",
28
+ "ppt",
29
+ "odp", # Presentations
30
+ "rtf",
31
+ "html",
32
+ "htm", # Rich text and HTML
33
+ "eml",
34
+ "msg", # Email formats
35
+ "epub", # eBooks
36
+ ]
37
+
38
+ @property
39
+ def supported_mime_types(self) -> List[str]:
40
+ return [
41
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document", # docx
42
+ "application/msword", # doc
43
+ "application/vnd.oasis.opendocument.text", # odt
44
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # xlsx
45
+ "application/vnd.ms-excel", # xls
46
+ "application/vnd.oasis.opendocument.spreadsheet", # ods
47
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation", # pptx
48
+ "application/vnd.ms-powerpoint", # ppt
49
+ "application/vnd.oasis.opendocument.presentation", # odp
50
+ "application/rtf", # rtf
51
+ "text/html", # html
52
+ "message/rfc822", # eml
53
+ "application/epub+zip", # epub
54
+ ]
55
+
56
+ @property
57
+ def loader_name(self) -> str:
58
+ return "unstructured_loader"
59
+
60
+ def can_handle(self, extension: str, mime_type: str) -> bool:
61
+ """Check if file can be handled by this loader."""
62
+ # Check file extension
63
+ if extension in self.supported_extensions and mime_type in self.supported_mime_types:
64
+ return True
65
+
66
+ return False
67
+
68
+ async def load(self, file_path: str, strategy: str = "auto", **kwargs):
69
+ """
70
+ Load document using unstructured library.
71
+
72
+ Args:
73
+ file_path: Path to the document file
74
+ strategy: Partitioning strategy ("auto", "fast", "hi_res", "ocr_only")
75
+ **kwargs: Additional arguments passed to unstructured partition
76
+
77
+ Returns:
78
+ LoaderResult with extracted text content and metadata
79
+
80
+ Raises:
81
+ ImportError: If unstructured is not installed
82
+ Exception: If document processing fails
83
+ """
84
+ try:
85
+ from unstructured.partition.auto import partition
86
+ except ImportError as e:
87
+ raise ImportError(
88
+ "unstructured is required for document processing. "
89
+ "Install with: pip install unstructured"
90
+ ) from e
91
+
92
+ try:
93
+ logger.info(f"Processing document: {file_path}")
94
+
95
+ with open(file_path, "rb") as f:
96
+ file_metadata = await get_file_metadata(f)
97
+ # Name ingested file of current loader based on original file content hash
98
+ storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
99
+
100
+ # Set partitioning parameters
101
+ partition_kwargs = {"filename": file_path, "strategy": strategy, **kwargs}
102
+
103
+ # Use partition to extract elements
104
+ elements = partition(**partition_kwargs)
105
+
106
+ # Process elements into text content
107
+ text_parts = []
108
+
109
+ for element in elements:
110
+ element_text = str(element).strip()
111
+ if element_text:
112
+ text_parts.append(element_text)
113
+
114
+ # Combine all text content
115
+ full_content = "\n\n".join(text_parts)
116
+
117
+ storage_config = get_storage_config()
118
+ data_root_directory = storage_config["data_root_directory"]
119
+ storage = get_file_storage(data_root_directory)
120
+
121
+ full_file_path = await storage.store(storage_file_name, full_content)
122
+
123
+ return full_file_path
124
+
125
+ except Exception as e:
126
+ logger.error(f"Failed to process document {file_path}: {e}")
127
+ raise Exception(f"Document processing failed: {e}") from e
@@ -0,0 +1,18 @@
1
+ from functools import lru_cache
2
+ from .LoaderEngine import LoaderEngine
3
+ from .create_loader_engine import create_loader_engine
4
+
5
+
6
+ @lru_cache
7
+ def get_loader_engine() -> LoaderEngine:
8
+ """
9
+ Factory function to get loader engine.
10
+
11
+ Follows cognee's pattern with @lru_cache for efficient reuse
12
+ of engine instances. Configuration is loaded from environment
13
+ variables and settings.
14
+
15
+ Returns:
16
+ Cached LoaderEngine instance configured with current settings
17
+ """
18
+ return create_loader_engine()
@@ -0,0 +1,18 @@
1
+ from cognee.infrastructure.loaders.external import PyPdfLoader
2
+ from cognee.infrastructure.loaders.core import TextLoader, AudioLoader, ImageLoader
3
+
4
+ # Registry for loader implementations
5
+ supported_loaders = {
6
+ PyPdfLoader.loader_name: PyPdfLoader,
7
+ TextLoader.loader_name: TextLoader,
8
+ ImageLoader.loader_name: ImageLoader,
9
+ AudioLoader.loader_name: AudioLoader,
10
+ }
11
+
12
+ # Try adding optional loaders
13
+ try:
14
+ from cognee.infrastructure.loaders.external import UnstructuredLoader
15
+
16
+ supported_loaders[UnstructuredLoader.loader_name] = UnstructuredLoader
17
+ except ImportError:
18
+ pass
@@ -0,0 +1,21 @@
1
+ from .supported_loaders import supported_loaders
2
+
3
+
4
+ def use_loader(loader_name: str, loader_class):
5
+ """
6
+ Register a loader at runtime.
7
+
8
+ This allows external packages and custom loaders to be registered
9
+ into the loader system.
10
+
11
+ Args:
12
+ loader_name: Unique name for the loader
13
+ loader_class: Loader class implementing LoaderInterface
14
+
15
+ Example:
16
+ from cognee.infrastructure.loaders import use_loader
17
+ from my_package import MyCustomLoader
18
+
19
+ use_loader("my_custom_loader", MyCustomLoader)
20
+ """
21
+ supported_loaders[loader_name] = loader_class
File without changes
@@ -6,6 +6,7 @@ from .get_dataset import get_dataset
6
6
  from .get_datasets import get_datasets
7
7
  from .get_datasets_by_name import get_datasets_by_name
8
8
  from .get_dataset_data import get_dataset_data
9
+ from .get_authorized_dataset import get_authorized_dataset
9
10
  from .get_data import get_data
10
11
  from .get_unique_dataset_id import get_unique_dataset_id
11
12
  from .get_authorized_existing_datasets import get_authorized_existing_datasets
@@ -0,0 +1,23 @@
1
+ from typing import Optional
2
+ from uuid import UUID
3
+ from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
4
+ from ..models import Dataset
5
+
6
+
7
+ async def get_authorized_dataset(
8
+ user_id: UUID, dataset_id: UUID, permission_type="read"
9
+ ) -> Optional[Dataset]:
10
+ """
11
+ Get a specific dataset with permissions for a user.
12
+
13
+ Args:
14
+ user_id (UUID): user id
15
+ dataset_id (UUID): dataset id
16
+ permission_type (str): permission type(read, write, delete, share), default is read
17
+
18
+ Returns:
19
+ Optional[Dataset]: dataset with permissions
20
+ """
21
+ datasets = await get_specific_user_permission_datasets(user_id, permission_type, [dataset_id])
22
+
23
+ return datasets[0] if datasets else None
@@ -1,6 +1,7 @@
1
1
  from datetime import datetime, timezone
2
2
  from uuid import uuid4
3
3
  from sqlalchemy import UUID, Column, DateTime, String, JSON, Integer
4
+ from sqlalchemy.ext.mutable import MutableDict
4
5
  from sqlalchemy.orm import relationship
5
6
 
6
7
  from cognee.infrastructure.databases.relational import Base
@@ -16,14 +17,23 @@ class Data(Base):
16
17
  name = Column(String)
17
18
  extension = Column(String)
18
19
  mime_type = Column(String)
20
+ original_extension = Column(String, nullable=True)
21
+ original_mime_type = Column(String, nullable=True)
22
+ loader_engine = Column(String)
19
23
  raw_data_location = Column(String)
24
+ original_data_location = Column(String)
20
25
  owner_id = Column(UUID, index=True)
21
- tenant_id = Column(UUID, index=True, default=None)
26
+ tenant_id = Column(UUID, index=True, nullable=True)
22
27
  content_hash = Column(String)
28
+ raw_content_hash = Column(String)
23
29
  external_metadata = Column(JSON)
24
- node_set = Column(JSON, nullable=True) # Store NodeSet as JSON list of strings
30
+ # Store NodeSet as JSON list of strings
31
+ node_set = Column(JSON, nullable=True)
32
+ # MutableDict allows SQLAlchemy to notice key-value pair changes, without it changing a value for a key
33
+ # wouldn't be noticed when commiting a database session
34
+ pipeline_status = Column(MutableDict.as_mutable(JSON))
25
35
  token_count = Column(Integer)
26
- data_size = Column(Integer) # File size in bytes
36
+ data_size = Column(Integer, nullable=True) # File size in bytes
27
37
  created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
28
38
  updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
29
39
 
@@ -1,5 +1,5 @@
1
- from cognee.infrastructure.llm.get_llm_client import get_llm_client
2
1
  from cognee.modules.chunking.Chunker import Chunker
2
+ from cognee.infrastructure.llm.LLMGateway import LLMGateway
3
3
 
4
4
  from .Document import Document
5
5
 
@@ -8,7 +8,7 @@ class AudioDocument(Document):
8
8
  type: str = "audio"
9
9
 
10
10
  async def create_transcript(self):
11
- result = await get_llm_client().create_transcript(self.raw_data_location)
11
+ result = await LLMGateway.create_transcript(self.raw_data_location)
12
12
  return result.text
13
13
 
14
14
  async def read(self, chunker_cls: Chunker, max_chunk_size: int):
@@ -1,4 +1,4 @@
1
- from cognee.infrastructure.llm.get_llm_client import get_llm_client
1
+ from cognee.infrastructure.llm.LLMGateway import LLMGateway
2
2
  from cognee.modules.chunking.Chunker import Chunker
3
3
 
4
4
  from .Document import Document
@@ -8,7 +8,7 @@ class ImageDocument(Document):
8
8
  type: str = "image"
9
9
 
10
10
  async def transcribe_image(self):
11
- result = await get_llm_client().transcribe_image(self.raw_data_location)
11
+ result = await LLMGateway.transcribe_image(self.raw_data_location)
12
12
  return result.choices[0].message.content
13
13
 
14
14
  async def read(self, chunker_cls: Chunker, max_chunk_size: int):
@@ -5,7 +5,6 @@ from cognee.modules.chunking.Chunker import Chunker
5
5
  from cognee.infrastructure.files.utils.open_data_file import open_data_file
6
6
 
7
7
  from .Document import Document
8
- from .exceptions.exceptions import PyPdfInternalError
9
8
 
10
9
  logger = get_logger("PDFDocument")
11
10
 
@@ -17,18 +16,12 @@ class PdfDocument(Document):
17
16
  async with open_data_file(self.raw_data_location, mode="rb") as stream:
18
17
  logger.info(f"Reading PDF: {self.raw_data_location}")
19
18
 
20
- try:
21
- file = PdfReader(stream, strict=False)
22
- except Exception:
23
- raise PyPdfInternalError()
19
+ file = PdfReader(stream, strict=False)
24
20
 
25
21
  async def get_text():
26
- try:
27
- for page in file.pages:
28
- page_text = page.extract_text()
29
- yield page_text
30
- except Exception:
31
- raise PyPdfInternalError()
22
+ for page in file.pages:
23
+ page_text = page.extract_text()
24
+ yield page_text
32
25
 
33
26
  chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)
34
27
 
@@ -18,11 +18,8 @@ class UnstructuredDocument(Document):
18
18
  except ModuleNotFoundError:
19
19
  raise UnstructuredLibraryImportError
20
20
 
21
- if self.raw_data_location.startswith("s3://"):
22
- async with open_data_file(self.raw_data_location, mode="rb") as f:
23
- elements = partition(file=f, content_type=self.mime_type)
24
- else:
25
- elements = partition(self.raw_data_location, content_type=self.mime_type)
21
+ async with open_data_file(self.raw_data_location, mode="rb") as f:
22
+ elements = partition(file=f, content_type=self.mime_type)
26
23
 
27
24
  in_memory_file = StringIO("\n\n".join([str(el) for el in elements]))
28
25
  in_memory_file.seek(0)
@@ -0,0 +1,5 @@
1
+ from uuid import NAMESPACE_OID, uuid5
2
+
3
+
4
+ def generate_edge_id(edge_id: str) -> str:
5
+ return uuid5(NAMESPACE_OID, edge_id.lower().replace(" ", "_").replace("'", ""))
@@ -1,3 +1,4 @@
1
+ import time
1
2
  from cognee.shared.logging_utils import get_logger
2
3
  from typing import List, Dict, Union, Optional, Type
3
4
 
@@ -8,7 +9,7 @@ from cognee.modules.graph.cognee_graph.CogneeGraphElements import Node, Edge
8
9
  from cognee.modules.graph.cognee_graph.CogneeAbstractGraph import CogneeAbstractGraph
9
10
  import heapq
10
11
 
11
- logger = get_logger()
12
+ logger = get_logger("CogneeGraph")
12
13
 
13
14
 
14
15
  class CogneeGraph(CogneeAbstractGraph):
@@ -66,7 +67,13 @@ class CogneeGraph(CogneeAbstractGraph):
66
67
  ) -> None:
67
68
  if node_dimension < 1 or edge_dimension < 1:
68
69
  raise InvalidValueError(message="Dimensions must be positive integers")
70
+
69
71
  try:
72
+ import time
73
+
74
+ start_time = time.time()
75
+
76
+ # Determine projection strategy
70
77
  if node_type is not None and node_name is not None:
71
78
  nodes_data, edges_data = await adapter.get_nodeset_subgraph(
72
79
  node_type=node_type, node_name=node_name
@@ -83,16 +90,17 @@ class CogneeGraph(CogneeAbstractGraph):
83
90
  nodes_data, edges_data = await adapter.get_filtered_graph_data(
84
91
  attribute_filters=memory_fragment_filter
85
92
  )
86
-
87
93
  if not nodes_data or not edges_data:
88
94
  raise EntityNotFoundError(
89
95
  message="Empty filtered graph projected from the database."
90
96
  )
91
97
 
98
+ # Process nodes
92
99
  for node_id, properties in nodes_data:
93
100
  node_attributes = {key: properties.get(key) for key in node_properties_to_project}
94
101
  self.add_node(Node(str(node_id), node_attributes, dimension=node_dimension))
95
102
 
103
+ # Process edges
96
104
  for source_id, target_id, relationship_type, properties in edges_data:
97
105
  source_node = self.get_node(str(source_id))
98
106
  target_node = self.get_node(str(target_id))
@@ -113,17 +121,23 @@ class CogneeGraph(CogneeAbstractGraph):
113
121
 
114
122
  source_node.add_skeleton_edge(edge)
115
123
  target_node.add_skeleton_edge(edge)
116
-
117
124
  else:
118
125
  raise EntityNotFoundError(
119
126
  message=f"Edge references nonexistent nodes: {source_id} -> {target_id}"
120
127
  )
121
128
 
122
- except (ValueError, TypeError) as e:
123
- print(f"Error projecting graph: {e}")
124
- raise e
129
+ # Final statistics
130
+ projection_time = time.time() - start_time
131
+ logger.info(
132
+ f"Graph projection completed: {len(self.nodes)} nodes, {len(self.edges)} edges in {projection_time:.2f}s"
133
+ )
134
+
135
+ except Exception as e:
136
+ logger.error(f"Error during graph projection: {str(e)}")
137
+ raise
125
138
 
126
139
  async def map_vector_distances_to_graph_nodes(self, node_distances) -> None:
140
+ mapped_nodes = 0
127
141
  for category, scored_results in node_distances.items():
128
142
  for scored_result in scored_results:
129
143
  node_id = str(scored_result.id)
@@ -131,48 +145,44 @@ class CogneeGraph(CogneeAbstractGraph):
131
145
  node = self.get_node(node_id)
132
146
  if node:
133
147
  node.add_attribute("vector_distance", score)
148
+ mapped_nodes += 1
134
149
 
135
- async def map_vector_distances_to_graph_edges(self, vector_engine, query) -> None:
150
+ async def map_vector_distances_to_graph_edges(
151
+ self, vector_engine, query_vector, edge_distances
152
+ ) -> None:
136
153
  try:
137
- query_vector = await vector_engine.embed_data([query])
138
- query_vector = query_vector[0]
139
154
  if query_vector is None or len(query_vector) == 0:
140
155
  raise ValueError("Failed to generate query embedding.")
141
156
 
142
- edge_distances = await vector_engine.search(
143
- collection_name="EdgeType_relationship_name",
144
- query_text=query,
145
- limit=0,
146
- )
157
+ if edge_distances is None:
158
+ start_time = time.time()
159
+ edge_distances = await vector_engine.search(
160
+ collection_name="EdgeType_relationship_name",
161
+ query_vector=query_vector,
162
+ limit=0,
163
+ )
164
+ projection_time = time.time() - start_time
165
+ logger.info(
166
+ f"Edge collection distances were calculated separately from nodes in {projection_time:.2f}s"
167
+ )
147
168
 
148
169
  embedding_map = {result.payload["text"]: result.score for result in edge_distances}
149
170
 
150
171
  for edge in self.edges:
151
172
  relationship_type = edge.attributes.get("relationship_type")
152
- if not relationship_type or relationship_type not in embedding_map:
153
- print(f"Edge {edge} has an unknown or missing relationship type.")
154
- continue
155
-
156
- edge.attributes["vector_distance"] = embedding_map[relationship_type]
173
+ distance = embedding_map.get(relationship_type, None)
174
+ if distance is not None:
175
+ edge.attributes["vector_distance"] = distance
157
176
 
158
177
  except Exception as ex:
159
- print(f"Error mapping vector distances to edges: {ex}")
178
+ logger.error(f"Error mapping vector distances to edges: {str(ex)}")
160
179
  raise ex
161
180
 
162
181
  async def calculate_top_triplet_importances(self, k: int) -> List:
163
- min_heap = []
164
- for i, edge in enumerate(self.edges):
165
- source_node = self.get_node(edge.node1.id)
166
- target_node = self.get_node(edge.node2.id)
167
-
168
- source_distance = source_node.attributes.get("vector_distance", 1) if source_node else 1
169
- target_distance = target_node.attributes.get("vector_distance", 1) if target_node else 1
170
- edge_distance = edge.attributes.get("vector_distance", 1)
171
-
172
- total_distance = source_distance + target_distance + edge_distance
173
-
174
- heapq.heappush(min_heap, (-total_distance, i, edge))
175
- if len(min_heap) > k:
176
- heapq.heappop(min_heap)
182
+ def score(edge):
183
+ n1 = edge.node1.attributes.get("vector_distance", 1)
184
+ n2 = edge.node2.attributes.get("vector_distance", 1)
185
+ e = edge.attributes.get("vector_distance", 1)
186
+ return n1 + n2 + e
177
187
 
178
- return [edge for _, _, edge in sorted(min_heap)]
188
+ return heapq.nsmallest(k, self.edges, key=score)
@@ -1,10 +1,16 @@
1
1
  from uuid import UUID
2
2
  from cognee.infrastructure.databases.graph import get_graph_engine
3
3
  from cognee.context_global_variables import set_database_global_context_variables
4
+ from cognee.modules.data.exceptions.exceptions import DatasetNotFoundError
5
+ from cognee.modules.data.methods import get_authorized_dataset
4
6
 
5
7
 
6
8
  async def get_formatted_graph_data(dataset_id: UUID, user_id: UUID):
7
- await set_database_global_context_variables(dataset_id, user_id)
9
+ dataset = await get_authorized_dataset(user_id, dataset_id)
10
+ if not dataset:
11
+ raise DatasetNotFoundError(message="Dataset not found.")
12
+
13
+ await set_database_global_context_variables(dataset_id, dataset.owner_id)
8
14
 
9
15
  graph_client = await get_graph_engine()
10
16
  (nodes, edges) = await graph_client.get_graph_data()
@@ -33,7 +39,7 @@ async def get_formatted_graph_data(dataset_id: UUID, user_id: UUID):
33
39
  lambda edge: {
34
40
  "source": str(edge[0]),
35
41
  "target": str(edge[1]),
36
- "label": edge[2],
42
+ "label": str(edge[2]),
37
43
  },
38
44
  edges,
39
45
  )