cognee 0.2.2.dev0__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. cognee/api/client.py +41 -3
  2. cognee/api/health.py +332 -0
  3. cognee/api/v1/add/add.py +5 -2
  4. cognee/api/v1/add/routers/get_add_router.py +3 -0
  5. cognee/api/v1/cognify/code_graph_pipeline.py +3 -1
  6. cognee/api/v1/cognify/cognify.py +8 -0
  7. cognee/api/v1/cognify/routers/get_cognify_router.py +8 -1
  8. cognee/api/v1/config/config.py +3 -1
  9. cognee/api/v1/datasets/routers/get_datasets_router.py +1 -7
  10. cognee/api/v1/delete/delete.py +16 -12
  11. cognee/api/v1/responses/routers/get_responses_router.py +3 -1
  12. cognee/api/v1/search/search.py +10 -0
  13. cognee/api/v1/settings/routers/get_settings_router.py +0 -2
  14. cognee/base_config.py +1 -0
  15. cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +5 -6
  16. cognee/infrastructure/databases/graph/config.py +2 -0
  17. cognee/infrastructure/databases/graph/get_graph_engine.py +58 -12
  18. cognee/infrastructure/databases/graph/graph_db_interface.py +15 -10
  19. cognee/infrastructure/databases/graph/kuzu/adapter.py +12 -7
  20. cognee/infrastructure/databases/graph/kuzu/kuzu_migrate.py +1 -1
  21. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +48 -13
  22. cognee/infrastructure/databases/graph/neptune_driver/__init__.py +15 -0
  23. cognee/infrastructure/databases/graph/neptune_driver/adapter.py +1427 -0
  24. cognee/infrastructure/databases/graph/neptune_driver/exceptions.py +115 -0
  25. cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +224 -0
  26. cognee/infrastructure/databases/graph/networkx/adapter.py +3 -3
  27. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +449 -0
  28. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -0
  29. cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +8 -3
  30. cognee/infrastructure/databases/vector/create_vector_engine.py +31 -15
  31. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +3 -1
  32. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +21 -6
  33. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +4 -3
  34. cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +3 -1
  35. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +22 -16
  36. cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +36 -34
  37. cognee/infrastructure/databases/vector/vector_db_interface.py +78 -7
  38. cognee/infrastructure/files/utils/get_data_file_path.py +39 -0
  39. cognee/infrastructure/files/utils/guess_file_type.py +2 -2
  40. cognee/infrastructure/files/utils/open_data_file.py +4 -23
  41. cognee/infrastructure/llm/LLMGateway.py +137 -0
  42. cognee/infrastructure/llm/__init__.py +14 -4
  43. cognee/infrastructure/llm/config.py +29 -1
  44. cognee/infrastructure/llm/prompts/answer_hotpot_question.txt +1 -1
  45. cognee/infrastructure/llm/prompts/answer_hotpot_using_cognee_search.txt +1 -1
  46. cognee/infrastructure/llm/prompts/answer_simple_question.txt +1 -1
  47. cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt +1 -1
  48. cognee/infrastructure/llm/prompts/categorize_categories.txt +1 -1
  49. cognee/infrastructure/llm/prompts/classify_content.txt +1 -1
  50. cognee/infrastructure/llm/prompts/context_for_question.txt +1 -1
  51. cognee/infrastructure/llm/prompts/graph_context_for_question.txt +1 -1
  52. cognee/infrastructure/llm/prompts/natural_language_retriever_system.txt +1 -1
  53. cognee/infrastructure/llm/prompts/patch_gen_instructions.txt +1 -1
  54. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +130 -0
  55. cognee/infrastructure/llm/prompts/summarize_code.txt +2 -2
  56. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +57 -0
  57. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +533 -0
  58. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/config.py +94 -0
  59. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/globals.py +37 -0
  60. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +21 -0
  61. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +131 -0
  62. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +266 -0
  63. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +137 -0
  64. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +550 -0
  65. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/tracing.py +26 -0
  66. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +962 -0
  67. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +52 -0
  68. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +166 -0
  69. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +109 -0
  70. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +343 -0
  71. cognee/{modules/data → infrastructure/llm/structured_output_framework/baml/baml_src}/extraction/__init__.py +1 -0
  72. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +89 -0
  73. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +33 -0
  74. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +18 -0
  75. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +3 -0
  76. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_categories.py +12 -0
  77. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/extract_summary.py +16 -7
  78. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/extract_content_graph.py +7 -6
  79. cognee/infrastructure/llm/{anthropic → structured_output_framework/litellm_instructor/llm/anthropic}/adapter.py +10 -4
  80. cognee/infrastructure/llm/{gemini → structured_output_framework/litellm_instructor/llm/gemini}/adapter.py +6 -5
  81. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/__init__.py +0 -0
  82. cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/litellm_instructor/llm/generic_llm_api}/adapter.py +7 -3
  83. cognee/infrastructure/llm/{get_llm_client.py → structured_output_framework/litellm_instructor/llm/get_llm_client.py} +18 -6
  84. cognee/infrastructure/llm/{llm_interface.py → structured_output_framework/litellm_instructor/llm/llm_interface.py} +2 -2
  85. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/__init__.py +0 -0
  86. cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor/llm/ollama}/adapter.py +4 -2
  87. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/__init__.py +0 -0
  88. cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm/openai}/adapter.py +6 -4
  89. cognee/infrastructure/llm/{rate_limiter.py → structured_output_framework/litellm_instructor/llm/rate_limiter.py} +0 -5
  90. cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +4 -2
  91. cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +7 -3
  92. cognee/infrastructure/llm/tokenizer/__init__.py +4 -0
  93. cognee/infrastructure/llm/utils.py +3 -1
  94. cognee/infrastructure/loaders/LoaderEngine.py +156 -0
  95. cognee/infrastructure/loaders/LoaderInterface.py +73 -0
  96. cognee/infrastructure/loaders/__init__.py +18 -0
  97. cognee/infrastructure/loaders/core/__init__.py +7 -0
  98. cognee/infrastructure/loaders/core/audio_loader.py +98 -0
  99. cognee/infrastructure/loaders/core/image_loader.py +114 -0
  100. cognee/infrastructure/loaders/core/text_loader.py +90 -0
  101. cognee/infrastructure/loaders/create_loader_engine.py +32 -0
  102. cognee/infrastructure/loaders/external/__init__.py +22 -0
  103. cognee/infrastructure/loaders/external/pypdf_loader.py +96 -0
  104. cognee/infrastructure/loaders/external/unstructured_loader.py +127 -0
  105. cognee/infrastructure/loaders/get_loader_engine.py +18 -0
  106. cognee/infrastructure/loaders/supported_loaders.py +18 -0
  107. cognee/infrastructure/loaders/use_loader.py +21 -0
  108. cognee/infrastructure/loaders/utils/__init__.py +0 -0
  109. cognee/modules/data/methods/__init__.py +1 -0
  110. cognee/modules/data/methods/get_authorized_dataset.py +23 -0
  111. cognee/modules/data/models/Data.py +11 -1
  112. cognee/modules/data/processing/document_types/AudioDocument.py +2 -2
  113. cognee/modules/data/processing/document_types/ImageDocument.py +2 -2
  114. cognee/modules/data/processing/document_types/PdfDocument.py +4 -11
  115. cognee/modules/engine/utils/generate_edge_id.py +5 -0
  116. cognee/modules/graph/cognee_graph/CogneeGraph.py +9 -18
  117. cognee/modules/graph/methods/get_formatted_graph_data.py +7 -1
  118. cognee/modules/graph/utils/get_graph_from_model.py +93 -101
  119. cognee/modules/ingestion/data_types/TextData.py +8 -2
  120. cognee/modules/ingestion/save_data_to_file.py +1 -1
  121. cognee/modules/pipelines/exceptions/__init__.py +1 -0
  122. cognee/modules/pipelines/exceptions/exceptions.py +12 -0
  123. cognee/modules/pipelines/models/DataItemStatus.py +5 -0
  124. cognee/modules/pipelines/models/PipelineRunInfo.py +6 -0
  125. cognee/modules/pipelines/models/__init__.py +1 -0
  126. cognee/modules/pipelines/operations/pipeline.py +10 -2
  127. cognee/modules/pipelines/operations/run_tasks.py +251 -19
  128. cognee/modules/retrieval/code_retriever.py +3 -5
  129. cognee/modules/retrieval/completion_retriever.py +1 -1
  130. cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +0 -2
  131. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +0 -2
  132. cognee/modules/retrieval/graph_completion_cot_retriever.py +8 -9
  133. cognee/modules/retrieval/natural_language_retriever.py +3 -5
  134. cognee/modules/retrieval/utils/completion.py +6 -9
  135. cognee/modules/retrieval/utils/description_to_codepart_search.py +2 -3
  136. cognee/modules/search/methods/search.py +5 -1
  137. cognee/modules/search/operations/__init__.py +1 -0
  138. cognee/modules/search/operations/select_search_type.py +42 -0
  139. cognee/modules/search/types/SearchType.py +1 -0
  140. cognee/modules/settings/get_settings.py +0 -4
  141. cognee/modules/settings/save_vector_db_config.py +1 -1
  142. cognee/shared/data_models.py +3 -1
  143. cognee/shared/logging_utils.py +0 -5
  144. cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
  145. cognee/tasks/documents/extract_chunks_from_documents.py +10 -12
  146. cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +4 -6
  147. cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +4 -6
  148. cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +6 -7
  149. cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +4 -7
  150. cognee/tasks/graph/extract_graph_from_code.py +3 -2
  151. cognee/tasks/graph/extract_graph_from_data.py +4 -3
  152. cognee/tasks/graph/infer_data_ontology.py +5 -6
  153. cognee/tasks/ingestion/data_item_to_text_file.py +79 -0
  154. cognee/tasks/ingestion/ingest_data.py +91 -61
  155. cognee/tasks/ingestion/resolve_data_directories.py +3 -0
  156. cognee/tasks/repo_processor/get_repo_file_dependencies.py +3 -0
  157. cognee/tasks/storage/index_data_points.py +1 -1
  158. cognee/tasks/storage/index_graph_edges.py +4 -1
  159. cognee/tasks/summarization/summarize_code.py +2 -3
  160. cognee/tasks/summarization/summarize_text.py +3 -2
  161. cognee/tests/test_cognee_server_start.py +12 -7
  162. cognee/tests/test_deduplication.py +2 -2
  163. cognee/tests/test_deletion.py +58 -17
  164. cognee/tests/test_graph_visualization_permissions.py +161 -0
  165. cognee/tests/test_neptune_analytics_graph.py +309 -0
  166. cognee/tests/test_neptune_analytics_hybrid.py +176 -0
  167. cognee/tests/{test_qdrant.py → test_neptune_analytics_vector.py} +86 -16
  168. cognee/tests/test_pgvector.py +5 -5
  169. cognee/tests/test_s3.py +1 -6
  170. cognee/tests/unit/infrastructure/databases/test_rate_limiter.py +11 -10
  171. cognee/tests/unit/infrastructure/databases/vector/__init__.py +0 -0
  172. cognee/tests/unit/infrastructure/mock_embedding_engine.py +1 -1
  173. cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +5 -5
  174. cognee/tests/unit/infrastructure/test_rate_limiting_realistic.py +6 -4
  175. cognee/tests/unit/infrastructure/test_rate_limiting_retry.py +1 -1
  176. cognee/tests/unit/interfaces/graph/get_graph_from_model_unit_test.py +61 -3
  177. cognee/tests/unit/modules/search/search_methods_test.py +55 -0
  178. {cognee-0.2.2.dev0.dist-info → cognee-0.2.3.dist-info}/METADATA +12 -6
  179. {cognee-0.2.2.dev0.dist-info → cognee-0.2.3.dist-info}/RECORD +195 -156
  180. cognee/infrastructure/databases/vector/pinecone/adapter.py +0 -8
  181. cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +0 -514
  182. cognee/infrastructure/databases/vector/qdrant/__init__.py +0 -2
  183. cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +0 -527
  184. cognee/infrastructure/databases/vector/weaviate_db/__init__.py +0 -1
  185. cognee/modules/data/extraction/extract_categories.py +0 -14
  186. distributed/Dockerfile +0 -34
  187. distributed/app.py +0 -4
  188. distributed/entrypoint.py +0 -71
  189. distributed/entrypoint.sh +0 -5
  190. distributed/modal_image.py +0 -11
  191. distributed/queues.py +0 -5
  192. distributed/tasks/queued_add_data_points.py +0 -13
  193. distributed/tasks/queued_add_edges.py +0 -13
  194. distributed/tasks/queued_add_nodes.py +0 -13
  195. distributed/test.py +0 -28
  196. distributed/utils.py +0 -19
  197. distributed/workers/data_point_saving_worker.py +0 -93
  198. distributed/workers/graph_saving_worker.py +0 -104
  199. /cognee/infrastructure/databases/{graph/memgraph → hybrid/neptune_analytics}/__init__.py +0 -0
  200. /cognee/infrastructure/{llm → databases/vector/embeddings}/embedding_rate_limiter.py +0 -0
  201. /cognee/infrastructure/{databases/vector/pinecone → llm/structured_output_framework}/__init__.py +0 -0
  202. /cognee/infrastructure/llm/{anthropic → structured_output_framework/baml/baml_src}/__init__.py +0 -0
  203. /cognee/infrastructure/llm/{gemini/__init__.py → structured_output_framework/baml/baml_src/extraction/extract_categories.py} +0 -0
  204. /cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/baml/baml_src/extraction/knowledge_graph}/__init__.py +0 -0
  205. /cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor}/__init__.py +0 -0
  206. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/__init__.py +0 -0
  207. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/texts.json +0 -0
  208. /cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm}/__init__.py +0 -0
  209. {distributed → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic}/__init__.py +0 -0
  210. {distributed/tasks → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini}/__init__.py +0 -0
  211. /cognee/modules/data/{extraction/knowledge_graph → methods}/add_model_class_to_graph.py +0 -0
  212. {cognee-0.2.2.dev0.dist-info → cognee-0.2.3.dist-info}/WHEEL +0 -0
  213. {cognee-0.2.2.dev0.dist-info → cognee-0.2.3.dist-info}/licenses/LICENSE +0 -0
  214. {cognee-0.2.2.dev0.dist-info → cognee-0.2.3.dist-info}/licenses/NOTICE.md +0 -0
@@ -0,0 +1,96 @@
1
+ from typing import List
2
+ from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
3
+ from cognee.shared.logging_utils import get_logger
4
+ from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
5
+ from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
6
+
7
+ logger = get_logger(__name__)
8
+
9
+
10
+ class PyPdfLoader(LoaderInterface):
11
+ """
12
+ PDF loader using pypdf library.
13
+
14
+ Extracts text content from PDF files page by page, providing
15
+ structured page information and handling PDF-specific errors.
16
+ """
17
+
18
+ @property
19
+ def supported_extensions(self) -> List[str]:
20
+ return ["pdf"]
21
+
22
+ @property
23
+ def supported_mime_types(self) -> List[str]:
24
+ return ["application/pdf"]
25
+
26
+ @property
27
+ def loader_name(self) -> str:
28
+ return "pypdf_loader"
29
+
30
+ def can_handle(self, extension: str, mime_type: str) -> bool:
31
+ """Check if file can be handled by this loader."""
32
+ # Check file extension
33
+ if extension in self.supported_extensions and mime_type in self.supported_mime_types:
34
+ return True
35
+
36
+ return False
37
+
38
+ async def load(self, file_path: str, strict: bool = False, **kwargs) -> str:
39
+ """
40
+ Load PDF file and extract text content.
41
+
42
+ Args:
43
+ file_path: Path to the PDF file
44
+ strict: Whether to use strict mode for PDF reading
45
+ **kwargs: Additional arguments
46
+
47
+ Returns:
48
+ LoaderResult with extracted text content and metadata
49
+
50
+ Raises:
51
+ ImportError: If pypdf is not installed
52
+ Exception: If PDF processing fails
53
+ """
54
+ try:
55
+ from pypdf import PdfReader
56
+ except ImportError as e:
57
+ raise ImportError(
58
+ "pypdf is required for PDF processing. Install with: pip install pypdf"
59
+ ) from e
60
+
61
+ try:
62
+ with open(file_path, "rb") as file:
63
+ file_metadata = await get_file_metadata(file)
64
+ # Name ingested file of current loader based on original file content hash
65
+ storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
66
+
67
+ logger.info(f"Reading PDF: {file_path}")
68
+ reader = PdfReader(file, strict=strict)
69
+
70
+ content_parts = []
71
+ page_texts = []
72
+
73
+ for page_num, page in enumerate(reader.pages, 1):
74
+ try:
75
+ page_text = page.extract_text()
76
+ if page_text.strip(): # Only add non-empty pages
77
+ page_texts.append(page_text)
78
+ content_parts.append(f"Page {page_num}:\n{page_text}\n")
79
+ except Exception as e:
80
+ logger.warning(f"Failed to extract text from page {page_num}: {e}")
81
+ continue
82
+
83
+ # Combine all content
84
+ full_content = "\n".join(content_parts)
85
+
86
+ storage_config = get_storage_config()
87
+ data_root_directory = storage_config["data_root_directory"]
88
+ storage = get_file_storage(data_root_directory)
89
+
90
+ full_file_path = await storage.store(storage_file_name, full_content)
91
+
92
+ return full_file_path
93
+
94
+ except Exception as e:
95
+ logger.error(f"Failed to process PDF {file_path}: {e}")
96
+ raise Exception(f"PDF processing failed: {e}") from e
@@ -0,0 +1,127 @@
1
+ from typing import List
2
+ from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
3
+ from cognee.shared.logging_utils import get_logger
4
+ from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
5
+ from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
6
+
7
+ logger = get_logger(__name__)
8
+
9
+
10
+ class UnstructuredLoader(LoaderInterface):
11
+ """
12
+ Document loader using the unstructured library.
13
+
14
+ Handles various document formats including docx, pptx, xlsx, odt, etc.
15
+ Uses the unstructured library's auto-partition functionality.
16
+ """
17
+
18
+ @property
19
+ def supported_extensions(self) -> List[str]:
20
+ return [
21
+ "docx",
22
+ "doc",
23
+ "odt", # Word documents
24
+ "xlsx",
25
+ "xls",
26
+ "ods", # Spreadsheets
27
+ "pptx",
28
+ "ppt",
29
+ "odp", # Presentations
30
+ "rtf",
31
+ "html",
32
+ "htm", # Rich text and HTML
33
+ "eml",
34
+ "msg", # Email formats
35
+ "epub", # eBooks
36
+ ]
37
+
38
+ @property
39
+ def supported_mime_types(self) -> List[str]:
40
+ return [
41
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document", # docx
42
+ "application/msword", # doc
43
+ "application/vnd.oasis.opendocument.text", # odt
44
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # xlsx
45
+ "application/vnd.ms-excel", # xls
46
+ "application/vnd.oasis.opendocument.spreadsheet", # ods
47
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation", # pptx
48
+ "application/vnd.ms-powerpoint", # ppt
49
+ "application/vnd.oasis.opendocument.presentation", # odp
50
+ "application/rtf", # rtf
51
+ "text/html", # html
52
+ "message/rfc822", # eml
53
+ "application/epub+zip", # epub
54
+ ]
55
+
56
+ @property
57
+ def loader_name(self) -> str:
58
+ return "unstructured_loader"
59
+
60
+ def can_handle(self, extension: str, mime_type: str) -> bool:
61
+ """Check if file can be handled by this loader."""
62
+ # Check file extension
63
+ if extension in self.supported_extensions and mime_type in self.supported_mime_types:
64
+ return True
65
+
66
+ return False
67
+
68
+ async def load(self, file_path: str, strategy: str = "auto", **kwargs):
69
+ """
70
+ Load document using unstructured library.
71
+
72
+ Args:
73
+ file_path: Path to the document file
74
+ strategy: Partitioning strategy ("auto", "fast", "hi_res", "ocr_only")
75
+ **kwargs: Additional arguments passed to unstructured partition
76
+
77
+ Returns:
78
+ LoaderResult with extracted text content and metadata
79
+
80
+ Raises:
81
+ ImportError: If unstructured is not installed
82
+ Exception: If document processing fails
83
+ """
84
+ try:
85
+ from unstructured.partition.auto import partition
86
+ except ImportError as e:
87
+ raise ImportError(
88
+ "unstructured is required for document processing. "
89
+ "Install with: pip install unstructured"
90
+ ) from e
91
+
92
+ try:
93
+ logger.info(f"Processing document: {file_path}")
94
+
95
+ with open(file_path, "rb") as f:
96
+ file_metadata = await get_file_metadata(f)
97
+ # Name ingested file of current loader based on original file content hash
98
+ storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
99
+
100
+ # Set partitioning parameters
101
+ partition_kwargs = {"filename": file_path, "strategy": strategy, **kwargs}
102
+
103
+ # Use partition to extract elements
104
+ elements = partition(**partition_kwargs)
105
+
106
+ # Process elements into text content
107
+ text_parts = []
108
+
109
+ for element in elements:
110
+ element_text = str(element).strip()
111
+ if element_text:
112
+ text_parts.append(element_text)
113
+
114
+ # Combine all text content
115
+ full_content = "\n\n".join(text_parts)
116
+
117
+ storage_config = get_storage_config()
118
+ data_root_directory = storage_config["data_root_directory"]
119
+ storage = get_file_storage(data_root_directory)
120
+
121
+ full_file_path = await storage.store(storage_file_name, full_content)
122
+
123
+ return full_file_path
124
+
125
+ except Exception as e:
126
+ logger.error(f"Failed to process document {file_path}: {e}")
127
+ raise Exception(f"Document processing failed: {e}") from e
@@ -0,0 +1,18 @@
1
+ from functools import lru_cache
2
+ from .LoaderEngine import LoaderEngine
3
+ from .create_loader_engine import create_loader_engine
4
+
5
+
6
+ @lru_cache
7
+ def get_loader_engine() -> LoaderEngine:
8
+ """
9
+ Factory function to get loader engine.
10
+
11
+ Follows cognee's pattern with @lru_cache for efficient reuse
12
+ of engine instances. Configuration is loaded from environment
13
+ variables and settings.
14
+
15
+ Returns:
16
+ Cached LoaderEngine instance configured with current settings
17
+ """
18
+ return create_loader_engine()
@@ -0,0 +1,18 @@
1
+ from cognee.infrastructure.loaders.external import PyPdfLoader
2
+ from cognee.infrastructure.loaders.core import TextLoader, AudioLoader, ImageLoader
3
+
4
+ # Registry for loader implementations
5
+ supported_loaders = {
6
+ PyPdfLoader.loader_name: PyPdfLoader,
7
+ TextLoader.loader_name: TextLoader,
8
+ ImageLoader.loader_name: ImageLoader,
9
+ AudioLoader.loader_name: AudioLoader,
10
+ }
11
+
12
+ # Try adding optional loaders
13
+ try:
14
+ from cognee.infrastructure.loaders.external import UnstructuredLoader
15
+
16
+ supported_loaders[UnstructuredLoader.loader_name] = UnstructuredLoader
17
+ except ImportError:
18
+ pass
@@ -0,0 +1,21 @@
1
+ from .supported_loaders import supported_loaders
2
+
3
+
4
+ def use_loader(loader_name: str, loader_class):
5
+ """
6
+ Register a loader at runtime.
7
+
8
+ This allows external packages and custom loaders to be registered
9
+ into the loader system.
10
+
11
+ Args:
12
+ loader_name: Unique name for the loader
13
+ loader_class: Loader class implementing LoaderInterface
14
+
15
+ Example:
16
+ from cognee.infrastructure.loaders import use_loader
17
+ from my_package import MyCustomLoader
18
+
19
+ use_loader("my_custom_loader", MyCustomLoader)
20
+ """
21
+ supported_loaders[loader_name] = loader_class
File without changes
@@ -6,6 +6,7 @@ from .get_dataset import get_dataset
6
6
  from .get_datasets import get_datasets
7
7
  from .get_datasets_by_name import get_datasets_by_name
8
8
  from .get_dataset_data import get_dataset_data
9
+ from .get_authorized_dataset import get_authorized_dataset
9
10
  from .get_data import get_data
10
11
  from .get_unique_dataset_id import get_unique_dataset_id
11
12
  from .get_authorized_existing_datasets import get_authorized_existing_datasets
@@ -0,0 +1,23 @@
1
+ from typing import Optional
2
+ from uuid import UUID
3
+ from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
4
+ from ..models import Dataset
5
+
6
+
7
+ async def get_authorized_dataset(
8
+ user_id: UUID, dataset_id: UUID, permission_type="read"
9
+ ) -> Optional[Dataset]:
10
+ """
11
+ Get a specific dataset with permissions for a user.
12
+
13
+ Args:
14
+ user_id (UUID): user id
15
+ dataset_id (UUID): dataset id
16
+ permission_type (str): permission type(read, write, delete, share), default is read
17
+
18
+ Returns:
19
+ Optional[Dataset]: dataset with permissions
20
+ """
21
+ datasets = await get_specific_user_permission_datasets(user_id, permission_type, [dataset_id])
22
+
23
+ return datasets[0] if datasets else None
@@ -1,6 +1,7 @@
1
1
  from datetime import datetime, timezone
2
2
  from uuid import uuid4
3
3
  from sqlalchemy import UUID, Column, DateTime, String, JSON, Integer
4
+ from sqlalchemy.ext.mutable import MutableDict
4
5
  from sqlalchemy.orm import relationship
5
6
 
6
7
  from cognee.infrastructure.databases.relational import Base
@@ -16,12 +17,21 @@ class Data(Base):
16
17
  name = Column(String)
17
18
  extension = Column(String)
18
19
  mime_type = Column(String)
20
+ original_extension = Column(String, nullable=True)
21
+ original_mime_type = Column(String, nullable=True)
22
+ loader_engine = Column(String)
19
23
  raw_data_location = Column(String)
24
+ original_data_location = Column(String)
20
25
  owner_id = Column(UUID, index=True)
21
26
  tenant_id = Column(UUID, index=True, nullable=True)
22
27
  content_hash = Column(String)
28
+ raw_content_hash = Column(String)
23
29
  external_metadata = Column(JSON)
24
- node_set = Column(JSON, nullable=True) # Store NodeSet as JSON list of strings
30
+ # Store NodeSet as JSON list of strings
31
+ node_set = Column(JSON, nullable=True)
32
+ # MutableDict allows SQLAlchemy to notice key-value pair changes, without it changing a value for a key
33
+ # wouldn't be noticed when commiting a database session
34
+ pipeline_status = Column(MutableDict.as_mutable(JSON))
25
35
  token_count = Column(Integer)
26
36
  data_size = Column(Integer, nullable=True) # File size in bytes
27
37
  created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
@@ -1,5 +1,5 @@
1
- from cognee.infrastructure.llm.get_llm_client import get_llm_client
2
1
  from cognee.modules.chunking.Chunker import Chunker
2
+ from cognee.infrastructure.llm.LLMGateway import LLMGateway
3
3
 
4
4
  from .Document import Document
5
5
 
@@ -8,7 +8,7 @@ class AudioDocument(Document):
8
8
  type: str = "audio"
9
9
 
10
10
  async def create_transcript(self):
11
- result = await get_llm_client().create_transcript(self.raw_data_location)
11
+ result = await LLMGateway.create_transcript(self.raw_data_location)
12
12
  return result.text
13
13
 
14
14
  async def read(self, chunker_cls: Chunker, max_chunk_size: int):
@@ -1,4 +1,4 @@
1
- from cognee.infrastructure.llm.get_llm_client import get_llm_client
1
+ from cognee.infrastructure.llm.LLMGateway import LLMGateway
2
2
  from cognee.modules.chunking.Chunker import Chunker
3
3
 
4
4
  from .Document import Document
@@ -8,7 +8,7 @@ class ImageDocument(Document):
8
8
  type: str = "image"
9
9
 
10
10
  async def transcribe_image(self):
11
- result = await get_llm_client().transcribe_image(self.raw_data_location)
11
+ result = await LLMGateway.transcribe_image(self.raw_data_location)
12
12
  return result.choices[0].message.content
13
13
 
14
14
  async def read(self, chunker_cls: Chunker, max_chunk_size: int):
@@ -5,7 +5,6 @@ from cognee.modules.chunking.Chunker import Chunker
5
5
  from cognee.infrastructure.files.utils.open_data_file import open_data_file
6
6
 
7
7
  from .Document import Document
8
- from .exceptions.exceptions import PyPdfInternalError
9
8
 
10
9
  logger = get_logger("PDFDocument")
11
10
 
@@ -17,18 +16,12 @@ class PdfDocument(Document):
17
16
  async with open_data_file(self.raw_data_location, mode="rb") as stream:
18
17
  logger.info(f"Reading PDF: {self.raw_data_location}")
19
18
 
20
- try:
21
- file = PdfReader(stream, strict=False)
22
- except Exception:
23
- raise PyPdfInternalError()
19
+ file = PdfReader(stream, strict=False)
24
20
 
25
21
  async def get_text():
26
- try:
27
- for page in file.pages:
28
- page_text = page.extract_text()
29
- yield page_text
30
- except Exception:
31
- raise PyPdfInternalError()
22
+ for page in file.pages:
23
+ page_text = page.extract_text()
24
+ yield page_text
32
25
 
33
26
  chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)
34
27
 
@@ -0,0 +1,5 @@
1
+ from uuid import NAMESPACE_OID, uuid5
2
+
3
+
4
+ def generate_edge_id(edge_id: str) -> str:
5
+ return uuid5(NAMESPACE_OID, edge_id.lower().replace(" ", "_").replace("'", ""))
@@ -170,28 +170,19 @@ class CogneeGraph(CogneeAbstractGraph):
170
170
 
171
171
  for edge in self.edges:
172
172
  relationship_type = edge.attributes.get("relationship_type")
173
- if relationship_type and relationship_type in embedding_map:
174
- edge.attributes["vector_distance"] = embedding_map[relationship_type]
173
+ distance = embedding_map.get(relationship_type, None)
174
+ if distance is not None:
175
+ edge.attributes["vector_distance"] = distance
175
176
 
176
177
  except Exception as ex:
177
178
  logger.error(f"Error mapping vector distances to edges: {str(ex)}")
178
179
  raise ex
179
180
 
180
181
  async def calculate_top_triplet_importances(self, k: int) -> List:
181
- min_heap = []
182
+ def score(edge):
183
+ n1 = edge.node1.attributes.get("vector_distance", 1)
184
+ n2 = edge.node2.attributes.get("vector_distance", 1)
185
+ e = edge.attributes.get("vector_distance", 1)
186
+ return n1 + n2 + e
182
187
 
183
- for i, edge in enumerate(self.edges):
184
- source_node = self.get_node(edge.node1.id)
185
- target_node = self.get_node(edge.node2.id)
186
-
187
- source_distance = source_node.attributes.get("vector_distance", 1) if source_node else 1
188
- target_distance = target_node.attributes.get("vector_distance", 1) if target_node else 1
189
- edge_distance = edge.attributes.get("vector_distance", 1)
190
-
191
- total_distance = source_distance + target_distance + edge_distance
192
-
193
- heapq.heappush(min_heap, (-total_distance, i, edge))
194
- if len(min_heap) > k:
195
- heapq.heappop(min_heap)
196
-
197
- return [edge for _, _, edge in sorted(min_heap)]
188
+ return heapq.nsmallest(k, self.edges, key=score)
@@ -1,10 +1,16 @@
1
1
  from uuid import UUID
2
2
  from cognee.infrastructure.databases.graph import get_graph_engine
3
3
  from cognee.context_global_variables import set_database_global_context_variables
4
+ from cognee.modules.data.exceptions.exceptions import DatasetNotFoundError
5
+ from cognee.modules.data.methods import get_authorized_dataset
4
6
 
5
7
 
6
8
  async def get_formatted_graph_data(dataset_id: UUID, user_id: UUID):
7
- await set_database_global_context_variables(dataset_id, user_id)
9
+ dataset = await get_authorized_dataset(user_id, dataset_id)
10
+ if not dataset:
11
+ raise DatasetNotFoundError(message="Dataset not found.")
12
+
13
+ await set_database_global_context_variables(dataset_id, dataset.owner_id)
8
14
 
9
15
  graph_client = await get_graph_engine()
10
16
  (nodes, edges) = await graph_client.get_graph_data()