cognee 0.2.2.dev0__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. cognee/api/client.py +41 -3
  2. cognee/api/health.py +332 -0
  3. cognee/api/v1/add/add.py +5 -2
  4. cognee/api/v1/add/routers/get_add_router.py +3 -0
  5. cognee/api/v1/cognify/code_graph_pipeline.py +3 -1
  6. cognee/api/v1/cognify/cognify.py +8 -0
  7. cognee/api/v1/cognify/routers/get_cognify_router.py +8 -1
  8. cognee/api/v1/config/config.py +3 -1
  9. cognee/api/v1/datasets/routers/get_datasets_router.py +1 -7
  10. cognee/api/v1/delete/delete.py +16 -12
  11. cognee/api/v1/responses/routers/get_responses_router.py +3 -1
  12. cognee/api/v1/search/search.py +10 -0
  13. cognee/api/v1/settings/routers/get_settings_router.py +0 -2
  14. cognee/base_config.py +1 -0
  15. cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +5 -6
  16. cognee/infrastructure/databases/graph/config.py +2 -0
  17. cognee/infrastructure/databases/graph/get_graph_engine.py +58 -12
  18. cognee/infrastructure/databases/graph/graph_db_interface.py +15 -10
  19. cognee/infrastructure/databases/graph/kuzu/adapter.py +12 -7
  20. cognee/infrastructure/databases/graph/kuzu/kuzu_migrate.py +1 -1
  21. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +48 -13
  22. cognee/infrastructure/databases/graph/neptune_driver/__init__.py +15 -0
  23. cognee/infrastructure/databases/graph/neptune_driver/adapter.py +1427 -0
  24. cognee/infrastructure/databases/graph/neptune_driver/exceptions.py +115 -0
  25. cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +224 -0
  26. cognee/infrastructure/databases/graph/networkx/adapter.py +3 -3
  27. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +449 -0
  28. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -0
  29. cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +8 -3
  30. cognee/infrastructure/databases/vector/create_vector_engine.py +31 -15
  31. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +3 -1
  32. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +21 -6
  33. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +4 -3
  34. cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +3 -1
  35. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +22 -16
  36. cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +36 -34
  37. cognee/infrastructure/databases/vector/vector_db_interface.py +78 -7
  38. cognee/infrastructure/files/utils/get_data_file_path.py +39 -0
  39. cognee/infrastructure/files/utils/guess_file_type.py +2 -2
  40. cognee/infrastructure/files/utils/open_data_file.py +4 -23
  41. cognee/infrastructure/llm/LLMGateway.py +137 -0
  42. cognee/infrastructure/llm/__init__.py +14 -4
  43. cognee/infrastructure/llm/config.py +29 -1
  44. cognee/infrastructure/llm/prompts/answer_hotpot_question.txt +1 -1
  45. cognee/infrastructure/llm/prompts/answer_hotpot_using_cognee_search.txt +1 -1
  46. cognee/infrastructure/llm/prompts/answer_simple_question.txt +1 -1
  47. cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt +1 -1
  48. cognee/infrastructure/llm/prompts/categorize_categories.txt +1 -1
  49. cognee/infrastructure/llm/prompts/classify_content.txt +1 -1
  50. cognee/infrastructure/llm/prompts/context_for_question.txt +1 -1
  51. cognee/infrastructure/llm/prompts/graph_context_for_question.txt +1 -1
  52. cognee/infrastructure/llm/prompts/natural_language_retriever_system.txt +1 -1
  53. cognee/infrastructure/llm/prompts/patch_gen_instructions.txt +1 -1
  54. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +130 -0
  55. cognee/infrastructure/llm/prompts/summarize_code.txt +2 -2
  56. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +57 -0
  57. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +533 -0
  58. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/config.py +94 -0
  59. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/globals.py +37 -0
  60. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +21 -0
  61. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +131 -0
  62. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +266 -0
  63. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +137 -0
  64. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +550 -0
  65. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/tracing.py +26 -0
  66. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +962 -0
  67. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +52 -0
  68. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +166 -0
  69. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +109 -0
  70. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +343 -0
  71. cognee/{modules/data → infrastructure/llm/structured_output_framework/baml/baml_src}/extraction/__init__.py +1 -0
  72. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +89 -0
  73. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +33 -0
  74. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +18 -0
  75. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +3 -0
  76. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_categories.py +12 -0
  77. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/extract_summary.py +16 -7
  78. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/extract_content_graph.py +7 -6
  79. cognee/infrastructure/llm/{anthropic → structured_output_framework/litellm_instructor/llm/anthropic}/adapter.py +10 -4
  80. cognee/infrastructure/llm/{gemini → structured_output_framework/litellm_instructor/llm/gemini}/adapter.py +6 -5
  81. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/__init__.py +0 -0
  82. cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/litellm_instructor/llm/generic_llm_api}/adapter.py +7 -3
  83. cognee/infrastructure/llm/{get_llm_client.py → structured_output_framework/litellm_instructor/llm/get_llm_client.py} +18 -6
  84. cognee/infrastructure/llm/{llm_interface.py → structured_output_framework/litellm_instructor/llm/llm_interface.py} +2 -2
  85. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/__init__.py +0 -0
  86. cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor/llm/ollama}/adapter.py +4 -2
  87. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/__init__.py +0 -0
  88. cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm/openai}/adapter.py +6 -4
  89. cognee/infrastructure/llm/{rate_limiter.py → structured_output_framework/litellm_instructor/llm/rate_limiter.py} +0 -5
  90. cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +4 -2
  91. cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +7 -3
  92. cognee/infrastructure/llm/tokenizer/__init__.py +4 -0
  93. cognee/infrastructure/llm/utils.py +3 -1
  94. cognee/infrastructure/loaders/LoaderEngine.py +156 -0
  95. cognee/infrastructure/loaders/LoaderInterface.py +73 -0
  96. cognee/infrastructure/loaders/__init__.py +18 -0
  97. cognee/infrastructure/loaders/core/__init__.py +7 -0
  98. cognee/infrastructure/loaders/core/audio_loader.py +98 -0
  99. cognee/infrastructure/loaders/core/image_loader.py +114 -0
  100. cognee/infrastructure/loaders/core/text_loader.py +90 -0
  101. cognee/infrastructure/loaders/create_loader_engine.py +32 -0
  102. cognee/infrastructure/loaders/external/__init__.py +22 -0
  103. cognee/infrastructure/loaders/external/pypdf_loader.py +96 -0
  104. cognee/infrastructure/loaders/external/unstructured_loader.py +127 -0
  105. cognee/infrastructure/loaders/get_loader_engine.py +18 -0
  106. cognee/infrastructure/loaders/supported_loaders.py +18 -0
  107. cognee/infrastructure/loaders/use_loader.py +21 -0
  108. cognee/infrastructure/loaders/utils/__init__.py +0 -0
  109. cognee/modules/data/methods/__init__.py +1 -0
  110. cognee/modules/data/methods/get_authorized_dataset.py +23 -0
  111. cognee/modules/data/models/Data.py +11 -1
  112. cognee/modules/data/processing/document_types/AudioDocument.py +2 -2
  113. cognee/modules/data/processing/document_types/ImageDocument.py +2 -2
  114. cognee/modules/data/processing/document_types/PdfDocument.py +4 -11
  115. cognee/modules/engine/utils/generate_edge_id.py +5 -0
  116. cognee/modules/graph/cognee_graph/CogneeGraph.py +9 -18
  117. cognee/modules/graph/methods/get_formatted_graph_data.py +7 -1
  118. cognee/modules/graph/utils/get_graph_from_model.py +93 -101
  119. cognee/modules/ingestion/data_types/TextData.py +8 -2
  120. cognee/modules/ingestion/save_data_to_file.py +1 -1
  121. cognee/modules/pipelines/exceptions/__init__.py +1 -0
  122. cognee/modules/pipelines/exceptions/exceptions.py +12 -0
  123. cognee/modules/pipelines/models/DataItemStatus.py +5 -0
  124. cognee/modules/pipelines/models/PipelineRunInfo.py +6 -0
  125. cognee/modules/pipelines/models/__init__.py +1 -0
  126. cognee/modules/pipelines/operations/pipeline.py +10 -2
  127. cognee/modules/pipelines/operations/run_tasks.py +251 -19
  128. cognee/modules/retrieval/code_retriever.py +3 -5
  129. cognee/modules/retrieval/completion_retriever.py +1 -1
  130. cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +0 -2
  131. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +0 -2
  132. cognee/modules/retrieval/graph_completion_cot_retriever.py +8 -9
  133. cognee/modules/retrieval/natural_language_retriever.py +3 -5
  134. cognee/modules/retrieval/utils/completion.py +6 -9
  135. cognee/modules/retrieval/utils/description_to_codepart_search.py +2 -3
  136. cognee/modules/search/methods/search.py +5 -1
  137. cognee/modules/search/operations/__init__.py +1 -0
  138. cognee/modules/search/operations/select_search_type.py +42 -0
  139. cognee/modules/search/types/SearchType.py +1 -0
  140. cognee/modules/settings/get_settings.py +0 -4
  141. cognee/modules/settings/save_vector_db_config.py +1 -1
  142. cognee/shared/data_models.py +3 -1
  143. cognee/shared/logging_utils.py +0 -5
  144. cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
  145. cognee/tasks/documents/extract_chunks_from_documents.py +10 -12
  146. cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +4 -6
  147. cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +4 -6
  148. cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +6 -7
  149. cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +4 -7
  150. cognee/tasks/graph/extract_graph_from_code.py +3 -2
  151. cognee/tasks/graph/extract_graph_from_data.py +4 -3
  152. cognee/tasks/graph/infer_data_ontology.py +5 -6
  153. cognee/tasks/ingestion/data_item_to_text_file.py +79 -0
  154. cognee/tasks/ingestion/ingest_data.py +91 -61
  155. cognee/tasks/ingestion/resolve_data_directories.py +3 -0
  156. cognee/tasks/repo_processor/get_repo_file_dependencies.py +3 -0
  157. cognee/tasks/storage/index_data_points.py +1 -1
  158. cognee/tasks/storage/index_graph_edges.py +4 -1
  159. cognee/tasks/summarization/summarize_code.py +2 -3
  160. cognee/tasks/summarization/summarize_text.py +3 -2
  161. cognee/tests/test_cognee_server_start.py +12 -7
  162. cognee/tests/test_deduplication.py +2 -2
  163. cognee/tests/test_deletion.py +58 -17
  164. cognee/tests/test_graph_visualization_permissions.py +161 -0
  165. cognee/tests/test_neptune_analytics_graph.py +309 -0
  166. cognee/tests/test_neptune_analytics_hybrid.py +176 -0
  167. cognee/tests/{test_qdrant.py → test_neptune_analytics_vector.py} +86 -16
  168. cognee/tests/test_pgvector.py +5 -5
  169. cognee/tests/test_s3.py +1 -6
  170. cognee/tests/unit/infrastructure/databases/test_rate_limiter.py +11 -10
  171. cognee/tests/unit/infrastructure/databases/vector/__init__.py +0 -0
  172. cognee/tests/unit/infrastructure/mock_embedding_engine.py +1 -1
  173. cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +5 -5
  174. cognee/tests/unit/infrastructure/test_rate_limiting_realistic.py +6 -4
  175. cognee/tests/unit/infrastructure/test_rate_limiting_retry.py +1 -1
  176. cognee/tests/unit/interfaces/graph/get_graph_from_model_unit_test.py +61 -3
  177. cognee/tests/unit/modules/search/search_methods_test.py +55 -0
  178. {cognee-0.2.2.dev0.dist-info → cognee-0.2.3.dist-info}/METADATA +12 -6
  179. {cognee-0.2.2.dev0.dist-info → cognee-0.2.3.dist-info}/RECORD +195 -156
  180. cognee/infrastructure/databases/vector/pinecone/adapter.py +0 -8
  181. cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +0 -514
  182. cognee/infrastructure/databases/vector/qdrant/__init__.py +0 -2
  183. cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +0 -527
  184. cognee/infrastructure/databases/vector/weaviate_db/__init__.py +0 -1
  185. cognee/modules/data/extraction/extract_categories.py +0 -14
  186. distributed/Dockerfile +0 -34
  187. distributed/app.py +0 -4
  188. distributed/entrypoint.py +0 -71
  189. distributed/entrypoint.sh +0 -5
  190. distributed/modal_image.py +0 -11
  191. distributed/queues.py +0 -5
  192. distributed/tasks/queued_add_data_points.py +0 -13
  193. distributed/tasks/queued_add_edges.py +0 -13
  194. distributed/tasks/queued_add_nodes.py +0 -13
  195. distributed/test.py +0 -28
  196. distributed/utils.py +0 -19
  197. distributed/workers/data_point_saving_worker.py +0 -93
  198. distributed/workers/graph_saving_worker.py +0 -104
  199. /cognee/infrastructure/databases/{graph/memgraph → hybrid/neptune_analytics}/__init__.py +0 -0
  200. /cognee/infrastructure/{llm → databases/vector/embeddings}/embedding_rate_limiter.py +0 -0
  201. /cognee/infrastructure/{databases/vector/pinecone → llm/structured_output_framework}/__init__.py +0 -0
  202. /cognee/infrastructure/llm/{anthropic → structured_output_framework/baml/baml_src}/__init__.py +0 -0
  203. /cognee/infrastructure/llm/{gemini/__init__.py → structured_output_framework/baml/baml_src/extraction/extract_categories.py} +0 -0
  204. /cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/baml/baml_src/extraction/knowledge_graph}/__init__.py +0 -0
  205. /cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor}/__init__.py +0 -0
  206. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/__init__.py +0 -0
  207. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/texts.json +0 -0
  208. /cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm}/__init__.py +0 -0
  209. {distributed → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic}/__init__.py +0 -0
  210. {distributed/tasks → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini}/__init__.py +0 -0
  211. /cognee/modules/data/{extraction/knowledge_graph → methods}/add_model_class_to_graph.py +0 -0
  212. {cognee-0.2.2.dev0.dist-info → cognee-0.2.3.dist-info}/WHEEL +0 -0
  213. {cognee-0.2.2.dev0.dist-info → cognee-0.2.3.dist-info}/licenses/LICENSE +0 -0
  214. {cognee-0.2.2.dev0.dist-info → cognee-0.2.3.dist-info}/licenses/NOTICE.md +0 -0
@@ -0,0 +1,156 @@
1
+ import filetype
2
+ from typing import Dict, List, Optional, Any
3
+ from .LoaderInterface import LoaderInterface
4
+ from cognee.shared.logging_utils import get_logger
5
+
6
+ logger = get_logger(__name__)
7
+
8
+
9
+ class LoaderEngine:
10
+ """
11
+ Main loader engine for managing file loaders.
12
+
13
+ Follows cognee's adapter pattern similar to database engines,
14
+ providing a centralized system for file loading operations.
15
+ """
16
+
17
+ def __init__(self):
18
+ """
19
+ Initialize the loader engine.
20
+
21
+ Args:
22
+ default_loader_priority: Priority order for loader selection
23
+ """
24
+ self._loaders: Dict[str, LoaderInterface] = {}
25
+ self._extension_map: Dict[str, List[LoaderInterface]] = {}
26
+ self._mime_type_map: Dict[str, List[LoaderInterface]] = {}
27
+
28
+ self.default_loader_priority = [
29
+ "text_loader",
30
+ "pypdf_loader",
31
+ "image_loader",
32
+ "audio_loader",
33
+ "unstructured_loader",
34
+ ]
35
+
36
+ def register_loader(self, loader: LoaderInterface) -> bool:
37
+ """
38
+ Register a loader with the engine.
39
+
40
+ Args:
41
+ loader: LoaderInterface implementation to register
42
+
43
+ Returns:
44
+ True if loader was registered successfully, False otherwise
45
+ """
46
+
47
+ self._loaders[loader.loader_name] = loader
48
+
49
+ # Map extensions to loaders
50
+ for ext in loader.supported_extensions:
51
+ ext_lower = ext.lower()
52
+ if ext_lower not in self._extension_map:
53
+ self._extension_map[ext_lower] = []
54
+ self._extension_map[ext_lower].append(loader)
55
+
56
+ # Map mime types to loaders
57
+ for mime_type in loader.supported_mime_types:
58
+ if mime_type not in self._mime_type_map:
59
+ self._mime_type_map[mime_type] = []
60
+ self._mime_type_map[mime_type].append(loader)
61
+
62
+ logger.info(f"Registered loader: {loader.loader_name}")
63
+ return True
64
+
65
+ def get_loader(
66
+ self, file_path: str, preferred_loaders: List[str] = None
67
+ ) -> Optional[LoaderInterface]:
68
+ """
69
+ Get appropriate loader for a file.
70
+
71
+ Args:
72
+ file_path: Path to the file to be processed
73
+ preferred_loaders: List of preferred loader names to try first
74
+
75
+ Returns:
76
+ LoaderInterface that can handle the file, or None if not found
77
+ """
78
+
79
+ file_info = filetype.guess(file_path)
80
+
81
+ # Try preferred loaders first
82
+ if preferred_loaders:
83
+ for loader_name in preferred_loaders:
84
+ if loader_name in self._loaders:
85
+ loader = self._loaders[loader_name]
86
+ if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
87
+ return loader
88
+ else:
89
+ raise ValueError(f"Loader does not exist: {loader_name}")
90
+
91
+ # Try default priority order
92
+ for loader_name in self.default_loader_priority:
93
+ if loader_name in self._loaders:
94
+ loader = self._loaders[loader_name]
95
+ if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
96
+ return loader
97
+ else:
98
+ raise ValueError(f"Loader does not exist: {loader_name}")
99
+
100
+ return None
101
+
102
+ async def load_file(
103
+ self,
104
+ file_path: str,
105
+ file_stream: Optional[Any],
106
+ preferred_loaders: Optional[List[str]] = None,
107
+ **kwargs,
108
+ ):
109
+ """
110
+ Load file using appropriate loader.
111
+
112
+ Args:
113
+ file_path: Path to the file to be processed
114
+ preferred_loaders: List of preferred loader names to try first
115
+ **kwargs: Additional loader-specific configuration
116
+
117
+ Raises:
118
+ ValueError: If no suitable loader is found
119
+ Exception: If file processing fails
120
+ """
121
+ loader = self.get_loader(file_path, preferred_loaders)
122
+ if not loader:
123
+ raise ValueError(f"No loader found for file: {file_path}")
124
+
125
+ logger.debug(f"Loading {file_path} with {loader.loader_name}")
126
+ # TODO: loading needs to be reworked to work with both file streams and file locations
127
+ return await loader.load(file_path, **kwargs)
128
+
129
+ def get_available_loaders(self) -> List[str]:
130
+ """
131
+ Get list of available loader names.
132
+
133
+ Returns:
134
+ List of registered loader names
135
+ """
136
+ return list(self._loaders.keys())
137
+
138
+ def get_loader_info(self, loader_name: str) -> Dict[str, any]:
139
+ """
140
+ Get information about a specific loader.
141
+
142
+ Args:
143
+ loader_name: Name of the loader to inspect
144
+
145
+ Returns:
146
+ Dictionary containing loader information
147
+ """
148
+ if loader_name not in self._loaders:
149
+ return {}
150
+
151
+ loader = self._loaders[loader_name]
152
+ return {
153
+ "name": loader.loader_name,
154
+ "extensions": loader.supported_extensions,
155
+ "mime_types": loader.supported_mime_types,
156
+ }
@@ -0,0 +1,73 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Optional, Any
3
+
4
+
5
+ class LoaderInterface(ABC):
6
+ """
7
+ Base interface for all file loaders in cognee.
8
+
9
+ This interface follows cognee's established pattern for database adapters,
10
+ ensuring consistent behavior across all loader implementations.
11
+ """
12
+
13
+ @property
14
+ @abstractmethod
15
+ def supported_extensions(self) -> List[str]:
16
+ """
17
+ List of file extensions this loader supports.
18
+
19
+ Returns:
20
+ List of extensions including the dot (e.g., ['.txt', '.md'])
21
+ """
22
+ pass
23
+
24
+ @property
25
+ @abstractmethod
26
+ def supported_mime_types(self) -> List[str]:
27
+ """
28
+ List of MIME types this loader supports.
29
+
30
+ Returns:
31
+ List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
32
+ """
33
+ pass
34
+
35
+ @property
36
+ @abstractmethod
37
+ def loader_name(self) -> str:
38
+ """
39
+ Unique name identifier for this loader.
40
+
41
+ Returns:
42
+ String identifier used for registration and configuration
43
+ """
44
+ pass
45
+
46
+ @abstractmethod
47
+ def can_handle(self, extension: str, mime_type: str) -> bool:
48
+ """
49
+ Check if this loader can handle the given file.
50
+
51
+ Args:
52
+ extension: File extension
53
+ mime_type: MIME type of the file
54
+
55
+ Returns:
56
+ True if this loader can process the file, False otherwise
57
+ """
58
+ pass
59
+
60
+ @abstractmethod
61
+ async def load(self, file_path: str, file_stream: Optional[Any] = None, **kwargs):
62
+ """
63
+ Load and process the file, returning standardized result.
64
+
65
+ Args:
66
+ file_path: Path to the file to be processed
67
+ file_stream: If file stream is provided it will be used to process file instead
68
+ **kwargs: Additional loader-specific configuration
69
+
70
+ Raises:
71
+ Exception: If file cannot be processed
72
+ """
73
+ pass
@@ -0,0 +1,18 @@
1
+ """
2
+ File loader infrastructure for cognee.
3
+
4
+ This package provides a plugin-based system for loading different file formats
5
+ into cognee, following the same patterns as database adapters.
6
+
7
+ Main exports:
8
+ - get_loader_engine(): Factory function to get configured loader engine
9
+ - use_loader(): Register custom loaders at runtime
10
+ - LoaderInterface: Base interface for implementing loaders
11
+ - LoaderResult, ContentType: Data models for loader results
12
+ """
13
+
14
+ from .get_loader_engine import get_loader_engine
15
+ from .use_loader import use_loader
16
+ from .LoaderInterface import LoaderInterface
17
+
18
+ __all__ = ["get_loader_engine", "use_loader", "LoaderInterface"]
@@ -0,0 +1,7 @@
1
+ """Core loader implementations that are always available."""
2
+
3
+ from .text_loader import TextLoader
4
+ from .audio_loader import AudioLoader
5
+ from .image_loader import ImageLoader
6
+
7
+ __all__ = ["TextLoader", "AudioLoader", "ImageLoader"]
@@ -0,0 +1,98 @@
1
+ import os
2
+ from typing import List
3
+ from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
4
+ from cognee.infrastructure.llm.LLMGateway import LLMGateway
5
+ from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
6
+ from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
7
+
8
+
9
+ class AudioLoader(LoaderInterface):
10
+ """
11
+ Core text file loader that handles basic text file formats.
12
+
13
+ This loader is always available and serves as the fallback for
14
+ text-based files when no specialized loader is available.
15
+ """
16
+
17
+ @property
18
+ def supported_extensions(self) -> List[str]:
19
+ """Supported text file extensions."""
20
+ return [
21
+ "aac", # Audio documents
22
+ "mid",
23
+ "mp3",
24
+ "m4a",
25
+ "ogg",
26
+ "flac",
27
+ "wav",
28
+ "amr",
29
+ "aiff",
30
+ ]
31
+
32
+ @property
33
+ def supported_mime_types(self) -> List[str]:
34
+ """Supported MIME types for text content."""
35
+ return [
36
+ "audio/aac",
37
+ "audio/midi",
38
+ "audio/mpeg",
39
+ "audio/mp4",
40
+ "audio/ogg",
41
+ "audio/flac",
42
+ "audio/wav",
43
+ "audio/amr",
44
+ "audio/aiff",
45
+ ]
46
+
47
+ @property
48
+ def loader_name(self) -> str:
49
+ """Unique identifier for this loader."""
50
+ return "audio_loader"
51
+
52
+ def can_handle(self, extension: str, mime_type: str) -> bool:
53
+ """
54
+ Check if this loader can handle the given file.
55
+
56
+ Args:
57
+ extension: File extension
58
+ mime_type: Optional MIME type
59
+
60
+ Returns:
61
+ True if file can be handled, False otherwise
62
+ """
63
+ if extension in self.supported_extensions and mime_type in self.supported_mime_types:
64
+ return True
65
+ return False
66
+
67
+ async def load(self, file_path: str, **kwargs):
68
+ """
69
+ Load and process the audio file.
70
+
71
+ Args:
72
+ file_path: Path to the file to load
73
+ **kwargs: Additional configuration (unused)
74
+
75
+ Returns:
76
+ LoaderResult containing the file content and metadata
77
+
78
+ Raises:
79
+ FileNotFoundError: If file doesn't exist
80
+ OSError: If file cannot be read
81
+ """
82
+ if not os.path.exists(file_path):
83
+ raise FileNotFoundError(f"File not found: {file_path}")
84
+
85
+ with open(file_path, "rb") as f:
86
+ file_metadata = await get_file_metadata(f)
87
+ # Name ingested file of current loader based on original file content hash
88
+ storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
89
+
90
+ result = await LLMGateway.create_transcript(file_path)
91
+
92
+ storage_config = get_storage_config()
93
+ data_root_directory = storage_config["data_root_directory"]
94
+ storage = get_file_storage(data_root_directory)
95
+
96
+ full_file_path = await storage.store(storage_file_name, result.text)
97
+
98
+ return full_file_path
@@ -0,0 +1,114 @@
1
+ import os
2
+ from typing import List
3
+ from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
4
+ from cognee.infrastructure.llm.LLMGateway import LLMGateway
5
+ from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
6
+ from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
7
+
8
+
9
+ class ImageLoader(LoaderInterface):
10
+ """
11
+ Core image file loader that handles basic image file formats.
12
+ """
13
+
14
+ @property
15
+ def supported_extensions(self) -> List[str]:
16
+ """Supported text file extensions."""
17
+ return [
18
+ "png",
19
+ "dwg",
20
+ "xcf",
21
+ "jpg",
22
+ ".jpe",
23
+ ".jpeg",
24
+ "jpx",
25
+ "apng",
26
+ "gif",
27
+ "webp",
28
+ "cr2",
29
+ "tif",
30
+ "tiff",
31
+ "bmp",
32
+ "jxr",
33
+ "psd",
34
+ "ico",
35
+ "heic",
36
+ "avif",
37
+ ]
38
+
39
+ @property
40
+ def supported_mime_types(self) -> List[str]:
41
+ """Supported MIME types for text content."""
42
+ return [
43
+ "image/png",
44
+ "image/vnd.dwg",
45
+ "image/x-xcf",
46
+ "image/jpeg",
47
+ "image/jpx",
48
+ "image/apng",
49
+ "image/gif",
50
+ "image/webp",
51
+ "image/x-canon-cr2",
52
+ "image/tiff",
53
+ "image/bmp",
54
+ "image/jxr",
55
+ "image/vnd.adobe.photoshop",
56
+ "image/vnd.microsoft.icon",
57
+ "image/heic",
58
+ "image/avif",
59
+ ]
60
+
61
+ @property
62
+ def loader_name(self) -> str:
63
+ """Unique identifier for this loader."""
64
+ return "image_loader"
65
+
66
+ def can_handle(self, extension: str, mime_type: str) -> bool:
67
+ """
68
+ Check if this loader can handle the given file.
69
+
70
+ Args:
71
+ extension: File extension
72
+ mime_type: Optional MIME type
73
+
74
+ Returns:
75
+ True if file can be handled, False otherwise
76
+ """
77
+ if extension in self.supported_extensions and mime_type in self.supported_mime_types:
78
+ return True
79
+
80
+ return False
81
+
82
+ async def load(self, file_path: str, **kwargs):
83
+ """
84
+ Load and process the image file.
85
+
86
+ Args:
87
+ file_path: Path to the file to load
88
+ **kwargs: Additional configuration (unused)
89
+
90
+ Returns:
91
+ LoaderResult containing the file content and metadata
92
+
93
+ Raises:
94
+ FileNotFoundError: If file doesn't exist
95
+ UnicodeDecodeError: If file cannot be decoded with specified encoding
96
+ OSError: If file cannot be read
97
+ """
98
+ if not os.path.exists(file_path):
99
+ raise FileNotFoundError(f"File not found: {file_path}")
100
+
101
+ with open(file_path, "rb") as f:
102
+ file_metadata = await get_file_metadata(f)
103
+ # Name ingested file of current loader based on original file content hash
104
+ storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
105
+
106
+ result = await LLMGateway.transcribe_image(file_path)
107
+
108
+ storage_config = get_storage_config()
109
+ data_root_directory = storage_config["data_root_directory"]
110
+ storage = get_file_storage(data_root_directory)
111
+
112
+ full_file_path = await storage.store(storage_file_name, result.choices[0].message.content)
113
+
114
+ return full_file_path
@@ -0,0 +1,90 @@
1
+ import os
2
+ from typing import List
3
+ from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
4
+ from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
5
+ from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
6
+
7
+
8
+ class TextLoader(LoaderInterface):
9
+ """
10
+ Core text file loader that handles basic text file formats.
11
+
12
+ This loader is always available and serves as the fallback for
13
+ text-based files when no specialized loader is available.
14
+ """
15
+
16
+ @property
17
+ def supported_extensions(self) -> List[str]:
18
+ """Supported text file extensions."""
19
+ return ["txt", "md", "csv", "json", "xml", "yaml", "yml", "log"]
20
+
21
+ @property
22
+ def supported_mime_types(self) -> List[str]:
23
+ """Supported MIME types for text content."""
24
+ return [
25
+ "text/plain",
26
+ "text/markdown",
27
+ "text/csv",
28
+ "application/json",
29
+ "text/xml",
30
+ "application/xml",
31
+ "text/yaml",
32
+ "application/yaml",
33
+ ]
34
+
35
+ @property
36
+ def loader_name(self) -> str:
37
+ """Unique identifier for this loader."""
38
+ return "text_loader"
39
+
40
+ def can_handle(self, extension: str, mime_type: str) -> bool:
41
+ """
42
+ Check if this loader can handle the given file.
43
+
44
+ Args:
45
+ extension: File extension
46
+ mime_type: Optional MIME type
47
+
48
+ Returns:
49
+ True if file can be handled, False otherwise
50
+ """
51
+ if extension in self.supported_extensions and mime_type in self.supported_mime_types:
52
+ return True
53
+
54
+ return False
55
+
56
+ async def load(self, file_path: str, encoding: str = "utf-8", **kwargs):
57
+ """
58
+ Load and process the text file.
59
+
60
+ Args:
61
+ file_path: Path to the file to load
62
+ encoding: Text encoding to use (default: utf-8)
63
+ **kwargs: Additional configuration (unused)
64
+
65
+ Returns:
66
+ LoaderResult containing the file content and metadata
67
+
68
+ Raises:
69
+ FileNotFoundError: If file doesn't exist
70
+ UnicodeDecodeError: If file cannot be decoded with specified encoding
71
+ OSError: If file cannot be read
72
+ """
73
+ if not os.path.exists(file_path):
74
+ raise FileNotFoundError(f"File not found: {file_path}")
75
+
76
+ with open(file_path, "rb") as f:
77
+ file_metadata = await get_file_metadata(f)
78
+ # Name ingested file of current loader based on original file content hash
79
+ storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
80
+
81
+ with open(file_path, "r", encoding=encoding) as f:
82
+ content = f.read()
83
+
84
+ storage_config = get_storage_config()
85
+ data_root_directory = storage_config["data_root_directory"]
86
+ storage = get_file_storage(data_root_directory)
87
+
88
+ full_file_path = await storage.store(storage_file_name, content)
89
+
90
+ return full_file_path
@@ -0,0 +1,32 @@
1
+ from .LoaderEngine import LoaderEngine
2
+ from .supported_loaders import supported_loaders
3
+ from cognee.shared.logging_utils import get_logger
4
+
5
+ logger = get_logger(__name__)
6
+
7
+
8
+ def create_loader_engine() -> LoaderEngine:
9
+ """
10
+ Create loader engine with given configuration.
11
+
12
+ Follows cognee's pattern for engine creation functions used
13
+ in database adapters.
14
+
15
+ Args:
16
+ default_loader_priority: Priority order for loader selection
17
+
18
+ Returns:
19
+ Configured LoaderEngine instance
20
+ """
21
+ engine = LoaderEngine()
22
+
23
+ # Register supported loaders from registry
24
+ for loader_name, loader_class in supported_loaders.items():
25
+ try:
26
+ loader_instance = loader_class()
27
+ engine.register_loader(loader_instance)
28
+ except Exception as e:
29
+ # Log but don't fail - allow engine to continue with other loaders
30
+ logger.warning(f"Failed to register loader {loader_name}: {e}")
31
+
32
+ return engine
@@ -0,0 +1,22 @@
1
+ """
2
+ External loader implementations for cognee.
3
+
4
+ This module contains loaders that depend on external libraries:
5
+ - pypdf_loader: PDF processing using pypdf
6
+ - unstructured_loader: Document processing using unstructured
7
+ - dlt_loader: Data lake/warehouse integration using DLT
8
+
9
+ These loaders are optional and only available if their dependencies are installed.
10
+ """
11
+
12
+ from .pypdf_loader import PyPdfLoader
13
+
14
+ __all__ = ["PyPdfLoader"]
15
+
16
+ # Conditional imports based on dependency availability
17
+ try:
18
+ from .unstructured_loader import UnstructuredLoader
19
+
20
+ __all__.append("UnstructuredLoader")
21
+ except ImportError:
22
+ pass