cognee 0.2.1.dev7__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/api/client.py +44 -4
- cognee/api/health.py +332 -0
- cognee/api/v1/add/add.py +5 -2
- cognee/api/v1/add/routers/get_add_router.py +3 -0
- cognee/api/v1/cognify/code_graph_pipeline.py +3 -1
- cognee/api/v1/cognify/cognify.py +8 -0
- cognee/api/v1/cognify/routers/get_cognify_router.py +8 -1
- cognee/api/v1/config/config.py +3 -1
- cognee/api/v1/datasets/routers/get_datasets_router.py +2 -8
- cognee/api/v1/delete/delete.py +16 -12
- cognee/api/v1/responses/routers/get_responses_router.py +3 -1
- cognee/api/v1/search/search.py +10 -0
- cognee/api/v1/settings/routers/get_settings_router.py +0 -2
- cognee/base_config.py +1 -0
- cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +5 -6
- cognee/infrastructure/databases/graph/config.py +2 -0
- cognee/infrastructure/databases/graph/get_graph_engine.py +58 -12
- cognee/infrastructure/databases/graph/graph_db_interface.py +15 -10
- cognee/infrastructure/databases/graph/kuzu/adapter.py +43 -16
- cognee/infrastructure/databases/graph/kuzu/kuzu_migrate.py +281 -0
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +151 -77
- cognee/infrastructure/databases/graph/neptune_driver/__init__.py +15 -0
- cognee/infrastructure/databases/graph/neptune_driver/adapter.py +1427 -0
- cognee/infrastructure/databases/graph/neptune_driver/exceptions.py +115 -0
- cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +224 -0
- cognee/infrastructure/databases/graph/networkx/adapter.py +3 -3
- cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +449 -0
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +11 -3
- cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +8 -3
- cognee/infrastructure/databases/vector/create_vector_engine.py +31 -23
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +3 -1
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +21 -6
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +4 -3
- cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +3 -1
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +22 -16
- cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +36 -34
- cognee/infrastructure/databases/vector/vector_db_interface.py +78 -7
- cognee/infrastructure/files/utils/get_data_file_path.py +39 -0
- cognee/infrastructure/files/utils/guess_file_type.py +2 -2
- cognee/infrastructure/files/utils/open_data_file.py +4 -23
- cognee/infrastructure/llm/LLMGateway.py +137 -0
- cognee/infrastructure/llm/__init__.py +14 -4
- cognee/infrastructure/llm/config.py +29 -1
- cognee/infrastructure/llm/prompts/answer_hotpot_question.txt +1 -1
- cognee/infrastructure/llm/prompts/answer_hotpot_using_cognee_search.txt +1 -1
- cognee/infrastructure/llm/prompts/answer_simple_question.txt +1 -1
- cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt +1 -1
- cognee/infrastructure/llm/prompts/categorize_categories.txt +1 -1
- cognee/infrastructure/llm/prompts/classify_content.txt +1 -1
- cognee/infrastructure/llm/prompts/context_for_question.txt +1 -1
- cognee/infrastructure/llm/prompts/graph_context_for_question.txt +1 -1
- cognee/infrastructure/llm/prompts/natural_language_retriever_system.txt +1 -1
- cognee/infrastructure/llm/prompts/patch_gen_instructions.txt +1 -1
- cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +130 -0
- cognee/infrastructure/llm/prompts/summarize_code.txt +2 -2
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +57 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +533 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/config.py +94 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/globals.py +37 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +21 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +131 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +266 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +137 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +550 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/tracing.py +26 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +962 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +52 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +166 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +109 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +343 -0
- cognee/{modules/data → infrastructure/llm/structured_output_framework/baml/baml_src}/extraction/__init__.py +1 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +89 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +33 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +18 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +3 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_categories.py +12 -0
- cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/extract_summary.py +16 -7
- cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/extract_content_graph.py +7 -6
- cognee/infrastructure/llm/{anthropic → structured_output_framework/litellm_instructor/llm/anthropic}/adapter.py +10 -4
- cognee/infrastructure/llm/{gemini → structured_output_framework/litellm_instructor/llm/gemini}/adapter.py +6 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/__init__.py +0 -0
- cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/litellm_instructor/llm/generic_llm_api}/adapter.py +7 -3
- cognee/infrastructure/llm/{get_llm_client.py → structured_output_framework/litellm_instructor/llm/get_llm_client.py} +18 -6
- cognee/infrastructure/llm/{llm_interface.py → structured_output_framework/litellm_instructor/llm/llm_interface.py} +2 -2
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/__init__.py +0 -0
- cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor/llm/ollama}/adapter.py +4 -2
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/__init__.py +0 -0
- cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm/openai}/adapter.py +6 -4
- cognee/infrastructure/llm/{rate_limiter.py → structured_output_framework/litellm_instructor/llm/rate_limiter.py} +0 -5
- cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +4 -2
- cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +7 -3
- cognee/infrastructure/llm/tokenizer/__init__.py +4 -0
- cognee/infrastructure/llm/utils.py +3 -1
- cognee/infrastructure/loaders/LoaderEngine.py +156 -0
- cognee/infrastructure/loaders/LoaderInterface.py +73 -0
- cognee/infrastructure/loaders/__init__.py +18 -0
- cognee/infrastructure/loaders/core/__init__.py +7 -0
- cognee/infrastructure/loaders/core/audio_loader.py +98 -0
- cognee/infrastructure/loaders/core/image_loader.py +114 -0
- cognee/infrastructure/loaders/core/text_loader.py +90 -0
- cognee/infrastructure/loaders/create_loader_engine.py +32 -0
- cognee/infrastructure/loaders/external/__init__.py +22 -0
- cognee/infrastructure/loaders/external/pypdf_loader.py +96 -0
- cognee/infrastructure/loaders/external/unstructured_loader.py +127 -0
- cognee/infrastructure/loaders/get_loader_engine.py +18 -0
- cognee/infrastructure/loaders/supported_loaders.py +18 -0
- cognee/infrastructure/loaders/use_loader.py +21 -0
- cognee/infrastructure/loaders/utils/__init__.py +0 -0
- cognee/modules/data/methods/__init__.py +1 -0
- cognee/modules/data/methods/get_authorized_dataset.py +23 -0
- cognee/modules/data/models/Data.py +13 -3
- cognee/modules/data/processing/document_types/AudioDocument.py +2 -2
- cognee/modules/data/processing/document_types/ImageDocument.py +2 -2
- cognee/modules/data/processing/document_types/PdfDocument.py +4 -11
- cognee/modules/data/processing/document_types/UnstructuredDocument.py +2 -5
- cognee/modules/engine/utils/generate_edge_id.py +5 -0
- cognee/modules/graph/cognee_graph/CogneeGraph.py +45 -35
- cognee/modules/graph/methods/get_formatted_graph_data.py +8 -2
- cognee/modules/graph/utils/get_graph_from_model.py +93 -101
- cognee/modules/ingestion/data_types/TextData.py +8 -2
- cognee/modules/ingestion/save_data_to_file.py +1 -1
- cognee/modules/pipelines/exceptions/__init__.py +1 -0
- cognee/modules/pipelines/exceptions/exceptions.py +12 -0
- cognee/modules/pipelines/models/DataItemStatus.py +5 -0
- cognee/modules/pipelines/models/PipelineRunInfo.py +6 -0
- cognee/modules/pipelines/models/__init__.py +1 -0
- cognee/modules/pipelines/operations/pipeline.py +10 -2
- cognee/modules/pipelines/operations/run_tasks.py +252 -20
- cognee/modules/pipelines/operations/run_tasks_distributed.py +1 -1
- cognee/modules/retrieval/chunks_retriever.py +23 -1
- cognee/modules/retrieval/code_retriever.py +66 -9
- cognee/modules/retrieval/completion_retriever.py +11 -9
- cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +0 -2
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +0 -2
- cognee/modules/retrieval/graph_completion_cot_retriever.py +8 -9
- cognee/modules/retrieval/graph_completion_retriever.py +1 -1
- cognee/modules/retrieval/insights_retriever.py +4 -0
- cognee/modules/retrieval/natural_language_retriever.py +9 -15
- cognee/modules/retrieval/summaries_retriever.py +23 -1
- cognee/modules/retrieval/utils/brute_force_triplet_search.py +23 -4
- cognee/modules/retrieval/utils/completion.py +6 -9
- cognee/modules/retrieval/utils/description_to_codepart_search.py +2 -3
- cognee/modules/search/methods/search.py +5 -1
- cognee/modules/search/operations/__init__.py +1 -0
- cognee/modules/search/operations/select_search_type.py +42 -0
- cognee/modules/search/types/SearchType.py +1 -0
- cognee/modules/settings/get_settings.py +0 -8
- cognee/modules/settings/save_vector_db_config.py +1 -1
- cognee/shared/data_models.py +3 -1
- cognee/shared/logging_utils.py +0 -5
- cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
- cognee/tasks/documents/extract_chunks_from_documents.py +10 -12
- cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +4 -6
- cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +4 -6
- cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +6 -7
- cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +4 -7
- cognee/tasks/graph/extract_graph_from_code.py +3 -2
- cognee/tasks/graph/extract_graph_from_data.py +4 -3
- cognee/tasks/graph/infer_data_ontology.py +5 -6
- cognee/tasks/ingestion/data_item_to_text_file.py +79 -0
- cognee/tasks/ingestion/ingest_data.py +91 -61
- cognee/tasks/ingestion/resolve_data_directories.py +3 -0
- cognee/tasks/repo_processor/get_repo_file_dependencies.py +3 -0
- cognee/tasks/storage/index_data_points.py +1 -1
- cognee/tasks/storage/index_graph_edges.py +4 -1
- cognee/tasks/summarization/summarize_code.py +2 -3
- cognee/tasks/summarization/summarize_text.py +3 -2
- cognee/tests/test_cognee_server_start.py +12 -7
- cognee/tests/test_deduplication.py +2 -2
- cognee/tests/test_deletion.py +58 -17
- cognee/tests/test_graph_visualization_permissions.py +161 -0
- cognee/tests/test_neptune_analytics_graph.py +309 -0
- cognee/tests/test_neptune_analytics_hybrid.py +176 -0
- cognee/tests/{test_weaviate.py → test_neptune_analytics_vector.py} +86 -11
- cognee/tests/test_pgvector.py +5 -5
- cognee/tests/test_s3.py +1 -6
- cognee/tests/unit/infrastructure/databases/test_rate_limiter.py +11 -10
- cognee/tests/unit/infrastructure/databases/vector/__init__.py +0 -0
- cognee/tests/unit/infrastructure/mock_embedding_engine.py +1 -1
- cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +5 -5
- cognee/tests/unit/infrastructure/test_rate_limiting_realistic.py +6 -4
- cognee/tests/unit/infrastructure/test_rate_limiting_retry.py +1 -1
- cognee/tests/unit/interfaces/graph/get_graph_from_model_unit_test.py +61 -3
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +84 -9
- cognee/tests/unit/modules/search/search_methods_test.py +55 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/METADATA +13 -9
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/RECORD +203 -164
- cognee/infrastructure/databases/vector/pinecone/adapter.py +0 -8
- cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +0 -514
- cognee/infrastructure/databases/vector/qdrant/__init__.py +0 -2
- cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +0 -527
- cognee/infrastructure/databases/vector/weaviate_db/__init__.py +0 -1
- cognee/modules/data/extraction/extract_categories.py +0 -14
- cognee/tests/test_qdrant.py +0 -99
- distributed/Dockerfile +0 -34
- distributed/app.py +0 -4
- distributed/entrypoint.py +0 -71
- distributed/entrypoint.sh +0 -5
- distributed/modal_image.py +0 -11
- distributed/queues.py +0 -5
- distributed/tasks/queued_add_data_points.py +0 -13
- distributed/tasks/queued_add_edges.py +0 -13
- distributed/tasks/queued_add_nodes.py +0 -13
- distributed/test.py +0 -28
- distributed/utils.py +0 -19
- distributed/workers/data_point_saving_worker.py +0 -93
- distributed/workers/graph_saving_worker.py +0 -104
- /cognee/infrastructure/databases/{graph/memgraph → hybrid/neptune_analytics}/__init__.py +0 -0
- /cognee/infrastructure/{llm → databases/vector/embeddings}/embedding_rate_limiter.py +0 -0
- /cognee/infrastructure/{databases/vector/pinecone → llm/structured_output_framework}/__init__.py +0 -0
- /cognee/infrastructure/llm/{anthropic → structured_output_framework/baml/baml_src}/__init__.py +0 -0
- /cognee/infrastructure/llm/{gemini/__init__.py → structured_output_framework/baml/baml_src/extraction/extract_categories.py} +0 -0
- /cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/baml/baml_src/extraction/knowledge_graph}/__init__.py +0 -0
- /cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor}/__init__.py +0 -0
- /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/__init__.py +0 -0
- /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/texts.json +0 -0
- /cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm}/__init__.py +0 -0
- {distributed → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic}/__init__.py +0 -0
- {distributed/tasks → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini}/__init__.py +0 -0
- /cognee/modules/data/{extraction/knowledge_graph → methods}/add_model_class_to_graph.py +0 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/WHEEL +0 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import filetype
|
|
2
|
+
from typing import Dict, List, Optional, Any
|
|
3
|
+
from .LoaderInterface import LoaderInterface
|
|
4
|
+
from cognee.shared.logging_utils import get_logger
|
|
5
|
+
|
|
6
|
+
logger = get_logger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LoaderEngine:
|
|
10
|
+
"""
|
|
11
|
+
Main loader engine for managing file loaders.
|
|
12
|
+
|
|
13
|
+
Follows cognee's adapter pattern similar to database engines,
|
|
14
|
+
providing a centralized system for file loading operations.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self):
|
|
18
|
+
"""
|
|
19
|
+
Initialize the loader engine.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
default_loader_priority: Priority order for loader selection
|
|
23
|
+
"""
|
|
24
|
+
self._loaders: Dict[str, LoaderInterface] = {}
|
|
25
|
+
self._extension_map: Dict[str, List[LoaderInterface]] = {}
|
|
26
|
+
self._mime_type_map: Dict[str, List[LoaderInterface]] = {}
|
|
27
|
+
|
|
28
|
+
self.default_loader_priority = [
|
|
29
|
+
"text_loader",
|
|
30
|
+
"pypdf_loader",
|
|
31
|
+
"image_loader",
|
|
32
|
+
"audio_loader",
|
|
33
|
+
"unstructured_loader",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
def register_loader(self, loader: LoaderInterface) -> bool:
|
|
37
|
+
"""
|
|
38
|
+
Register a loader with the engine.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
loader: LoaderInterface implementation to register
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
True if loader was registered successfully, False otherwise
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
self._loaders[loader.loader_name] = loader
|
|
48
|
+
|
|
49
|
+
# Map extensions to loaders
|
|
50
|
+
for ext in loader.supported_extensions:
|
|
51
|
+
ext_lower = ext.lower()
|
|
52
|
+
if ext_lower not in self._extension_map:
|
|
53
|
+
self._extension_map[ext_lower] = []
|
|
54
|
+
self._extension_map[ext_lower].append(loader)
|
|
55
|
+
|
|
56
|
+
# Map mime types to loaders
|
|
57
|
+
for mime_type in loader.supported_mime_types:
|
|
58
|
+
if mime_type not in self._mime_type_map:
|
|
59
|
+
self._mime_type_map[mime_type] = []
|
|
60
|
+
self._mime_type_map[mime_type].append(loader)
|
|
61
|
+
|
|
62
|
+
logger.info(f"Registered loader: {loader.loader_name}")
|
|
63
|
+
return True
|
|
64
|
+
|
|
65
|
+
def get_loader(
|
|
66
|
+
self, file_path: str, preferred_loaders: List[str] = None
|
|
67
|
+
) -> Optional[LoaderInterface]:
|
|
68
|
+
"""
|
|
69
|
+
Get appropriate loader for a file.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
file_path: Path to the file to be processed
|
|
73
|
+
preferred_loaders: List of preferred loader names to try first
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
LoaderInterface that can handle the file, or None if not found
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
file_info = filetype.guess(file_path)
|
|
80
|
+
|
|
81
|
+
# Try preferred loaders first
|
|
82
|
+
if preferred_loaders:
|
|
83
|
+
for loader_name in preferred_loaders:
|
|
84
|
+
if loader_name in self._loaders:
|
|
85
|
+
loader = self._loaders[loader_name]
|
|
86
|
+
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
|
87
|
+
return loader
|
|
88
|
+
else:
|
|
89
|
+
raise ValueError(f"Loader does not exist: {loader_name}")
|
|
90
|
+
|
|
91
|
+
# Try default priority order
|
|
92
|
+
for loader_name in self.default_loader_priority:
|
|
93
|
+
if loader_name in self._loaders:
|
|
94
|
+
loader = self._loaders[loader_name]
|
|
95
|
+
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
|
96
|
+
return loader
|
|
97
|
+
else:
|
|
98
|
+
raise ValueError(f"Loader does not exist: {loader_name}")
|
|
99
|
+
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
async def load_file(
|
|
103
|
+
self,
|
|
104
|
+
file_path: str,
|
|
105
|
+
file_stream: Optional[Any],
|
|
106
|
+
preferred_loaders: Optional[List[str]] = None,
|
|
107
|
+
**kwargs,
|
|
108
|
+
):
|
|
109
|
+
"""
|
|
110
|
+
Load file using appropriate loader.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
file_path: Path to the file to be processed
|
|
114
|
+
preferred_loaders: List of preferred loader names to try first
|
|
115
|
+
**kwargs: Additional loader-specific configuration
|
|
116
|
+
|
|
117
|
+
Raises:
|
|
118
|
+
ValueError: If no suitable loader is found
|
|
119
|
+
Exception: If file processing fails
|
|
120
|
+
"""
|
|
121
|
+
loader = self.get_loader(file_path, preferred_loaders)
|
|
122
|
+
if not loader:
|
|
123
|
+
raise ValueError(f"No loader found for file: {file_path}")
|
|
124
|
+
|
|
125
|
+
logger.debug(f"Loading {file_path} with {loader.loader_name}")
|
|
126
|
+
# TODO: loading needs to be reworked to work with both file streams and file locations
|
|
127
|
+
return await loader.load(file_path, **kwargs)
|
|
128
|
+
|
|
129
|
+
def get_available_loaders(self) -> List[str]:
|
|
130
|
+
"""
|
|
131
|
+
Get list of available loader names.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
List of registered loader names
|
|
135
|
+
"""
|
|
136
|
+
return list(self._loaders.keys())
|
|
137
|
+
|
|
138
|
+
def get_loader_info(self, loader_name: str) -> Dict[str, any]:
|
|
139
|
+
"""
|
|
140
|
+
Get information about a specific loader.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
loader_name: Name of the loader to inspect
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Dictionary containing loader information
|
|
147
|
+
"""
|
|
148
|
+
if loader_name not in self._loaders:
|
|
149
|
+
return {}
|
|
150
|
+
|
|
151
|
+
loader = self._loaders[loader_name]
|
|
152
|
+
return {
|
|
153
|
+
"name": loader.loader_name,
|
|
154
|
+
"extensions": loader.supported_extensions,
|
|
155
|
+
"mime_types": loader.supported_mime_types,
|
|
156
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List, Optional, Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class LoaderInterface(ABC):
|
|
6
|
+
"""
|
|
7
|
+
Base interface for all file loaders in cognee.
|
|
8
|
+
|
|
9
|
+
This interface follows cognee's established pattern for database adapters,
|
|
10
|
+
ensuring consistent behavior across all loader implementations.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
@abstractmethod
|
|
15
|
+
def supported_extensions(self) -> List[str]:
|
|
16
|
+
"""
|
|
17
|
+
List of file extensions this loader supports.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
List of extensions including the dot (e.g., ['.txt', '.md'])
|
|
21
|
+
"""
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def supported_mime_types(self) -> List[str]:
|
|
27
|
+
"""
|
|
28
|
+
List of MIME types this loader supports.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
List of MIME type strings (e.g., ['text/plain', 'application/pdf'])
|
|
32
|
+
"""
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def loader_name(self) -> str:
|
|
38
|
+
"""
|
|
39
|
+
Unique name identifier for this loader.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
String identifier used for registration and configuration
|
|
43
|
+
"""
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
def can_handle(self, extension: str, mime_type: str) -> bool:
|
|
48
|
+
"""
|
|
49
|
+
Check if this loader can handle the given file.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
extension: File extension
|
|
53
|
+
mime_type: MIME type of the file
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
True if this loader can process the file, False otherwise
|
|
57
|
+
"""
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
@abstractmethod
|
|
61
|
+
async def load(self, file_path: str, file_stream: Optional[Any] = None, **kwargs):
|
|
62
|
+
"""
|
|
63
|
+
Load and process the file, returning standardized result.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
file_path: Path to the file to be processed
|
|
67
|
+
file_stream: If file stream is provided it will be used to process file instead
|
|
68
|
+
**kwargs: Additional loader-specific configuration
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
Exception: If file cannot be processed
|
|
72
|
+
"""
|
|
73
|
+
pass
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
File loader infrastructure for cognee.
|
|
3
|
+
|
|
4
|
+
This package provides a plugin-based system for loading different file formats
|
|
5
|
+
into cognee, following the same patterns as database adapters.
|
|
6
|
+
|
|
7
|
+
Main exports:
|
|
8
|
+
- get_loader_engine(): Factory function to get configured loader engine
|
|
9
|
+
- use_loader(): Register custom loaders at runtime
|
|
10
|
+
- LoaderInterface: Base interface for implementing loaders
|
|
11
|
+
- LoaderResult, ContentType: Data models for loader results
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from .get_loader_engine import get_loader_engine
|
|
15
|
+
from .use_loader import use_loader
|
|
16
|
+
from .LoaderInterface import LoaderInterface
|
|
17
|
+
|
|
18
|
+
__all__ = ["get_loader_engine", "use_loader", "LoaderInterface"]
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
|
4
|
+
from cognee.infrastructure.llm.LLMGateway import LLMGateway
|
|
5
|
+
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
|
|
6
|
+
from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AudioLoader(LoaderInterface):
|
|
10
|
+
"""
|
|
11
|
+
Core text file loader that handles basic text file formats.
|
|
12
|
+
|
|
13
|
+
This loader is always available and serves as the fallback for
|
|
14
|
+
text-based files when no specialized loader is available.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def supported_extensions(self) -> List[str]:
|
|
19
|
+
"""Supported text file extensions."""
|
|
20
|
+
return [
|
|
21
|
+
"aac", # Audio documents
|
|
22
|
+
"mid",
|
|
23
|
+
"mp3",
|
|
24
|
+
"m4a",
|
|
25
|
+
"ogg",
|
|
26
|
+
"flac",
|
|
27
|
+
"wav",
|
|
28
|
+
"amr",
|
|
29
|
+
"aiff",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def supported_mime_types(self) -> List[str]:
|
|
34
|
+
"""Supported MIME types for text content."""
|
|
35
|
+
return [
|
|
36
|
+
"audio/aac",
|
|
37
|
+
"audio/midi",
|
|
38
|
+
"audio/mpeg",
|
|
39
|
+
"audio/mp4",
|
|
40
|
+
"audio/ogg",
|
|
41
|
+
"audio/flac",
|
|
42
|
+
"audio/wav",
|
|
43
|
+
"audio/amr",
|
|
44
|
+
"audio/aiff",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def loader_name(self) -> str:
|
|
49
|
+
"""Unique identifier for this loader."""
|
|
50
|
+
return "audio_loader"
|
|
51
|
+
|
|
52
|
+
def can_handle(self, extension: str, mime_type: str) -> bool:
|
|
53
|
+
"""
|
|
54
|
+
Check if this loader can handle the given file.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
extension: File extension
|
|
58
|
+
mime_type: Optional MIME type
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
True if file can be handled, False otherwise
|
|
62
|
+
"""
|
|
63
|
+
if extension in self.supported_extensions and mime_type in self.supported_mime_types:
|
|
64
|
+
return True
|
|
65
|
+
return False
|
|
66
|
+
|
|
67
|
+
async def load(self, file_path: str, **kwargs):
|
|
68
|
+
"""
|
|
69
|
+
Load and process the audio file.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
file_path: Path to the file to load
|
|
73
|
+
**kwargs: Additional configuration (unused)
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
LoaderResult containing the file content and metadata
|
|
77
|
+
|
|
78
|
+
Raises:
|
|
79
|
+
FileNotFoundError: If file doesn't exist
|
|
80
|
+
OSError: If file cannot be read
|
|
81
|
+
"""
|
|
82
|
+
if not os.path.exists(file_path):
|
|
83
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
84
|
+
|
|
85
|
+
with open(file_path, "rb") as f:
|
|
86
|
+
file_metadata = await get_file_metadata(f)
|
|
87
|
+
# Name ingested file of current loader based on original file content hash
|
|
88
|
+
storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
|
|
89
|
+
|
|
90
|
+
result = await LLMGateway.create_transcript(file_path)
|
|
91
|
+
|
|
92
|
+
storage_config = get_storage_config()
|
|
93
|
+
data_root_directory = storage_config["data_root_directory"]
|
|
94
|
+
storage = get_file_storage(data_root_directory)
|
|
95
|
+
|
|
96
|
+
full_file_path = await storage.store(storage_file_name, result.text)
|
|
97
|
+
|
|
98
|
+
return full_file_path
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
|
4
|
+
from cognee.infrastructure.llm.LLMGateway import LLMGateway
|
|
5
|
+
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
|
|
6
|
+
from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ImageLoader(LoaderInterface):
|
|
10
|
+
"""
|
|
11
|
+
Core image file loader that handles basic image file formats.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def supported_extensions(self) -> List[str]:
|
|
16
|
+
"""Supported text file extensions."""
|
|
17
|
+
return [
|
|
18
|
+
"png",
|
|
19
|
+
"dwg",
|
|
20
|
+
"xcf",
|
|
21
|
+
"jpg",
|
|
22
|
+
".jpe",
|
|
23
|
+
".jpeg",
|
|
24
|
+
"jpx",
|
|
25
|
+
"apng",
|
|
26
|
+
"gif",
|
|
27
|
+
"webp",
|
|
28
|
+
"cr2",
|
|
29
|
+
"tif",
|
|
30
|
+
"tiff",
|
|
31
|
+
"bmp",
|
|
32
|
+
"jxr",
|
|
33
|
+
"psd",
|
|
34
|
+
"ico",
|
|
35
|
+
"heic",
|
|
36
|
+
"avif",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def supported_mime_types(self) -> List[str]:
|
|
41
|
+
"""Supported MIME types for text content."""
|
|
42
|
+
return [
|
|
43
|
+
"image/png",
|
|
44
|
+
"image/vnd.dwg",
|
|
45
|
+
"image/x-xcf",
|
|
46
|
+
"image/jpeg",
|
|
47
|
+
"image/jpx",
|
|
48
|
+
"image/apng",
|
|
49
|
+
"image/gif",
|
|
50
|
+
"image/webp",
|
|
51
|
+
"image/x-canon-cr2",
|
|
52
|
+
"image/tiff",
|
|
53
|
+
"image/bmp",
|
|
54
|
+
"image/jxr",
|
|
55
|
+
"image/vnd.adobe.photoshop",
|
|
56
|
+
"image/vnd.microsoft.icon",
|
|
57
|
+
"image/heic",
|
|
58
|
+
"image/avif",
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def loader_name(self) -> str:
|
|
63
|
+
"""Unique identifier for this loader."""
|
|
64
|
+
return "image_loader"
|
|
65
|
+
|
|
66
|
+
def can_handle(self, extension: str, mime_type: str) -> bool:
|
|
67
|
+
"""
|
|
68
|
+
Check if this loader can handle the given file.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
extension: File extension
|
|
72
|
+
mime_type: Optional MIME type
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
True if file can be handled, False otherwise
|
|
76
|
+
"""
|
|
77
|
+
if extension in self.supported_extensions and mime_type in self.supported_mime_types:
|
|
78
|
+
return True
|
|
79
|
+
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
async def load(self, file_path: str, **kwargs):
|
|
83
|
+
"""
|
|
84
|
+
Load and process the image file.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
file_path: Path to the file to load
|
|
88
|
+
**kwargs: Additional configuration (unused)
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
LoaderResult containing the file content and metadata
|
|
92
|
+
|
|
93
|
+
Raises:
|
|
94
|
+
FileNotFoundError: If file doesn't exist
|
|
95
|
+
UnicodeDecodeError: If file cannot be decoded with specified encoding
|
|
96
|
+
OSError: If file cannot be read
|
|
97
|
+
"""
|
|
98
|
+
if not os.path.exists(file_path):
|
|
99
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
100
|
+
|
|
101
|
+
with open(file_path, "rb") as f:
|
|
102
|
+
file_metadata = await get_file_metadata(f)
|
|
103
|
+
# Name ingested file of current loader based on original file content hash
|
|
104
|
+
storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
|
|
105
|
+
|
|
106
|
+
result = await LLMGateway.transcribe_image(file_path)
|
|
107
|
+
|
|
108
|
+
storage_config = get_storage_config()
|
|
109
|
+
data_root_directory = storage_config["data_root_directory"]
|
|
110
|
+
storage = get_file_storage(data_root_directory)
|
|
111
|
+
|
|
112
|
+
full_file_path = await storage.store(storage_file_name, result.choices[0].message.content)
|
|
113
|
+
|
|
114
|
+
return full_file_path
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
|
4
|
+
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
|
|
5
|
+
from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TextLoader(LoaderInterface):
|
|
9
|
+
"""
|
|
10
|
+
Core text file loader that handles basic text file formats.
|
|
11
|
+
|
|
12
|
+
This loader is always available and serves as the fallback for
|
|
13
|
+
text-based files when no specialized loader is available.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def supported_extensions(self) -> List[str]:
|
|
18
|
+
"""Supported text file extensions."""
|
|
19
|
+
return ["txt", "md", "csv", "json", "xml", "yaml", "yml", "log"]
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def supported_mime_types(self) -> List[str]:
|
|
23
|
+
"""Supported MIME types for text content."""
|
|
24
|
+
return [
|
|
25
|
+
"text/plain",
|
|
26
|
+
"text/markdown",
|
|
27
|
+
"text/csv",
|
|
28
|
+
"application/json",
|
|
29
|
+
"text/xml",
|
|
30
|
+
"application/xml",
|
|
31
|
+
"text/yaml",
|
|
32
|
+
"application/yaml",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def loader_name(self) -> str:
|
|
37
|
+
"""Unique identifier for this loader."""
|
|
38
|
+
return "text_loader"
|
|
39
|
+
|
|
40
|
+
def can_handle(self, extension: str, mime_type: str) -> bool:
|
|
41
|
+
"""
|
|
42
|
+
Check if this loader can handle the given file.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
extension: File extension
|
|
46
|
+
mime_type: Optional MIME type
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
True if file can be handled, False otherwise
|
|
50
|
+
"""
|
|
51
|
+
if extension in self.supported_extensions and mime_type in self.supported_mime_types:
|
|
52
|
+
return True
|
|
53
|
+
|
|
54
|
+
return False
|
|
55
|
+
|
|
56
|
+
async def load(self, file_path: str, encoding: str = "utf-8", **kwargs):
|
|
57
|
+
"""
|
|
58
|
+
Load and process the text file.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
file_path: Path to the file to load
|
|
62
|
+
encoding: Text encoding to use (default: utf-8)
|
|
63
|
+
**kwargs: Additional configuration (unused)
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
LoaderResult containing the file content and metadata
|
|
67
|
+
|
|
68
|
+
Raises:
|
|
69
|
+
FileNotFoundError: If file doesn't exist
|
|
70
|
+
UnicodeDecodeError: If file cannot be decoded with specified encoding
|
|
71
|
+
OSError: If file cannot be read
|
|
72
|
+
"""
|
|
73
|
+
if not os.path.exists(file_path):
|
|
74
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
75
|
+
|
|
76
|
+
with open(file_path, "rb") as f:
|
|
77
|
+
file_metadata = await get_file_metadata(f)
|
|
78
|
+
# Name ingested file of current loader based on original file content hash
|
|
79
|
+
storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
|
|
80
|
+
|
|
81
|
+
with open(file_path, "r", encoding=encoding) as f:
|
|
82
|
+
content = f.read()
|
|
83
|
+
|
|
84
|
+
storage_config = get_storage_config()
|
|
85
|
+
data_root_directory = storage_config["data_root_directory"]
|
|
86
|
+
storage = get_file_storage(data_root_directory)
|
|
87
|
+
|
|
88
|
+
full_file_path = await storage.store(storage_file_name, content)
|
|
89
|
+
|
|
90
|
+
return full_file_path
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from .LoaderEngine import LoaderEngine
|
|
2
|
+
from .supported_loaders import supported_loaders
|
|
3
|
+
from cognee.shared.logging_utils import get_logger
|
|
4
|
+
|
|
5
|
+
logger = get_logger(__name__)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def create_loader_engine() -> LoaderEngine:
|
|
9
|
+
"""
|
|
10
|
+
Create loader engine with given configuration.
|
|
11
|
+
|
|
12
|
+
Follows cognee's pattern for engine creation functions used
|
|
13
|
+
in database adapters.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
default_loader_priority: Priority order for loader selection
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Configured LoaderEngine instance
|
|
20
|
+
"""
|
|
21
|
+
engine = LoaderEngine()
|
|
22
|
+
|
|
23
|
+
# Register supported loaders from registry
|
|
24
|
+
for loader_name, loader_class in supported_loaders.items():
|
|
25
|
+
try:
|
|
26
|
+
loader_instance = loader_class()
|
|
27
|
+
engine.register_loader(loader_instance)
|
|
28
|
+
except Exception as e:
|
|
29
|
+
# Log but don't fail - allow engine to continue with other loaders
|
|
30
|
+
logger.warning(f"Failed to register loader {loader_name}: {e}")
|
|
31
|
+
|
|
32
|
+
return engine
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
External loader implementations for cognee.
|
|
3
|
+
|
|
4
|
+
This module contains loaders that depend on external libraries:
|
|
5
|
+
- pypdf_loader: PDF processing using pypdf
|
|
6
|
+
- unstructured_loader: Document processing using unstructured
|
|
7
|
+
- dlt_loader: Data lake/warehouse integration using DLT
|
|
8
|
+
|
|
9
|
+
These loaders are optional and only available if their dependencies are installed.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from .pypdf_loader import PyPdfLoader
|
|
13
|
+
|
|
14
|
+
__all__ = ["PyPdfLoader"]
|
|
15
|
+
|
|
16
|
+
# Conditional imports based on dependency availability
|
|
17
|
+
try:
|
|
18
|
+
from .unstructured_loader import UnstructuredLoader
|
|
19
|
+
|
|
20
|
+
__all__.append("UnstructuredLoader")
|
|
21
|
+
except ImportError:
|
|
22
|
+
pass
|