cognee 0.3.7.dev2__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/base_config.py +7 -0
- cognee/infrastructure/databases/vector/create_vector_engine.py +10 -4
- cognee/infrastructure/files/utils/guess_file_type.py +0 -83
- cognee/infrastructure/llm/prompts/extract_query_time.txt +13 -15
- cognee/infrastructure/loaders/LoaderEngine.py +2 -1
- cognee/modules/ontology/get_default_ontology_resolver.py +8 -2
- cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py +35 -7
- cognee/modules/retrieval/temporal_retriever.py +6 -4
- cognee/modules/visualization/cognee_network_visualization.py +408 -31
- cognee/shared/logging_utils.py +43 -10
- cognee/shared/utils.py +17 -10
- cognee/tasks/ingestion/migrate_relational_database.py +1 -1
- cognee/tests/unit/modules/ontology/test_ontology_adapter.py +151 -0
- {cognee-0.3.7.dev2.dist-info → cognee-0.3.8.dist-info}/METADATA +2 -2
- {cognee-0.3.7.dev2.dist-info → cognee-0.3.8.dist-info}/RECORD +19 -19
- {cognee-0.3.7.dev2.dist-info → cognee-0.3.8.dist-info}/WHEEL +0 -0
- {cognee-0.3.7.dev2.dist-info → cognee-0.3.8.dist-info}/entry_points.txt +0 -0
- {cognee-0.3.7.dev2.dist-info → cognee-0.3.8.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.3.7.dev2.dist-info → cognee-0.3.8.dist-info}/licenses/NOTICE.md +0 -0
cognee/base_config.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
from pathlib import Path
|
|
2
3
|
from typing import Optional
|
|
3
4
|
from functools import lru_cache
|
|
4
5
|
from cognee.root_dir import get_absolute_path, ensure_absolute_path
|
|
@@ -11,6 +12,9 @@ class BaseConfig(BaseSettings):
|
|
|
11
12
|
data_root_directory: str = get_absolute_path(".data_storage")
|
|
12
13
|
system_root_directory: str = get_absolute_path(".cognee_system")
|
|
13
14
|
cache_root_directory: str = get_absolute_path(".cognee_cache")
|
|
15
|
+
logs_root_directory: str = os.getenv(
|
|
16
|
+
"COGNEE_LOGS_DIR", str(os.path.join(os.path.dirname(os.path.dirname(__file__)), "logs"))
|
|
17
|
+
)
|
|
14
18
|
monitoring_tool: object = Observer.NONE
|
|
15
19
|
|
|
16
20
|
@pydantic.model_validator(mode="after")
|
|
@@ -30,6 +34,8 @@ class BaseConfig(BaseSettings):
|
|
|
30
34
|
# Require absolute paths for root directories
|
|
31
35
|
self.data_root_directory = ensure_absolute_path(self.data_root_directory)
|
|
32
36
|
self.system_root_directory = ensure_absolute_path(self.system_root_directory)
|
|
37
|
+
self.logs_root_directory = ensure_absolute_path(self.logs_root_directory)
|
|
38
|
+
|
|
33
39
|
# Set monitoring tool based on available keys
|
|
34
40
|
if self.langfuse_public_key and self.langfuse_secret_key:
|
|
35
41
|
self.monitoring_tool = Observer.LANGFUSE
|
|
@@ -49,6 +55,7 @@ class BaseConfig(BaseSettings):
|
|
|
49
55
|
"system_root_directory": self.system_root_directory,
|
|
50
56
|
"monitoring_tool": self.monitoring_tool,
|
|
51
57
|
"cache_root_directory": self.cache_root_directory,
|
|
58
|
+
"logs_root_directory": self.logs_root_directory,
|
|
52
59
|
}
|
|
53
60
|
|
|
54
61
|
|
|
@@ -47,7 +47,7 @@ def create_vector_engine(
|
|
|
47
47
|
embedding_engine=embedding_engine,
|
|
48
48
|
)
|
|
49
49
|
|
|
50
|
-
if vector_db_provider == "pgvector":
|
|
50
|
+
if vector_db_provider.lower() == "pgvector":
|
|
51
51
|
from cognee.infrastructure.databases.relational import get_relational_config
|
|
52
52
|
|
|
53
53
|
# Get configuration for postgres database
|
|
@@ -78,7 +78,7 @@ def create_vector_engine(
|
|
|
78
78
|
embedding_engine,
|
|
79
79
|
)
|
|
80
80
|
|
|
81
|
-
elif vector_db_provider == "chromadb":
|
|
81
|
+
elif vector_db_provider.lower() == "chromadb":
|
|
82
82
|
try:
|
|
83
83
|
import chromadb
|
|
84
84
|
except ImportError:
|
|
@@ -94,7 +94,7 @@ def create_vector_engine(
|
|
|
94
94
|
embedding_engine=embedding_engine,
|
|
95
95
|
)
|
|
96
96
|
|
|
97
|
-
elif vector_db_provider == "neptune_analytics":
|
|
97
|
+
elif vector_db_provider.lower() == "neptune_analytics":
|
|
98
98
|
try:
|
|
99
99
|
from langchain_aws import NeptuneAnalyticsGraph
|
|
100
100
|
except ImportError:
|
|
@@ -122,7 +122,7 @@ def create_vector_engine(
|
|
|
122
122
|
embedding_engine=embedding_engine,
|
|
123
123
|
)
|
|
124
124
|
|
|
125
|
-
|
|
125
|
+
elif vector_db_provider.lower() == "lancedb":
|
|
126
126
|
from .lancedb.LanceDBAdapter import LanceDBAdapter
|
|
127
127
|
|
|
128
128
|
return LanceDBAdapter(
|
|
@@ -130,3 +130,9 @@ def create_vector_engine(
|
|
|
130
130
|
api_key=vector_db_key,
|
|
131
131
|
embedding_engine=embedding_engine,
|
|
132
132
|
)
|
|
133
|
+
|
|
134
|
+
else:
|
|
135
|
+
raise EnvironmentError(
|
|
136
|
+
f"Unsupported graph database provider: {vector_db_provider}. "
|
|
137
|
+
f"Supported providers are: {', '.join(list(supported_databases.keys()) + ['LanceDB', 'PGVector', 'neptune_analytics', 'ChromaDB'])}"
|
|
138
|
+
)
|
|
@@ -22,89 +22,6 @@ class FileTypeException(Exception):
|
|
|
22
22
|
self.message = message
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
class TxtFileType(filetype.Type):
|
|
26
|
-
"""
|
|
27
|
-
Represents a text file type with specific MIME and extension properties.
|
|
28
|
-
|
|
29
|
-
Public methods:
|
|
30
|
-
- match: Determines whether a given buffer matches the text file type.
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
MIME = "text/plain"
|
|
34
|
-
EXTENSION = "txt"
|
|
35
|
-
|
|
36
|
-
def __init__(self):
|
|
37
|
-
super(TxtFileType, self).__init__(mime=TxtFileType.MIME, extension=TxtFileType.EXTENSION)
|
|
38
|
-
|
|
39
|
-
def match(self, buf):
|
|
40
|
-
"""
|
|
41
|
-
Determine if the given buffer contains text content.
|
|
42
|
-
|
|
43
|
-
Parameters:
|
|
44
|
-
-----------
|
|
45
|
-
|
|
46
|
-
- buf: The buffer to check for text content.
|
|
47
|
-
|
|
48
|
-
Returns:
|
|
49
|
-
--------
|
|
50
|
-
|
|
51
|
-
Returns True if the buffer is identified as text content, otherwise False.
|
|
52
|
-
"""
|
|
53
|
-
return is_text_content(buf)
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
txt_file_type = TxtFileType()
|
|
57
|
-
|
|
58
|
-
filetype.add_type(txt_file_type)
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
class CustomPdfMatcher(filetype.Type):
|
|
62
|
-
"""
|
|
63
|
-
Match PDF file types based on MIME type and extension.
|
|
64
|
-
|
|
65
|
-
Public methods:
|
|
66
|
-
- match
|
|
67
|
-
|
|
68
|
-
Instance variables:
|
|
69
|
-
- MIME: The MIME type of the PDF.
|
|
70
|
-
- EXTENSION: The file extension of the PDF.
|
|
71
|
-
"""
|
|
72
|
-
|
|
73
|
-
MIME = "application/pdf"
|
|
74
|
-
EXTENSION = "pdf"
|
|
75
|
-
|
|
76
|
-
def __init__(self):
|
|
77
|
-
super(CustomPdfMatcher, self).__init__(
|
|
78
|
-
mime=CustomPdfMatcher.MIME, extension=CustomPdfMatcher.EXTENSION
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
def match(self, buf):
|
|
82
|
-
"""
|
|
83
|
-
Determine if the provided buffer is a PDF file.
|
|
84
|
-
|
|
85
|
-
This method checks for the presence of the PDF signature in the buffer.
|
|
86
|
-
|
|
87
|
-
Raises:
|
|
88
|
-
- TypeError: If the buffer is not of bytes type.
|
|
89
|
-
|
|
90
|
-
Parameters:
|
|
91
|
-
-----------
|
|
92
|
-
|
|
93
|
-
- buf: The buffer containing the data to be checked.
|
|
94
|
-
|
|
95
|
-
Returns:
|
|
96
|
-
--------
|
|
97
|
-
|
|
98
|
-
Returns True if the buffer contains a PDF signature, otherwise returns False.
|
|
99
|
-
"""
|
|
100
|
-
return b"PDF-" in buf
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
custom_pdf_matcher = CustomPdfMatcher()
|
|
104
|
-
|
|
105
|
-
filetype.add_type(custom_pdf_matcher)
|
|
106
|
-
|
|
107
|
-
|
|
108
25
|
def guess_file_type(file: BinaryIO) -> filetype.Type:
|
|
109
26
|
"""
|
|
110
27
|
Guess the file type from the given binary file stream.
|
|
@@ -1,15 +1,13 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
ends_at: Optional[Timestamp] = None
|
|
15
|
-
```
|
|
1
|
+
You are tasked with identifying relevant time periods where the answer to a given query should be searched.
|
|
2
|
+
Current date is: `{{ time_now }}`. Determine relevant period(s) and return structured intervals.
|
|
3
|
+
|
|
4
|
+
Extraction rules:
|
|
5
|
+
|
|
6
|
+
1. Query without specific timestamp: use the time period with starts_at set to None and ends_at set to now.
|
|
7
|
+
2. Explicit time intervals: If the query specifies a range (e.g., from 2010 to 2020, between January and March 2023), extract both start and end dates. Always assign the earlier date to starts_at and the later date to ends_at.
|
|
8
|
+
3. Single timestamp: If the query refers to one specific moment (e.g., in 2015, on March 5, 2022), set starts_at and ends_at to that same timestamp.
|
|
9
|
+
4. Open-ended time references: For phrases such as "before X" or "after X", represent the unspecified side as None. For example: before 2009 → starts_at: None, ends_at: 2009; after 2009 → starts_at: 2009, ends_at: None.
|
|
10
|
+
5. Current-time references ("now", "current", "today"): If the query explicitly refers to the present, set both starts_at and ends_at to now (the ingestion timestamp).
|
|
11
|
+
6. "Who is" and "Who was" questions: These imply a general identity or biographical inquiry without a specific temporal scope. Set both starts_at and ends_at to None.
|
|
12
|
+
7. Ordering rule: Always ensure the earlier date is assigned to starts_at and the later date to ends_at.
|
|
13
|
+
8. No temporal information: If no valid or inferable time reference is found, set both starts_at and ends_at to None.
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import filetype
|
|
2
2
|
from typing import Dict, List, Optional, Any
|
|
3
3
|
from .LoaderInterface import LoaderInterface
|
|
4
|
+
from cognee.infrastructure.files.utils.guess_file_type import guess_file_type
|
|
4
5
|
from cognee.shared.logging_utils import get_logger
|
|
5
6
|
|
|
6
7
|
logger = get_logger(__name__)
|
|
@@ -80,7 +81,7 @@ class LoaderEngine:
|
|
|
80
81
|
"""
|
|
81
82
|
from pathlib import Path
|
|
82
83
|
|
|
83
|
-
file_info =
|
|
84
|
+
file_info = guess_file_type(file_path)
|
|
84
85
|
|
|
85
86
|
path_extension = Path(file_path).suffix.lstrip(".")
|
|
86
87
|
|
|
@@ -21,7 +21,8 @@ def get_ontology_resolver_from_env(
|
|
|
21
21
|
Supported value: "rdflib".
|
|
22
22
|
matching_strategy (str): The matching strategy to apply.
|
|
23
23
|
Supported value: "fuzzy".
|
|
24
|
-
ontology_file_path (str): Path to the ontology file required for the resolver.
|
|
24
|
+
ontology_file_path (str): Path to the ontology file(s) required for the resolver.
|
|
25
|
+
Can be a single path or comma-separated paths for multiple files.
|
|
25
26
|
|
|
26
27
|
Returns:
|
|
27
28
|
BaseOntologyResolver: An instance of the requested ontology resolver.
|
|
@@ -31,8 +32,13 @@ def get_ontology_resolver_from_env(
|
|
|
31
32
|
or if required parameters are missing.
|
|
32
33
|
"""
|
|
33
34
|
if ontology_resolver == "rdflib" and matching_strategy == "fuzzy" and ontology_file_path:
|
|
35
|
+
if "," in ontology_file_path:
|
|
36
|
+
file_paths = [path.strip() for path in ontology_file_path.split(",")]
|
|
37
|
+
else:
|
|
38
|
+
file_paths = ontology_file_path
|
|
39
|
+
|
|
34
40
|
return RDFLibOntologyResolver(
|
|
35
|
-
matching_strategy=FuzzyMatchingStrategy(), ontology_file=
|
|
41
|
+
matching_strategy=FuzzyMatchingStrategy(), ontology_file=file_paths
|
|
36
42
|
)
|
|
37
43
|
else:
|
|
38
44
|
raise EnvironmentError(
|
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
import difflib
|
|
3
3
|
from cognee.shared.logging_utils import get_logger
|
|
4
4
|
from collections import deque
|
|
5
|
-
from typing import List, Tuple, Dict, Optional, Any
|
|
5
|
+
from typing import List, Tuple, Dict, Optional, Any, Union
|
|
6
6
|
from rdflib import Graph, URIRef, RDF, RDFS, OWL
|
|
7
7
|
|
|
8
8
|
from cognee.modules.ontology.exceptions import (
|
|
@@ -26,22 +26,50 @@ class RDFLibOntologyResolver(BaseOntologyResolver):
|
|
|
26
26
|
|
|
27
27
|
def __init__(
|
|
28
28
|
self,
|
|
29
|
-
ontology_file: Optional[str] = None,
|
|
29
|
+
ontology_file: Optional[Union[str, List[str]]] = None,
|
|
30
30
|
matching_strategy: Optional[MatchingStrategy] = None,
|
|
31
31
|
) -> None:
|
|
32
32
|
super().__init__(matching_strategy)
|
|
33
33
|
self.ontology_file = ontology_file
|
|
34
34
|
try:
|
|
35
|
-
|
|
35
|
+
files_to_load = []
|
|
36
|
+
if ontology_file is not None:
|
|
37
|
+
if isinstance(ontology_file, str):
|
|
38
|
+
files_to_load = [ontology_file]
|
|
39
|
+
elif isinstance(ontology_file, list):
|
|
40
|
+
files_to_load = ontology_file
|
|
41
|
+
else:
|
|
42
|
+
raise ValueError(
|
|
43
|
+
f"ontology_file must be a string, list of strings, or None. Got: {type(ontology_file)}"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
if files_to_load:
|
|
36
47
|
self.graph = Graph()
|
|
37
|
-
|
|
38
|
-
|
|
48
|
+
loaded_files = []
|
|
49
|
+
for file_path in files_to_load:
|
|
50
|
+
if os.path.exists(file_path):
|
|
51
|
+
self.graph.parse(file_path)
|
|
52
|
+
loaded_files.append(file_path)
|
|
53
|
+
logger.info("Ontology loaded successfully from file: %s", file_path)
|
|
54
|
+
else:
|
|
55
|
+
logger.warning(
|
|
56
|
+
"Ontology file '%s' not found. Skipping this file.",
|
|
57
|
+
file_path,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
if not loaded_files:
|
|
61
|
+
logger.info(
|
|
62
|
+
"No valid ontology files found. No owl ontology will be attached to the graph."
|
|
63
|
+
)
|
|
64
|
+
self.graph = None
|
|
65
|
+
else:
|
|
66
|
+
logger.info("Total ontology files loaded: %d", len(loaded_files))
|
|
39
67
|
else:
|
|
40
68
|
logger.info(
|
|
41
|
-
"
|
|
42
|
-
ontology_file,
|
|
69
|
+
"No ontology file provided. No owl ontology will be attached to the graph."
|
|
43
70
|
)
|
|
44
71
|
self.graph = None
|
|
72
|
+
|
|
45
73
|
self.build_lookup()
|
|
46
74
|
except Exception as e:
|
|
47
75
|
logger.error("Failed to load ontology", exc_info=e)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import asyncio
|
|
3
3
|
from typing import Any, Optional, List, Type
|
|
4
|
-
|
|
4
|
+
from datetime import datetime
|
|
5
5
|
|
|
6
6
|
from operator import itemgetter
|
|
7
7
|
from cognee.infrastructure.databases.vector import get_vector_engine
|
|
@@ -79,7 +79,11 @@ class TemporalRetriever(GraphCompletionRetriever):
|
|
|
79
79
|
else:
|
|
80
80
|
base_directory = None
|
|
81
81
|
|
|
82
|
-
|
|
82
|
+
time_now = datetime.now().strftime("%d-%m-%Y")
|
|
83
|
+
|
|
84
|
+
system_prompt = render_prompt(
|
|
85
|
+
prompt_path, {"time_now": time_now}, base_directory=base_directory
|
|
86
|
+
)
|
|
83
87
|
|
|
84
88
|
interval = await LLMGateway.acreate_structured_output(query, system_prompt, QueryInterval)
|
|
85
89
|
|
|
@@ -108,8 +112,6 @@ class TemporalRetriever(GraphCompletionRetriever):
|
|
|
108
112
|
|
|
109
113
|
graph_engine = await get_graph_engine()
|
|
110
114
|
|
|
111
|
-
triplets = []
|
|
112
|
-
|
|
113
115
|
if time_from and time_to:
|
|
114
116
|
ids = await graph_engine.collect_time_ids(time_from=time_from, time_to=time_to)
|
|
115
117
|
elif time_from:
|