cognee 0.3.7.dev2__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cognee/base_config.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import os
2
+ from pathlib import Path
2
3
  from typing import Optional
3
4
  from functools import lru_cache
4
5
  from cognee.root_dir import get_absolute_path, ensure_absolute_path
@@ -11,6 +12,9 @@ class BaseConfig(BaseSettings):
11
12
  data_root_directory: str = get_absolute_path(".data_storage")
12
13
  system_root_directory: str = get_absolute_path(".cognee_system")
13
14
  cache_root_directory: str = get_absolute_path(".cognee_cache")
15
+ logs_root_directory: str = os.getenv(
16
+ "COGNEE_LOGS_DIR", str(os.path.join(os.path.dirname(os.path.dirname(__file__)), "logs"))
17
+ )
14
18
  monitoring_tool: object = Observer.NONE
15
19
 
16
20
  @pydantic.model_validator(mode="after")
@@ -30,6 +34,8 @@ class BaseConfig(BaseSettings):
30
34
  # Require absolute paths for root directories
31
35
  self.data_root_directory = ensure_absolute_path(self.data_root_directory)
32
36
  self.system_root_directory = ensure_absolute_path(self.system_root_directory)
37
+ self.logs_root_directory = ensure_absolute_path(self.logs_root_directory)
38
+
33
39
  # Set monitoring tool based on available keys
34
40
  if self.langfuse_public_key and self.langfuse_secret_key:
35
41
  self.monitoring_tool = Observer.LANGFUSE
@@ -49,6 +55,7 @@ class BaseConfig(BaseSettings):
49
55
  "system_root_directory": self.system_root_directory,
50
56
  "monitoring_tool": self.monitoring_tool,
51
57
  "cache_root_directory": self.cache_root_directory,
58
+ "logs_root_directory": self.logs_root_directory,
52
59
  }
53
60
 
54
61
 
@@ -47,7 +47,7 @@ def create_vector_engine(
47
47
  embedding_engine=embedding_engine,
48
48
  )
49
49
 
50
- if vector_db_provider == "pgvector":
50
+ if vector_db_provider.lower() == "pgvector":
51
51
  from cognee.infrastructure.databases.relational import get_relational_config
52
52
 
53
53
  # Get configuration for postgres database
@@ -78,7 +78,7 @@ def create_vector_engine(
78
78
  embedding_engine,
79
79
  )
80
80
 
81
- elif vector_db_provider == "chromadb":
81
+ elif vector_db_provider.lower() == "chromadb":
82
82
  try:
83
83
  import chromadb
84
84
  except ImportError:
@@ -94,7 +94,7 @@ def create_vector_engine(
94
94
  embedding_engine=embedding_engine,
95
95
  )
96
96
 
97
- elif vector_db_provider == "neptune_analytics":
97
+ elif vector_db_provider.lower() == "neptune_analytics":
98
98
  try:
99
99
  from langchain_aws import NeptuneAnalyticsGraph
100
100
  except ImportError:
@@ -122,7 +122,7 @@ def create_vector_engine(
122
122
  embedding_engine=embedding_engine,
123
123
  )
124
124
 
125
- else:
125
+ elif vector_db_provider.lower() == "lancedb":
126
126
  from .lancedb.LanceDBAdapter import LanceDBAdapter
127
127
 
128
128
  return LanceDBAdapter(
@@ -130,3 +130,9 @@ def create_vector_engine(
130
130
  api_key=vector_db_key,
131
131
  embedding_engine=embedding_engine,
132
132
  )
133
+
134
+ else:
135
+ raise EnvironmentError(
136
+ f"Unsupported graph database provider: {vector_db_provider}. "
137
+ f"Supported providers are: {', '.join(list(supported_databases.keys()) + ['LanceDB', 'PGVector', 'neptune_analytics', 'ChromaDB'])}"
138
+ )
@@ -22,89 +22,6 @@ class FileTypeException(Exception):
22
22
  self.message = message
23
23
 
24
24
 
25
- class TxtFileType(filetype.Type):
26
- """
27
- Represents a text file type with specific MIME and extension properties.
28
-
29
- Public methods:
30
- - match: Determines whether a given buffer matches the text file type.
31
- """
32
-
33
- MIME = "text/plain"
34
- EXTENSION = "txt"
35
-
36
- def __init__(self):
37
- super(TxtFileType, self).__init__(mime=TxtFileType.MIME, extension=TxtFileType.EXTENSION)
38
-
39
- def match(self, buf):
40
- """
41
- Determine if the given buffer contains text content.
42
-
43
- Parameters:
44
- -----------
45
-
46
- - buf: The buffer to check for text content.
47
-
48
- Returns:
49
- --------
50
-
51
- Returns True if the buffer is identified as text content, otherwise False.
52
- """
53
- return is_text_content(buf)
54
-
55
-
56
- txt_file_type = TxtFileType()
57
-
58
- filetype.add_type(txt_file_type)
59
-
60
-
61
- class CustomPdfMatcher(filetype.Type):
62
- """
63
- Match PDF file types based on MIME type and extension.
64
-
65
- Public methods:
66
- - match
67
-
68
- Instance variables:
69
- - MIME: The MIME type of the PDF.
70
- - EXTENSION: The file extension of the PDF.
71
- """
72
-
73
- MIME = "application/pdf"
74
- EXTENSION = "pdf"
75
-
76
- def __init__(self):
77
- super(CustomPdfMatcher, self).__init__(
78
- mime=CustomPdfMatcher.MIME, extension=CustomPdfMatcher.EXTENSION
79
- )
80
-
81
- def match(self, buf):
82
- """
83
- Determine if the provided buffer is a PDF file.
84
-
85
- This method checks for the presence of the PDF signature in the buffer.
86
-
87
- Raises:
88
- - TypeError: If the buffer is not of bytes type.
89
-
90
- Parameters:
91
- -----------
92
-
93
- - buf: The buffer containing the data to be checked.
94
-
95
- Returns:
96
- --------
97
-
98
- Returns True if the buffer contains a PDF signature, otherwise returns False.
99
- """
100
- return b"PDF-" in buf
101
-
102
-
103
- custom_pdf_matcher = CustomPdfMatcher()
104
-
105
- filetype.add_type(custom_pdf_matcher)
106
-
107
-
108
25
  def guess_file_type(file: BinaryIO) -> filetype.Type:
109
26
  """
110
27
  Guess the file type from the given binary file stream.
@@ -1,15 +1,13 @@
1
- For the purposes of identifying timestamps in a query, you are tasked with extracting relevant timestamps from the query.
2
- ## Timestamp requirements
3
- - If the query contains interval extrack both starts_at and ends_at properties
4
- - If the query contains an instantaneous timestamp, starts_at and ends_at should be the same
5
- - If the query its open-ended (before 2009 or after 2009), the corresponding non defined end of the time should be none
6
- -For example: "before 2009" -- starts_at: None, ends_at: 2009 or "after 2009" -- starts_at: 2009, ends_at: None
7
- - Put always the data that comes first in time as starts_at and the timestamps that comes second in time as ends_at
8
- - If starts_at or ends_at cannot be extracted both of them has to be None
9
- ## Output Format
10
- Your reply should be a JSON: list of dictionaries with the following structure:
11
- ```python
12
- class QueryInterval(BaseModel):
13
- starts_at: Optional[Timestamp] = None
14
- ends_at: Optional[Timestamp] = None
15
- ```
1
+ You are tasked with identifying relevant time periods where the answer to a given query should be searched.
2
+ Current date is: `{{ time_now }}`. Determine relevant period(s) and return structured intervals.
3
+
4
+ Extraction rules:
5
+
6
+ 1. Query without specific timestamp: use the time period with starts_at set to None and ends_at set to now.
7
+ 2. Explicit time intervals: If the query specifies a range (e.g., from 2010 to 2020, between January and March 2023), extract both start and end dates. Always assign the earlier date to starts_at and the later date to ends_at.
8
+ 3. Single timestamp: If the query refers to one specific moment (e.g., in 2015, on March 5, 2022), set starts_at and ends_at to that same timestamp.
9
+ 4. Open-ended time references: For phrases such as "before X" or "after X", represent the unspecified side as None. For example: before 2009 → starts_at: None, ends_at: 2009; after 2009 → starts_at: 2009, ends_at: None.
10
+ 5. Current-time references ("now", "current", "today"): If the query explicitly refers to the present, set both starts_at and ends_at to now (the ingestion timestamp).
11
+ 6. "Who is" and "Who was" questions: These imply a general identity or biographical inquiry without a specific temporal scope. Set both starts_at and ends_at to None.
12
+ 7. Ordering rule: Always ensure the earlier date is assigned to starts_at and the later date to ends_at.
13
+ 8. No temporal information: If no valid or inferable time reference is found, set both starts_at and ends_at to None.
@@ -1,6 +1,7 @@
1
1
  import filetype
2
2
  from typing import Dict, List, Optional, Any
3
3
  from .LoaderInterface import LoaderInterface
4
+ from cognee.infrastructure.files.utils.guess_file_type import guess_file_type
4
5
  from cognee.shared.logging_utils import get_logger
5
6
 
6
7
  logger = get_logger(__name__)
@@ -80,7 +81,7 @@ class LoaderEngine:
80
81
  """
81
82
  from pathlib import Path
82
83
 
83
- file_info = filetype.guess(file_path)
84
+ file_info = guess_file_type(file_path)
84
85
 
85
86
  path_extension = Path(file_path).suffix.lstrip(".")
86
87
 
@@ -21,7 +21,8 @@ def get_ontology_resolver_from_env(
21
21
  Supported value: "rdflib".
22
22
  matching_strategy (str): The matching strategy to apply.
23
23
  Supported value: "fuzzy".
24
- ontology_file_path (str): Path to the ontology file required for the resolver.
24
+ ontology_file_path (str): Path to the ontology file(s) required for the resolver.
25
+ Can be a single path or comma-separated paths for multiple files.
25
26
 
26
27
  Returns:
27
28
  BaseOntologyResolver: An instance of the requested ontology resolver.
@@ -31,8 +32,13 @@ def get_ontology_resolver_from_env(
31
32
  or if required parameters are missing.
32
33
  """
33
34
  if ontology_resolver == "rdflib" and matching_strategy == "fuzzy" and ontology_file_path:
35
+ if "," in ontology_file_path:
36
+ file_paths = [path.strip() for path in ontology_file_path.split(",")]
37
+ else:
38
+ file_paths = ontology_file_path
39
+
34
40
  return RDFLibOntologyResolver(
35
- matching_strategy=FuzzyMatchingStrategy(), ontology_file=ontology_file_path
41
+ matching_strategy=FuzzyMatchingStrategy(), ontology_file=file_paths
36
42
  )
37
43
  else:
38
44
  raise EnvironmentError(
@@ -2,7 +2,7 @@ import os
2
2
  import difflib
3
3
  from cognee.shared.logging_utils import get_logger
4
4
  from collections import deque
5
- from typing import List, Tuple, Dict, Optional, Any
5
+ from typing import List, Tuple, Dict, Optional, Any, Union
6
6
  from rdflib import Graph, URIRef, RDF, RDFS, OWL
7
7
 
8
8
  from cognee.modules.ontology.exceptions import (
@@ -26,22 +26,50 @@ class RDFLibOntologyResolver(BaseOntologyResolver):
26
26
 
27
27
  def __init__(
28
28
  self,
29
- ontology_file: Optional[str] = None,
29
+ ontology_file: Optional[Union[str, List[str]]] = None,
30
30
  matching_strategy: Optional[MatchingStrategy] = None,
31
31
  ) -> None:
32
32
  super().__init__(matching_strategy)
33
33
  self.ontology_file = ontology_file
34
34
  try:
35
- if ontology_file and os.path.exists(ontology_file):
35
+ files_to_load = []
36
+ if ontology_file is not None:
37
+ if isinstance(ontology_file, str):
38
+ files_to_load = [ontology_file]
39
+ elif isinstance(ontology_file, list):
40
+ files_to_load = ontology_file
41
+ else:
42
+ raise ValueError(
43
+ f"ontology_file must be a string, list of strings, or None. Got: {type(ontology_file)}"
44
+ )
45
+
46
+ if files_to_load:
36
47
  self.graph = Graph()
37
- self.graph.parse(ontology_file)
38
- logger.info("Ontology loaded successfully from file: %s", ontology_file)
48
+ loaded_files = []
49
+ for file_path in files_to_load:
50
+ if os.path.exists(file_path):
51
+ self.graph.parse(file_path)
52
+ loaded_files.append(file_path)
53
+ logger.info("Ontology loaded successfully from file: %s", file_path)
54
+ else:
55
+ logger.warning(
56
+ "Ontology file '%s' not found. Skipping this file.",
57
+ file_path,
58
+ )
59
+
60
+ if not loaded_files:
61
+ logger.info(
62
+ "No valid ontology files found. No owl ontology will be attached to the graph."
63
+ )
64
+ self.graph = None
65
+ else:
66
+ logger.info("Total ontology files loaded: %d", len(loaded_files))
39
67
  else:
40
68
  logger.info(
41
- "Ontology file '%s' not found. No owl ontology will be attached to the graph.",
42
- ontology_file,
69
+ "No ontology file provided. No owl ontology will be attached to the graph."
43
70
  )
44
71
  self.graph = None
72
+
45
73
  self.build_lookup()
46
74
  except Exception as e:
47
75
  logger.error("Failed to load ontology", exc_info=e)
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  import asyncio
3
3
  from typing import Any, Optional, List, Type
4
-
4
+ from datetime import datetime
5
5
 
6
6
  from operator import itemgetter
7
7
  from cognee.infrastructure.databases.vector import get_vector_engine
@@ -79,7 +79,11 @@ class TemporalRetriever(GraphCompletionRetriever):
79
79
  else:
80
80
  base_directory = None
81
81
 
82
- system_prompt = render_prompt(prompt_path, {}, base_directory=base_directory)
82
+ time_now = datetime.now().strftime("%d-%m-%Y")
83
+
84
+ system_prompt = render_prompt(
85
+ prompt_path, {"time_now": time_now}, base_directory=base_directory
86
+ )
83
87
 
84
88
  interval = await LLMGateway.acreate_structured_output(query, system_prompt, QueryInterval)
85
89
 
@@ -108,8 +112,6 @@ class TemporalRetriever(GraphCompletionRetriever):
108
112
 
109
113
  graph_engine = await get_graph_engine()
110
114
 
111
- triplets = []
112
-
113
115
  if time_from and time_to:
114
116
  ids = await graph_engine.collect_time_ids(time_from=time_from, time_to=time_to)
115
117
  elif time_from: