cognee 0.3.6__py3-none-any.whl → 0.3.7.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. cognee/__init__.py +1 -0
  2. cognee/api/health.py +2 -12
  3. cognee/api/v1/add/add.py +46 -6
  4. cognee/api/v1/add/routers/get_add_router.py +11 -2
  5. cognee/api/v1/cognify/cognify.py +29 -9
  6. cognee/api/v1/cognify/routers/get_cognify_router.py +2 -1
  7. cognee/api/v1/datasets/datasets.py +11 -0
  8. cognee/api/v1/datasets/routers/get_datasets_router.py +8 -0
  9. cognee/api/v1/delete/routers/get_delete_router.py +2 -0
  10. cognee/api/v1/memify/routers/get_memify_router.py +2 -1
  11. cognee/api/v1/permissions/routers/get_permissions_router.py +6 -0
  12. cognee/api/v1/responses/default_tools.py +0 -1
  13. cognee/api/v1/responses/dispatch_function.py +1 -1
  14. cognee/api/v1/responses/routers/default_tools.py +0 -1
  15. cognee/api/v1/search/routers/get_search_router.py +3 -3
  16. cognee/api/v1/search/search.py +11 -9
  17. cognee/api/v1/settings/routers/get_settings_router.py +7 -1
  18. cognee/api/v1/sync/routers/get_sync_router.py +3 -0
  19. cognee/api/v1/ui/ui.py +45 -16
  20. cognee/api/v1/update/routers/get_update_router.py +3 -1
  21. cognee/api/v1/update/update.py +3 -3
  22. cognee/api/v1/users/routers/get_visualize_router.py +2 -0
  23. cognee/cli/_cognee.py +61 -10
  24. cognee/cli/commands/add_command.py +3 -3
  25. cognee/cli/commands/cognify_command.py +3 -3
  26. cognee/cli/commands/config_command.py +9 -7
  27. cognee/cli/commands/delete_command.py +3 -3
  28. cognee/cli/commands/search_command.py +3 -7
  29. cognee/cli/config.py +0 -1
  30. cognee/context_global_variables.py +5 -0
  31. cognee/exceptions/exceptions.py +1 -1
  32. cognee/infrastructure/databases/cache/__init__.py +2 -0
  33. cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
  34. cognee/infrastructure/databases/cache/config.py +44 -0
  35. cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
  36. cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
  37. cognee/infrastructure/databases/exceptions/__init__.py +1 -0
  38. cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
  39. cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
  40. cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
  41. cognee/infrastructure/databases/graph/kuzu/adapter.py +76 -47
  42. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
  43. cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
  44. cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
  45. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
  46. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
  47. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
  48. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
  49. cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
  50. cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
  51. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
  52. cognee/infrastructure/files/exceptions.py +1 -1
  53. cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
  54. cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
  55. cognee/infrastructure/files/utils/guess_file_type.py +6 -0
  56. cognee/infrastructure/llm/prompts/feedback_reaction_prompt.txt +14 -0
  57. cognee/infrastructure/llm/prompts/feedback_report_prompt.txt +13 -0
  58. cognee/infrastructure/llm/prompts/feedback_user_context_prompt.txt +5 -0
  59. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
  60. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
  61. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
  62. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
  63. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
  64. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
  65. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
  66. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
  67. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
  68. cognee/infrastructure/loaders/LoaderEngine.py +27 -7
  69. cognee/infrastructure/loaders/external/__init__.py +7 -0
  70. cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
  71. cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
  72. cognee/infrastructure/loaders/supported_loaders.py +7 -0
  73. cognee/modules/data/exceptions/exceptions.py +1 -1
  74. cognee/modules/data/methods/__init__.py +3 -0
  75. cognee/modules/data/methods/get_dataset_data.py +4 -1
  76. cognee/modules/data/methods/has_dataset_data.py +21 -0
  77. cognee/modules/engine/models/TableRow.py +0 -1
  78. cognee/modules/ingestion/save_data_to_file.py +9 -2
  79. cognee/modules/pipelines/exceptions/exceptions.py +1 -1
  80. cognee/modules/pipelines/operations/pipeline.py +12 -1
  81. cognee/modules/pipelines/operations/run_tasks.py +25 -197
  82. cognee/modules/pipelines/operations/run_tasks_base.py +7 -0
  83. cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
  84. cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
  85. cognee/modules/pipelines/operations/run_tasks_with_telemetry.py +9 -1
  86. cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
  87. cognee/modules/retrieval/base_graph_retriever.py +3 -1
  88. cognee/modules/retrieval/base_retriever.py +3 -1
  89. cognee/modules/retrieval/chunks_retriever.py +5 -1
  90. cognee/modules/retrieval/code_retriever.py +20 -2
  91. cognee/modules/retrieval/completion_retriever.py +50 -9
  92. cognee/modules/retrieval/cypher_search_retriever.py +11 -1
  93. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
  94. cognee/modules/retrieval/graph_completion_cot_retriever.py +152 -22
  95. cognee/modules/retrieval/graph_completion_retriever.py +54 -10
  96. cognee/modules/retrieval/lexical_retriever.py +20 -2
  97. cognee/modules/retrieval/natural_language_retriever.py +10 -1
  98. cognee/modules/retrieval/summaries_retriever.py +5 -1
  99. cognee/modules/retrieval/temporal_retriever.py +62 -10
  100. cognee/modules/retrieval/user_qa_feedback.py +3 -2
  101. cognee/modules/retrieval/utils/completion.py +30 -4
  102. cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
  103. cognee/modules/retrieval/utils/session_cache.py +156 -0
  104. cognee/modules/search/methods/get_search_type_tools.py +0 -5
  105. cognee/modules/search/methods/no_access_control_search.py +12 -1
  106. cognee/modules/search/methods/search.py +51 -5
  107. cognee/modules/search/types/SearchType.py +0 -1
  108. cognee/modules/settings/get_settings.py +23 -0
  109. cognee/modules/users/methods/get_authenticated_user.py +3 -1
  110. cognee/modules/users/methods/get_default_user.py +1 -6
  111. cognee/modules/users/roles/methods/create_role.py +2 -2
  112. cognee/modules/users/tenants/methods/create_tenant.py +2 -2
  113. cognee/shared/exceptions/exceptions.py +1 -1
  114. cognee/shared/logging_utils.py +18 -11
  115. cognee/shared/utils.py +24 -2
  116. cognee/tasks/codingagents/coding_rule_associations.py +1 -2
  117. cognee/tasks/documents/exceptions/exceptions.py +1 -1
  118. cognee/tasks/feedback/__init__.py +13 -0
  119. cognee/tasks/feedback/create_enrichments.py +84 -0
  120. cognee/tasks/feedback/extract_feedback_interactions.py +230 -0
  121. cognee/tasks/feedback/generate_improved_answers.py +130 -0
  122. cognee/tasks/feedback/link_enrichments_to_feedback.py +67 -0
  123. cognee/tasks/feedback/models.py +26 -0
  124. cognee/tasks/graph/extract_graph_from_data.py +2 -0
  125. cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
  126. cognee/tasks/ingestion/ingest_data.py +11 -5
  127. cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
  128. cognee/tasks/storage/add_data_points.py +3 -10
  129. cognee/tasks/storage/index_data_points.py +19 -14
  130. cognee/tasks/storage/index_graph_edges.py +25 -11
  131. cognee/tasks/web_scraper/__init__.py +34 -0
  132. cognee/tasks/web_scraper/config.py +26 -0
  133. cognee/tasks/web_scraper/default_url_crawler.py +446 -0
  134. cognee/tasks/web_scraper/models.py +46 -0
  135. cognee/tasks/web_scraper/types.py +4 -0
  136. cognee/tasks/web_scraper/utils.py +142 -0
  137. cognee/tasks/web_scraper/web_scraper_task.py +396 -0
  138. cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
  139. cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
  140. cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
  141. cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
  142. cognee/tests/subprocesses/reader.py +25 -0
  143. cognee/tests/subprocesses/simple_cognify_1.py +31 -0
  144. cognee/tests/subprocesses/simple_cognify_2.py +31 -0
  145. cognee/tests/subprocesses/writer.py +32 -0
  146. cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
  147. cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
  148. cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
  149. cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
  150. cognee/tests/test_add_docling_document.py +56 -0
  151. cognee/tests/test_chromadb.py +7 -11
  152. cognee/tests/test_concurrent_subprocess_access.py +76 -0
  153. cognee/tests/test_conversation_history.py +240 -0
  154. cognee/tests/test_feedback_enrichment.py +174 -0
  155. cognee/tests/test_kuzu.py +27 -15
  156. cognee/tests/test_lancedb.py +7 -11
  157. cognee/tests/test_library.py +32 -2
  158. cognee/tests/test_neo4j.py +24 -16
  159. cognee/tests/test_neptune_analytics_vector.py +7 -11
  160. cognee/tests/test_permissions.py +9 -13
  161. cognee/tests/test_pgvector.py +4 -4
  162. cognee/tests/test_remote_kuzu.py +8 -11
  163. cognee/tests/test_s3_file_storage.py +1 -1
  164. cognee/tests/test_search_db.py +6 -8
  165. cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
  166. cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
  167. cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +51 -0
  168. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/METADATA +21 -6
  169. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/RECORD +178 -139
  170. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/entry_points.txt +1 -0
  171. distributed/Dockerfile +0 -3
  172. distributed/entrypoint.py +21 -9
  173. distributed/signal.py +5 -0
  174. distributed/workers/data_point_saving_worker.py +64 -34
  175. distributed/workers/graph_saving_worker.py +71 -47
  176. cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
  177. cognee/modules/retrieval/insights_retriever.py +0 -133
  178. cognee/tests/test_memgraph.py +0 -109
  179. cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
  180. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/WHEEL +0 -0
  181. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/licenses/LICENSE +0 -0
  182. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/licenses/NOTICE.md +0 -0
@@ -7,6 +7,15 @@ from openai import ContentFilterFinishReasonError
7
7
  from litellm.exceptions import ContentPolicyViolationError
8
8
  from instructor.core import InstructorRetryException
9
9
 
10
+ import logging
11
+ from tenacity import (
12
+ retry,
13
+ stop_after_delay,
14
+ wait_exponential_jitter,
15
+ retry_if_not_exception_type,
16
+ before_sleep_log,
17
+ )
18
+
10
19
  from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import (
11
20
  LLMInterface,
12
21
  )
@@ -14,19 +23,13 @@ from cognee.infrastructure.llm.exceptions import (
14
23
  ContentPolicyFilterError,
15
24
  )
16
25
  from cognee.infrastructure.files.utils.open_data_file import open_data_file
17
- from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.rate_limiter import (
18
- rate_limit_async,
19
- rate_limit_sync,
20
- sleep_and_retry_async,
21
- sleep_and_retry_sync,
22
- )
23
26
  from cognee.modules.observability.get_observe import get_observe
24
27
  from cognee.shared.logging_utils import get_logger
25
28
 
26
- observe = get_observe()
27
-
28
29
  logger = get_logger()
29
30
 
31
+ observe = get_observe()
32
+
30
33
 
31
34
  class OpenAIAdapter(LLMInterface):
32
35
  """
@@ -97,8 +100,13 @@ class OpenAIAdapter(LLMInterface):
97
100
  self.fallback_endpoint = fallback_endpoint
98
101
 
99
102
  @observe(as_type="generation")
100
- @sleep_and_retry_async()
101
- @rate_limit_async
103
+ @retry(
104
+ stop=stop_after_delay(128),
105
+ wait=wait_exponential_jitter(2, 128),
106
+ retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
107
+ before_sleep=before_sleep_log(logger, logging.DEBUG),
108
+ reraise=True,
109
+ )
102
110
  async def acreate_structured_output(
103
111
  self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
104
112
  ) -> BaseModel:
@@ -148,10 +156,7 @@ class OpenAIAdapter(LLMInterface):
148
156
  InstructorRetryException,
149
157
  ) as e:
150
158
  if not (self.fallback_model and self.fallback_api_key):
151
- raise ContentPolicyFilterError(
152
- f"The provided input contains content that is not aligned with our content policy: {text_input}"
153
- ) from e
154
-
159
+ raise e
155
160
  try:
156
161
  return await self.aclient.chat.completions.create(
157
162
  model=self.fallback_model,
@@ -186,8 +191,13 @@ class OpenAIAdapter(LLMInterface):
186
191
  ) from error
187
192
 
188
193
  @observe
189
- @sleep_and_retry_sync()
190
- @rate_limit_sync
194
+ @retry(
195
+ stop=stop_after_delay(128),
196
+ wait=wait_exponential_jitter(2, 128),
197
+ retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
198
+ before_sleep=before_sleep_log(logger, logging.DEBUG),
199
+ reraise=True,
200
+ )
191
201
  def create_structured_output(
192
202
  self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
193
203
  ) -> BaseModel:
@@ -231,7 +241,13 @@ class OpenAIAdapter(LLMInterface):
231
241
  max_retries=self.MAX_RETRIES,
232
242
  )
233
243
 
234
- @rate_limit_async
244
+ @retry(
245
+ stop=stop_after_delay(128),
246
+ wait=wait_exponential_jitter(2, 128),
247
+ retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
248
+ before_sleep=before_sleep_log(logger, logging.DEBUG),
249
+ reraise=True,
250
+ )
235
251
  async def create_transcript(self, input):
236
252
  """
237
253
  Generate an audio transcript from a user query.
@@ -263,7 +279,13 @@ class OpenAIAdapter(LLMInterface):
263
279
 
264
280
  return transcription
265
281
 
266
- @rate_limit_async
282
+ @retry(
283
+ stop=stop_after_delay(128),
284
+ wait=wait_exponential_jitter(2, 128),
285
+ retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
286
+ before_sleep=before_sleep_log(logger, logging.DEBUG),
287
+ reraise=True,
288
+ )
267
289
  async def transcribe_image(self, input) -> BaseModel:
268
290
  """
269
291
  Generate a transcription of an image from a user query.
@@ -27,11 +27,11 @@ class LoaderEngine:
27
27
 
28
28
  self.default_loader_priority = [
29
29
  "text_loader",
30
- "advanced_pdf_loader",
31
30
  "pypdf_loader",
32
31
  "image_loader",
33
32
  "audio_loader",
34
33
  "unstructured_loader",
34
+ "advanced_pdf_loader",
35
35
  ]
36
36
 
37
37
  def register_loader(self, loader: LoaderInterface) -> bool:
@@ -64,7 +64,9 @@ class LoaderEngine:
64
64
  return True
65
65
 
66
66
  def get_loader(
67
- self, file_path: str, preferred_loaders: List[str] = None
67
+ self,
68
+ file_path: str,
69
+ preferred_loaders: dict[str, dict[str, Any]],
68
70
  ) -> Optional[LoaderInterface]:
69
71
  """
70
72
  Get appropriate loader for a file.
@@ -76,14 +78,21 @@ class LoaderEngine:
76
78
  Returns:
77
79
  LoaderInterface that can handle the file, or None if not found
78
80
  """
81
+ from pathlib import Path
79
82
 
80
83
  file_info = filetype.guess(file_path)
81
84
 
85
+ path_extension = Path(file_path).suffix.lstrip(".")
86
+
82
87
  # Try preferred loaders first
83
88
  if preferred_loaders:
84
89
  for loader_name in preferred_loaders:
85
90
  if loader_name in self._loaders:
86
91
  loader = self._loaders[loader_name]
92
+ # Try with path extension first (for text formats like html)
93
+ if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
94
+ return loader
95
+ # Fall back to content-detected extension
87
96
  if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
88
97
  return loader
89
98
  else:
@@ -93,6 +102,10 @@ class LoaderEngine:
93
102
  for loader_name in self.default_loader_priority:
94
103
  if loader_name in self._loaders:
95
104
  loader = self._loaders[loader_name]
105
+ # Try with path extension first (for text formats like html)
106
+ if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
107
+ return loader
108
+ # Fall back to content-detected extension
96
109
  if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
97
110
  return loader
98
111
  else:
@@ -105,8 +118,7 @@ class LoaderEngine:
105
118
  async def load_file(
106
119
  self,
107
120
  file_path: str,
108
- file_stream: Optional[Any],
109
- preferred_loaders: Optional[List[str]] = None,
121
+ preferred_loaders: dict[str, dict[str, Any]] = None,
110
122
  **kwargs,
111
123
  ):
112
124
  """
@@ -114,7 +126,7 @@ class LoaderEngine:
114
126
 
115
127
  Args:
116
128
  file_path: Path to the file to be processed
117
- preferred_loaders: List of preferred loader names to try first
129
+ preferred_loaders: Dict of loader names to their configurations
118
130
  **kwargs: Additional loader-specific configuration
119
131
 
120
132
  Raises:
@@ -126,8 +138,16 @@ class LoaderEngine:
126
138
  raise ValueError(f"No loader found for file: {file_path}")
127
139
 
128
140
  logger.debug(f"Loading {file_path} with {loader.loader_name}")
129
- # TODO: loading needs to be reworked to work with both file streams and file locations
130
- return await loader.load(file_path, **kwargs)
141
+
142
+ # Extract loader-specific config from preferred_loaders
143
+ loader_config = {}
144
+ if preferred_loaders and loader.loader_name in preferred_loaders:
145
+ loader_config = preferred_loaders[loader.loader_name]
146
+
147
+ # Merge with any additional kwargs (kwargs take precedence)
148
+ merged_kwargs = {**loader_config, **kwargs}
149
+
150
+ return await loader.load(file_path, **merged_kwargs)
131
151
 
132
152
  def get_available_loaders(self) -> List[str]:
133
153
  """
@@ -27,3 +27,10 @@ try:
27
27
  __all__.append("AdvancedPdfLoader")
28
28
  except ImportError:
29
29
  pass
30
+
31
+ try:
32
+ from .beautiful_soup_loader import BeautifulSoupLoader
33
+
34
+ __all__.append("BeautifulSoupLoader")
35
+ except ImportError:
36
+ pass
@@ -14,14 +14,6 @@ from cognee.infrastructure.loaders.external.pypdf_loader import PyPdfLoader
14
14
 
15
15
  logger = get_logger(__name__)
16
16
 
17
- try:
18
- from unstructured.partition.pdf import partition_pdf
19
- except ImportError as e:
20
- logger.info(
21
- "unstructured[pdf] not installed, can't use AdvancedPdfLoader, will use PyPdfLoader instead."
22
- )
23
- raise ImportError from e
24
-
25
17
 
26
18
  @dataclass
27
19
  class _PageBuffer:
@@ -88,6 +80,8 @@ class AdvancedPdfLoader(LoaderInterface):
88
80
  **kwargs,
89
81
  }
90
82
  # Use partition to extract elements
83
+ from unstructured.partition.pdf import partition_pdf
84
+
91
85
  elements = partition_pdf(**partition_kwargs)
92
86
 
93
87
  # Process elements into text content
@@ -0,0 +1,310 @@
1
+ """BeautifulSoup-based web crawler for extracting content from web pages.
2
+
3
+ This module provides the BeautifulSoupCrawler class for fetching and extracting content
4
+ from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages. It
5
+ supports robots.txt handling, rate limiting, and custom extraction rules.
6
+ """
7
+
8
+ from typing import Union, Dict, Any, Optional, List
9
+ from dataclasses import dataclass
10
+ from bs4 import BeautifulSoup
11
+ from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
12
+ from cognee.shared.logging_utils import get_logger
13
+
14
+ logger = get_logger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class ExtractionRule:
19
+ """Normalized extraction rule for web content.
20
+
21
+ Attributes:
22
+ selector: CSS selector for extraction (if any).
23
+ xpath: XPath expression for extraction (if any).
24
+ attr: HTML attribute to extract (if any).
25
+ all: If True, extract all matching elements; otherwise, extract first.
26
+ join_with: String to join multiple extracted elements.
27
+ """
28
+
29
+ selector: Optional[str] = None
30
+ xpath: Optional[str] = None
31
+ attr: Optional[str] = None
32
+ all: bool = False
33
+ join_with: str = " "
34
+
35
+
36
+ class BeautifulSoupLoader(LoaderInterface):
37
+ """Crawler for fetching and extracting web content using BeautifulSoup.
38
+
39
+ Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
40
+ compliance, and rate limiting. Extracts content using CSS selectors or XPath rules.
41
+
42
+ Attributes:
43
+ concurrency: Number of concurrent requests allowed.
44
+ crawl_delay: Minimum seconds between requests to the same domain.
45
+ max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
46
+ timeout: Per-request timeout in seconds.
47
+ max_retries: Number of retries for failed requests.
48
+ retry_delay_factor: Multiplier for exponential backoff on retries.
49
+ headers: HTTP headers for requests (e.g., User-Agent).
50
+ robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
51
+ """
52
+
53
+ @property
54
+ def supported_extensions(self) -> List[str]:
55
+ return ["html"]
56
+
57
+ @property
58
+ def supported_mime_types(self) -> List[str]:
59
+ return ["text/html", "text/plain"]
60
+
61
+ @property
62
+ def loader_name(self) -> str:
63
+ return "beautiful_soup_loader"
64
+
65
+ def can_handle(self, extension: str, mime_type: str) -> bool:
66
+ can = extension in self.supported_extensions and mime_type in self.supported_mime_types
67
+ return can
68
+
69
+ def _get_default_extraction_rules(self):
70
+ # Comprehensive default extraction rules for common HTML content
71
+ return {
72
+ # Meta information
73
+ "title": {"selector": "title", "all": False},
74
+ "meta_description": {
75
+ "selector": "meta[name='description']",
76
+ "attr": "content",
77
+ "all": False,
78
+ },
79
+ "meta_keywords": {
80
+ "selector": "meta[name='keywords']",
81
+ "attr": "content",
82
+ "all": False,
83
+ },
84
+ # Open Graph meta tags
85
+ "og_title": {
86
+ "selector": "meta[property='og:title']",
87
+ "attr": "content",
88
+ "all": False,
89
+ },
90
+ "og_description": {
91
+ "selector": "meta[property='og:description']",
92
+ "attr": "content",
93
+ "all": False,
94
+ },
95
+ # Main content areas (prioritized selectors)
96
+ "article": {"selector": "article", "all": True, "join_with": "\n\n"},
97
+ "main": {"selector": "main", "all": True, "join_with": "\n\n"},
98
+ # Semantic content sections
99
+ "headers_h1": {"selector": "h1", "all": True, "join_with": "\n"},
100
+ "headers_h2": {"selector": "h2", "all": True, "join_with": "\n"},
101
+ "headers_h3": {"selector": "h3", "all": True, "join_with": "\n"},
102
+ "headers_h4": {"selector": "h4", "all": True, "join_with": "\n"},
103
+ "headers_h5": {"selector": "h5", "all": True, "join_with": "\n"},
104
+ "headers_h6": {"selector": "h6", "all": True, "join_with": "\n"},
105
+ # Text content
106
+ "paragraphs": {"selector": "p", "all": True, "join_with": "\n\n"},
107
+ "blockquotes": {"selector": "blockquote", "all": True, "join_with": "\n\n"},
108
+ "preformatted": {"selector": "pre", "all": True, "join_with": "\n\n"},
109
+ # Lists
110
+ "ordered_lists": {"selector": "ol", "all": True, "join_with": "\n"},
111
+ "unordered_lists": {"selector": "ul", "all": True, "join_with": "\n"},
112
+ "list_items": {"selector": "li", "all": True, "join_with": "\n"},
113
+ "definition_lists": {"selector": "dl", "all": True, "join_with": "\n"},
114
+ # Tables
115
+ "tables": {"selector": "table", "all": True, "join_with": "\n\n"},
116
+ "table_captions": {
117
+ "selector": "caption",
118
+ "all": True,
119
+ "join_with": "\n",
120
+ },
121
+ # Code blocks
122
+ "code_blocks": {"selector": "code", "all": True, "join_with": "\n"},
123
+ # Figures and media descriptions
124
+ "figures": {"selector": "figure", "all": True, "join_with": "\n\n"},
125
+ "figcaptions": {"selector": "figcaption", "all": True, "join_with": "\n"},
126
+ "image_alts": {"selector": "img", "attr": "alt", "all": True, "join_with": " "},
127
+ # Links (text content, not URLs to avoid clutter)
128
+ "link_text": {"selector": "a", "all": True, "join_with": " "},
129
+ # Emphasized text
130
+ "strong": {"selector": "strong", "all": True, "join_with": " "},
131
+ "emphasis": {"selector": "em", "all": True, "join_with": " "},
132
+ "marked": {"selector": "mark", "all": True, "join_with": " "},
133
+ # Time and data elements
134
+ "time": {"selector": "time", "all": True, "join_with": " "},
135
+ "data": {"selector": "data", "all": True, "join_with": " "},
136
+ # Sections and semantic structure
137
+ "sections": {"selector": "section", "all": True, "join_with": "\n\n"},
138
+ "asides": {"selector": "aside", "all": True, "join_with": "\n\n"},
139
+ "details": {"selector": "details", "all": True, "join_with": "\n"},
140
+ "summary": {"selector": "summary", "all": True, "join_with": "\n"},
141
+ # Navigation (may contain important links/structure)
142
+ "nav": {"selector": "nav", "all": True, "join_with": "\n"},
143
+ # Footer information
144
+ "footer": {"selector": "footer", "all": True, "join_with": "\n"},
145
+ # Divs with specific content roles
146
+ "content_divs": {
147
+ "selector": "div[role='main'], div[role='article'], div.content, div#content",
148
+ "all": True,
149
+ "join_with": "\n\n",
150
+ },
151
+ }
152
+
153
+ async def load(
154
+ self,
155
+ file_path: str,
156
+ extraction_rules: dict[str, Any] = None,
157
+ join_all_matches: bool = False,
158
+ **kwargs,
159
+ ):
160
+ """Load an HTML file, extract content, and save to storage.
161
+
162
+ Args:
163
+ file_path: Path to the HTML file
164
+ extraction_rules: Dict of CSS selector rules for content extraction
165
+ join_all_matches: If True, extract all matching elements for each rule
166
+ **kwargs: Additional arguments
167
+
168
+ Returns:
169
+ Path to the stored extracted text file
170
+ """
171
+ if extraction_rules is None:
172
+ extraction_rules = self._get_default_extraction_rules()
173
+ logger.info("Using default comprehensive extraction rules for HTML content")
174
+
175
+ logger.info(f"Processing HTML file: {file_path}")
176
+
177
+ from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
178
+ from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
179
+
180
+ with open(file_path, "rb") as f:
181
+ file_metadata = await get_file_metadata(f)
182
+ f.seek(0)
183
+ html = f.read()
184
+
185
+ storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
186
+
187
+ # Normalize extraction rules
188
+ normalized_rules: List[ExtractionRule] = []
189
+ for _, rule in extraction_rules.items():
190
+ r = self._normalize_rule(rule)
191
+ if join_all_matches:
192
+ r.all = True
193
+ normalized_rules.append(r)
194
+
195
+ pieces = []
196
+ for rule in normalized_rules:
197
+ text = self._extract_from_html(html, rule)
198
+ if text:
199
+ pieces.append(text)
200
+
201
+ full_content = " ".join(pieces).strip()
202
+
203
+ # remove after defaults for extraction rules
204
+ # Fallback: If no content extracted, check if the file is plain text (not HTML)
205
+ if not full_content:
206
+ from bs4 import BeautifulSoup
207
+
208
+ soup = BeautifulSoup(html, "html.parser")
209
+ # If there are no HTML tags, treat as plain text
210
+ if not soup.find():
211
+ logger.warning(
212
+ f"No HTML tags found in {file_path}. Treating as plain text. "
213
+ "This may happen when content is pre-extracted (e.g., via Tavily with text format)."
214
+ )
215
+ full_content = html.decode("utf-8") if isinstance(html, bytes) else html
216
+ full_content = full_content.strip()
217
+
218
+ if not full_content:
219
+ logger.warning(f"No content extracted from HTML file: {file_path}")
220
+
221
+ # Store the extracted content
222
+ storage_config = get_storage_config()
223
+ data_root_directory = storage_config["data_root_directory"]
224
+ storage = get_file_storage(data_root_directory)
225
+
226
+ full_file_path = await storage.store(storage_file_name, full_content)
227
+
228
+ logger.info(f"Extracted {len(full_content)} characters from HTML")
229
+ return full_file_path
230
+
231
+ def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
232
+ """Normalize an extraction rule to an ExtractionRule dataclass.
233
+
234
+ Args:
235
+ rule: A string (CSS selector) or dict with extraction parameters.
236
+
237
+ Returns:
238
+ ExtractionRule: Normalized extraction rule.
239
+
240
+ Raises:
241
+ ValueError: If the rule is invalid.
242
+ """
243
+ if isinstance(rule, str):
244
+ return ExtractionRule(selector=rule)
245
+ if isinstance(rule, dict):
246
+ return ExtractionRule(
247
+ selector=rule.get("selector"),
248
+ xpath=rule.get("xpath"),
249
+ attr=rule.get("attr"),
250
+ all=bool(rule.get("all", False)),
251
+ join_with=rule.get("join_with", " "),
252
+ )
253
+ raise ValueError(f"Invalid extraction rule: {rule}")
254
+
255
+ def _extract_from_html(self, html: str, rule: ExtractionRule) -> str:
256
+ """Extract content from HTML using BeautifulSoup or lxml XPath.
257
+
258
+ Args:
259
+ html: The HTML content to extract from.
260
+ rule: The extraction rule to apply.
261
+
262
+ Returns:
263
+ str: The extracted content.
264
+
265
+ Raises:
266
+ RuntimeError: If XPath is used but lxml is not installed.
267
+ """
268
+ soup = BeautifulSoup(html, "html.parser")
269
+
270
+ if rule.xpath:
271
+ try:
272
+ from lxml import html as lxml_html
273
+ except ImportError:
274
+ raise RuntimeError(
275
+ "XPath requested but lxml is not available. Install lxml or use CSS selectors."
276
+ )
277
+ doc = lxml_html.fromstring(html)
278
+ nodes = doc.xpath(rule.xpath)
279
+ texts = []
280
+ for n in nodes:
281
+ if hasattr(n, "text_content"):
282
+ texts.append(n.text_content().strip())
283
+ else:
284
+ texts.append(str(n).strip())
285
+ return rule.join_with.join(t for t in texts if t)
286
+
287
+ if not rule.selector:
288
+ return ""
289
+
290
+ if rule.all:
291
+ nodes = soup.select(rule.selector)
292
+ pieces = []
293
+ for el in nodes:
294
+ if rule.attr:
295
+ val = el.get(rule.attr)
296
+ if val:
297
+ pieces.append(val.strip())
298
+ else:
299
+ text = el.get_text(strip=True)
300
+ if text:
301
+ pieces.append(text)
302
+ return rule.join_with.join(pieces).strip()
303
+ else:
304
+ el = soup.select_one(rule.selector)
305
+ if el is None:
306
+ return ""
307
+ if rule.attr:
308
+ val = el.get(rule.attr)
309
+ return (val or "").strip()
310
+ return el.get_text(strip=True)
@@ -23,3 +23,10 @@ try:
23
23
  supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
24
24
  except ImportError:
25
25
  pass
26
+
27
+ try:
28
+ from cognee.infrastructure.loaders.external import BeautifulSoupLoader
29
+
30
+ supported_loaders[BeautifulSoupLoader.loader_name] = BeautifulSoupLoader
31
+ except ImportError:
32
+ pass
@@ -10,7 +10,7 @@ class UnstructuredLibraryImportError(CogneeConfigurationError):
10
10
  self,
11
11
  message: str = "Import error. Unstructured library is not installed.",
12
12
  name: str = "UnstructuredModuleImportError",
13
- status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
13
+ status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
14
14
  ):
15
15
  super().__init__(message, name, status_code)
16
16
 
@@ -23,3 +23,6 @@ from .create_authorized_dataset import create_authorized_dataset
23
23
 
24
24
  # Check
25
25
  from .check_dataset_name import check_dataset_name
26
+
27
+ # Boolean check
28
+ from .has_dataset_data import has_dataset_data
@@ -9,7 +9,10 @@ async def get_dataset_data(dataset_id: UUID) -> list[Data]:
9
9
 
10
10
  async with db_engine.get_async_session() as session:
11
11
  result = await session.execute(
12
- select(Data).join(Data.datasets).filter((Dataset.id == dataset_id))
12
+ select(Data)
13
+ .join(Data.datasets)
14
+ .filter((Dataset.id == dataset_id))
15
+ .order_by(Data.data_size.desc())
13
16
  )
14
17
 
15
18
  data = list(result.scalars().all())
@@ -0,0 +1,21 @@
1
+ from uuid import UUID
2
+
3
+ from sqlalchemy import select
4
+ from sqlalchemy.sql import func
5
+
6
+ from cognee.infrastructure.databases.relational import get_relational_engine
7
+ from cognee.modules.data.models import DatasetData
8
+
9
+
10
+ async def has_dataset_data(dataset_id: UUID) -> bool:
11
+ db_engine = get_relational_engine()
12
+
13
+ async with db_engine.get_async_session() as session:
14
+ count_query = (
15
+ select(func.count())
16
+ .select_from(DatasetData)
17
+ .where(DatasetData.dataset_id == dataset_id)
18
+ )
19
+ count = await session.execute(count_query)
20
+
21
+ return count.scalar_one() > 0
@@ -5,7 +5,6 @@ from typing import Optional
5
5
 
6
6
  class TableRow(DataPoint):
7
7
  name: str
8
- is_a: Optional[TableType] = None
9
8
  description: str
10
9
  properties: str
11
10
 
@@ -1,10 +1,12 @@
1
- from typing import BinaryIO, Union
1
+ from typing import BinaryIO, Union, Optional
2
2
  from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
3
3
  from .classify import classify
4
4
  import hashlib
5
5
 
6
6
 
7
- async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
7
+ async def save_data_to_file(
8
+ data: Union[str, BinaryIO], filename: str = None, file_extension: Optional[str] = None
9
+ ):
8
10
  storage_config = get_storage_config()
9
11
 
10
12
  data_root_directory = storage_config["data_root_directory"]
@@ -21,6 +23,11 @@ async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
21
23
 
22
24
  file_name = file_metadata["name"]
23
25
 
26
+ if file_extension is not None:
27
+ extension = file_extension.lstrip(".")
28
+ file_name_without_ext = file_name.rsplit(".", 1)[0]
29
+ file_name = f"{file_name_without_ext}.{extension}"
30
+
24
31
  storage = get_file_storage(data_root_directory)
25
32
 
26
33
  full_file_path = await storage.store(file_name, data)
@@ -7,6 +7,6 @@ class PipelineRunFailedError(CogneeSystemError):
7
7
  self,
8
8
  message: str = "Pipeline run failed.",
9
9
  name: str = "PipelineRunFailedError",
10
- status_code: int = status.HTTP_422_UNPROCESSABLE_ENTITY,
10
+ status_code: int = status.HTTP_422_UNPROCESSABLE_CONTENT,
11
11
  ):
12
12
  super().__init__(message, name, status_code)