cognee 0.3.6__py3-none-any.whl → 0.3.7.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +1 -0
- cognee/api/health.py +2 -12
- cognee/api/v1/add/add.py +46 -6
- cognee/api/v1/add/routers/get_add_router.py +11 -2
- cognee/api/v1/cognify/cognify.py +29 -9
- cognee/api/v1/cognify/routers/get_cognify_router.py +2 -1
- cognee/api/v1/datasets/datasets.py +11 -0
- cognee/api/v1/datasets/routers/get_datasets_router.py +8 -0
- cognee/api/v1/delete/routers/get_delete_router.py +2 -0
- cognee/api/v1/memify/routers/get_memify_router.py +2 -1
- cognee/api/v1/permissions/routers/get_permissions_router.py +6 -0
- cognee/api/v1/responses/default_tools.py +0 -1
- cognee/api/v1/responses/dispatch_function.py +1 -1
- cognee/api/v1/responses/routers/default_tools.py +0 -1
- cognee/api/v1/search/routers/get_search_router.py +3 -3
- cognee/api/v1/search/search.py +11 -9
- cognee/api/v1/settings/routers/get_settings_router.py +7 -1
- cognee/api/v1/sync/routers/get_sync_router.py +3 -0
- cognee/api/v1/ui/ui.py +45 -16
- cognee/api/v1/update/routers/get_update_router.py +3 -1
- cognee/api/v1/update/update.py +3 -3
- cognee/api/v1/users/routers/get_visualize_router.py +2 -0
- cognee/cli/_cognee.py +61 -10
- cognee/cli/commands/add_command.py +3 -3
- cognee/cli/commands/cognify_command.py +3 -3
- cognee/cli/commands/config_command.py +9 -7
- cognee/cli/commands/delete_command.py +3 -3
- cognee/cli/commands/search_command.py +3 -7
- cognee/cli/config.py +0 -1
- cognee/context_global_variables.py +5 -0
- cognee/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/cache/__init__.py +2 -0
- cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
- cognee/infrastructure/databases/cache/config.py +44 -0
- cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
- cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
- cognee/infrastructure/databases/exceptions/__init__.py +1 -0
- cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
- cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
- cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
- cognee/infrastructure/databases/graph/kuzu/adapter.py +76 -47
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
- cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
- cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
- cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
- cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
- cognee/infrastructure/files/exceptions.py +1 -1
- cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
- cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
- cognee/infrastructure/files/utils/guess_file_type.py +6 -0
- cognee/infrastructure/llm/prompts/feedback_reaction_prompt.txt +14 -0
- cognee/infrastructure/llm/prompts/feedback_report_prompt.txt +13 -0
- cognee/infrastructure/llm/prompts/feedback_user_context_prompt.txt +5 -0
- cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
- cognee/infrastructure/loaders/LoaderEngine.py +27 -7
- cognee/infrastructure/loaders/external/__init__.py +7 -0
- cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
- cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
- cognee/infrastructure/loaders/supported_loaders.py +7 -0
- cognee/modules/data/exceptions/exceptions.py +1 -1
- cognee/modules/data/methods/__init__.py +3 -0
- cognee/modules/data/methods/get_dataset_data.py +4 -1
- cognee/modules/data/methods/has_dataset_data.py +21 -0
- cognee/modules/engine/models/TableRow.py +0 -1
- cognee/modules/ingestion/save_data_to_file.py +9 -2
- cognee/modules/pipelines/exceptions/exceptions.py +1 -1
- cognee/modules/pipelines/operations/pipeline.py +12 -1
- cognee/modules/pipelines/operations/run_tasks.py +25 -197
- cognee/modules/pipelines/operations/run_tasks_base.py +7 -0
- cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
- cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
- cognee/modules/pipelines/operations/run_tasks_with_telemetry.py +9 -1
- cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
- cognee/modules/retrieval/base_graph_retriever.py +3 -1
- cognee/modules/retrieval/base_retriever.py +3 -1
- cognee/modules/retrieval/chunks_retriever.py +5 -1
- cognee/modules/retrieval/code_retriever.py +20 -2
- cognee/modules/retrieval/completion_retriever.py +50 -9
- cognee/modules/retrieval/cypher_search_retriever.py +11 -1
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
- cognee/modules/retrieval/graph_completion_cot_retriever.py +152 -22
- cognee/modules/retrieval/graph_completion_retriever.py +54 -10
- cognee/modules/retrieval/lexical_retriever.py +20 -2
- cognee/modules/retrieval/natural_language_retriever.py +10 -1
- cognee/modules/retrieval/summaries_retriever.py +5 -1
- cognee/modules/retrieval/temporal_retriever.py +62 -10
- cognee/modules/retrieval/user_qa_feedback.py +3 -2
- cognee/modules/retrieval/utils/completion.py +30 -4
- cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
- cognee/modules/retrieval/utils/session_cache.py +156 -0
- cognee/modules/search/methods/get_search_type_tools.py +0 -5
- cognee/modules/search/methods/no_access_control_search.py +12 -1
- cognee/modules/search/methods/search.py +51 -5
- cognee/modules/search/types/SearchType.py +0 -1
- cognee/modules/settings/get_settings.py +23 -0
- cognee/modules/users/methods/get_authenticated_user.py +3 -1
- cognee/modules/users/methods/get_default_user.py +1 -6
- cognee/modules/users/roles/methods/create_role.py +2 -2
- cognee/modules/users/tenants/methods/create_tenant.py +2 -2
- cognee/shared/exceptions/exceptions.py +1 -1
- cognee/shared/logging_utils.py +18 -11
- cognee/shared/utils.py +24 -2
- cognee/tasks/codingagents/coding_rule_associations.py +1 -2
- cognee/tasks/documents/exceptions/exceptions.py +1 -1
- cognee/tasks/feedback/__init__.py +13 -0
- cognee/tasks/feedback/create_enrichments.py +84 -0
- cognee/tasks/feedback/extract_feedback_interactions.py +230 -0
- cognee/tasks/feedback/generate_improved_answers.py +130 -0
- cognee/tasks/feedback/link_enrichments_to_feedback.py +67 -0
- cognee/tasks/feedback/models.py +26 -0
- cognee/tasks/graph/extract_graph_from_data.py +2 -0
- cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
- cognee/tasks/ingestion/ingest_data.py +11 -5
- cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
- cognee/tasks/storage/add_data_points.py +3 -10
- cognee/tasks/storage/index_data_points.py +19 -14
- cognee/tasks/storage/index_graph_edges.py +25 -11
- cognee/tasks/web_scraper/__init__.py +34 -0
- cognee/tasks/web_scraper/config.py +26 -0
- cognee/tasks/web_scraper/default_url_crawler.py +446 -0
- cognee/tasks/web_scraper/models.py +46 -0
- cognee/tasks/web_scraper/types.py +4 -0
- cognee/tasks/web_scraper/utils.py +142 -0
- cognee/tasks/web_scraper/web_scraper_task.py +396 -0
- cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
- cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
- cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
- cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
- cognee/tests/subprocesses/reader.py +25 -0
- cognee/tests/subprocesses/simple_cognify_1.py +31 -0
- cognee/tests/subprocesses/simple_cognify_2.py +31 -0
- cognee/tests/subprocesses/writer.py +32 -0
- cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
- cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
- cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
- cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
- cognee/tests/test_add_docling_document.py +56 -0
- cognee/tests/test_chromadb.py +7 -11
- cognee/tests/test_concurrent_subprocess_access.py +76 -0
- cognee/tests/test_conversation_history.py +240 -0
- cognee/tests/test_feedback_enrichment.py +174 -0
- cognee/tests/test_kuzu.py +27 -15
- cognee/tests/test_lancedb.py +7 -11
- cognee/tests/test_library.py +32 -2
- cognee/tests/test_neo4j.py +24 -16
- cognee/tests/test_neptune_analytics_vector.py +7 -11
- cognee/tests/test_permissions.py +9 -13
- cognee/tests/test_pgvector.py +4 -4
- cognee/tests/test_remote_kuzu.py +8 -11
- cognee/tests/test_s3_file_storage.py +1 -1
- cognee/tests/test_search_db.py +6 -8
- cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
- cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +51 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/METADATA +21 -6
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/RECORD +178 -139
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/entry_points.txt +1 -0
- distributed/Dockerfile +0 -3
- distributed/entrypoint.py +21 -9
- distributed/signal.py +5 -0
- distributed/workers/data_point_saving_worker.py +64 -34
- distributed/workers/graph_saving_worker.py +71 -47
- cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
- cognee/modules/retrieval/insights_retriever.py +0 -133
- cognee/tests/test_memgraph.py +0 -109
- cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/WHEEL +0 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/licenses/NOTICE.md +0 -0
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py
CHANGED
|
@@ -7,6 +7,15 @@ from openai import ContentFilterFinishReasonError
|
|
|
7
7
|
from litellm.exceptions import ContentPolicyViolationError
|
|
8
8
|
from instructor.core import InstructorRetryException
|
|
9
9
|
|
|
10
|
+
import logging
|
|
11
|
+
from tenacity import (
|
|
12
|
+
retry,
|
|
13
|
+
stop_after_delay,
|
|
14
|
+
wait_exponential_jitter,
|
|
15
|
+
retry_if_not_exception_type,
|
|
16
|
+
before_sleep_log,
|
|
17
|
+
)
|
|
18
|
+
|
|
10
19
|
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import (
|
|
11
20
|
LLMInterface,
|
|
12
21
|
)
|
|
@@ -14,19 +23,13 @@ from cognee.infrastructure.llm.exceptions import (
|
|
|
14
23
|
ContentPolicyFilterError,
|
|
15
24
|
)
|
|
16
25
|
from cognee.infrastructure.files.utils.open_data_file import open_data_file
|
|
17
|
-
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.rate_limiter import (
|
|
18
|
-
rate_limit_async,
|
|
19
|
-
rate_limit_sync,
|
|
20
|
-
sleep_and_retry_async,
|
|
21
|
-
sleep_and_retry_sync,
|
|
22
|
-
)
|
|
23
26
|
from cognee.modules.observability.get_observe import get_observe
|
|
24
27
|
from cognee.shared.logging_utils import get_logger
|
|
25
28
|
|
|
26
|
-
observe = get_observe()
|
|
27
|
-
|
|
28
29
|
logger = get_logger()
|
|
29
30
|
|
|
31
|
+
observe = get_observe()
|
|
32
|
+
|
|
30
33
|
|
|
31
34
|
class OpenAIAdapter(LLMInterface):
|
|
32
35
|
"""
|
|
@@ -97,8 +100,13 @@ class OpenAIAdapter(LLMInterface):
|
|
|
97
100
|
self.fallback_endpoint = fallback_endpoint
|
|
98
101
|
|
|
99
102
|
@observe(as_type="generation")
|
|
100
|
-
@
|
|
101
|
-
|
|
103
|
+
@retry(
|
|
104
|
+
stop=stop_after_delay(128),
|
|
105
|
+
wait=wait_exponential_jitter(2, 128),
|
|
106
|
+
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
|
|
107
|
+
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
|
108
|
+
reraise=True,
|
|
109
|
+
)
|
|
102
110
|
async def acreate_structured_output(
|
|
103
111
|
self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
|
|
104
112
|
) -> BaseModel:
|
|
@@ -148,10 +156,7 @@ class OpenAIAdapter(LLMInterface):
|
|
|
148
156
|
InstructorRetryException,
|
|
149
157
|
) as e:
|
|
150
158
|
if not (self.fallback_model and self.fallback_api_key):
|
|
151
|
-
raise
|
|
152
|
-
f"The provided input contains content that is not aligned with our content policy: {text_input}"
|
|
153
|
-
) from e
|
|
154
|
-
|
|
159
|
+
raise e
|
|
155
160
|
try:
|
|
156
161
|
return await self.aclient.chat.completions.create(
|
|
157
162
|
model=self.fallback_model,
|
|
@@ -186,8 +191,13 @@ class OpenAIAdapter(LLMInterface):
|
|
|
186
191
|
) from error
|
|
187
192
|
|
|
188
193
|
@observe
|
|
189
|
-
@
|
|
190
|
-
|
|
194
|
+
@retry(
|
|
195
|
+
stop=stop_after_delay(128),
|
|
196
|
+
wait=wait_exponential_jitter(2, 128),
|
|
197
|
+
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
|
|
198
|
+
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
|
199
|
+
reraise=True,
|
|
200
|
+
)
|
|
191
201
|
def create_structured_output(
|
|
192
202
|
self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
|
|
193
203
|
) -> BaseModel:
|
|
@@ -231,7 +241,13 @@ class OpenAIAdapter(LLMInterface):
|
|
|
231
241
|
max_retries=self.MAX_RETRIES,
|
|
232
242
|
)
|
|
233
243
|
|
|
234
|
-
@
|
|
244
|
+
@retry(
|
|
245
|
+
stop=stop_after_delay(128),
|
|
246
|
+
wait=wait_exponential_jitter(2, 128),
|
|
247
|
+
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
|
|
248
|
+
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
|
249
|
+
reraise=True,
|
|
250
|
+
)
|
|
235
251
|
async def create_transcript(self, input):
|
|
236
252
|
"""
|
|
237
253
|
Generate an audio transcript from a user query.
|
|
@@ -263,7 +279,13 @@ class OpenAIAdapter(LLMInterface):
|
|
|
263
279
|
|
|
264
280
|
return transcription
|
|
265
281
|
|
|
266
|
-
@
|
|
282
|
+
@retry(
|
|
283
|
+
stop=stop_after_delay(128),
|
|
284
|
+
wait=wait_exponential_jitter(2, 128),
|
|
285
|
+
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
|
|
286
|
+
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
|
287
|
+
reraise=True,
|
|
288
|
+
)
|
|
267
289
|
async def transcribe_image(self, input) -> BaseModel:
|
|
268
290
|
"""
|
|
269
291
|
Generate a transcription of an image from a user query.
|
|
@@ -27,11 +27,11 @@ class LoaderEngine:
|
|
|
27
27
|
|
|
28
28
|
self.default_loader_priority = [
|
|
29
29
|
"text_loader",
|
|
30
|
-
"advanced_pdf_loader",
|
|
31
30
|
"pypdf_loader",
|
|
32
31
|
"image_loader",
|
|
33
32
|
"audio_loader",
|
|
34
33
|
"unstructured_loader",
|
|
34
|
+
"advanced_pdf_loader",
|
|
35
35
|
]
|
|
36
36
|
|
|
37
37
|
def register_loader(self, loader: LoaderInterface) -> bool:
|
|
@@ -64,7 +64,9 @@ class LoaderEngine:
|
|
|
64
64
|
return True
|
|
65
65
|
|
|
66
66
|
def get_loader(
|
|
67
|
-
self,
|
|
67
|
+
self,
|
|
68
|
+
file_path: str,
|
|
69
|
+
preferred_loaders: dict[str, dict[str, Any]],
|
|
68
70
|
) -> Optional[LoaderInterface]:
|
|
69
71
|
"""
|
|
70
72
|
Get appropriate loader for a file.
|
|
@@ -76,14 +78,21 @@ class LoaderEngine:
|
|
|
76
78
|
Returns:
|
|
77
79
|
LoaderInterface that can handle the file, or None if not found
|
|
78
80
|
"""
|
|
81
|
+
from pathlib import Path
|
|
79
82
|
|
|
80
83
|
file_info = filetype.guess(file_path)
|
|
81
84
|
|
|
85
|
+
path_extension = Path(file_path).suffix.lstrip(".")
|
|
86
|
+
|
|
82
87
|
# Try preferred loaders first
|
|
83
88
|
if preferred_loaders:
|
|
84
89
|
for loader_name in preferred_loaders:
|
|
85
90
|
if loader_name in self._loaders:
|
|
86
91
|
loader = self._loaders[loader_name]
|
|
92
|
+
# Try with path extension first (for text formats like html)
|
|
93
|
+
if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
|
|
94
|
+
return loader
|
|
95
|
+
# Fall back to content-detected extension
|
|
87
96
|
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
|
88
97
|
return loader
|
|
89
98
|
else:
|
|
@@ -93,6 +102,10 @@ class LoaderEngine:
|
|
|
93
102
|
for loader_name in self.default_loader_priority:
|
|
94
103
|
if loader_name in self._loaders:
|
|
95
104
|
loader = self._loaders[loader_name]
|
|
105
|
+
# Try with path extension first (for text formats like html)
|
|
106
|
+
if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
|
|
107
|
+
return loader
|
|
108
|
+
# Fall back to content-detected extension
|
|
96
109
|
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
|
97
110
|
return loader
|
|
98
111
|
else:
|
|
@@ -105,8 +118,7 @@ class LoaderEngine:
|
|
|
105
118
|
async def load_file(
|
|
106
119
|
self,
|
|
107
120
|
file_path: str,
|
|
108
|
-
|
|
109
|
-
preferred_loaders: Optional[List[str]] = None,
|
|
121
|
+
preferred_loaders: dict[str, dict[str, Any]] = None,
|
|
110
122
|
**kwargs,
|
|
111
123
|
):
|
|
112
124
|
"""
|
|
@@ -114,7 +126,7 @@ class LoaderEngine:
|
|
|
114
126
|
|
|
115
127
|
Args:
|
|
116
128
|
file_path: Path to the file to be processed
|
|
117
|
-
preferred_loaders:
|
|
129
|
+
preferred_loaders: Dict of loader names to their configurations
|
|
118
130
|
**kwargs: Additional loader-specific configuration
|
|
119
131
|
|
|
120
132
|
Raises:
|
|
@@ -126,8 +138,16 @@ class LoaderEngine:
|
|
|
126
138
|
raise ValueError(f"No loader found for file: {file_path}")
|
|
127
139
|
|
|
128
140
|
logger.debug(f"Loading {file_path} with {loader.loader_name}")
|
|
129
|
-
|
|
130
|
-
|
|
141
|
+
|
|
142
|
+
# Extract loader-specific config from preferred_loaders
|
|
143
|
+
loader_config = {}
|
|
144
|
+
if preferred_loaders and loader.loader_name in preferred_loaders:
|
|
145
|
+
loader_config = preferred_loaders[loader.loader_name]
|
|
146
|
+
|
|
147
|
+
# Merge with any additional kwargs (kwargs take precedence)
|
|
148
|
+
merged_kwargs = {**loader_config, **kwargs}
|
|
149
|
+
|
|
150
|
+
return await loader.load(file_path, **merged_kwargs)
|
|
131
151
|
|
|
132
152
|
def get_available_loaders(self) -> List[str]:
|
|
133
153
|
"""
|
|
@@ -14,14 +14,6 @@ from cognee.infrastructure.loaders.external.pypdf_loader import PyPdfLoader
|
|
|
14
14
|
|
|
15
15
|
logger = get_logger(__name__)
|
|
16
16
|
|
|
17
|
-
try:
|
|
18
|
-
from unstructured.partition.pdf import partition_pdf
|
|
19
|
-
except ImportError as e:
|
|
20
|
-
logger.info(
|
|
21
|
-
"unstructured[pdf] not installed, can't use AdvancedPdfLoader, will use PyPdfLoader instead."
|
|
22
|
-
)
|
|
23
|
-
raise ImportError from e
|
|
24
|
-
|
|
25
17
|
|
|
26
18
|
@dataclass
|
|
27
19
|
class _PageBuffer:
|
|
@@ -88,6 +80,8 @@ class AdvancedPdfLoader(LoaderInterface):
|
|
|
88
80
|
**kwargs,
|
|
89
81
|
}
|
|
90
82
|
# Use partition to extract elements
|
|
83
|
+
from unstructured.partition.pdf import partition_pdf
|
|
84
|
+
|
|
91
85
|
elements = partition_pdf(**partition_kwargs)
|
|
92
86
|
|
|
93
87
|
# Process elements into text content
|
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
"""BeautifulSoup-based web crawler for extracting content from web pages.
|
|
2
|
+
|
|
3
|
+
This module provides the BeautifulSoupCrawler class for fetching and extracting content
|
|
4
|
+
from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages. It
|
|
5
|
+
supports robots.txt handling, rate limiting, and custom extraction rules.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Union, Dict, Any, Optional, List
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from bs4 import BeautifulSoup
|
|
11
|
+
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
|
12
|
+
from cognee.shared.logging_utils import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class ExtractionRule:
|
|
19
|
+
"""Normalized extraction rule for web content.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
selector: CSS selector for extraction (if any).
|
|
23
|
+
xpath: XPath expression for extraction (if any).
|
|
24
|
+
attr: HTML attribute to extract (if any).
|
|
25
|
+
all: If True, extract all matching elements; otherwise, extract first.
|
|
26
|
+
join_with: String to join multiple extracted elements.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
selector: Optional[str] = None
|
|
30
|
+
xpath: Optional[str] = None
|
|
31
|
+
attr: Optional[str] = None
|
|
32
|
+
all: bool = False
|
|
33
|
+
join_with: str = " "
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class BeautifulSoupLoader(LoaderInterface):
|
|
37
|
+
"""Crawler for fetching and extracting web content using BeautifulSoup.
|
|
38
|
+
|
|
39
|
+
Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
|
|
40
|
+
compliance, and rate limiting. Extracts content using CSS selectors or XPath rules.
|
|
41
|
+
|
|
42
|
+
Attributes:
|
|
43
|
+
concurrency: Number of concurrent requests allowed.
|
|
44
|
+
crawl_delay: Minimum seconds between requests to the same domain.
|
|
45
|
+
max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
|
|
46
|
+
timeout: Per-request timeout in seconds.
|
|
47
|
+
max_retries: Number of retries for failed requests.
|
|
48
|
+
retry_delay_factor: Multiplier for exponential backoff on retries.
|
|
49
|
+
headers: HTTP headers for requests (e.g., User-Agent).
|
|
50
|
+
robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def supported_extensions(self) -> List[str]:
|
|
55
|
+
return ["html"]
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def supported_mime_types(self) -> List[str]:
|
|
59
|
+
return ["text/html", "text/plain"]
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def loader_name(self) -> str:
|
|
63
|
+
return "beautiful_soup_loader"
|
|
64
|
+
|
|
65
|
+
def can_handle(self, extension: str, mime_type: str) -> bool:
|
|
66
|
+
can = extension in self.supported_extensions and mime_type in self.supported_mime_types
|
|
67
|
+
return can
|
|
68
|
+
|
|
69
|
+
def _get_default_extraction_rules(self):
|
|
70
|
+
# Comprehensive default extraction rules for common HTML content
|
|
71
|
+
return {
|
|
72
|
+
# Meta information
|
|
73
|
+
"title": {"selector": "title", "all": False},
|
|
74
|
+
"meta_description": {
|
|
75
|
+
"selector": "meta[name='description']",
|
|
76
|
+
"attr": "content",
|
|
77
|
+
"all": False,
|
|
78
|
+
},
|
|
79
|
+
"meta_keywords": {
|
|
80
|
+
"selector": "meta[name='keywords']",
|
|
81
|
+
"attr": "content",
|
|
82
|
+
"all": False,
|
|
83
|
+
},
|
|
84
|
+
# Open Graph meta tags
|
|
85
|
+
"og_title": {
|
|
86
|
+
"selector": "meta[property='og:title']",
|
|
87
|
+
"attr": "content",
|
|
88
|
+
"all": False,
|
|
89
|
+
},
|
|
90
|
+
"og_description": {
|
|
91
|
+
"selector": "meta[property='og:description']",
|
|
92
|
+
"attr": "content",
|
|
93
|
+
"all": False,
|
|
94
|
+
},
|
|
95
|
+
# Main content areas (prioritized selectors)
|
|
96
|
+
"article": {"selector": "article", "all": True, "join_with": "\n\n"},
|
|
97
|
+
"main": {"selector": "main", "all": True, "join_with": "\n\n"},
|
|
98
|
+
# Semantic content sections
|
|
99
|
+
"headers_h1": {"selector": "h1", "all": True, "join_with": "\n"},
|
|
100
|
+
"headers_h2": {"selector": "h2", "all": True, "join_with": "\n"},
|
|
101
|
+
"headers_h3": {"selector": "h3", "all": True, "join_with": "\n"},
|
|
102
|
+
"headers_h4": {"selector": "h4", "all": True, "join_with": "\n"},
|
|
103
|
+
"headers_h5": {"selector": "h5", "all": True, "join_with": "\n"},
|
|
104
|
+
"headers_h6": {"selector": "h6", "all": True, "join_with": "\n"},
|
|
105
|
+
# Text content
|
|
106
|
+
"paragraphs": {"selector": "p", "all": True, "join_with": "\n\n"},
|
|
107
|
+
"blockquotes": {"selector": "blockquote", "all": True, "join_with": "\n\n"},
|
|
108
|
+
"preformatted": {"selector": "pre", "all": True, "join_with": "\n\n"},
|
|
109
|
+
# Lists
|
|
110
|
+
"ordered_lists": {"selector": "ol", "all": True, "join_with": "\n"},
|
|
111
|
+
"unordered_lists": {"selector": "ul", "all": True, "join_with": "\n"},
|
|
112
|
+
"list_items": {"selector": "li", "all": True, "join_with": "\n"},
|
|
113
|
+
"definition_lists": {"selector": "dl", "all": True, "join_with": "\n"},
|
|
114
|
+
# Tables
|
|
115
|
+
"tables": {"selector": "table", "all": True, "join_with": "\n\n"},
|
|
116
|
+
"table_captions": {
|
|
117
|
+
"selector": "caption",
|
|
118
|
+
"all": True,
|
|
119
|
+
"join_with": "\n",
|
|
120
|
+
},
|
|
121
|
+
# Code blocks
|
|
122
|
+
"code_blocks": {"selector": "code", "all": True, "join_with": "\n"},
|
|
123
|
+
# Figures and media descriptions
|
|
124
|
+
"figures": {"selector": "figure", "all": True, "join_with": "\n\n"},
|
|
125
|
+
"figcaptions": {"selector": "figcaption", "all": True, "join_with": "\n"},
|
|
126
|
+
"image_alts": {"selector": "img", "attr": "alt", "all": True, "join_with": " "},
|
|
127
|
+
# Links (text content, not URLs to avoid clutter)
|
|
128
|
+
"link_text": {"selector": "a", "all": True, "join_with": " "},
|
|
129
|
+
# Emphasized text
|
|
130
|
+
"strong": {"selector": "strong", "all": True, "join_with": " "},
|
|
131
|
+
"emphasis": {"selector": "em", "all": True, "join_with": " "},
|
|
132
|
+
"marked": {"selector": "mark", "all": True, "join_with": " "},
|
|
133
|
+
# Time and data elements
|
|
134
|
+
"time": {"selector": "time", "all": True, "join_with": " "},
|
|
135
|
+
"data": {"selector": "data", "all": True, "join_with": " "},
|
|
136
|
+
# Sections and semantic structure
|
|
137
|
+
"sections": {"selector": "section", "all": True, "join_with": "\n\n"},
|
|
138
|
+
"asides": {"selector": "aside", "all": True, "join_with": "\n\n"},
|
|
139
|
+
"details": {"selector": "details", "all": True, "join_with": "\n"},
|
|
140
|
+
"summary": {"selector": "summary", "all": True, "join_with": "\n"},
|
|
141
|
+
# Navigation (may contain important links/structure)
|
|
142
|
+
"nav": {"selector": "nav", "all": True, "join_with": "\n"},
|
|
143
|
+
# Footer information
|
|
144
|
+
"footer": {"selector": "footer", "all": True, "join_with": "\n"},
|
|
145
|
+
# Divs with specific content roles
|
|
146
|
+
"content_divs": {
|
|
147
|
+
"selector": "div[role='main'], div[role='article'], div.content, div#content",
|
|
148
|
+
"all": True,
|
|
149
|
+
"join_with": "\n\n",
|
|
150
|
+
},
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
async def load(
|
|
154
|
+
self,
|
|
155
|
+
file_path: str,
|
|
156
|
+
extraction_rules: dict[str, Any] = None,
|
|
157
|
+
join_all_matches: bool = False,
|
|
158
|
+
**kwargs,
|
|
159
|
+
):
|
|
160
|
+
"""Load an HTML file, extract content, and save to storage.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
file_path: Path to the HTML file
|
|
164
|
+
extraction_rules: Dict of CSS selector rules for content extraction
|
|
165
|
+
join_all_matches: If True, extract all matching elements for each rule
|
|
166
|
+
**kwargs: Additional arguments
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Path to the stored extracted text file
|
|
170
|
+
"""
|
|
171
|
+
if extraction_rules is None:
|
|
172
|
+
extraction_rules = self._get_default_extraction_rules()
|
|
173
|
+
logger.info("Using default comprehensive extraction rules for HTML content")
|
|
174
|
+
|
|
175
|
+
logger.info(f"Processing HTML file: {file_path}")
|
|
176
|
+
|
|
177
|
+
from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
|
|
178
|
+
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
|
|
179
|
+
|
|
180
|
+
with open(file_path, "rb") as f:
|
|
181
|
+
file_metadata = await get_file_metadata(f)
|
|
182
|
+
f.seek(0)
|
|
183
|
+
html = f.read()
|
|
184
|
+
|
|
185
|
+
storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
|
|
186
|
+
|
|
187
|
+
# Normalize extraction rules
|
|
188
|
+
normalized_rules: List[ExtractionRule] = []
|
|
189
|
+
for _, rule in extraction_rules.items():
|
|
190
|
+
r = self._normalize_rule(rule)
|
|
191
|
+
if join_all_matches:
|
|
192
|
+
r.all = True
|
|
193
|
+
normalized_rules.append(r)
|
|
194
|
+
|
|
195
|
+
pieces = []
|
|
196
|
+
for rule in normalized_rules:
|
|
197
|
+
text = self._extract_from_html(html, rule)
|
|
198
|
+
if text:
|
|
199
|
+
pieces.append(text)
|
|
200
|
+
|
|
201
|
+
full_content = " ".join(pieces).strip()
|
|
202
|
+
|
|
203
|
+
# remove after defaults for extraction rules
|
|
204
|
+
# Fallback: If no content extracted, check if the file is plain text (not HTML)
|
|
205
|
+
if not full_content:
|
|
206
|
+
from bs4 import BeautifulSoup
|
|
207
|
+
|
|
208
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
209
|
+
# If there are no HTML tags, treat as plain text
|
|
210
|
+
if not soup.find():
|
|
211
|
+
logger.warning(
|
|
212
|
+
f"No HTML tags found in {file_path}. Treating as plain text. "
|
|
213
|
+
"This may happen when content is pre-extracted (e.g., via Tavily with text format)."
|
|
214
|
+
)
|
|
215
|
+
full_content = html.decode("utf-8") if isinstance(html, bytes) else html
|
|
216
|
+
full_content = full_content.strip()
|
|
217
|
+
|
|
218
|
+
if not full_content:
|
|
219
|
+
logger.warning(f"No content extracted from HTML file: {file_path}")
|
|
220
|
+
|
|
221
|
+
# Store the extracted content
|
|
222
|
+
storage_config = get_storage_config()
|
|
223
|
+
data_root_directory = storage_config["data_root_directory"]
|
|
224
|
+
storage = get_file_storage(data_root_directory)
|
|
225
|
+
|
|
226
|
+
full_file_path = await storage.store(storage_file_name, full_content)
|
|
227
|
+
|
|
228
|
+
logger.info(f"Extracted {len(full_content)} characters from HTML")
|
|
229
|
+
return full_file_path
|
|
230
|
+
|
|
231
|
+
def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
|
|
232
|
+
"""Normalize an extraction rule to an ExtractionRule dataclass.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
rule: A string (CSS selector) or dict with extraction parameters.
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
ExtractionRule: Normalized extraction rule.
|
|
239
|
+
|
|
240
|
+
Raises:
|
|
241
|
+
ValueError: If the rule is invalid.
|
|
242
|
+
"""
|
|
243
|
+
if isinstance(rule, str):
|
|
244
|
+
return ExtractionRule(selector=rule)
|
|
245
|
+
if isinstance(rule, dict):
|
|
246
|
+
return ExtractionRule(
|
|
247
|
+
selector=rule.get("selector"),
|
|
248
|
+
xpath=rule.get("xpath"),
|
|
249
|
+
attr=rule.get("attr"),
|
|
250
|
+
all=bool(rule.get("all", False)),
|
|
251
|
+
join_with=rule.get("join_with", " "),
|
|
252
|
+
)
|
|
253
|
+
raise ValueError(f"Invalid extraction rule: {rule}")
|
|
254
|
+
|
|
255
|
+
def _extract_from_html(self, html: str, rule: ExtractionRule) -> str:
|
|
256
|
+
"""Extract content from HTML using BeautifulSoup or lxml XPath.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
html: The HTML content to extract from.
|
|
260
|
+
rule: The extraction rule to apply.
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
str: The extracted content.
|
|
264
|
+
|
|
265
|
+
Raises:
|
|
266
|
+
RuntimeError: If XPath is used but lxml is not installed.
|
|
267
|
+
"""
|
|
268
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
269
|
+
|
|
270
|
+
if rule.xpath:
|
|
271
|
+
try:
|
|
272
|
+
from lxml import html as lxml_html
|
|
273
|
+
except ImportError:
|
|
274
|
+
raise RuntimeError(
|
|
275
|
+
"XPath requested but lxml is not available. Install lxml or use CSS selectors."
|
|
276
|
+
)
|
|
277
|
+
doc = lxml_html.fromstring(html)
|
|
278
|
+
nodes = doc.xpath(rule.xpath)
|
|
279
|
+
texts = []
|
|
280
|
+
for n in nodes:
|
|
281
|
+
if hasattr(n, "text_content"):
|
|
282
|
+
texts.append(n.text_content().strip())
|
|
283
|
+
else:
|
|
284
|
+
texts.append(str(n).strip())
|
|
285
|
+
return rule.join_with.join(t for t in texts if t)
|
|
286
|
+
|
|
287
|
+
if not rule.selector:
|
|
288
|
+
return ""
|
|
289
|
+
|
|
290
|
+
if rule.all:
|
|
291
|
+
nodes = soup.select(rule.selector)
|
|
292
|
+
pieces = []
|
|
293
|
+
for el in nodes:
|
|
294
|
+
if rule.attr:
|
|
295
|
+
val = el.get(rule.attr)
|
|
296
|
+
if val:
|
|
297
|
+
pieces.append(val.strip())
|
|
298
|
+
else:
|
|
299
|
+
text = el.get_text(strip=True)
|
|
300
|
+
if text:
|
|
301
|
+
pieces.append(text)
|
|
302
|
+
return rule.join_with.join(pieces).strip()
|
|
303
|
+
else:
|
|
304
|
+
el = soup.select_one(rule.selector)
|
|
305
|
+
if el is None:
|
|
306
|
+
return ""
|
|
307
|
+
if rule.attr:
|
|
308
|
+
val = el.get(rule.attr)
|
|
309
|
+
return (val or "").strip()
|
|
310
|
+
return el.get_text(strip=True)
|
|
@@ -23,3 +23,10 @@ try:
|
|
|
23
23
|
supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
|
|
24
24
|
except ImportError:
|
|
25
25
|
pass
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
from cognee.infrastructure.loaders.external import BeautifulSoupLoader
|
|
29
|
+
|
|
30
|
+
supported_loaders[BeautifulSoupLoader.loader_name] = BeautifulSoupLoader
|
|
31
|
+
except ImportError:
|
|
32
|
+
pass
|
|
@@ -10,7 +10,7 @@ class UnstructuredLibraryImportError(CogneeConfigurationError):
|
|
|
10
10
|
self,
|
|
11
11
|
message: str = "Import error. Unstructured library is not installed.",
|
|
12
12
|
name: str = "UnstructuredModuleImportError",
|
|
13
|
-
status_code=status.
|
|
13
|
+
status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
|
|
14
14
|
):
|
|
15
15
|
super().__init__(message, name, status_code)
|
|
16
16
|
|
|
@@ -9,7 +9,10 @@ async def get_dataset_data(dataset_id: UUID) -> list[Data]:
|
|
|
9
9
|
|
|
10
10
|
async with db_engine.get_async_session() as session:
|
|
11
11
|
result = await session.execute(
|
|
12
|
-
select(Data)
|
|
12
|
+
select(Data)
|
|
13
|
+
.join(Data.datasets)
|
|
14
|
+
.filter((Dataset.id == dataset_id))
|
|
15
|
+
.order_by(Data.data_size.desc())
|
|
13
16
|
)
|
|
14
17
|
|
|
15
18
|
data = list(result.scalars().all())
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from uuid import UUID
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import select
|
|
4
|
+
from sqlalchemy.sql import func
|
|
5
|
+
|
|
6
|
+
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
7
|
+
from cognee.modules.data.models import DatasetData
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def has_dataset_data(dataset_id: UUID) -> bool:
|
|
11
|
+
db_engine = get_relational_engine()
|
|
12
|
+
|
|
13
|
+
async with db_engine.get_async_session() as session:
|
|
14
|
+
count_query = (
|
|
15
|
+
select(func.count())
|
|
16
|
+
.select_from(DatasetData)
|
|
17
|
+
.where(DatasetData.dataset_id == dataset_id)
|
|
18
|
+
)
|
|
19
|
+
count = await session.execute(count_query)
|
|
20
|
+
|
|
21
|
+
return count.scalar_one() > 0
|
|
@@ -1,10 +1,12 @@
|
|
|
1
|
-
from typing import BinaryIO, Union
|
|
1
|
+
from typing import BinaryIO, Union, Optional
|
|
2
2
|
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
|
|
3
3
|
from .classify import classify
|
|
4
4
|
import hashlib
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
async def save_data_to_file(
|
|
7
|
+
async def save_data_to_file(
|
|
8
|
+
data: Union[str, BinaryIO], filename: str = None, file_extension: Optional[str] = None
|
|
9
|
+
):
|
|
8
10
|
storage_config = get_storage_config()
|
|
9
11
|
|
|
10
12
|
data_root_directory = storage_config["data_root_directory"]
|
|
@@ -21,6 +23,11 @@ async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
|
|
|
21
23
|
|
|
22
24
|
file_name = file_metadata["name"]
|
|
23
25
|
|
|
26
|
+
if file_extension is not None:
|
|
27
|
+
extension = file_extension.lstrip(".")
|
|
28
|
+
file_name_without_ext = file_name.rsplit(".", 1)[0]
|
|
29
|
+
file_name = f"{file_name_without_ext}.{extension}"
|
|
30
|
+
|
|
24
31
|
storage = get_file_storage(data_root_directory)
|
|
25
32
|
|
|
26
33
|
full_file_path = await storage.store(file_name, data)
|
|
@@ -7,6 +7,6 @@ class PipelineRunFailedError(CogneeSystemError):
|
|
|
7
7
|
self,
|
|
8
8
|
message: str = "Pipeline run failed.",
|
|
9
9
|
name: str = "PipelineRunFailedError",
|
|
10
|
-
status_code: int = status.
|
|
10
|
+
status_code: int = status.HTTP_422_UNPROCESSABLE_CONTENT,
|
|
11
11
|
):
|
|
12
12
|
super().__init__(message, name, status_code)
|