cognee 0.3.6__py3-none-any.whl → 0.3.7.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +1 -0
- cognee/api/health.py +2 -12
- cognee/api/v1/add/add.py +46 -6
- cognee/api/v1/add/routers/get_add_router.py +11 -2
- cognee/api/v1/cognify/cognify.py +29 -9
- cognee/api/v1/cognify/routers/get_cognify_router.py +2 -1
- cognee/api/v1/datasets/datasets.py +11 -0
- cognee/api/v1/datasets/routers/get_datasets_router.py +8 -0
- cognee/api/v1/delete/routers/get_delete_router.py +2 -0
- cognee/api/v1/memify/routers/get_memify_router.py +2 -1
- cognee/api/v1/permissions/routers/get_permissions_router.py +6 -0
- cognee/api/v1/responses/default_tools.py +0 -1
- cognee/api/v1/responses/dispatch_function.py +1 -1
- cognee/api/v1/responses/routers/default_tools.py +0 -1
- cognee/api/v1/search/routers/get_search_router.py +3 -3
- cognee/api/v1/search/search.py +11 -9
- cognee/api/v1/settings/routers/get_settings_router.py +7 -1
- cognee/api/v1/sync/routers/get_sync_router.py +3 -0
- cognee/api/v1/ui/ui.py +45 -16
- cognee/api/v1/update/routers/get_update_router.py +3 -1
- cognee/api/v1/update/update.py +3 -3
- cognee/api/v1/users/routers/get_visualize_router.py +2 -0
- cognee/cli/_cognee.py +61 -10
- cognee/cli/commands/add_command.py +3 -3
- cognee/cli/commands/cognify_command.py +3 -3
- cognee/cli/commands/config_command.py +9 -7
- cognee/cli/commands/delete_command.py +3 -3
- cognee/cli/commands/search_command.py +3 -7
- cognee/cli/config.py +0 -1
- cognee/context_global_variables.py +5 -0
- cognee/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/cache/__init__.py +2 -0
- cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
- cognee/infrastructure/databases/cache/config.py +44 -0
- cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
- cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
- cognee/infrastructure/databases/exceptions/__init__.py +1 -0
- cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
- cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
- cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
- cognee/infrastructure/databases/graph/kuzu/adapter.py +76 -47
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
- cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
- cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
- cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
- cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
- cognee/infrastructure/files/exceptions.py +1 -1
- cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
- cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
- cognee/infrastructure/files/utils/guess_file_type.py +6 -0
- cognee/infrastructure/llm/prompts/feedback_reaction_prompt.txt +14 -0
- cognee/infrastructure/llm/prompts/feedback_report_prompt.txt +13 -0
- cognee/infrastructure/llm/prompts/feedback_user_context_prompt.txt +5 -0
- cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
- cognee/infrastructure/loaders/LoaderEngine.py +27 -7
- cognee/infrastructure/loaders/external/__init__.py +7 -0
- cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
- cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
- cognee/infrastructure/loaders/supported_loaders.py +7 -0
- cognee/modules/data/exceptions/exceptions.py +1 -1
- cognee/modules/data/methods/__init__.py +3 -0
- cognee/modules/data/methods/get_dataset_data.py +4 -1
- cognee/modules/data/methods/has_dataset_data.py +21 -0
- cognee/modules/engine/models/TableRow.py +0 -1
- cognee/modules/ingestion/save_data_to_file.py +9 -2
- cognee/modules/pipelines/exceptions/exceptions.py +1 -1
- cognee/modules/pipelines/operations/pipeline.py +12 -1
- cognee/modules/pipelines/operations/run_tasks.py +25 -197
- cognee/modules/pipelines/operations/run_tasks_base.py +7 -0
- cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
- cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
- cognee/modules/pipelines/operations/run_tasks_with_telemetry.py +9 -1
- cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
- cognee/modules/retrieval/base_graph_retriever.py +3 -1
- cognee/modules/retrieval/base_retriever.py +3 -1
- cognee/modules/retrieval/chunks_retriever.py +5 -1
- cognee/modules/retrieval/code_retriever.py +20 -2
- cognee/modules/retrieval/completion_retriever.py +50 -9
- cognee/modules/retrieval/cypher_search_retriever.py +11 -1
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
- cognee/modules/retrieval/graph_completion_cot_retriever.py +152 -22
- cognee/modules/retrieval/graph_completion_retriever.py +54 -10
- cognee/modules/retrieval/lexical_retriever.py +20 -2
- cognee/modules/retrieval/natural_language_retriever.py +10 -1
- cognee/modules/retrieval/summaries_retriever.py +5 -1
- cognee/modules/retrieval/temporal_retriever.py +62 -10
- cognee/modules/retrieval/user_qa_feedback.py +3 -2
- cognee/modules/retrieval/utils/completion.py +30 -4
- cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
- cognee/modules/retrieval/utils/session_cache.py +156 -0
- cognee/modules/search/methods/get_search_type_tools.py +0 -5
- cognee/modules/search/methods/no_access_control_search.py +12 -1
- cognee/modules/search/methods/search.py +51 -5
- cognee/modules/search/types/SearchType.py +0 -1
- cognee/modules/settings/get_settings.py +23 -0
- cognee/modules/users/methods/get_authenticated_user.py +3 -1
- cognee/modules/users/methods/get_default_user.py +1 -6
- cognee/modules/users/roles/methods/create_role.py +2 -2
- cognee/modules/users/tenants/methods/create_tenant.py +2 -2
- cognee/shared/exceptions/exceptions.py +1 -1
- cognee/shared/logging_utils.py +18 -11
- cognee/shared/utils.py +24 -2
- cognee/tasks/codingagents/coding_rule_associations.py +1 -2
- cognee/tasks/documents/exceptions/exceptions.py +1 -1
- cognee/tasks/feedback/__init__.py +13 -0
- cognee/tasks/feedback/create_enrichments.py +84 -0
- cognee/tasks/feedback/extract_feedback_interactions.py +230 -0
- cognee/tasks/feedback/generate_improved_answers.py +130 -0
- cognee/tasks/feedback/link_enrichments_to_feedback.py +67 -0
- cognee/tasks/feedback/models.py +26 -0
- cognee/tasks/graph/extract_graph_from_data.py +2 -0
- cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
- cognee/tasks/ingestion/ingest_data.py +11 -5
- cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
- cognee/tasks/storage/add_data_points.py +3 -10
- cognee/tasks/storage/index_data_points.py +19 -14
- cognee/tasks/storage/index_graph_edges.py +25 -11
- cognee/tasks/web_scraper/__init__.py +34 -0
- cognee/tasks/web_scraper/config.py +26 -0
- cognee/tasks/web_scraper/default_url_crawler.py +446 -0
- cognee/tasks/web_scraper/models.py +46 -0
- cognee/tasks/web_scraper/types.py +4 -0
- cognee/tasks/web_scraper/utils.py +142 -0
- cognee/tasks/web_scraper/web_scraper_task.py +396 -0
- cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
- cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
- cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
- cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
- cognee/tests/subprocesses/reader.py +25 -0
- cognee/tests/subprocesses/simple_cognify_1.py +31 -0
- cognee/tests/subprocesses/simple_cognify_2.py +31 -0
- cognee/tests/subprocesses/writer.py +32 -0
- cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
- cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
- cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
- cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
- cognee/tests/test_add_docling_document.py +56 -0
- cognee/tests/test_chromadb.py +7 -11
- cognee/tests/test_concurrent_subprocess_access.py +76 -0
- cognee/tests/test_conversation_history.py +240 -0
- cognee/tests/test_feedback_enrichment.py +174 -0
- cognee/tests/test_kuzu.py +27 -15
- cognee/tests/test_lancedb.py +7 -11
- cognee/tests/test_library.py +32 -2
- cognee/tests/test_neo4j.py +24 -16
- cognee/tests/test_neptune_analytics_vector.py +7 -11
- cognee/tests/test_permissions.py +9 -13
- cognee/tests/test_pgvector.py +4 -4
- cognee/tests/test_remote_kuzu.py +8 -11
- cognee/tests/test_s3_file_storage.py +1 -1
- cognee/tests/test_search_db.py +6 -8
- cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
- cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +51 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/METADATA +21 -6
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/RECORD +178 -139
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/entry_points.txt +1 -0
- distributed/Dockerfile +0 -3
- distributed/entrypoint.py +21 -9
- distributed/signal.py +5 -0
- distributed/workers/data_point_saving_worker.py +64 -34
- distributed/workers/graph_saving_worker.py +71 -47
- cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
- cognee/modules/retrieval/insights_retriever.py +0 -133
- cognee/tests/test_memgraph.py +0 -109
- cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/WHEEL +0 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -10,9 +10,7 @@ from cognee.tasks.storage.exceptions import (
|
|
|
10
10
|
)
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
async def add_data_points(
|
|
14
|
-
data_points: List[DataPoint], update_edge_collection: bool = True
|
|
15
|
-
) -> List[DataPoint]:
|
|
13
|
+
async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
|
|
16
14
|
"""
|
|
17
15
|
Add a batch of data points to the graph database by extracting nodes and edges,
|
|
18
16
|
deduplicating them, and indexing them for retrieval.
|
|
@@ -25,9 +23,6 @@ async def add_data_points(
|
|
|
25
23
|
Args:
|
|
26
24
|
data_points (List[DataPoint]):
|
|
27
25
|
A list of data points to process and insert into the graph.
|
|
28
|
-
update_edge_collection (bool, optional):
|
|
29
|
-
Whether to update the edge index after adding edges.
|
|
30
|
-
Defaults to True.
|
|
31
26
|
|
|
32
27
|
Returns:
|
|
33
28
|
List[DataPoint]:
|
|
@@ -73,12 +68,10 @@ async def add_data_points(
|
|
|
73
68
|
|
|
74
69
|
graph_engine = await get_graph_engine()
|
|
75
70
|
|
|
71
|
+
await graph_engine.add_nodes(nodes)
|
|
76
72
|
await index_data_points(nodes)
|
|
77
73
|
|
|
78
|
-
await graph_engine.add_nodes(nodes)
|
|
79
74
|
await graph_engine.add_edges(edges)
|
|
80
|
-
|
|
81
|
-
if update_edge_collection:
|
|
82
|
-
await index_graph_edges()
|
|
75
|
+
await index_graph_edges(edges)
|
|
83
76
|
|
|
84
77
|
return data_points
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+
import asyncio
|
|
2
2
|
|
|
3
|
-
from cognee.
|
|
3
|
+
from cognee.shared.logging_utils import get_logger
|
|
4
4
|
from cognee.infrastructure.databases.vector import get_vector_engine
|
|
5
5
|
from cognee.infrastructure.engine import DataPoint
|
|
6
6
|
|
|
@@ -33,18 +33,23 @@ async def index_data_points(data_points: list[DataPoint]):
|
|
|
33
33
|
indexed_data_point.metadata["index_fields"] = [field_name]
|
|
34
34
|
index_points[index_name].append(indexed_data_point)
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
36
|
+
tasks: list[asyncio.Task] = []
|
|
37
|
+
batch_size = vector_engine.embedding_engine.get_batch_size()
|
|
38
|
+
|
|
39
|
+
for index_name_and_field, points in index_points.items():
|
|
40
|
+
first = index_name_and_field.index("_")
|
|
41
|
+
index_name = index_name_and_field[:first]
|
|
42
|
+
field_name = index_name_and_field[first + 1 :]
|
|
43
|
+
|
|
44
|
+
# Create embedding requests per batch to run in parallel later
|
|
45
|
+
for i in range(0, len(points), batch_size):
|
|
46
|
+
batch = points[i : i + batch_size]
|
|
47
|
+
tasks.append(
|
|
48
|
+
asyncio.create_task(vector_engine.index_data_points(index_name, field_name, batch))
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Run all embedding requests in parallel
|
|
52
|
+
await asyncio.gather(*tasks)
|
|
48
53
|
|
|
49
54
|
return data_points
|
|
50
55
|
|
|
@@ -1,15 +1,20 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
|
|
1
3
|
from cognee.modules.engine.utils.generate_edge_id import generate_edge_id
|
|
2
|
-
from cognee.shared.logging_utils import get_logger
|
|
4
|
+
from cognee.shared.logging_utils import get_logger
|
|
3
5
|
from collections import Counter
|
|
4
|
-
|
|
6
|
+
from typing import Optional, Dict, Any, List, Tuple, Union
|
|
5
7
|
from cognee.infrastructure.databases.vector import get_vector_engine
|
|
6
8
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
7
9
|
from cognee.modules.graph.models.EdgeType import EdgeType
|
|
10
|
+
from cognee.infrastructure.databases.graph.graph_db_interface import EdgeData
|
|
8
11
|
|
|
9
|
-
logger = get_logger(
|
|
12
|
+
logger = get_logger()
|
|
10
13
|
|
|
11
14
|
|
|
12
|
-
async def index_graph_edges(
|
|
15
|
+
async def index_graph_edges(
|
|
16
|
+
edges_data: Union[List[EdgeData], List[Tuple[str, str, str, Optional[Dict[str, Any]]]]] = None,
|
|
17
|
+
):
|
|
13
18
|
"""
|
|
14
19
|
Indexes graph edges by creating and managing vector indexes for relationship types.
|
|
15
20
|
|
|
@@ -35,13 +40,17 @@ async def index_graph_edges():
|
|
|
35
40
|
index_points = {}
|
|
36
41
|
|
|
37
42
|
vector_engine = get_vector_engine()
|
|
38
|
-
|
|
43
|
+
|
|
44
|
+
if edges_data is None:
|
|
45
|
+
graph_engine = await get_graph_engine()
|
|
46
|
+
_, edges_data = await graph_engine.get_graph_data()
|
|
47
|
+
logger.warning(
|
|
48
|
+
"Your graph edge embedding is deprecated, please pass edges to the index_graph_edges directly."
|
|
49
|
+
)
|
|
39
50
|
except Exception as e:
|
|
40
51
|
logger.error("Failed to initialize engines: %s", e)
|
|
41
52
|
raise RuntimeError("Initialization error") from e
|
|
42
53
|
|
|
43
|
-
_, edges_data = await graph_engine.get_graph_data()
|
|
44
|
-
|
|
45
54
|
edge_types = Counter(
|
|
46
55
|
item.get("relationship_name")
|
|
47
56
|
for edge in edges_data
|
|
@@ -69,15 +78,20 @@ async def index_graph_edges():
|
|
|
69
78
|
indexed_data_point.metadata["index_fields"] = [field_name]
|
|
70
79
|
index_points[index_name].append(indexed_data_point)
|
|
71
80
|
|
|
81
|
+
# Get maximum batch size for embedding model
|
|
82
|
+
batch_size = vector_engine.embedding_engine.get_batch_size()
|
|
83
|
+
tasks: list[asyncio.Task] = []
|
|
84
|
+
|
|
72
85
|
for index_name, indexable_points in index_points.items():
|
|
73
86
|
index_name, field_name = index_name.split(".")
|
|
74
87
|
|
|
75
|
-
#
|
|
76
|
-
batch_size = vector_engine.embedding_engine.get_batch_size()
|
|
77
|
-
# We save the data in batches of {batch_size} to not put a lot of pressure on the database
|
|
88
|
+
# Create embedding tasks to run in parallel later
|
|
78
89
|
for start in range(0, len(indexable_points), batch_size):
|
|
79
90
|
batch = indexable_points[start : start + batch_size]
|
|
80
91
|
|
|
81
|
-
|
|
92
|
+
tasks.append(vector_engine.index_data_points(index_name, field_name, batch))
|
|
93
|
+
|
|
94
|
+
# Start all embedding tasks and wait for completion
|
|
95
|
+
await asyncio.gather(*tasks)
|
|
82
96
|
|
|
83
97
|
return None
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Web scraping module for cognee.
|
|
2
|
+
|
|
3
|
+
This module provides tools for scraping web content, managing scraping jobs, and storing
|
|
4
|
+
data in a graph database. It includes classes and functions for crawling web pages using
|
|
5
|
+
BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .utils import fetch_page_content
|
|
9
|
+
from .default_url_crawler import DefaultUrlCrawler
|
|
10
|
+
|
|
11
|
+
# Lazy import for web_scraper_task to avoid requiring apscheduler
|
|
12
|
+
# Import these directly if needed: from cognee.tasks.web_scraper.web_scraper_task import ...
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def __getattr__(name):
|
|
16
|
+
"""Lazy load web scraper task functions that require apscheduler."""
|
|
17
|
+
if name == "cron_web_scraper_task":
|
|
18
|
+
from .web_scraper_task import cron_web_scraper_task
|
|
19
|
+
|
|
20
|
+
return cron_web_scraper_task
|
|
21
|
+
elif name == "web_scraper_task":
|
|
22
|
+
from .web_scraper_task import web_scraper_task
|
|
23
|
+
|
|
24
|
+
return web_scraper_task
|
|
25
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"BeautifulSoupCrawler",
|
|
30
|
+
"fetch_page_content",
|
|
31
|
+
"cron_web_scraper_task",
|
|
32
|
+
"web_scraper_task",
|
|
33
|
+
"DefaultUrlCrawler",
|
|
34
|
+
]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from typing import Any, Dict, Optional, Literal
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TavilyConfig(BaseModel):
|
|
7
|
+
api_key: Optional[str] = os.getenv("TAVILY_API_KEY")
|
|
8
|
+
extract_depth: Literal["basic", "advanced"] = "basic"
|
|
9
|
+
proxies: Optional[Dict[str, str]] = None
|
|
10
|
+
timeout: Optional[int] = Field(default=10, ge=1, le=60)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DefaultCrawlerConfig(BaseModel):
|
|
14
|
+
concurrency: int = 5
|
|
15
|
+
crawl_delay: float = 0.5
|
|
16
|
+
max_crawl_delay: Optional[float] = (
|
|
17
|
+
10.0 # Maximum crawl delay to respect from robots.txt (None = no limit)
|
|
18
|
+
)
|
|
19
|
+
timeout: float = 15.0
|
|
20
|
+
max_retries: int = 2
|
|
21
|
+
retry_delay_factor: float = 0.5
|
|
22
|
+
headers: Optional[Dict[str, str]] = None
|
|
23
|
+
use_playwright: bool = False
|
|
24
|
+
playwright_js_wait: float = 0.8
|
|
25
|
+
robots_cache_ttl: float = 3600.0
|
|
26
|
+
join_all_matches: bool = False
|
|
@@ -0,0 +1,446 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from functools import lru_cache
|
|
4
|
+
import time
|
|
5
|
+
from typing import Any, Union, List, Dict, Optional
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
from cognee.shared.logging_utils import get_logger
|
|
10
|
+
from cognee.tasks.web_scraper.types import UrlsToHtmls
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
from protego import Protego
|
|
16
|
+
except ImportError:
|
|
17
|
+
logger.warning("Failed to import protego, make sure to install using pip install protego>=0.1")
|
|
18
|
+
Protego = None
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
from playwright.async_api import async_playwright
|
|
22
|
+
except ImportError:
|
|
23
|
+
logger.warning(
|
|
24
|
+
"Failed to import playwright, make sure to install using pip install playwright>=1.9.0"
|
|
25
|
+
)
|
|
26
|
+
async_playwright = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class RobotsTxtCache:
|
|
31
|
+
"""Cache for robots.txt data.
|
|
32
|
+
|
|
33
|
+
Attributes:
|
|
34
|
+
protego: Parsed robots.txt object (Protego instance).
|
|
35
|
+
crawl_delay: Delay between requests (in seconds).
|
|
36
|
+
timestamp: Time when the cache entry was created.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
protego: Any
|
|
40
|
+
crawl_delay: float
|
|
41
|
+
timestamp: float = field(default_factory=time.time)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DefaultUrlCrawler:
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
*,
|
|
48
|
+
concurrency: int = 5,
|
|
49
|
+
crawl_delay: float = 0.5,
|
|
50
|
+
max_crawl_delay: Optional[float] = 10.0,
|
|
51
|
+
timeout: float = 15.0,
|
|
52
|
+
max_retries: int = 2,
|
|
53
|
+
retry_delay_factor: float = 0.5,
|
|
54
|
+
headers: Optional[Dict[str, str]] = None,
|
|
55
|
+
robots_cache_ttl: float = 3600.0,
|
|
56
|
+
):
|
|
57
|
+
"""Initialize the BeautifulSoupCrawler.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
concurrency: Number of concurrent requests allowed.
|
|
61
|
+
crawl_delay: Minimum seconds between requests to the same domain.
|
|
62
|
+
max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
|
|
63
|
+
timeout: Per-request timeout in seconds.
|
|
64
|
+
max_retries: Number of retries for failed requests.
|
|
65
|
+
retry_delay_factor: Multiplier for exponential backoff on retries.
|
|
66
|
+
headers: HTTP headers for requests (defaults to User-Agent: Cognee-Scraper/1.0).
|
|
67
|
+
robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
|
|
68
|
+
"""
|
|
69
|
+
self.concurrency = concurrency
|
|
70
|
+
self._sem = asyncio.Semaphore(concurrency)
|
|
71
|
+
self.crawl_delay = crawl_delay
|
|
72
|
+
self.max_crawl_delay = max_crawl_delay
|
|
73
|
+
self.timeout = timeout
|
|
74
|
+
self.max_retries = max_retries
|
|
75
|
+
self.retry_delay_factor = retry_delay_factor
|
|
76
|
+
self.headers = headers or {"User-Agent": "Cognee-Scraper/1.0"}
|
|
77
|
+
self.robots_cache_ttl = robots_cache_ttl
|
|
78
|
+
self._last_request_time_per_domain: Dict[str, float] = {}
|
|
79
|
+
self._robots_cache: Dict[str, RobotsTxtCache] = {}
|
|
80
|
+
self._client: Optional[httpx.AsyncClient] = None
|
|
81
|
+
self._robots_lock = asyncio.Lock()
|
|
82
|
+
|
|
83
|
+
async def _ensure_client(self):
|
|
84
|
+
"""Initialize the HTTP client if not already created."""
|
|
85
|
+
if self._client is None:
|
|
86
|
+
self._client = httpx.AsyncClient(timeout=self.timeout, headers=self.headers)
|
|
87
|
+
|
|
88
|
+
async def close(self):
|
|
89
|
+
"""Close the HTTP client."""
|
|
90
|
+
if self._client:
|
|
91
|
+
await self._client.aclose()
|
|
92
|
+
self._client = None
|
|
93
|
+
|
|
94
|
+
async def __aenter__(self):
|
|
95
|
+
"""Enter the context manager, initializing the HTTP client."""
|
|
96
|
+
await self._ensure_client()
|
|
97
|
+
return self
|
|
98
|
+
|
|
99
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
100
|
+
"""Exit the context manager, closing the HTTP client."""
|
|
101
|
+
await self.close()
|
|
102
|
+
|
|
103
|
+
@lru_cache(maxsize=1024)
|
|
104
|
+
def _domain_from_url(self, url: str) -> str:
|
|
105
|
+
"""Extract the domain (netloc) from a URL.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
url: The URL to parse.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
str: The domain (netloc) of the URL.
|
|
112
|
+
"""
|
|
113
|
+
try:
|
|
114
|
+
return urlparse(url).netloc
|
|
115
|
+
except Exception:
|
|
116
|
+
return url
|
|
117
|
+
|
|
118
|
+
@lru_cache(maxsize=1024)
|
|
119
|
+
def _get_domain_root(self, url: str) -> str:
|
|
120
|
+
"""Get the root URL (scheme and netloc) from a URL.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
url: The URL to parse.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
str: The root URL (e.g., "https://example.com").
|
|
127
|
+
"""
|
|
128
|
+
parsed = urlparse(url)
|
|
129
|
+
return f"{parsed.scheme}://{parsed.netloc}"
|
|
130
|
+
|
|
131
|
+
async def _respect_rate_limit(self, url: str, crawl_delay: Optional[float] = None):
|
|
132
|
+
"""Enforce rate limiting for requests to the same domain.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
url: The URL to check.
|
|
136
|
+
crawl_delay: Custom crawl delay in seconds (if any).
|
|
137
|
+
"""
|
|
138
|
+
domain = self._domain_from_url(url)
|
|
139
|
+
last = self._last_request_time_per_domain.get(domain)
|
|
140
|
+
delay = crawl_delay if crawl_delay is not None else self.crawl_delay
|
|
141
|
+
|
|
142
|
+
if last is None:
|
|
143
|
+
self._last_request_time_per_domain[domain] = time.time()
|
|
144
|
+
return
|
|
145
|
+
|
|
146
|
+
elapsed = time.time() - last
|
|
147
|
+
wait_for = delay - elapsed
|
|
148
|
+
if wait_for > 0:
|
|
149
|
+
logger.info(
|
|
150
|
+
f"Rate limiting: waiting {wait_for:.2f}s before requesting {url} (crawl_delay={delay}s from robots.txt)"
|
|
151
|
+
)
|
|
152
|
+
await asyncio.sleep(wait_for)
|
|
153
|
+
logger.info(f"Rate limit wait completed for {url}")
|
|
154
|
+
self._last_request_time_per_domain[domain] = time.time()
|
|
155
|
+
|
|
156
|
+
async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]:
|
|
157
|
+
"""Get cached robots.txt data if valid.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
domain_root: The root URL (e.g., "https://example.com").
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Optional[RobotsTxtCache]: Cached robots.txt data or None if expired or not found.
|
|
164
|
+
"""
|
|
165
|
+
if Protego is None:
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
cached = self._robots_cache.get(domain_root)
|
|
169
|
+
if cached and (time.time() - cached.timestamp) < self.robots_cache_ttl:
|
|
170
|
+
return cached
|
|
171
|
+
return None
|
|
172
|
+
|
|
173
|
+
async def _fetch_and_cache_robots(self, domain_root: str) -> RobotsTxtCache:
|
|
174
|
+
"""Fetch and cache robots.txt data.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
domain_root: The root URL (e.g., "https://example.com").
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
RobotsTxtCache: Cached robots.txt data with crawl delay.
|
|
181
|
+
|
|
182
|
+
Raises:
|
|
183
|
+
Exception: If fetching robots.txt fails.
|
|
184
|
+
"""
|
|
185
|
+
async with self._robots_lock:
|
|
186
|
+
cached = await self._get_robots_cache(domain_root)
|
|
187
|
+
if cached:
|
|
188
|
+
return cached
|
|
189
|
+
|
|
190
|
+
robots_url = f"{domain_root}/robots.txt"
|
|
191
|
+
try:
|
|
192
|
+
await self._ensure_client()
|
|
193
|
+
await self._respect_rate_limit(robots_url, self.crawl_delay)
|
|
194
|
+
resp = await self._client.get(robots_url, timeout=5.0)
|
|
195
|
+
content = resp.text if resp.status_code == 200 else ""
|
|
196
|
+
except Exception as e:
|
|
197
|
+
logger.debug(f"Failed to fetch robots.txt from {domain_root}: {e}")
|
|
198
|
+
content = ""
|
|
199
|
+
|
|
200
|
+
protego = Protego.parse(content) if content.strip() else None
|
|
201
|
+
agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
|
|
202
|
+
|
|
203
|
+
crawl_delay = self.crawl_delay
|
|
204
|
+
if protego:
|
|
205
|
+
delay = protego.crawl_delay(agent) or protego.crawl_delay("*")
|
|
206
|
+
if delay:
|
|
207
|
+
# Apply max_crawl_delay cap if configured
|
|
208
|
+
if self.max_crawl_delay is not None and delay > self.max_crawl_delay:
|
|
209
|
+
logger.warning(
|
|
210
|
+
f"robots.txt specifies crawl_delay={delay}s for {domain_root}, "
|
|
211
|
+
f"capping to max_crawl_delay={self.max_crawl_delay}s"
|
|
212
|
+
)
|
|
213
|
+
crawl_delay = self.max_crawl_delay
|
|
214
|
+
else:
|
|
215
|
+
crawl_delay = delay
|
|
216
|
+
|
|
217
|
+
cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay)
|
|
218
|
+
self._robots_cache[domain_root] = cache_entry
|
|
219
|
+
return cache_entry
|
|
220
|
+
|
|
221
|
+
async def _is_url_allowed(self, url: str) -> bool:
|
|
222
|
+
"""Check if a URL is allowed by robots.txt.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
url: The URL to check.
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
bool: True if the URL is allowed, False otherwise.
|
|
229
|
+
"""
|
|
230
|
+
if Protego is None:
|
|
231
|
+
return True
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
domain_root = self._get_domain_root(url)
|
|
235
|
+
cache = await self._get_robots_cache(domain_root)
|
|
236
|
+
if cache is None:
|
|
237
|
+
cache = await self._fetch_and_cache_robots(domain_root)
|
|
238
|
+
|
|
239
|
+
if cache.protego is None:
|
|
240
|
+
return True
|
|
241
|
+
|
|
242
|
+
agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
|
|
243
|
+
return cache.protego.can_fetch(agent, url) or cache.protego.can_fetch("*", url)
|
|
244
|
+
except Exception as e:
|
|
245
|
+
logger.debug(f"Error checking robots.txt for {url}: {e}")
|
|
246
|
+
return True
|
|
247
|
+
|
|
248
|
+
async def _get_crawl_delay(self, url: str) -> float:
|
|
249
|
+
"""Get the crawl delay for a URL from robots.txt.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
url: The URL to check.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
float: Crawl delay in seconds.
|
|
256
|
+
"""
|
|
257
|
+
if Protego is None:
|
|
258
|
+
return self.crawl_delay
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
domain_root = self._get_domain_root(url)
|
|
262
|
+
cache = await self._get_robots_cache(domain_root)
|
|
263
|
+
if cache is None:
|
|
264
|
+
cache = await self._fetch_and_cache_robots(domain_root)
|
|
265
|
+
return cache.crawl_delay
|
|
266
|
+
except Exception:
|
|
267
|
+
return self.crawl_delay
|
|
268
|
+
|
|
269
|
+
async def _fetch_httpx(self, url: str) -> str:
|
|
270
|
+
"""Fetch a URL using HTTPX with retries.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
url: The URL to fetch.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
str: The HTML content of the page.
|
|
277
|
+
|
|
278
|
+
Raises:
|
|
279
|
+
Exception: If all retry attempts fail.
|
|
280
|
+
"""
|
|
281
|
+
await self._ensure_client()
|
|
282
|
+
assert self._client is not None, "HTTP client not initialized"
|
|
283
|
+
|
|
284
|
+
attempt = 0
|
|
285
|
+
crawl_delay = await self._get_crawl_delay(url)
|
|
286
|
+
logger.info(f"Fetching URL with httpx (crawl_delay={crawl_delay}s): {url}")
|
|
287
|
+
|
|
288
|
+
while True:
|
|
289
|
+
try:
|
|
290
|
+
await self._respect_rate_limit(url, crawl_delay)
|
|
291
|
+
resp = await self._client.get(url)
|
|
292
|
+
resp.raise_for_status()
|
|
293
|
+
logger.info(
|
|
294
|
+
f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)"
|
|
295
|
+
)
|
|
296
|
+
return resp.text
|
|
297
|
+
except Exception as exc:
|
|
298
|
+
attempt += 1
|
|
299
|
+
if attempt > self.max_retries:
|
|
300
|
+
logger.error(f"Fetch failed for {url} after {attempt} attempts: {exc}")
|
|
301
|
+
raise
|
|
302
|
+
|
|
303
|
+
delay = self.retry_delay_factor * (2 ** (attempt - 1))
|
|
304
|
+
logger.warning(
|
|
305
|
+
f"Retrying {url} after {delay:.2f}s (attempt {attempt}) due to {exc}"
|
|
306
|
+
)
|
|
307
|
+
await asyncio.sleep(delay)
|
|
308
|
+
|
|
309
|
+
async def _render_with_playwright(
|
|
310
|
+
self, url: str, js_wait: float = 1.0, timeout: Optional[float] = None
|
|
311
|
+
) -> str:
|
|
312
|
+
"""Fetch and render a URL using Playwright for JavaScript content.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
url: The URL to fetch.
|
|
316
|
+
js_wait: Seconds to wait for JavaScript to load.
|
|
317
|
+
timeout: Timeout for the request (in seconds, defaults to instance timeout).
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
str: The rendered HTML content.
|
|
321
|
+
|
|
322
|
+
Raises:
|
|
323
|
+
RuntimeError: If Playwright is not installed.
|
|
324
|
+
Exception: If all retry attempts fail.
|
|
325
|
+
"""
|
|
326
|
+
if async_playwright is None:
|
|
327
|
+
raise RuntimeError(
|
|
328
|
+
"Playwright is not installed. Install with `pip install playwright` and run `playwright install`."
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
timeout_val = timeout or self.timeout
|
|
332
|
+
logger.info(
|
|
333
|
+
f"Rendering URL with Playwright (js_wait={js_wait}s, timeout={timeout_val}s): {url}"
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
attempt = 0
|
|
337
|
+
while True:
|
|
338
|
+
try:
|
|
339
|
+
async with async_playwright() as p:
|
|
340
|
+
logger.info(f"Launching headless Chromium browser for {url}")
|
|
341
|
+
browser = await p.chromium.launch(headless=True)
|
|
342
|
+
try:
|
|
343
|
+
context = await browser.new_context()
|
|
344
|
+
page = await context.new_page()
|
|
345
|
+
logger.info(f"Navigating to {url} and waiting for network idle")
|
|
346
|
+
await page.goto(
|
|
347
|
+
url,
|
|
348
|
+
wait_until="networkidle",
|
|
349
|
+
timeout=int(timeout_val * 1000),
|
|
350
|
+
)
|
|
351
|
+
if js_wait:
|
|
352
|
+
logger.info(f"Waiting {js_wait}s for JavaScript to execute")
|
|
353
|
+
await asyncio.sleep(js_wait)
|
|
354
|
+
content = await page.content()
|
|
355
|
+
logger.info(
|
|
356
|
+
f"Successfully rendered {url} with Playwright (size={len(content)} bytes)"
|
|
357
|
+
)
|
|
358
|
+
return content
|
|
359
|
+
finally:
|
|
360
|
+
await browser.close()
|
|
361
|
+
except Exception as exc:
|
|
362
|
+
attempt += 1
|
|
363
|
+
if attempt > self.max_retries:
|
|
364
|
+
logger.error(f"Playwright fetch failed for {url}: {exc}")
|
|
365
|
+
raise
|
|
366
|
+
backoff = self.retry_delay_factor * (2 ** (attempt - 1))
|
|
367
|
+
logger.warning(
|
|
368
|
+
f"Retrying playwright fetch {url} after {backoff:.2f}s (attempt {attempt})"
|
|
369
|
+
)
|
|
370
|
+
await asyncio.sleep(backoff)
|
|
371
|
+
|
|
372
|
+
async def fetch_urls(
|
|
373
|
+
self,
|
|
374
|
+
urls: Union[str, List[str]],
|
|
375
|
+
*,
|
|
376
|
+
use_playwright: bool = False,
|
|
377
|
+
playwright_js_wait: float = 0.8,
|
|
378
|
+
) -> UrlsToHtmls:
|
|
379
|
+
"""Fetch and extract content from URLs using BeautifulSoup or Playwright.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
urls: A single URL, list of URLs, or dict mapping URLs to extraction rules.
|
|
383
|
+
extraction_rules: Default extraction rules for string or list URLs.
|
|
384
|
+
use_playwright: If True, use Playwright for JavaScript rendering.
|
|
385
|
+
playwright_js_wait: Seconds to wait for JavaScript to load.
|
|
386
|
+
join_all_matches: If True, extract all matching elements for each rule.
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
Dict[str, str]: A dictionary mapping URLs to their extracted content.
|
|
390
|
+
|
|
391
|
+
Raises:
|
|
392
|
+
ValueError: If extraction_rules are missing when required or if urls is invalid.
|
|
393
|
+
Exception: If fetching or extraction fails.
|
|
394
|
+
"""
|
|
395
|
+
if isinstance(urls, str):
|
|
396
|
+
urls = [urls]
|
|
397
|
+
else:
|
|
398
|
+
raise ValueError(f"Invalid urls type: {type(urls)}")
|
|
399
|
+
|
|
400
|
+
async def _task(url: str):
|
|
401
|
+
async with self._sem:
|
|
402
|
+
try:
|
|
403
|
+
logger.info(f"Processing URL: {url}")
|
|
404
|
+
|
|
405
|
+
# Check robots.txt
|
|
406
|
+
allowed = await self._is_url_allowed(url)
|
|
407
|
+
if not allowed:
|
|
408
|
+
logger.warning(f"URL disallowed by robots.txt: {url}")
|
|
409
|
+
return url, ""
|
|
410
|
+
|
|
411
|
+
logger.info(f"Robots.txt check passed for {url}")
|
|
412
|
+
|
|
413
|
+
# Fetch HTML
|
|
414
|
+
if use_playwright:
|
|
415
|
+
logger.info(
|
|
416
|
+
f"Rendering {url} with Playwright (JS wait: {playwright_js_wait}s)"
|
|
417
|
+
)
|
|
418
|
+
html = await self._render_with_playwright(
|
|
419
|
+
url, js_wait=playwright_js_wait, timeout=self.timeout
|
|
420
|
+
)
|
|
421
|
+
else:
|
|
422
|
+
logger.info(f"Fetching {url} with httpx")
|
|
423
|
+
html = await self._fetch_httpx(url)
|
|
424
|
+
|
|
425
|
+
logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)")
|
|
426
|
+
|
|
427
|
+
return url, html
|
|
428
|
+
|
|
429
|
+
except Exception as e:
|
|
430
|
+
logger.error(f"Error processing {url}: {e}")
|
|
431
|
+
return url, ""
|
|
432
|
+
|
|
433
|
+
logger.info(f"Creating {len(urls)} async tasks for concurrent fetching")
|
|
434
|
+
tasks = [asyncio.create_task(_task(u)) for u in urls]
|
|
435
|
+
results = {}
|
|
436
|
+
completed = 0
|
|
437
|
+
total = len(tasks)
|
|
438
|
+
|
|
439
|
+
for coro in asyncio.as_completed(tasks):
|
|
440
|
+
url, html = await coro
|
|
441
|
+
results[url] = html
|
|
442
|
+
completed += 1
|
|
443
|
+
logger.info(f"Progress: {completed}/{total} URLs processed")
|
|
444
|
+
|
|
445
|
+
logger.info(f"Completed fetching all {len(results)} URL(s)")
|
|
446
|
+
return results
|