cognee 0.3.6__py3-none-any.whl → 0.3.7.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +1 -0
- cognee/api/health.py +2 -12
- cognee/api/v1/add/add.py +46 -6
- cognee/api/v1/add/routers/get_add_router.py +11 -2
- cognee/api/v1/cognify/cognify.py +29 -9
- cognee/api/v1/cognify/routers/get_cognify_router.py +2 -1
- cognee/api/v1/datasets/datasets.py +11 -0
- cognee/api/v1/datasets/routers/get_datasets_router.py +8 -0
- cognee/api/v1/delete/routers/get_delete_router.py +2 -0
- cognee/api/v1/memify/routers/get_memify_router.py +2 -1
- cognee/api/v1/permissions/routers/get_permissions_router.py +6 -0
- cognee/api/v1/responses/default_tools.py +0 -1
- cognee/api/v1/responses/dispatch_function.py +1 -1
- cognee/api/v1/responses/routers/default_tools.py +0 -1
- cognee/api/v1/search/routers/get_search_router.py +3 -3
- cognee/api/v1/search/search.py +11 -9
- cognee/api/v1/settings/routers/get_settings_router.py +7 -1
- cognee/api/v1/sync/routers/get_sync_router.py +3 -0
- cognee/api/v1/ui/ui.py +45 -16
- cognee/api/v1/update/routers/get_update_router.py +3 -1
- cognee/api/v1/update/update.py +3 -3
- cognee/api/v1/users/routers/get_visualize_router.py +2 -0
- cognee/cli/_cognee.py +61 -10
- cognee/cli/commands/add_command.py +3 -3
- cognee/cli/commands/cognify_command.py +3 -3
- cognee/cli/commands/config_command.py +9 -7
- cognee/cli/commands/delete_command.py +3 -3
- cognee/cli/commands/search_command.py +3 -7
- cognee/cli/config.py +0 -1
- cognee/context_global_variables.py +5 -0
- cognee/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/cache/__init__.py +2 -0
- cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
- cognee/infrastructure/databases/cache/config.py +44 -0
- cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
- cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
- cognee/infrastructure/databases/exceptions/__init__.py +1 -0
- cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
- cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
- cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
- cognee/infrastructure/databases/graph/kuzu/adapter.py +76 -47
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
- cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
- cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
- cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
- cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
- cognee/infrastructure/files/exceptions.py +1 -1
- cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
- cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
- cognee/infrastructure/files/utils/guess_file_type.py +6 -0
- cognee/infrastructure/llm/prompts/feedback_reaction_prompt.txt +14 -0
- cognee/infrastructure/llm/prompts/feedback_report_prompt.txt +13 -0
- cognee/infrastructure/llm/prompts/feedback_user_context_prompt.txt +5 -0
- cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
- cognee/infrastructure/loaders/LoaderEngine.py +27 -7
- cognee/infrastructure/loaders/external/__init__.py +7 -0
- cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
- cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
- cognee/infrastructure/loaders/supported_loaders.py +7 -0
- cognee/modules/data/exceptions/exceptions.py +1 -1
- cognee/modules/data/methods/__init__.py +3 -0
- cognee/modules/data/methods/get_dataset_data.py +4 -1
- cognee/modules/data/methods/has_dataset_data.py +21 -0
- cognee/modules/engine/models/TableRow.py +0 -1
- cognee/modules/ingestion/save_data_to_file.py +9 -2
- cognee/modules/pipelines/exceptions/exceptions.py +1 -1
- cognee/modules/pipelines/operations/pipeline.py +12 -1
- cognee/modules/pipelines/operations/run_tasks.py +25 -197
- cognee/modules/pipelines/operations/run_tasks_base.py +7 -0
- cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
- cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
- cognee/modules/pipelines/operations/run_tasks_with_telemetry.py +9 -1
- cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
- cognee/modules/retrieval/base_graph_retriever.py +3 -1
- cognee/modules/retrieval/base_retriever.py +3 -1
- cognee/modules/retrieval/chunks_retriever.py +5 -1
- cognee/modules/retrieval/code_retriever.py +20 -2
- cognee/modules/retrieval/completion_retriever.py +50 -9
- cognee/modules/retrieval/cypher_search_retriever.py +11 -1
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
- cognee/modules/retrieval/graph_completion_cot_retriever.py +152 -22
- cognee/modules/retrieval/graph_completion_retriever.py +54 -10
- cognee/modules/retrieval/lexical_retriever.py +20 -2
- cognee/modules/retrieval/natural_language_retriever.py +10 -1
- cognee/modules/retrieval/summaries_retriever.py +5 -1
- cognee/modules/retrieval/temporal_retriever.py +62 -10
- cognee/modules/retrieval/user_qa_feedback.py +3 -2
- cognee/modules/retrieval/utils/completion.py +30 -4
- cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
- cognee/modules/retrieval/utils/session_cache.py +156 -0
- cognee/modules/search/methods/get_search_type_tools.py +0 -5
- cognee/modules/search/methods/no_access_control_search.py +12 -1
- cognee/modules/search/methods/search.py +51 -5
- cognee/modules/search/types/SearchType.py +0 -1
- cognee/modules/settings/get_settings.py +23 -0
- cognee/modules/users/methods/get_authenticated_user.py +3 -1
- cognee/modules/users/methods/get_default_user.py +1 -6
- cognee/modules/users/roles/methods/create_role.py +2 -2
- cognee/modules/users/tenants/methods/create_tenant.py +2 -2
- cognee/shared/exceptions/exceptions.py +1 -1
- cognee/shared/logging_utils.py +18 -11
- cognee/shared/utils.py +24 -2
- cognee/tasks/codingagents/coding_rule_associations.py +1 -2
- cognee/tasks/documents/exceptions/exceptions.py +1 -1
- cognee/tasks/feedback/__init__.py +13 -0
- cognee/tasks/feedback/create_enrichments.py +84 -0
- cognee/tasks/feedback/extract_feedback_interactions.py +230 -0
- cognee/tasks/feedback/generate_improved_answers.py +130 -0
- cognee/tasks/feedback/link_enrichments_to_feedback.py +67 -0
- cognee/tasks/feedback/models.py +26 -0
- cognee/tasks/graph/extract_graph_from_data.py +2 -0
- cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
- cognee/tasks/ingestion/ingest_data.py +11 -5
- cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
- cognee/tasks/storage/add_data_points.py +3 -10
- cognee/tasks/storage/index_data_points.py +19 -14
- cognee/tasks/storage/index_graph_edges.py +25 -11
- cognee/tasks/web_scraper/__init__.py +34 -0
- cognee/tasks/web_scraper/config.py +26 -0
- cognee/tasks/web_scraper/default_url_crawler.py +446 -0
- cognee/tasks/web_scraper/models.py +46 -0
- cognee/tasks/web_scraper/types.py +4 -0
- cognee/tasks/web_scraper/utils.py +142 -0
- cognee/tasks/web_scraper/web_scraper_task.py +396 -0
- cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
- cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
- cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
- cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
- cognee/tests/subprocesses/reader.py +25 -0
- cognee/tests/subprocesses/simple_cognify_1.py +31 -0
- cognee/tests/subprocesses/simple_cognify_2.py +31 -0
- cognee/tests/subprocesses/writer.py +32 -0
- cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
- cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
- cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
- cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
- cognee/tests/test_add_docling_document.py +56 -0
- cognee/tests/test_chromadb.py +7 -11
- cognee/tests/test_concurrent_subprocess_access.py +76 -0
- cognee/tests/test_conversation_history.py +240 -0
- cognee/tests/test_feedback_enrichment.py +174 -0
- cognee/tests/test_kuzu.py +27 -15
- cognee/tests/test_lancedb.py +7 -11
- cognee/tests/test_library.py +32 -2
- cognee/tests/test_neo4j.py +24 -16
- cognee/tests/test_neptune_analytics_vector.py +7 -11
- cognee/tests/test_permissions.py +9 -13
- cognee/tests/test_pgvector.py +4 -4
- cognee/tests/test_remote_kuzu.py +8 -11
- cognee/tests/test_s3_file_storage.py +1 -1
- cognee/tests/test_search_db.py +6 -8
- cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
- cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +51 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/METADATA +21 -6
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/RECORD +178 -139
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/entry_points.txt +1 -0
- distributed/Dockerfile +0 -3
- distributed/entrypoint.py +21 -9
- distributed/signal.py +5 -0
- distributed/workers/data_point_saving_worker.py +64 -34
- distributed/workers/graph_saving_worker.py +71 -47
- cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
- cognee/modules/retrieval/insights_retriever.py +0 -133
- cognee/tests/test_memgraph.py +0 -109
- cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/WHEEL +0 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from cognee.infrastructure.engine import DataPoint
|
|
2
|
+
from typing import Optional, Dict, Any, List
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class WebPage(DataPoint):
|
|
7
|
+
"""Represents a scraped web page with metadata"""
|
|
8
|
+
|
|
9
|
+
name: Optional[str]
|
|
10
|
+
content: str
|
|
11
|
+
content_hash: str
|
|
12
|
+
scraped_at: datetime
|
|
13
|
+
last_modified: Optional[datetime]
|
|
14
|
+
status_code: int
|
|
15
|
+
content_type: str
|
|
16
|
+
page_size: int
|
|
17
|
+
extraction_rules: Dict[str, Any] # CSS selectors, XPath rules used
|
|
18
|
+
description: str
|
|
19
|
+
metadata: dict = {"index_fields": ["name", "description", "content"]}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class WebSite(DataPoint):
|
|
23
|
+
"""Represents a website or domain being scraped"""
|
|
24
|
+
|
|
25
|
+
name: str
|
|
26
|
+
base_url: str
|
|
27
|
+
robots_txt: Optional[str]
|
|
28
|
+
crawl_delay: float
|
|
29
|
+
last_crawled: datetime
|
|
30
|
+
page_count: int
|
|
31
|
+
scraping_config: Dict[str, Any]
|
|
32
|
+
description: str
|
|
33
|
+
metadata: dict = {"index_fields": ["name", "description"]}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ScrapingJob(DataPoint):
|
|
37
|
+
"""Represents a scraping job configuration"""
|
|
38
|
+
|
|
39
|
+
name: str
|
|
40
|
+
urls: List[str]
|
|
41
|
+
schedule: Optional[str] # Cron-like schedule for recurring scrapes
|
|
42
|
+
status: str # "active", "paused", "completed", "failed"
|
|
43
|
+
last_run: Optional[datetime]
|
|
44
|
+
next_run: Optional[datetime]
|
|
45
|
+
description: str
|
|
46
|
+
metadata: dict = {"index_fields": ["name", "description"]}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Utilities for fetching web content using BeautifulSoup or Tavily.
|
|
2
|
+
|
|
3
|
+
This module provides functions to fetch and extract content from web pages, supporting
|
|
4
|
+
both BeautifulSoup for custom extraction rules and Tavily for API-based scraping.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
from typing import List, Union
|
|
9
|
+
from cognee.shared.logging_utils import get_logger
|
|
10
|
+
from cognee.tasks.web_scraper.types import UrlsToHtmls
|
|
11
|
+
from .default_url_crawler import DefaultUrlCrawler
|
|
12
|
+
from .config import DefaultCrawlerConfig, TavilyConfig
|
|
13
|
+
|
|
14
|
+
logger = get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
|
|
18
|
+
"""Fetch content from one or more URLs using the specified tool.
|
|
19
|
+
|
|
20
|
+
This function retrieves web page content using either BeautifulSoup (with custom
|
|
21
|
+
extraction rules) or Tavily (API-based scraping). It handles single URLs or lists of
|
|
22
|
+
URLs and returns a dictionary mapping URLs to their extracted content.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
urls: A single URL (str) or a list of URLs (List[str]) to scrape.
|
|
26
|
+
preferred_tool: The scraping tool to use ("tavily" or "beautifulsoup").
|
|
27
|
+
Defaults to "beautifulsoup".
|
|
28
|
+
tavily_config: Configuration for Tavily API, including API key.
|
|
29
|
+
Required if preferred_tool is "tavily".
|
|
30
|
+
default_crawler_config: Configuration for BeautifulSoup crawler, including
|
|
31
|
+
extraction rules. Required if preferred_tool is "beautifulsoup" and
|
|
32
|
+
extraction_rules are needed.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Dict[str, str]: A dictionary mapping each URL to its
|
|
36
|
+
extracted content (as a string for BeautifulSoup or a dict for Tavily).
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: If Tavily API key is missing when using Tavily, or if
|
|
40
|
+
extraction_rules are not provided when using BeautifulSoup.
|
|
41
|
+
ImportError: If required dependencies (beautifulsoup4 or tavily-python) are not
|
|
42
|
+
installed.
|
|
43
|
+
"""
|
|
44
|
+
url_list = [urls] if isinstance(urls, str) else urls
|
|
45
|
+
|
|
46
|
+
if os.getenv("TAVILY_API_KEY"):
|
|
47
|
+
logger.info("Using Tavily API for url fetching")
|
|
48
|
+
return await fetch_with_tavily(urls)
|
|
49
|
+
else:
|
|
50
|
+
logger.info("Using default crawler for content extraction")
|
|
51
|
+
|
|
52
|
+
default_crawler_config = (
|
|
53
|
+
DefaultCrawlerConfig()
|
|
54
|
+
) # We've decided to use defaults, and configure through env vars as needed
|
|
55
|
+
|
|
56
|
+
logger.info(
|
|
57
|
+
f"Initializing BeautifulSoup crawler with concurrency={default_crawler_config.concurrency}, timeout={default_crawler_config.timeout}s, max_crawl_delay={default_crawler_config.max_crawl_delay}s"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
crawler = DefaultUrlCrawler(
|
|
61
|
+
concurrency=default_crawler_config.concurrency,
|
|
62
|
+
crawl_delay=default_crawler_config.crawl_delay,
|
|
63
|
+
max_crawl_delay=default_crawler_config.max_crawl_delay,
|
|
64
|
+
timeout=default_crawler_config.timeout,
|
|
65
|
+
max_retries=default_crawler_config.max_retries,
|
|
66
|
+
retry_delay_factor=default_crawler_config.retry_delay_factor,
|
|
67
|
+
headers=default_crawler_config.headers,
|
|
68
|
+
robots_cache_ttl=default_crawler_config.robots_cache_ttl,
|
|
69
|
+
)
|
|
70
|
+
try:
|
|
71
|
+
logger.info(
|
|
72
|
+
f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={default_crawler_config.use_playwright})"
|
|
73
|
+
)
|
|
74
|
+
results = await crawler.fetch_urls(
|
|
75
|
+
urls,
|
|
76
|
+
use_playwright=default_crawler_config.use_playwright,
|
|
77
|
+
playwright_js_wait=default_crawler_config.playwright_js_wait,
|
|
78
|
+
)
|
|
79
|
+
logger.info(f"Successfully fetched content from {len(results)} URL(s)")
|
|
80
|
+
return results
|
|
81
|
+
except Exception as e:
|
|
82
|
+
logger.error(f"Error fetching page content: {str(e)}")
|
|
83
|
+
raise
|
|
84
|
+
finally:
|
|
85
|
+
logger.info("Closing BeautifulSoup crawler")
|
|
86
|
+
await crawler.close()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
async def fetch_with_tavily(urls: Union[str, List[str]]) -> UrlsToHtmls:
|
|
90
|
+
"""Fetch content from URLs using the Tavily API.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
urls: A single URL (str) or a list of URLs (List[str]) to scrape.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Dict[str, str]: A dictionary mapping each URL to its raw content as a string.
|
|
97
|
+
|
|
98
|
+
Raises:
|
|
99
|
+
ImportError: If tavily-python is not installed.
|
|
100
|
+
Exception: If the Tavily API request fails.
|
|
101
|
+
"""
|
|
102
|
+
try:
|
|
103
|
+
from tavily import AsyncTavilyClient
|
|
104
|
+
except ImportError:
|
|
105
|
+
logger.error(
|
|
106
|
+
"Failed to import tavily, make sure to install using pip install tavily-python>=0.7.0"
|
|
107
|
+
)
|
|
108
|
+
raise
|
|
109
|
+
|
|
110
|
+
tavily_config = TavilyConfig()
|
|
111
|
+
url_list = [urls] if isinstance(urls, str) else urls
|
|
112
|
+
extract_depth = tavily_config.extract_depth if tavily_config else "basic"
|
|
113
|
+
timeout = tavily_config.timeout if tavily_config else 10
|
|
114
|
+
|
|
115
|
+
logger.info(
|
|
116
|
+
f"Initializing Tavily client with extract_depth={extract_depth}, timeout={timeout}s"
|
|
117
|
+
)
|
|
118
|
+
client = AsyncTavilyClient(
|
|
119
|
+
api_key=tavily_config.api_key,
|
|
120
|
+
proxies=tavily_config.proxies,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
logger.info(f"Sending extract request to Tavily API for {len(url_list)} URL(s)")
|
|
124
|
+
results = await client.extract(
|
|
125
|
+
urls,
|
|
126
|
+
format="text",
|
|
127
|
+
extract_depth=extract_depth,
|
|
128
|
+
timeout=timeout,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
failed_count = len(results.get("failed_results", []))
|
|
132
|
+
if failed_count > 0:
|
|
133
|
+
logger.warning(f"Tavily API failed to fetch {failed_count} URL(s)")
|
|
134
|
+
for failed_result in results.get("failed_results", []):
|
|
135
|
+
logger.warning(f"Failed to fetch {failed_result}")
|
|
136
|
+
|
|
137
|
+
return_results = {}
|
|
138
|
+
for result in results.get("results", []):
|
|
139
|
+
return_results[result["url"]] = result["raw_content"]
|
|
140
|
+
|
|
141
|
+
logger.info(f"Successfully fetched content from {len(return_results)} URL(s) via Tavily")
|
|
142
|
+
return return_results
|
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
"""Web scraping tasks for storing scraped data in a graph database.
|
|
2
|
+
|
|
3
|
+
This module provides functions to scrape web content, create or update WebPage, WebSite,
|
|
4
|
+
and ScrapingJob data points, and store them in a Kuzu graph database. It supports
|
|
5
|
+
scheduled scraping tasks and ensures that node updates preserve existing graph edges.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import hashlib
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from typing import Union, List
|
|
12
|
+
from urllib.parse import urlparse
|
|
13
|
+
from uuid import uuid5, NAMESPACE_OID
|
|
14
|
+
|
|
15
|
+
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
16
|
+
from cognee.shared.logging_utils import get_logger
|
|
17
|
+
from cognee.tasks.storage.index_data_points import index_data_points
|
|
18
|
+
from cognee.tasks.storage.index_graph_edges import index_graph_edges
|
|
19
|
+
from cognee.modules.engine.operations.setup import setup
|
|
20
|
+
|
|
21
|
+
from .models import WebPage, WebSite, ScrapingJob
|
|
22
|
+
from .config import DefaultCrawlerConfig, TavilyConfig
|
|
23
|
+
from .utils import fetch_page_content
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
from apscheduler.triggers.cron import CronTrigger
|
|
27
|
+
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
|
28
|
+
except ImportError:
|
|
29
|
+
raise ImportError("Please install apscheduler by pip install APScheduler>=3.10")
|
|
30
|
+
|
|
31
|
+
logger = get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
_scheduler = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_scheduler():
|
|
38
|
+
global _scheduler
|
|
39
|
+
if _scheduler is None:
|
|
40
|
+
_scheduler = AsyncIOScheduler()
|
|
41
|
+
return _scheduler
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
async def cron_web_scraper_task(
|
|
45
|
+
url: Union[str, List[str]],
|
|
46
|
+
*,
|
|
47
|
+
schedule: str = None,
|
|
48
|
+
extraction_rules: dict = None,
|
|
49
|
+
tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
|
|
50
|
+
soup_crawler_config: DefaultCrawlerConfig = None,
|
|
51
|
+
tavily_config: TavilyConfig = None,
|
|
52
|
+
job_name: str = "scraping",
|
|
53
|
+
):
|
|
54
|
+
"""Schedule or run a web scraping task.
|
|
55
|
+
|
|
56
|
+
This function schedules a recurring web scraping task using APScheduler or runs it
|
|
57
|
+
immediately if no schedule is provided. It delegates to web_scraper_task for actual
|
|
58
|
+
scraping and graph storage.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
url: A single URL or list of URLs to scrape.
|
|
62
|
+
schedule: A cron expression for scheduling (e.g., "0 0 * * *"). If None, runs immediately.
|
|
63
|
+
extraction_rules: Dictionary of extraction rules for BeautifulSoup (e.g., CSS selectors).
|
|
64
|
+
tavily_api_key: API key for Tavily. Defaults to TAVILY_API_KEY environment variable.
|
|
65
|
+
soup_crawler_config: Configuration for BeautifulSoup crawler.
|
|
66
|
+
tavily_config: Configuration for Tavily API.
|
|
67
|
+
job_name: Name of the scraping job. Defaults to "scraping".
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Any: The result of web_scraper_task if run immediately, or None if scheduled.
|
|
71
|
+
|
|
72
|
+
Raises:
|
|
73
|
+
ValueError: If the schedule is an invalid cron expression.
|
|
74
|
+
ImportError: If APScheduler is not installed.
|
|
75
|
+
"""
|
|
76
|
+
now = datetime.now()
|
|
77
|
+
job_name = job_name or f"scrape_{now.strftime('%Y%m%d_%H%M%S')}"
|
|
78
|
+
if schedule:
|
|
79
|
+
try:
|
|
80
|
+
trigger = CronTrigger.from_crontab(schedule)
|
|
81
|
+
except ValueError as e:
|
|
82
|
+
raise ValueError(f"Invalid cron string '{schedule}': {e}")
|
|
83
|
+
|
|
84
|
+
scheduler = get_scheduler()
|
|
85
|
+
scheduler.add_job(
|
|
86
|
+
web_scraper_task,
|
|
87
|
+
kwargs={
|
|
88
|
+
"url": url,
|
|
89
|
+
"schedule": schedule,
|
|
90
|
+
"extraction_rules": extraction_rules,
|
|
91
|
+
"tavily_api_key": tavily_api_key,
|
|
92
|
+
"soup_crawler_config": soup_crawler_config,
|
|
93
|
+
"tavily_config": tavily_config,
|
|
94
|
+
"job_name": job_name,
|
|
95
|
+
},
|
|
96
|
+
trigger=trigger,
|
|
97
|
+
id=job_name,
|
|
98
|
+
name=job_name or f"WebScraper_{uuid5(NAMESPACE_OID, name=job_name)}",
|
|
99
|
+
replace_existing=True,
|
|
100
|
+
)
|
|
101
|
+
if not scheduler.running:
|
|
102
|
+
scheduler.start()
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
# If no schedule, run immediately
|
|
106
|
+
logger.info(f"[{datetime.now()}] Running web scraper task immediately...")
|
|
107
|
+
return await web_scraper_task(
|
|
108
|
+
url=url,
|
|
109
|
+
schedule=schedule,
|
|
110
|
+
extraction_rules=extraction_rules,
|
|
111
|
+
tavily_api_key=tavily_api_key,
|
|
112
|
+
soup_crawler_config=soup_crawler_config,
|
|
113
|
+
tavily_config=tavily_config,
|
|
114
|
+
job_name=job_name,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
async def web_scraper_task(
|
|
119
|
+
url: Union[str, List[str]],
|
|
120
|
+
*,
|
|
121
|
+
schedule: str = None,
|
|
122
|
+
extraction_rules: dict = None,
|
|
123
|
+
tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
|
|
124
|
+
soup_crawler_config: DefaultCrawlerConfig = None,
|
|
125
|
+
tavily_config: TavilyConfig = None,
|
|
126
|
+
job_name: str = None,
|
|
127
|
+
):
|
|
128
|
+
"""Scrape URLs and store data points in a Graph database.
|
|
129
|
+
|
|
130
|
+
This function scrapes content from the provided URLs, creates or updates WebPage,
|
|
131
|
+
WebSite, and ScrapingJob data points, and stores them in a Graph database.
|
|
132
|
+
Each data point includes a description field summarizing its attributes. It creates
|
|
133
|
+
'is_scraping' (ScrapingJob to WebSite) and 'is_part_of' (WebPage to WebSite)
|
|
134
|
+
relationships, preserving existing edges during node updates.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
url: A single URL or list of URLs to scrape.
|
|
138
|
+
schedule: A cron expression for scheduling (e.g., "0 0 * * *"). If None, runs once.
|
|
139
|
+
extraction_rules: Dictionary of extraction rules for BeautifulSoup (e.g., CSS selectors).
|
|
140
|
+
tavily_api_key: API key for Tavily. Defaults to TAVILY_API_KEY environment variable.
|
|
141
|
+
soup_crawler_config: Configuration for BeautifulSoup crawler.
|
|
142
|
+
tavily_config: Configuration for Tavily API.
|
|
143
|
+
job_name: Name of the scraping job. Defaults to a timestamp-based name.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Any: The graph data returned by the graph database.
|
|
147
|
+
|
|
148
|
+
Raises:
|
|
149
|
+
TypeError: If neither tavily_config nor soup_crawler_config is provided.
|
|
150
|
+
Exception: If fetching content or database operations fail.
|
|
151
|
+
"""
|
|
152
|
+
await setup()
|
|
153
|
+
graph_db = await get_graph_engine()
|
|
154
|
+
|
|
155
|
+
if isinstance(url, str):
|
|
156
|
+
url = [url]
|
|
157
|
+
|
|
158
|
+
soup_crawler_config, tavily_config, preferred_tool = check_arguments(
|
|
159
|
+
tavily_api_key, extraction_rules, tavily_config, soup_crawler_config
|
|
160
|
+
)
|
|
161
|
+
now = datetime.now()
|
|
162
|
+
job_name = job_name or f"scrape_{now.strftime('%Y%m%d_%H%M%S')}"
|
|
163
|
+
status = "active"
|
|
164
|
+
trigger = CronTrigger.from_crontab(schedule) if schedule else None
|
|
165
|
+
next_run = trigger.get_next_fire_time(None, now) if trigger else None
|
|
166
|
+
scraping_job_created = await graph_db.get_node(uuid5(NAMESPACE_OID, name=job_name))
|
|
167
|
+
|
|
168
|
+
# Create description for ScrapingJob
|
|
169
|
+
scraping_job_description = (
|
|
170
|
+
f"Scraping job: {job_name}\n"
|
|
171
|
+
f"URLs: {', '.join(url)}\n"
|
|
172
|
+
f"Status: {status}\n"
|
|
173
|
+
f"Schedule: {schedule}\n"
|
|
174
|
+
f"Last run: {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
|
|
175
|
+
f"Next run: {next_run.strftime('%Y-%m-%d %H:%M:%S') if next_run else 'Not scheduled'}"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
scraping_job = ScrapingJob(
|
|
179
|
+
id=uuid5(NAMESPACE_OID, name=job_name),
|
|
180
|
+
name=job_name,
|
|
181
|
+
urls=url,
|
|
182
|
+
status=status,
|
|
183
|
+
schedule=schedule,
|
|
184
|
+
last_run=now,
|
|
185
|
+
next_run=next_run,
|
|
186
|
+
description=scraping_job_description,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
if scraping_job_created:
|
|
190
|
+
await graph_db.add_node(scraping_job) # Update existing scraping job
|
|
191
|
+
websites_dict = {}
|
|
192
|
+
webpages = []
|
|
193
|
+
|
|
194
|
+
# Fetch content
|
|
195
|
+
results = await fetch_page_content(
|
|
196
|
+
urls=url,
|
|
197
|
+
preferred_tool=preferred_tool,
|
|
198
|
+
tavily_config=tavily_config,
|
|
199
|
+
soup_crawler_config=soup_crawler_config,
|
|
200
|
+
)
|
|
201
|
+
for page_url, content in results.items():
|
|
202
|
+
parsed_url = urlparse(page_url)
|
|
203
|
+
domain = parsed_url.netloc
|
|
204
|
+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
|
205
|
+
|
|
206
|
+
# Create or update WebSite
|
|
207
|
+
if base_url not in websites_dict:
|
|
208
|
+
# Create description for WebSite
|
|
209
|
+
website_description = (
|
|
210
|
+
f"Website: {domain}\n"
|
|
211
|
+
f"Base URL: {base_url}\n"
|
|
212
|
+
f"Last crawled: {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
|
|
213
|
+
f"Page count: 1\n"
|
|
214
|
+
f"Scraping tool: {preferred_tool}\n"
|
|
215
|
+
f"Robots.txt: {'Available' if websites_dict.get(base_url, {}).get('robots_txt') else 'Not set'}\n"
|
|
216
|
+
f"Crawl delay: 0.5 seconds"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
websites_dict[base_url] = WebSite(
|
|
220
|
+
id=uuid5(NAMESPACE_OID, name=domain),
|
|
221
|
+
name=domain,
|
|
222
|
+
base_url=base_url,
|
|
223
|
+
robots_txt=None,
|
|
224
|
+
crawl_delay=0.5,
|
|
225
|
+
last_crawled=now,
|
|
226
|
+
page_count=1,
|
|
227
|
+
scraping_config={
|
|
228
|
+
"extraction_rules": extraction_rules or {},
|
|
229
|
+
"tool": preferred_tool,
|
|
230
|
+
},
|
|
231
|
+
description=website_description,
|
|
232
|
+
)
|
|
233
|
+
if scraping_job_created:
|
|
234
|
+
await graph_db.add_node(websites_dict[base_url])
|
|
235
|
+
else:
|
|
236
|
+
websites_dict[base_url].page_count += 1
|
|
237
|
+
# Update description for existing WebSite
|
|
238
|
+
websites_dict[base_url].description = (
|
|
239
|
+
f"Website: {domain}\n"
|
|
240
|
+
f"Base URL: {base_url}\n"
|
|
241
|
+
f"Last crawled: {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
|
|
242
|
+
f"Page count: {websites_dict[base_url].page_count}\n"
|
|
243
|
+
f"Scraping tool: {preferred_tool}\n"
|
|
244
|
+
f"Robots.txt: {'Available' if websites_dict[base_url].robots_txt else 'Not set'}\n"
|
|
245
|
+
f"Crawl delay: {websites_dict[base_url].crawl_delay} seconds"
|
|
246
|
+
)
|
|
247
|
+
if scraping_job_created:
|
|
248
|
+
await graph_db.add_node(websites_dict[base_url])
|
|
249
|
+
|
|
250
|
+
# Create WebPage
|
|
251
|
+
content_str = content if isinstance(content, str) else str(content)
|
|
252
|
+
content_hash = hashlib.sha256(content_str.encode("utf-8")).hexdigest()
|
|
253
|
+
content_preview = content_str[:500] + ("..." if len(content_str) > 500 else "")
|
|
254
|
+
# Create description for WebPage
|
|
255
|
+
webpage_description = (
|
|
256
|
+
f"Webpage: {parsed_url.path.lstrip('/') or 'Home'}\n"
|
|
257
|
+
f"URL: {page_url}\n"
|
|
258
|
+
f"Scraped at: {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
|
|
259
|
+
f"Content: {content_preview}\n"
|
|
260
|
+
f"Content type: text\n"
|
|
261
|
+
f"Page size: {len(content_str)} bytes\n"
|
|
262
|
+
f"Status code: 200"
|
|
263
|
+
)
|
|
264
|
+
page_extraction_rules = extraction_rules
|
|
265
|
+
webpage = WebPage(
|
|
266
|
+
id=uuid5(NAMESPACE_OID, name=page_url),
|
|
267
|
+
name=page_url,
|
|
268
|
+
content=content_str,
|
|
269
|
+
content_hash=content_hash,
|
|
270
|
+
scraped_at=now,
|
|
271
|
+
last_modified=None,
|
|
272
|
+
status_code=200,
|
|
273
|
+
content_type="text/html",
|
|
274
|
+
page_size=len(content_str),
|
|
275
|
+
extraction_rules=page_extraction_rules or {},
|
|
276
|
+
description=webpage_description,
|
|
277
|
+
)
|
|
278
|
+
webpages.append(webpage)
|
|
279
|
+
|
|
280
|
+
scraping_job.status = "completed" if webpages else "failed"
|
|
281
|
+
# Update ScrapingJob description with final status
|
|
282
|
+
scraping_job.description = (
|
|
283
|
+
f"Scraping job: {job_name}\n"
|
|
284
|
+
f"URLs: {', '.join(url)}\n"
|
|
285
|
+
f"Status: {scraping_job.status}\n"
|
|
286
|
+
f"Last run: {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
|
|
287
|
+
f"Next run: {next_run.strftime('%Y-%m-%d %H:%M:%S') if next_run else 'Not scheduled'}"
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
websites = list(websites_dict.values())
|
|
291
|
+
# Adding Nodes and Edges
|
|
292
|
+
node_mapping = {scraping_job.id: scraping_job}
|
|
293
|
+
edge_mapping = []
|
|
294
|
+
|
|
295
|
+
for website in websites:
|
|
296
|
+
node_mapping[website.id] = website
|
|
297
|
+
edge_mapping.append(
|
|
298
|
+
(
|
|
299
|
+
scraping_job.id,
|
|
300
|
+
website.id,
|
|
301
|
+
"is_scraping",
|
|
302
|
+
{
|
|
303
|
+
"source_node_id": scraping_job.id,
|
|
304
|
+
"target_node_id": website.id,
|
|
305
|
+
"relationship_name": "is_scraping",
|
|
306
|
+
},
|
|
307
|
+
)
|
|
308
|
+
)
|
|
309
|
+
for webpage in webpages:
|
|
310
|
+
node_mapping[webpage.id] = webpage
|
|
311
|
+
parsed_url = urlparse(webpage.name)
|
|
312
|
+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
|
313
|
+
edge_mapping.append(
|
|
314
|
+
(
|
|
315
|
+
webpage.id,
|
|
316
|
+
websites_dict[base_url].id,
|
|
317
|
+
"is_part_of",
|
|
318
|
+
{
|
|
319
|
+
"source_node_id": webpage.id,
|
|
320
|
+
"target_node_id": websites_dict[base_url].id,
|
|
321
|
+
"relationship_name": "is_part_of",
|
|
322
|
+
},
|
|
323
|
+
)
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
await graph_db.add_nodes(list(node_mapping.values()))
|
|
327
|
+
await graph_db.add_edges(edge_mapping)
|
|
328
|
+
await index_data_points(list(node_mapping.values()))
|
|
329
|
+
await index_graph_edges()
|
|
330
|
+
|
|
331
|
+
return await graph_db.get_graph_data()
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawler_config):
|
|
335
|
+
"""Validate and configure arguments for web_scraper_task.
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
tavily_api_key: API key for Tavily.
|
|
339
|
+
extraction_rules: Extraction rules for BeautifulSoup.
|
|
340
|
+
tavily_config: Configuration for Tavily API.
|
|
341
|
+
soup_crawler_config: Configuration for BeautifulSoup crawler.
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
Tuple[DefaultCrawlerConfig, TavilyConfig, str]: Configured soup_crawler_config,
|
|
345
|
+
tavily_config, and preferred_tool ("tavily" or "beautifulsoup").
|
|
346
|
+
|
|
347
|
+
Raises:
|
|
348
|
+
TypeError: If neither tavily_config nor soup_crawler_config is provided.
|
|
349
|
+
"""
|
|
350
|
+
preferred_tool = "beautifulsoup"
|
|
351
|
+
|
|
352
|
+
if extraction_rules and not soup_crawler_config:
|
|
353
|
+
soup_crawler_config = DefaultCrawlerConfig(extraction_rules=extraction_rules)
|
|
354
|
+
|
|
355
|
+
if tavily_api_key:
|
|
356
|
+
if not tavily_config:
|
|
357
|
+
tavily_config = TavilyConfig(api_key=tavily_api_key)
|
|
358
|
+
else:
|
|
359
|
+
tavily_config.api_key = tavily_api_key
|
|
360
|
+
if not extraction_rules and not soup_crawler_config:
|
|
361
|
+
preferred_tool = "tavily"
|
|
362
|
+
|
|
363
|
+
if not tavily_config and not soup_crawler_config:
|
|
364
|
+
raise TypeError("Make sure you pass arguments for web_scraper_task")
|
|
365
|
+
|
|
366
|
+
return soup_crawler_config, tavily_config, preferred_tool
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def get_path_after_base(base_url: str, url: str) -> str:
|
|
370
|
+
"""Extract the path after the base URL.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
base_url: The base URL (e.g., "https://example.com").
|
|
374
|
+
url: The full URL to extract the path from.
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
str: The path after the base URL, with leading slashes removed.
|
|
378
|
+
|
|
379
|
+
Raises:
|
|
380
|
+
ValueError: If the base URL and target URL are from different domains.
|
|
381
|
+
"""
|
|
382
|
+
parsed_base = urlparse(base_url)
|
|
383
|
+
parsed_url = urlparse(url)
|
|
384
|
+
|
|
385
|
+
# Ensure they have the same netloc (domain)
|
|
386
|
+
if parsed_base.netloc != parsed_url.netloc:
|
|
387
|
+
raise ValueError("Base URL and target URL are from different domains")
|
|
388
|
+
|
|
389
|
+
# Return everything after base_url path
|
|
390
|
+
base_path = parsed_base.path.rstrip("/")
|
|
391
|
+
full_path = parsed_url.path
|
|
392
|
+
|
|
393
|
+
if full_path.startswith(base_path):
|
|
394
|
+
return full_path[len(base_path) :].lstrip("/")
|
|
395
|
+
else:
|
|
396
|
+
return full_path.lstrip("/")
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from cognee.tasks.web_scraper import DefaultUrlCrawler
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@pytest.mark.asyncio
|
|
6
|
+
async def test_fetch():
|
|
7
|
+
crawler = DefaultUrlCrawler()
|
|
8
|
+
url = "https://en.wikipedia.org/wiki/Large_language_model"
|
|
9
|
+
results = await crawler.fetch_urls(url)
|
|
10
|
+
assert len(results) == 1
|
|
11
|
+
assert isinstance(results, dict)
|
|
12
|
+
html = results[url]
|
|
13
|
+
assert isinstance(html, str)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
from cognee.tasks.web_scraper.utils import fetch_with_tavily
|
|
4
|
+
|
|
5
|
+
skip_in_ci = pytest.mark.skipif(
|
|
6
|
+
os.getenv("GITHUB_ACTIONS") == "true",
|
|
7
|
+
reason="Skipping in Github for now - before we get TAVILY_API_KEY",
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@skip_in_ci
|
|
12
|
+
@pytest.mark.asyncio
|
|
13
|
+
async def test_fetch():
|
|
14
|
+
url = "https://en.wikipedia.org/wiki/Large_language_model"
|
|
15
|
+
results = await fetch_with_tavily(url)
|
|
16
|
+
assert isinstance(results, dict)
|
|
17
|
+
assert len(results) == 1
|
|
18
|
+
html = results[url]
|
|
19
|
+
assert isinstance(html, str)
|