cognee 0.3.6__py3-none-any.whl → 0.3.7.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. cognee/__init__.py +1 -0
  2. cognee/api/health.py +2 -12
  3. cognee/api/v1/add/add.py +46 -6
  4. cognee/api/v1/add/routers/get_add_router.py +11 -2
  5. cognee/api/v1/cognify/cognify.py +29 -9
  6. cognee/api/v1/cognify/routers/get_cognify_router.py +2 -1
  7. cognee/api/v1/datasets/datasets.py +11 -0
  8. cognee/api/v1/datasets/routers/get_datasets_router.py +8 -0
  9. cognee/api/v1/delete/routers/get_delete_router.py +2 -0
  10. cognee/api/v1/memify/routers/get_memify_router.py +2 -1
  11. cognee/api/v1/permissions/routers/get_permissions_router.py +6 -0
  12. cognee/api/v1/responses/default_tools.py +0 -1
  13. cognee/api/v1/responses/dispatch_function.py +1 -1
  14. cognee/api/v1/responses/routers/default_tools.py +0 -1
  15. cognee/api/v1/search/routers/get_search_router.py +3 -3
  16. cognee/api/v1/search/search.py +11 -9
  17. cognee/api/v1/settings/routers/get_settings_router.py +7 -1
  18. cognee/api/v1/sync/routers/get_sync_router.py +3 -0
  19. cognee/api/v1/ui/ui.py +45 -16
  20. cognee/api/v1/update/routers/get_update_router.py +3 -1
  21. cognee/api/v1/update/update.py +3 -3
  22. cognee/api/v1/users/routers/get_visualize_router.py +2 -0
  23. cognee/cli/_cognee.py +61 -10
  24. cognee/cli/commands/add_command.py +3 -3
  25. cognee/cli/commands/cognify_command.py +3 -3
  26. cognee/cli/commands/config_command.py +9 -7
  27. cognee/cli/commands/delete_command.py +3 -3
  28. cognee/cli/commands/search_command.py +3 -7
  29. cognee/cli/config.py +0 -1
  30. cognee/context_global_variables.py +5 -0
  31. cognee/exceptions/exceptions.py +1 -1
  32. cognee/infrastructure/databases/cache/__init__.py +2 -0
  33. cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
  34. cognee/infrastructure/databases/cache/config.py +44 -0
  35. cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
  36. cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
  37. cognee/infrastructure/databases/exceptions/__init__.py +1 -0
  38. cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
  39. cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
  40. cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
  41. cognee/infrastructure/databases/graph/kuzu/adapter.py +76 -47
  42. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
  43. cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
  44. cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
  45. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
  46. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
  47. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
  48. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
  49. cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
  50. cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
  51. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
  52. cognee/infrastructure/files/exceptions.py +1 -1
  53. cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
  54. cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
  55. cognee/infrastructure/files/utils/guess_file_type.py +6 -0
  56. cognee/infrastructure/llm/prompts/feedback_reaction_prompt.txt +14 -0
  57. cognee/infrastructure/llm/prompts/feedback_report_prompt.txt +13 -0
  58. cognee/infrastructure/llm/prompts/feedback_user_context_prompt.txt +5 -0
  59. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
  60. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
  61. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
  62. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
  63. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
  64. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
  65. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
  66. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
  67. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
  68. cognee/infrastructure/loaders/LoaderEngine.py +27 -7
  69. cognee/infrastructure/loaders/external/__init__.py +7 -0
  70. cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
  71. cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
  72. cognee/infrastructure/loaders/supported_loaders.py +7 -0
  73. cognee/modules/data/exceptions/exceptions.py +1 -1
  74. cognee/modules/data/methods/__init__.py +3 -0
  75. cognee/modules/data/methods/get_dataset_data.py +4 -1
  76. cognee/modules/data/methods/has_dataset_data.py +21 -0
  77. cognee/modules/engine/models/TableRow.py +0 -1
  78. cognee/modules/ingestion/save_data_to_file.py +9 -2
  79. cognee/modules/pipelines/exceptions/exceptions.py +1 -1
  80. cognee/modules/pipelines/operations/pipeline.py +12 -1
  81. cognee/modules/pipelines/operations/run_tasks.py +25 -197
  82. cognee/modules/pipelines/operations/run_tasks_base.py +7 -0
  83. cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
  84. cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
  85. cognee/modules/pipelines/operations/run_tasks_with_telemetry.py +9 -1
  86. cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
  87. cognee/modules/retrieval/base_graph_retriever.py +3 -1
  88. cognee/modules/retrieval/base_retriever.py +3 -1
  89. cognee/modules/retrieval/chunks_retriever.py +5 -1
  90. cognee/modules/retrieval/code_retriever.py +20 -2
  91. cognee/modules/retrieval/completion_retriever.py +50 -9
  92. cognee/modules/retrieval/cypher_search_retriever.py +11 -1
  93. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
  94. cognee/modules/retrieval/graph_completion_cot_retriever.py +152 -22
  95. cognee/modules/retrieval/graph_completion_retriever.py +54 -10
  96. cognee/modules/retrieval/lexical_retriever.py +20 -2
  97. cognee/modules/retrieval/natural_language_retriever.py +10 -1
  98. cognee/modules/retrieval/summaries_retriever.py +5 -1
  99. cognee/modules/retrieval/temporal_retriever.py +62 -10
  100. cognee/modules/retrieval/user_qa_feedback.py +3 -2
  101. cognee/modules/retrieval/utils/completion.py +30 -4
  102. cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
  103. cognee/modules/retrieval/utils/session_cache.py +156 -0
  104. cognee/modules/search/methods/get_search_type_tools.py +0 -5
  105. cognee/modules/search/methods/no_access_control_search.py +12 -1
  106. cognee/modules/search/methods/search.py +51 -5
  107. cognee/modules/search/types/SearchType.py +0 -1
  108. cognee/modules/settings/get_settings.py +23 -0
  109. cognee/modules/users/methods/get_authenticated_user.py +3 -1
  110. cognee/modules/users/methods/get_default_user.py +1 -6
  111. cognee/modules/users/roles/methods/create_role.py +2 -2
  112. cognee/modules/users/tenants/methods/create_tenant.py +2 -2
  113. cognee/shared/exceptions/exceptions.py +1 -1
  114. cognee/shared/logging_utils.py +18 -11
  115. cognee/shared/utils.py +24 -2
  116. cognee/tasks/codingagents/coding_rule_associations.py +1 -2
  117. cognee/tasks/documents/exceptions/exceptions.py +1 -1
  118. cognee/tasks/feedback/__init__.py +13 -0
  119. cognee/tasks/feedback/create_enrichments.py +84 -0
  120. cognee/tasks/feedback/extract_feedback_interactions.py +230 -0
  121. cognee/tasks/feedback/generate_improved_answers.py +130 -0
  122. cognee/tasks/feedback/link_enrichments_to_feedback.py +67 -0
  123. cognee/tasks/feedback/models.py +26 -0
  124. cognee/tasks/graph/extract_graph_from_data.py +2 -0
  125. cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
  126. cognee/tasks/ingestion/ingest_data.py +11 -5
  127. cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
  128. cognee/tasks/storage/add_data_points.py +3 -10
  129. cognee/tasks/storage/index_data_points.py +19 -14
  130. cognee/tasks/storage/index_graph_edges.py +25 -11
  131. cognee/tasks/web_scraper/__init__.py +34 -0
  132. cognee/tasks/web_scraper/config.py +26 -0
  133. cognee/tasks/web_scraper/default_url_crawler.py +446 -0
  134. cognee/tasks/web_scraper/models.py +46 -0
  135. cognee/tasks/web_scraper/types.py +4 -0
  136. cognee/tasks/web_scraper/utils.py +142 -0
  137. cognee/tasks/web_scraper/web_scraper_task.py +396 -0
  138. cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
  139. cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
  140. cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
  141. cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
  142. cognee/tests/subprocesses/reader.py +25 -0
  143. cognee/tests/subprocesses/simple_cognify_1.py +31 -0
  144. cognee/tests/subprocesses/simple_cognify_2.py +31 -0
  145. cognee/tests/subprocesses/writer.py +32 -0
  146. cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
  147. cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
  148. cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
  149. cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
  150. cognee/tests/test_add_docling_document.py +56 -0
  151. cognee/tests/test_chromadb.py +7 -11
  152. cognee/tests/test_concurrent_subprocess_access.py +76 -0
  153. cognee/tests/test_conversation_history.py +240 -0
  154. cognee/tests/test_feedback_enrichment.py +174 -0
  155. cognee/tests/test_kuzu.py +27 -15
  156. cognee/tests/test_lancedb.py +7 -11
  157. cognee/tests/test_library.py +32 -2
  158. cognee/tests/test_neo4j.py +24 -16
  159. cognee/tests/test_neptune_analytics_vector.py +7 -11
  160. cognee/tests/test_permissions.py +9 -13
  161. cognee/tests/test_pgvector.py +4 -4
  162. cognee/tests/test_remote_kuzu.py +8 -11
  163. cognee/tests/test_s3_file_storage.py +1 -1
  164. cognee/tests/test_search_db.py +6 -8
  165. cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
  166. cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
  167. cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +51 -0
  168. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/METADATA +21 -6
  169. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/RECORD +178 -139
  170. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/entry_points.txt +1 -0
  171. distributed/Dockerfile +0 -3
  172. distributed/entrypoint.py +21 -9
  173. distributed/signal.py +5 -0
  174. distributed/workers/data_point_saving_worker.py +64 -34
  175. distributed/workers/graph_saving_worker.py +71 -47
  176. cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
  177. cognee/modules/retrieval/insights_retriever.py +0 -133
  178. cognee/tests/test_memgraph.py +0 -109
  179. cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
  180. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/WHEEL +0 -0
  181. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/licenses/LICENSE +0 -0
  182. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/licenses/NOTICE.md +0 -0
@@ -0,0 +1,46 @@
1
+ from cognee.infrastructure.engine import DataPoint
2
+ from typing import Optional, Dict, Any, List
3
+ from datetime import datetime
4
+
5
+
6
+ class WebPage(DataPoint):
7
+ """Represents a scraped web page with metadata"""
8
+
9
+ name: Optional[str]
10
+ content: str
11
+ content_hash: str
12
+ scraped_at: datetime
13
+ last_modified: Optional[datetime]
14
+ status_code: int
15
+ content_type: str
16
+ page_size: int
17
+ extraction_rules: Dict[str, Any] # CSS selectors, XPath rules used
18
+ description: str
19
+ metadata: dict = {"index_fields": ["name", "description", "content"]}
20
+
21
+
22
+ class WebSite(DataPoint):
23
+ """Represents a website or domain being scraped"""
24
+
25
+ name: str
26
+ base_url: str
27
+ robots_txt: Optional[str]
28
+ crawl_delay: float
29
+ last_crawled: datetime
30
+ page_count: int
31
+ scraping_config: Dict[str, Any]
32
+ description: str
33
+ metadata: dict = {"index_fields": ["name", "description"]}
34
+
35
+
36
+ class ScrapingJob(DataPoint):
37
+ """Represents a scraping job configuration"""
38
+
39
+ name: str
40
+ urls: List[str]
41
+ schedule: Optional[str] # Cron-like schedule for recurring scrapes
42
+ status: str # "active", "paused", "completed", "failed"
43
+ last_run: Optional[datetime]
44
+ next_run: Optional[datetime]
45
+ description: str
46
+ metadata: dict = {"index_fields": ["name", "description"]}
@@ -0,0 +1,4 @@
1
+ from typing import TypeAlias
2
+
3
+
4
+ UrlsToHtmls: TypeAlias = dict[str, str]
@@ -0,0 +1,142 @@
1
+ """Utilities for fetching web content using BeautifulSoup or Tavily.
2
+
3
+ This module provides functions to fetch and extract content from web pages, supporting
4
+ both BeautifulSoup for custom extraction rules and Tavily for API-based scraping.
5
+ """
6
+
7
+ import os
8
+ from typing import List, Union
9
+ from cognee.shared.logging_utils import get_logger
10
+ from cognee.tasks.web_scraper.types import UrlsToHtmls
11
+ from .default_url_crawler import DefaultUrlCrawler
12
+ from .config import DefaultCrawlerConfig, TavilyConfig
13
+
14
+ logger = get_logger(__name__)
15
+
16
+
17
+ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
18
+ """Fetch content from one or more URLs using the specified tool.
19
+
20
+ This function retrieves web page content using either BeautifulSoup (with custom
21
+ extraction rules) or Tavily (API-based scraping). It handles single URLs or lists of
22
+ URLs and returns a dictionary mapping URLs to their extracted content.
23
+
24
+ Args:
25
+ urls: A single URL (str) or a list of URLs (List[str]) to scrape.
26
+ preferred_tool: The scraping tool to use ("tavily" or "beautifulsoup").
27
+ Defaults to "beautifulsoup".
28
+ tavily_config: Configuration for Tavily API, including API key.
29
+ Required if preferred_tool is "tavily".
30
+ default_crawler_config: Configuration for BeautifulSoup crawler, including
31
+ extraction rules. Required if preferred_tool is "beautifulsoup" and
32
+ extraction_rules are needed.
33
+
34
+ Returns:
35
+ Dict[str, str]: A dictionary mapping each URL to its
36
+ extracted content (as a string for BeautifulSoup or a dict for Tavily).
37
+
38
+ Raises:
39
+ ValueError: If Tavily API key is missing when using Tavily, or if
40
+ extraction_rules are not provided when using BeautifulSoup.
41
+ ImportError: If required dependencies (beautifulsoup4 or tavily-python) are not
42
+ installed.
43
+ """
44
+ url_list = [urls] if isinstance(urls, str) else urls
45
+
46
+ if os.getenv("TAVILY_API_KEY"):
47
+ logger.info("Using Tavily API for url fetching")
48
+ return await fetch_with_tavily(urls)
49
+ else:
50
+ logger.info("Using default crawler for content extraction")
51
+
52
+ default_crawler_config = (
53
+ DefaultCrawlerConfig()
54
+ ) # We've decided to use defaults, and configure through env vars as needed
55
+
56
+ logger.info(
57
+ f"Initializing BeautifulSoup crawler with concurrency={default_crawler_config.concurrency}, timeout={default_crawler_config.timeout}s, max_crawl_delay={default_crawler_config.max_crawl_delay}s"
58
+ )
59
+
60
+ crawler = DefaultUrlCrawler(
61
+ concurrency=default_crawler_config.concurrency,
62
+ crawl_delay=default_crawler_config.crawl_delay,
63
+ max_crawl_delay=default_crawler_config.max_crawl_delay,
64
+ timeout=default_crawler_config.timeout,
65
+ max_retries=default_crawler_config.max_retries,
66
+ retry_delay_factor=default_crawler_config.retry_delay_factor,
67
+ headers=default_crawler_config.headers,
68
+ robots_cache_ttl=default_crawler_config.robots_cache_ttl,
69
+ )
70
+ try:
71
+ logger.info(
72
+ f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={default_crawler_config.use_playwright})"
73
+ )
74
+ results = await crawler.fetch_urls(
75
+ urls,
76
+ use_playwright=default_crawler_config.use_playwright,
77
+ playwright_js_wait=default_crawler_config.playwright_js_wait,
78
+ )
79
+ logger.info(f"Successfully fetched content from {len(results)} URL(s)")
80
+ return results
81
+ except Exception as e:
82
+ logger.error(f"Error fetching page content: {str(e)}")
83
+ raise
84
+ finally:
85
+ logger.info("Closing BeautifulSoup crawler")
86
+ await crawler.close()
87
+
88
+
89
+ async def fetch_with_tavily(urls: Union[str, List[str]]) -> UrlsToHtmls:
90
+ """Fetch content from URLs using the Tavily API.
91
+
92
+ Args:
93
+ urls: A single URL (str) or a list of URLs (List[str]) to scrape.
94
+
95
+ Returns:
96
+ Dict[str, str]: A dictionary mapping each URL to its raw content as a string.
97
+
98
+ Raises:
99
+ ImportError: If tavily-python is not installed.
100
+ Exception: If the Tavily API request fails.
101
+ """
102
+ try:
103
+ from tavily import AsyncTavilyClient
104
+ except ImportError:
105
+ logger.error(
106
+ "Failed to import tavily, make sure to install using pip install tavily-python>=0.7.0"
107
+ )
108
+ raise
109
+
110
+ tavily_config = TavilyConfig()
111
+ url_list = [urls] if isinstance(urls, str) else urls
112
+ extract_depth = tavily_config.extract_depth if tavily_config else "basic"
113
+ timeout = tavily_config.timeout if tavily_config else 10
114
+
115
+ logger.info(
116
+ f"Initializing Tavily client with extract_depth={extract_depth}, timeout={timeout}s"
117
+ )
118
+ client = AsyncTavilyClient(
119
+ api_key=tavily_config.api_key,
120
+ proxies=tavily_config.proxies,
121
+ )
122
+
123
+ logger.info(f"Sending extract request to Tavily API for {len(url_list)} URL(s)")
124
+ results = await client.extract(
125
+ urls,
126
+ format="text",
127
+ extract_depth=extract_depth,
128
+ timeout=timeout,
129
+ )
130
+
131
+ failed_count = len(results.get("failed_results", []))
132
+ if failed_count > 0:
133
+ logger.warning(f"Tavily API failed to fetch {failed_count} URL(s)")
134
+ for failed_result in results.get("failed_results", []):
135
+ logger.warning(f"Failed to fetch {failed_result}")
136
+
137
+ return_results = {}
138
+ for result in results.get("results", []):
139
+ return_results[result["url"]] = result["raw_content"]
140
+
141
+ logger.info(f"Successfully fetched content from {len(return_results)} URL(s) via Tavily")
142
+ return return_results
@@ -0,0 +1,396 @@
1
+ """Web scraping tasks for storing scraped data in a graph database.
2
+
3
+ This module provides functions to scrape web content, create or update WebPage, WebSite,
4
+ and ScrapingJob data points, and store them in a Kuzu graph database. It supports
5
+ scheduled scraping tasks and ensures that node updates preserve existing graph edges.
6
+ """
7
+
8
+ import os
9
+ import hashlib
10
+ from datetime import datetime
11
+ from typing import Union, List
12
+ from urllib.parse import urlparse
13
+ from uuid import uuid5, NAMESPACE_OID
14
+
15
+ from cognee.infrastructure.databases.graph import get_graph_engine
16
+ from cognee.shared.logging_utils import get_logger
17
+ from cognee.tasks.storage.index_data_points import index_data_points
18
+ from cognee.tasks.storage.index_graph_edges import index_graph_edges
19
+ from cognee.modules.engine.operations.setup import setup
20
+
21
+ from .models import WebPage, WebSite, ScrapingJob
22
+ from .config import DefaultCrawlerConfig, TavilyConfig
23
+ from .utils import fetch_page_content
24
+
25
+ try:
26
+ from apscheduler.triggers.cron import CronTrigger
27
+ from apscheduler.schedulers.asyncio import AsyncIOScheduler
28
+ except ImportError:
29
+ raise ImportError("Please install apscheduler by pip install APScheduler>=3.10")
30
+
31
+ logger = get_logger(__name__)
32
+
33
+
34
+ _scheduler = None
35
+
36
+
37
+ def get_scheduler():
38
+ global _scheduler
39
+ if _scheduler is None:
40
+ _scheduler = AsyncIOScheduler()
41
+ return _scheduler
42
+
43
+
44
+ async def cron_web_scraper_task(
45
+ url: Union[str, List[str]],
46
+ *,
47
+ schedule: str = None,
48
+ extraction_rules: dict = None,
49
+ tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
50
+ soup_crawler_config: DefaultCrawlerConfig = None,
51
+ tavily_config: TavilyConfig = None,
52
+ job_name: str = "scraping",
53
+ ):
54
+ """Schedule or run a web scraping task.
55
+
56
+ This function schedules a recurring web scraping task using APScheduler or runs it
57
+ immediately if no schedule is provided. It delegates to web_scraper_task for actual
58
+ scraping and graph storage.
59
+
60
+ Args:
61
+ url: A single URL or list of URLs to scrape.
62
+ schedule: A cron expression for scheduling (e.g., "0 0 * * *"). If None, runs immediately.
63
+ extraction_rules: Dictionary of extraction rules for BeautifulSoup (e.g., CSS selectors).
64
+ tavily_api_key: API key for Tavily. Defaults to TAVILY_API_KEY environment variable.
65
+ soup_crawler_config: Configuration for BeautifulSoup crawler.
66
+ tavily_config: Configuration for Tavily API.
67
+ job_name: Name of the scraping job. Defaults to "scraping".
68
+
69
+ Returns:
70
+ Any: The result of web_scraper_task if run immediately, or None if scheduled.
71
+
72
+ Raises:
73
+ ValueError: If the schedule is an invalid cron expression.
74
+ ImportError: If APScheduler is not installed.
75
+ """
76
+ now = datetime.now()
77
+ job_name = job_name or f"scrape_{now.strftime('%Y%m%d_%H%M%S')}"
78
+ if schedule:
79
+ try:
80
+ trigger = CronTrigger.from_crontab(schedule)
81
+ except ValueError as e:
82
+ raise ValueError(f"Invalid cron string '{schedule}': {e}")
83
+
84
+ scheduler = get_scheduler()
85
+ scheduler.add_job(
86
+ web_scraper_task,
87
+ kwargs={
88
+ "url": url,
89
+ "schedule": schedule,
90
+ "extraction_rules": extraction_rules,
91
+ "tavily_api_key": tavily_api_key,
92
+ "soup_crawler_config": soup_crawler_config,
93
+ "tavily_config": tavily_config,
94
+ "job_name": job_name,
95
+ },
96
+ trigger=trigger,
97
+ id=job_name,
98
+ name=job_name or f"WebScraper_{uuid5(NAMESPACE_OID, name=job_name)}",
99
+ replace_existing=True,
100
+ )
101
+ if not scheduler.running:
102
+ scheduler.start()
103
+ return
104
+
105
+ # If no schedule, run immediately
106
+ logger.info(f"[{datetime.now()}] Running web scraper task immediately...")
107
+ return await web_scraper_task(
108
+ url=url,
109
+ schedule=schedule,
110
+ extraction_rules=extraction_rules,
111
+ tavily_api_key=tavily_api_key,
112
+ soup_crawler_config=soup_crawler_config,
113
+ tavily_config=tavily_config,
114
+ job_name=job_name,
115
+ )
116
+
117
+
118
+ async def web_scraper_task(
119
+ url: Union[str, List[str]],
120
+ *,
121
+ schedule: str = None,
122
+ extraction_rules: dict = None,
123
+ tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
124
+ soup_crawler_config: DefaultCrawlerConfig = None,
125
+ tavily_config: TavilyConfig = None,
126
+ job_name: str = None,
127
+ ):
128
+ """Scrape URLs and store data points in a Graph database.
129
+
130
+ This function scrapes content from the provided URLs, creates or updates WebPage,
131
+ WebSite, and ScrapingJob data points, and stores them in a Graph database.
132
+ Each data point includes a description field summarizing its attributes. It creates
133
+ 'is_scraping' (ScrapingJob to WebSite) and 'is_part_of' (WebPage to WebSite)
134
+ relationships, preserving existing edges during node updates.
135
+
136
+ Args:
137
+ url: A single URL or list of URLs to scrape.
138
+ schedule: A cron expression for scheduling (e.g., "0 0 * * *"). If None, runs once.
139
+ extraction_rules: Dictionary of extraction rules for BeautifulSoup (e.g., CSS selectors).
140
+ tavily_api_key: API key for Tavily. Defaults to TAVILY_API_KEY environment variable.
141
+ soup_crawler_config: Configuration for BeautifulSoup crawler.
142
+ tavily_config: Configuration for Tavily API.
143
+ job_name: Name of the scraping job. Defaults to a timestamp-based name.
144
+
145
+ Returns:
146
+ Any: The graph data returned by the graph database.
147
+
148
+ Raises:
149
+ TypeError: If neither tavily_config nor soup_crawler_config is provided.
150
+ Exception: If fetching content or database operations fail.
151
+ """
152
+ await setup()
153
+ graph_db = await get_graph_engine()
154
+
155
+ if isinstance(url, str):
156
+ url = [url]
157
+
158
+ soup_crawler_config, tavily_config, preferred_tool = check_arguments(
159
+ tavily_api_key, extraction_rules, tavily_config, soup_crawler_config
160
+ )
161
+ now = datetime.now()
162
+ job_name = job_name or f"scrape_{now.strftime('%Y%m%d_%H%M%S')}"
163
+ status = "active"
164
+ trigger = CronTrigger.from_crontab(schedule) if schedule else None
165
+ next_run = trigger.get_next_fire_time(None, now) if trigger else None
166
+ scraping_job_created = await graph_db.get_node(uuid5(NAMESPACE_OID, name=job_name))
167
+
168
+ # Create description for ScrapingJob
169
+ scraping_job_description = (
170
+ f"Scraping job: {job_name}\n"
171
+ f"URLs: {', '.join(url)}\n"
172
+ f"Status: {status}\n"
173
+ f"Schedule: {schedule}\n"
174
+ f"Last run: {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
175
+ f"Next run: {next_run.strftime('%Y-%m-%d %H:%M:%S') if next_run else 'Not scheduled'}"
176
+ )
177
+
178
+ scraping_job = ScrapingJob(
179
+ id=uuid5(NAMESPACE_OID, name=job_name),
180
+ name=job_name,
181
+ urls=url,
182
+ status=status,
183
+ schedule=schedule,
184
+ last_run=now,
185
+ next_run=next_run,
186
+ description=scraping_job_description,
187
+ )
188
+
189
+ if scraping_job_created:
190
+ await graph_db.add_node(scraping_job) # Update existing scraping job
191
+ websites_dict = {}
192
+ webpages = []
193
+
194
+ # Fetch content
195
+ results = await fetch_page_content(
196
+ urls=url,
197
+ preferred_tool=preferred_tool,
198
+ tavily_config=tavily_config,
199
+ soup_crawler_config=soup_crawler_config,
200
+ )
201
+ for page_url, content in results.items():
202
+ parsed_url = urlparse(page_url)
203
+ domain = parsed_url.netloc
204
+ base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
205
+
206
+ # Create or update WebSite
207
+ if base_url not in websites_dict:
208
+ # Create description for WebSite
209
+ website_description = (
210
+ f"Website: {domain}\n"
211
+ f"Base URL: {base_url}\n"
212
+ f"Last crawled: {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
213
+ f"Page count: 1\n"
214
+ f"Scraping tool: {preferred_tool}\n"
215
+ f"Robots.txt: {'Available' if websites_dict.get(base_url, {}).get('robots_txt') else 'Not set'}\n"
216
+ f"Crawl delay: 0.5 seconds"
217
+ )
218
+
219
+ websites_dict[base_url] = WebSite(
220
+ id=uuid5(NAMESPACE_OID, name=domain),
221
+ name=domain,
222
+ base_url=base_url,
223
+ robots_txt=None,
224
+ crawl_delay=0.5,
225
+ last_crawled=now,
226
+ page_count=1,
227
+ scraping_config={
228
+ "extraction_rules": extraction_rules or {},
229
+ "tool": preferred_tool,
230
+ },
231
+ description=website_description,
232
+ )
233
+ if scraping_job_created:
234
+ await graph_db.add_node(websites_dict[base_url])
235
+ else:
236
+ websites_dict[base_url].page_count += 1
237
+ # Update description for existing WebSite
238
+ websites_dict[base_url].description = (
239
+ f"Website: {domain}\n"
240
+ f"Base URL: {base_url}\n"
241
+ f"Last crawled: {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
242
+ f"Page count: {websites_dict[base_url].page_count}\n"
243
+ f"Scraping tool: {preferred_tool}\n"
244
+ f"Robots.txt: {'Available' if websites_dict[base_url].robots_txt else 'Not set'}\n"
245
+ f"Crawl delay: {websites_dict[base_url].crawl_delay} seconds"
246
+ )
247
+ if scraping_job_created:
248
+ await graph_db.add_node(websites_dict[base_url])
249
+
250
+ # Create WebPage
251
+ content_str = content if isinstance(content, str) else str(content)
252
+ content_hash = hashlib.sha256(content_str.encode("utf-8")).hexdigest()
253
+ content_preview = content_str[:500] + ("..." if len(content_str) > 500 else "")
254
+ # Create description for WebPage
255
+ webpage_description = (
256
+ f"Webpage: {parsed_url.path.lstrip('/') or 'Home'}\n"
257
+ f"URL: {page_url}\n"
258
+ f"Scraped at: {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
259
+ f"Content: {content_preview}\n"
260
+ f"Content type: text\n"
261
+ f"Page size: {len(content_str)} bytes\n"
262
+ f"Status code: 200"
263
+ )
264
+ page_extraction_rules = extraction_rules
265
+ webpage = WebPage(
266
+ id=uuid5(NAMESPACE_OID, name=page_url),
267
+ name=page_url,
268
+ content=content_str,
269
+ content_hash=content_hash,
270
+ scraped_at=now,
271
+ last_modified=None,
272
+ status_code=200,
273
+ content_type="text/html",
274
+ page_size=len(content_str),
275
+ extraction_rules=page_extraction_rules or {},
276
+ description=webpage_description,
277
+ )
278
+ webpages.append(webpage)
279
+
280
+ scraping_job.status = "completed" if webpages else "failed"
281
+ # Update ScrapingJob description with final status
282
+ scraping_job.description = (
283
+ f"Scraping job: {job_name}\n"
284
+ f"URLs: {', '.join(url)}\n"
285
+ f"Status: {scraping_job.status}\n"
286
+ f"Last run: {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
287
+ f"Next run: {next_run.strftime('%Y-%m-%d %H:%M:%S') if next_run else 'Not scheduled'}"
288
+ )
289
+
290
+ websites = list(websites_dict.values())
291
+ # Adding Nodes and Edges
292
+ node_mapping = {scraping_job.id: scraping_job}
293
+ edge_mapping = []
294
+
295
+ for website in websites:
296
+ node_mapping[website.id] = website
297
+ edge_mapping.append(
298
+ (
299
+ scraping_job.id,
300
+ website.id,
301
+ "is_scraping",
302
+ {
303
+ "source_node_id": scraping_job.id,
304
+ "target_node_id": website.id,
305
+ "relationship_name": "is_scraping",
306
+ },
307
+ )
308
+ )
309
+ for webpage in webpages:
310
+ node_mapping[webpage.id] = webpage
311
+ parsed_url = urlparse(webpage.name)
312
+ base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
313
+ edge_mapping.append(
314
+ (
315
+ webpage.id,
316
+ websites_dict[base_url].id,
317
+ "is_part_of",
318
+ {
319
+ "source_node_id": webpage.id,
320
+ "target_node_id": websites_dict[base_url].id,
321
+ "relationship_name": "is_part_of",
322
+ },
323
+ )
324
+ )
325
+
326
+ await graph_db.add_nodes(list(node_mapping.values()))
327
+ await graph_db.add_edges(edge_mapping)
328
+ await index_data_points(list(node_mapping.values()))
329
+ await index_graph_edges()
330
+
331
+ return await graph_db.get_graph_data()
332
+
333
+
334
+ def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawler_config):
335
+ """Validate and configure arguments for web_scraper_task.
336
+
337
+ Args:
338
+ tavily_api_key: API key for Tavily.
339
+ extraction_rules: Extraction rules for BeautifulSoup.
340
+ tavily_config: Configuration for Tavily API.
341
+ soup_crawler_config: Configuration for BeautifulSoup crawler.
342
+
343
+ Returns:
344
+ Tuple[DefaultCrawlerConfig, TavilyConfig, str]: Configured soup_crawler_config,
345
+ tavily_config, and preferred_tool ("tavily" or "beautifulsoup").
346
+
347
+ Raises:
348
+ TypeError: If neither tavily_config nor soup_crawler_config is provided.
349
+ """
350
+ preferred_tool = "beautifulsoup"
351
+
352
+ if extraction_rules and not soup_crawler_config:
353
+ soup_crawler_config = DefaultCrawlerConfig(extraction_rules=extraction_rules)
354
+
355
+ if tavily_api_key:
356
+ if not tavily_config:
357
+ tavily_config = TavilyConfig(api_key=tavily_api_key)
358
+ else:
359
+ tavily_config.api_key = tavily_api_key
360
+ if not extraction_rules and not soup_crawler_config:
361
+ preferred_tool = "tavily"
362
+
363
+ if not tavily_config and not soup_crawler_config:
364
+ raise TypeError("Make sure you pass arguments for web_scraper_task")
365
+
366
+ return soup_crawler_config, tavily_config, preferred_tool
367
+
368
+
369
+ def get_path_after_base(base_url: str, url: str) -> str:
370
+ """Extract the path after the base URL.
371
+
372
+ Args:
373
+ base_url: The base URL (e.g., "https://example.com").
374
+ url: The full URL to extract the path from.
375
+
376
+ Returns:
377
+ str: The path after the base URL, with leading slashes removed.
378
+
379
+ Raises:
380
+ ValueError: If the base URL and target URL are from different domains.
381
+ """
382
+ parsed_base = urlparse(base_url)
383
+ parsed_url = urlparse(url)
384
+
385
+ # Ensure they have the same netloc (domain)
386
+ if parsed_base.netloc != parsed_url.netloc:
387
+ raise ValueError("Base URL and target URL are from different domains")
388
+
389
+ # Return everything after base_url path
390
+ base_path = parsed_base.path.rstrip("/")
391
+ full_path = parsed_url.path
392
+
393
+ if full_path.startswith(base_path):
394
+ return full_path[len(base_path) :].lstrip("/")
395
+ else:
396
+ return full_path.lstrip("/")
@@ -53,7 +53,6 @@ class TestCliConfig:
53
53
  expected_types = [
54
54
  "GRAPH_COMPLETION",
55
55
  "RAG_COMPLETION",
56
- "INSIGHTS",
57
56
  "CHUNKS",
58
57
  "SUMMARIES",
59
58
  "CODE",
@@ -0,0 +1,13 @@
1
+ import pytest
2
+ from cognee.tasks.web_scraper import DefaultUrlCrawler
3
+
4
+
5
+ @pytest.mark.asyncio
6
+ async def test_fetch():
7
+ crawler = DefaultUrlCrawler()
8
+ url = "https://en.wikipedia.org/wiki/Large_language_model"
9
+ results = await crawler.fetch_urls(url)
10
+ assert len(results) == 1
11
+ assert isinstance(results, dict)
12
+ html = results[url]
13
+ assert isinstance(html, str)
@@ -0,0 +1,19 @@
1
+ import os
2
+ import pytest
3
+ from cognee.tasks.web_scraper.utils import fetch_with_tavily
4
+
5
+ skip_in_ci = pytest.mark.skipif(
6
+ os.getenv("GITHUB_ACTIONS") == "true",
7
+ reason="Skipping in Github for now - before we get TAVILY_API_KEY",
8
+ )
9
+
10
+
11
+ @skip_in_ci
12
+ @pytest.mark.asyncio
13
+ async def test_fetch():
14
+ url = "https://en.wikipedia.org/wiki/Large_language_model"
15
+ results = await fetch_with_tavily(url)
16
+ assert isinstance(results, dict)
17
+ assert len(results) == 1
18
+ html = results[url]
19
+ assert isinstance(html, str)