cognee 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +1 -0
- cognee/api/health.py +2 -12
- cognee/api/v1/add/add.py +46 -6
- cognee/api/v1/add/routers/get_add_router.py +5 -1
- cognee/api/v1/cognify/cognify.py +29 -9
- cognee/api/v1/datasets/datasets.py +11 -0
- cognee/api/v1/responses/default_tools.py +0 -1
- cognee/api/v1/responses/dispatch_function.py +1 -1
- cognee/api/v1/responses/routers/default_tools.py +0 -1
- cognee/api/v1/search/search.py +11 -9
- cognee/api/v1/settings/routers/get_settings_router.py +7 -1
- cognee/api/v1/ui/ui.py +47 -16
- cognee/api/v1/update/routers/get_update_router.py +1 -1
- cognee/api/v1/update/update.py +3 -3
- cognee/cli/_cognee.py +61 -10
- cognee/cli/commands/add_command.py +3 -3
- cognee/cli/commands/cognify_command.py +3 -3
- cognee/cli/commands/config_command.py +9 -7
- cognee/cli/commands/delete_command.py +3 -3
- cognee/cli/commands/search_command.py +3 -7
- cognee/cli/config.py +0 -1
- cognee/context_global_variables.py +5 -0
- cognee/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/cache/__init__.py +2 -0
- cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
- cognee/infrastructure/databases/cache/config.py +44 -0
- cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
- cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
- cognee/infrastructure/databases/exceptions/__init__.py +1 -0
- cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
- cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
- cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
- cognee/infrastructure/databases/graph/kuzu/adapter.py +67 -44
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
- cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
- cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
- cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
- cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
- cognee/infrastructure/files/exceptions.py +1 -1
- cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
- cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
- cognee/infrastructure/files/utils/guess_file_type.py +6 -0
- cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
- cognee/infrastructure/loaders/LoaderEngine.py +27 -7
- cognee/infrastructure/loaders/external/__init__.py +7 -0
- cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
- cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
- cognee/infrastructure/loaders/supported_loaders.py +7 -0
- cognee/modules/data/exceptions/exceptions.py +1 -1
- cognee/modules/data/methods/__init__.py +3 -0
- cognee/modules/data/methods/get_dataset_data.py +4 -1
- cognee/modules/data/methods/has_dataset_data.py +21 -0
- cognee/modules/engine/models/TableRow.py +0 -1
- cognee/modules/ingestion/save_data_to_file.py +9 -2
- cognee/modules/pipelines/exceptions/exceptions.py +1 -1
- cognee/modules/pipelines/operations/pipeline.py +12 -1
- cognee/modules/pipelines/operations/run_tasks.py +25 -197
- cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
- cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
- cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
- cognee/modules/retrieval/base_graph_retriever.py +3 -1
- cognee/modules/retrieval/base_retriever.py +3 -1
- cognee/modules/retrieval/chunks_retriever.py +5 -1
- cognee/modules/retrieval/code_retriever.py +20 -2
- cognee/modules/retrieval/completion_retriever.py +50 -9
- cognee/modules/retrieval/cypher_search_retriever.py +11 -1
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
- cognee/modules/retrieval/graph_completion_cot_retriever.py +32 -1
- cognee/modules/retrieval/graph_completion_retriever.py +54 -10
- cognee/modules/retrieval/lexical_retriever.py +20 -2
- cognee/modules/retrieval/natural_language_retriever.py +10 -1
- cognee/modules/retrieval/summaries_retriever.py +5 -1
- cognee/modules/retrieval/temporal_retriever.py +62 -10
- cognee/modules/retrieval/user_qa_feedback.py +3 -2
- cognee/modules/retrieval/utils/completion.py +5 -0
- cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
- cognee/modules/retrieval/utils/session_cache.py +156 -0
- cognee/modules/search/methods/get_search_type_tools.py +0 -5
- cognee/modules/search/methods/no_access_control_search.py +12 -1
- cognee/modules/search/methods/search.py +34 -2
- cognee/modules/search/types/SearchType.py +0 -1
- cognee/modules/settings/get_settings.py +23 -0
- cognee/modules/users/methods/get_authenticated_user.py +3 -1
- cognee/modules/users/methods/get_default_user.py +1 -6
- cognee/modules/users/roles/methods/create_role.py +2 -2
- cognee/modules/users/tenants/methods/create_tenant.py +2 -2
- cognee/shared/exceptions/exceptions.py +1 -1
- cognee/tasks/codingagents/coding_rule_associations.py +1 -2
- cognee/tasks/documents/exceptions/exceptions.py +1 -1
- cognee/tasks/graph/extract_graph_from_data.py +2 -0
- cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
- cognee/tasks/ingestion/ingest_data.py +11 -5
- cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
- cognee/tasks/storage/add_data_points.py +3 -10
- cognee/tasks/storage/index_data_points.py +19 -14
- cognee/tasks/storage/index_graph_edges.py +25 -11
- cognee/tasks/web_scraper/__init__.py +34 -0
- cognee/tasks/web_scraper/config.py +26 -0
- cognee/tasks/web_scraper/default_url_crawler.py +446 -0
- cognee/tasks/web_scraper/models.py +46 -0
- cognee/tasks/web_scraper/types.py +4 -0
- cognee/tasks/web_scraper/utils.py +142 -0
- cognee/tasks/web_scraper/web_scraper_task.py +396 -0
- cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
- cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
- cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
- cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
- cognee/tests/subprocesses/reader.py +25 -0
- cognee/tests/subprocesses/simple_cognify_1.py +31 -0
- cognee/tests/subprocesses/simple_cognify_2.py +31 -0
- cognee/tests/subprocesses/writer.py +32 -0
- cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
- cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
- cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
- cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
- cognee/tests/test_add_docling_document.py +56 -0
- cognee/tests/test_chromadb.py +7 -11
- cognee/tests/test_concurrent_subprocess_access.py +76 -0
- cognee/tests/test_conversation_history.py +240 -0
- cognee/tests/test_kuzu.py +27 -15
- cognee/tests/test_lancedb.py +7 -11
- cognee/tests/test_library.py +32 -2
- cognee/tests/test_neo4j.py +24 -16
- cognee/tests/test_neptune_analytics_vector.py +7 -11
- cognee/tests/test_permissions.py +9 -13
- cognee/tests/test_pgvector.py +4 -4
- cognee/tests/test_remote_kuzu.py +8 -11
- cognee/tests/test_s3_file_storage.py +1 -1
- cognee/tests/test_search_db.py +6 -8
- cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
- cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/METADATA +21 -6
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/RECORD +155 -126
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/entry_points.txt +1 -0
- distributed/Dockerfile +0 -3
- distributed/entrypoint.py +21 -9
- distributed/signal.py +5 -0
- distributed/workers/data_point_saving_worker.py +64 -34
- distributed/workers/graph_saving_worker.py +71 -47
- cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
- cognee/modules/retrieval/insights_retriever.py +0 -133
- cognee/tests/test_memgraph.py +0 -109
- cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/WHEEL +0 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
"""Web scraping tasks for storing scraped data in a graph database.
|
|
2
|
+
|
|
3
|
+
This module provides functions to scrape web content, create or update WebPage, WebSite,
|
|
4
|
+
and ScrapingJob data points, and store them in a Kuzu graph database. It supports
|
|
5
|
+
scheduled scraping tasks and ensures that node updates preserve existing graph edges.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import hashlib
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from typing import Union, List
|
|
12
|
+
from urllib.parse import urlparse
|
|
13
|
+
from uuid import uuid5, NAMESPACE_OID
|
|
14
|
+
|
|
15
|
+
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
16
|
+
from cognee.shared.logging_utils import get_logger
|
|
17
|
+
from cognee.tasks.storage.index_data_points import index_data_points
|
|
18
|
+
from cognee.tasks.storage.index_graph_edges import index_graph_edges
|
|
19
|
+
from cognee.modules.engine.operations.setup import setup
|
|
20
|
+
|
|
21
|
+
from .models import WebPage, WebSite, ScrapingJob
|
|
22
|
+
from .config import DefaultCrawlerConfig, TavilyConfig
|
|
23
|
+
from .utils import fetch_page_content
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
from apscheduler.triggers.cron import CronTrigger
|
|
27
|
+
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
|
28
|
+
except ImportError:
|
|
29
|
+
raise ImportError("Please install apscheduler by pip install APScheduler>=3.10")
|
|
30
|
+
|
|
31
|
+
logger = get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
_scheduler = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_scheduler():
|
|
38
|
+
global _scheduler
|
|
39
|
+
if _scheduler is None:
|
|
40
|
+
_scheduler = AsyncIOScheduler()
|
|
41
|
+
return _scheduler
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
async def cron_web_scraper_task(
|
|
45
|
+
url: Union[str, List[str]],
|
|
46
|
+
*,
|
|
47
|
+
schedule: str = None,
|
|
48
|
+
extraction_rules: dict = None,
|
|
49
|
+
tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
|
|
50
|
+
soup_crawler_config: DefaultCrawlerConfig = None,
|
|
51
|
+
tavily_config: TavilyConfig = None,
|
|
52
|
+
job_name: str = "scraping",
|
|
53
|
+
):
|
|
54
|
+
"""Schedule or run a web scraping task.
|
|
55
|
+
|
|
56
|
+
This function schedules a recurring web scraping task using APScheduler or runs it
|
|
57
|
+
immediately if no schedule is provided. It delegates to web_scraper_task for actual
|
|
58
|
+
scraping and graph storage.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
url: A single URL or list of URLs to scrape.
|
|
62
|
+
schedule: A cron expression for scheduling (e.g., "0 0 * * *"). If None, runs immediately.
|
|
63
|
+
extraction_rules: Dictionary of extraction rules for BeautifulSoup (e.g., CSS selectors).
|
|
64
|
+
tavily_api_key: API key for Tavily. Defaults to TAVILY_API_KEY environment variable.
|
|
65
|
+
soup_crawler_config: Configuration for BeautifulSoup crawler.
|
|
66
|
+
tavily_config: Configuration for Tavily API.
|
|
67
|
+
job_name: Name of the scraping job. Defaults to "scraping".
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Any: The result of web_scraper_task if run immediately, or None if scheduled.
|
|
71
|
+
|
|
72
|
+
Raises:
|
|
73
|
+
ValueError: If the schedule is an invalid cron expression.
|
|
74
|
+
ImportError: If APScheduler is not installed.
|
|
75
|
+
"""
|
|
76
|
+
now = datetime.now()
|
|
77
|
+
job_name = job_name or f"scrape_{now.strftime('%Y%m%d_%H%M%S')}"
|
|
78
|
+
if schedule:
|
|
79
|
+
try:
|
|
80
|
+
trigger = CronTrigger.from_crontab(schedule)
|
|
81
|
+
except ValueError as e:
|
|
82
|
+
raise ValueError(f"Invalid cron string '{schedule}': {e}")
|
|
83
|
+
|
|
84
|
+
scheduler = get_scheduler()
|
|
85
|
+
scheduler.add_job(
|
|
86
|
+
web_scraper_task,
|
|
87
|
+
kwargs={
|
|
88
|
+
"url": url,
|
|
89
|
+
"schedule": schedule,
|
|
90
|
+
"extraction_rules": extraction_rules,
|
|
91
|
+
"tavily_api_key": tavily_api_key,
|
|
92
|
+
"soup_crawler_config": soup_crawler_config,
|
|
93
|
+
"tavily_config": tavily_config,
|
|
94
|
+
"job_name": job_name,
|
|
95
|
+
},
|
|
96
|
+
trigger=trigger,
|
|
97
|
+
id=job_name,
|
|
98
|
+
name=job_name or f"WebScraper_{uuid5(NAMESPACE_OID, name=job_name)}",
|
|
99
|
+
replace_existing=True,
|
|
100
|
+
)
|
|
101
|
+
if not scheduler.running:
|
|
102
|
+
scheduler.start()
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
# If no schedule, run immediately
|
|
106
|
+
logger.info(f"[{datetime.now()}] Running web scraper task immediately...")
|
|
107
|
+
return await web_scraper_task(
|
|
108
|
+
url=url,
|
|
109
|
+
schedule=schedule,
|
|
110
|
+
extraction_rules=extraction_rules,
|
|
111
|
+
tavily_api_key=tavily_api_key,
|
|
112
|
+
soup_crawler_config=soup_crawler_config,
|
|
113
|
+
tavily_config=tavily_config,
|
|
114
|
+
job_name=job_name,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
async def web_scraper_task(
|
|
119
|
+
url: Union[str, List[str]],
|
|
120
|
+
*,
|
|
121
|
+
schedule: str = None,
|
|
122
|
+
extraction_rules: dict = None,
|
|
123
|
+
tavily_api_key: str = os.getenv("TAVILY_API_KEY"),
|
|
124
|
+
soup_crawler_config: DefaultCrawlerConfig = None,
|
|
125
|
+
tavily_config: TavilyConfig = None,
|
|
126
|
+
job_name: str = None,
|
|
127
|
+
):
|
|
128
|
+
"""Scrape URLs and store data points in a Graph database.
|
|
129
|
+
|
|
130
|
+
This function scrapes content from the provided URLs, creates or updates WebPage,
|
|
131
|
+
WebSite, and ScrapingJob data points, and stores them in a Graph database.
|
|
132
|
+
Each data point includes a description field summarizing its attributes. It creates
|
|
133
|
+
'is_scraping' (ScrapingJob to WebSite) and 'is_part_of' (WebPage to WebSite)
|
|
134
|
+
relationships, preserving existing edges during node updates.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
url: A single URL or list of URLs to scrape.
|
|
138
|
+
schedule: A cron expression for scheduling (e.g., "0 0 * * *"). If None, runs once.
|
|
139
|
+
extraction_rules: Dictionary of extraction rules for BeautifulSoup (e.g., CSS selectors).
|
|
140
|
+
tavily_api_key: API key for Tavily. Defaults to TAVILY_API_KEY environment variable.
|
|
141
|
+
soup_crawler_config: Configuration for BeautifulSoup crawler.
|
|
142
|
+
tavily_config: Configuration for Tavily API.
|
|
143
|
+
job_name: Name of the scraping job. Defaults to a timestamp-based name.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Any: The graph data returned by the graph database.
|
|
147
|
+
|
|
148
|
+
Raises:
|
|
149
|
+
TypeError: If neither tavily_config nor soup_crawler_config is provided.
|
|
150
|
+
Exception: If fetching content or database operations fail.
|
|
151
|
+
"""
|
|
152
|
+
await setup()
|
|
153
|
+
graph_db = await get_graph_engine()
|
|
154
|
+
|
|
155
|
+
if isinstance(url, str):
|
|
156
|
+
url = [url]
|
|
157
|
+
|
|
158
|
+
soup_crawler_config, tavily_config, preferred_tool = check_arguments(
|
|
159
|
+
tavily_api_key, extraction_rules, tavily_config, soup_crawler_config
|
|
160
|
+
)
|
|
161
|
+
now = datetime.now()
|
|
162
|
+
job_name = job_name or f"scrape_{now.strftime('%Y%m%d_%H%M%S')}"
|
|
163
|
+
status = "active"
|
|
164
|
+
trigger = CronTrigger.from_crontab(schedule) if schedule else None
|
|
165
|
+
next_run = trigger.get_next_fire_time(None, now) if trigger else None
|
|
166
|
+
scraping_job_created = await graph_db.get_node(uuid5(NAMESPACE_OID, name=job_name))
|
|
167
|
+
|
|
168
|
+
# Create description for ScrapingJob
|
|
169
|
+
scraping_job_description = (
|
|
170
|
+
f"Scraping job: {job_name}\n"
|
|
171
|
+
f"URLs: {', '.join(url)}\n"
|
|
172
|
+
f"Status: {status}\n"
|
|
173
|
+
f"Schedule: {schedule}\n"
|
|
174
|
+
f"Last run: {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
|
|
175
|
+
f"Next run: {next_run.strftime('%Y-%m-%d %H:%M:%S') if next_run else 'Not scheduled'}"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
scraping_job = ScrapingJob(
|
|
179
|
+
id=uuid5(NAMESPACE_OID, name=job_name),
|
|
180
|
+
name=job_name,
|
|
181
|
+
urls=url,
|
|
182
|
+
status=status,
|
|
183
|
+
schedule=schedule,
|
|
184
|
+
last_run=now,
|
|
185
|
+
next_run=next_run,
|
|
186
|
+
description=scraping_job_description,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
if scraping_job_created:
|
|
190
|
+
await graph_db.add_node(scraping_job) # Update existing scraping job
|
|
191
|
+
websites_dict = {}
|
|
192
|
+
webpages = []
|
|
193
|
+
|
|
194
|
+
# Fetch content
|
|
195
|
+
results = await fetch_page_content(
|
|
196
|
+
urls=url,
|
|
197
|
+
preferred_tool=preferred_tool,
|
|
198
|
+
tavily_config=tavily_config,
|
|
199
|
+
soup_crawler_config=soup_crawler_config,
|
|
200
|
+
)
|
|
201
|
+
for page_url, content in results.items():
|
|
202
|
+
parsed_url = urlparse(page_url)
|
|
203
|
+
domain = parsed_url.netloc
|
|
204
|
+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
|
205
|
+
|
|
206
|
+
# Create or update WebSite
|
|
207
|
+
if base_url not in websites_dict:
|
|
208
|
+
# Create description for WebSite
|
|
209
|
+
website_description = (
|
|
210
|
+
f"Website: {domain}\n"
|
|
211
|
+
f"Base URL: {base_url}\n"
|
|
212
|
+
f"Last crawled: {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
|
|
213
|
+
f"Page count: 1\n"
|
|
214
|
+
f"Scraping tool: {preferred_tool}\n"
|
|
215
|
+
f"Robots.txt: {'Available' if websites_dict.get(base_url, {}).get('robots_txt') else 'Not set'}\n"
|
|
216
|
+
f"Crawl delay: 0.5 seconds"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
websites_dict[base_url] = WebSite(
|
|
220
|
+
id=uuid5(NAMESPACE_OID, name=domain),
|
|
221
|
+
name=domain,
|
|
222
|
+
base_url=base_url,
|
|
223
|
+
robots_txt=None,
|
|
224
|
+
crawl_delay=0.5,
|
|
225
|
+
last_crawled=now,
|
|
226
|
+
page_count=1,
|
|
227
|
+
scraping_config={
|
|
228
|
+
"extraction_rules": extraction_rules or {},
|
|
229
|
+
"tool": preferred_tool,
|
|
230
|
+
},
|
|
231
|
+
description=website_description,
|
|
232
|
+
)
|
|
233
|
+
if scraping_job_created:
|
|
234
|
+
await graph_db.add_node(websites_dict[base_url])
|
|
235
|
+
else:
|
|
236
|
+
websites_dict[base_url].page_count += 1
|
|
237
|
+
# Update description for existing WebSite
|
|
238
|
+
websites_dict[base_url].description = (
|
|
239
|
+
f"Website: {domain}\n"
|
|
240
|
+
f"Base URL: {base_url}\n"
|
|
241
|
+
f"Last crawled: {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
|
|
242
|
+
f"Page count: {websites_dict[base_url].page_count}\n"
|
|
243
|
+
f"Scraping tool: {preferred_tool}\n"
|
|
244
|
+
f"Robots.txt: {'Available' if websites_dict[base_url].robots_txt else 'Not set'}\n"
|
|
245
|
+
f"Crawl delay: {websites_dict[base_url].crawl_delay} seconds"
|
|
246
|
+
)
|
|
247
|
+
if scraping_job_created:
|
|
248
|
+
await graph_db.add_node(websites_dict[base_url])
|
|
249
|
+
|
|
250
|
+
# Create WebPage
|
|
251
|
+
content_str = content if isinstance(content, str) else str(content)
|
|
252
|
+
content_hash = hashlib.sha256(content_str.encode("utf-8")).hexdigest()
|
|
253
|
+
content_preview = content_str[:500] + ("..." if len(content_str) > 500 else "")
|
|
254
|
+
# Create description for WebPage
|
|
255
|
+
webpage_description = (
|
|
256
|
+
f"Webpage: {parsed_url.path.lstrip('/') or 'Home'}\n"
|
|
257
|
+
f"URL: {page_url}\n"
|
|
258
|
+
f"Scraped at: {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
|
|
259
|
+
f"Content: {content_preview}\n"
|
|
260
|
+
f"Content type: text\n"
|
|
261
|
+
f"Page size: {len(content_str)} bytes\n"
|
|
262
|
+
f"Status code: 200"
|
|
263
|
+
)
|
|
264
|
+
page_extraction_rules = extraction_rules
|
|
265
|
+
webpage = WebPage(
|
|
266
|
+
id=uuid5(NAMESPACE_OID, name=page_url),
|
|
267
|
+
name=page_url,
|
|
268
|
+
content=content_str,
|
|
269
|
+
content_hash=content_hash,
|
|
270
|
+
scraped_at=now,
|
|
271
|
+
last_modified=None,
|
|
272
|
+
status_code=200,
|
|
273
|
+
content_type="text/html",
|
|
274
|
+
page_size=len(content_str),
|
|
275
|
+
extraction_rules=page_extraction_rules or {},
|
|
276
|
+
description=webpage_description,
|
|
277
|
+
)
|
|
278
|
+
webpages.append(webpage)
|
|
279
|
+
|
|
280
|
+
scraping_job.status = "completed" if webpages else "failed"
|
|
281
|
+
# Update ScrapingJob description with final status
|
|
282
|
+
scraping_job.description = (
|
|
283
|
+
f"Scraping job: {job_name}\n"
|
|
284
|
+
f"URLs: {', '.join(url)}\n"
|
|
285
|
+
f"Status: {scraping_job.status}\n"
|
|
286
|
+
f"Last run: {now.strftime('%Y-%m-%d %H:%M:%S')}\n"
|
|
287
|
+
f"Next run: {next_run.strftime('%Y-%m-%d %H:%M:%S') if next_run else 'Not scheduled'}"
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
websites = list(websites_dict.values())
|
|
291
|
+
# Adding Nodes and Edges
|
|
292
|
+
node_mapping = {scraping_job.id: scraping_job}
|
|
293
|
+
edge_mapping = []
|
|
294
|
+
|
|
295
|
+
for website in websites:
|
|
296
|
+
node_mapping[website.id] = website
|
|
297
|
+
edge_mapping.append(
|
|
298
|
+
(
|
|
299
|
+
scraping_job.id,
|
|
300
|
+
website.id,
|
|
301
|
+
"is_scraping",
|
|
302
|
+
{
|
|
303
|
+
"source_node_id": scraping_job.id,
|
|
304
|
+
"target_node_id": website.id,
|
|
305
|
+
"relationship_name": "is_scraping",
|
|
306
|
+
},
|
|
307
|
+
)
|
|
308
|
+
)
|
|
309
|
+
for webpage in webpages:
|
|
310
|
+
node_mapping[webpage.id] = webpage
|
|
311
|
+
parsed_url = urlparse(webpage.name)
|
|
312
|
+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
|
313
|
+
edge_mapping.append(
|
|
314
|
+
(
|
|
315
|
+
webpage.id,
|
|
316
|
+
websites_dict[base_url].id,
|
|
317
|
+
"is_part_of",
|
|
318
|
+
{
|
|
319
|
+
"source_node_id": webpage.id,
|
|
320
|
+
"target_node_id": websites_dict[base_url].id,
|
|
321
|
+
"relationship_name": "is_part_of",
|
|
322
|
+
},
|
|
323
|
+
)
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
await graph_db.add_nodes(list(node_mapping.values()))
|
|
327
|
+
await graph_db.add_edges(edge_mapping)
|
|
328
|
+
await index_data_points(list(node_mapping.values()))
|
|
329
|
+
await index_graph_edges()
|
|
330
|
+
|
|
331
|
+
return await graph_db.get_graph_data()
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawler_config):
|
|
335
|
+
"""Validate and configure arguments for web_scraper_task.
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
tavily_api_key: API key for Tavily.
|
|
339
|
+
extraction_rules: Extraction rules for BeautifulSoup.
|
|
340
|
+
tavily_config: Configuration for Tavily API.
|
|
341
|
+
soup_crawler_config: Configuration for BeautifulSoup crawler.
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
Tuple[DefaultCrawlerConfig, TavilyConfig, str]: Configured soup_crawler_config,
|
|
345
|
+
tavily_config, and preferred_tool ("tavily" or "beautifulsoup").
|
|
346
|
+
|
|
347
|
+
Raises:
|
|
348
|
+
TypeError: If neither tavily_config nor soup_crawler_config is provided.
|
|
349
|
+
"""
|
|
350
|
+
preferred_tool = "beautifulsoup"
|
|
351
|
+
|
|
352
|
+
if extraction_rules and not soup_crawler_config:
|
|
353
|
+
soup_crawler_config = DefaultCrawlerConfig(extraction_rules=extraction_rules)
|
|
354
|
+
|
|
355
|
+
if tavily_api_key:
|
|
356
|
+
if not tavily_config:
|
|
357
|
+
tavily_config = TavilyConfig(api_key=tavily_api_key)
|
|
358
|
+
else:
|
|
359
|
+
tavily_config.api_key = tavily_api_key
|
|
360
|
+
if not extraction_rules and not soup_crawler_config:
|
|
361
|
+
preferred_tool = "tavily"
|
|
362
|
+
|
|
363
|
+
if not tavily_config and not soup_crawler_config:
|
|
364
|
+
raise TypeError("Make sure you pass arguments for web_scraper_task")
|
|
365
|
+
|
|
366
|
+
return soup_crawler_config, tavily_config, preferred_tool
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def get_path_after_base(base_url: str, url: str) -> str:
|
|
370
|
+
"""Extract the path after the base URL.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
base_url: The base URL (e.g., "https://example.com").
|
|
374
|
+
url: The full URL to extract the path from.
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
str: The path after the base URL, with leading slashes removed.
|
|
378
|
+
|
|
379
|
+
Raises:
|
|
380
|
+
ValueError: If the base URL and target URL are from different domains.
|
|
381
|
+
"""
|
|
382
|
+
parsed_base = urlparse(base_url)
|
|
383
|
+
parsed_url = urlparse(url)
|
|
384
|
+
|
|
385
|
+
# Ensure they have the same netloc (domain)
|
|
386
|
+
if parsed_base.netloc != parsed_url.netloc:
|
|
387
|
+
raise ValueError("Base URL and target URL are from different domains")
|
|
388
|
+
|
|
389
|
+
# Return everything after base_url path
|
|
390
|
+
base_path = parsed_base.path.rstrip("/")
|
|
391
|
+
full_path = parsed_url.path
|
|
392
|
+
|
|
393
|
+
if full_path.startswith(base_path):
|
|
394
|
+
return full_path[len(base_path) :].lstrip("/")
|
|
395
|
+
else:
|
|
396
|
+
return full_path.lstrip("/")
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from cognee.tasks.web_scraper import DefaultUrlCrawler
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@pytest.mark.asyncio
|
|
6
|
+
async def test_fetch():
|
|
7
|
+
crawler = DefaultUrlCrawler()
|
|
8
|
+
url = "https://en.wikipedia.org/wiki/Large_language_model"
|
|
9
|
+
results = await crawler.fetch_urls(url)
|
|
10
|
+
assert len(results) == 1
|
|
11
|
+
assert isinstance(results, dict)
|
|
12
|
+
html = results[url]
|
|
13
|
+
assert isinstance(html, str)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
from cognee.tasks.web_scraper.utils import fetch_with_tavily
|
|
4
|
+
|
|
5
|
+
skip_in_ci = pytest.mark.skipif(
|
|
6
|
+
os.getenv("GITHUB_ACTIONS") == "true",
|
|
7
|
+
reason="Skipping in Github for now - before we get TAVILY_API_KEY",
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@skip_in_ci
|
|
12
|
+
@pytest.mark.asyncio
|
|
13
|
+
async def test_fetch():
|
|
14
|
+
url = "https://en.wikipedia.org/wiki/Large_language_model"
|
|
15
|
+
results = await fetch_with_tavily(url)
|
|
16
|
+
assert isinstance(results, dict)
|
|
17
|
+
assert len(results) == 1
|
|
18
|
+
html = results[url]
|
|
19
|
+
assert isinstance(html, str)
|