cognee 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +1 -0
- cognee/api/health.py +2 -12
- cognee/api/v1/add/add.py +46 -6
- cognee/api/v1/add/routers/get_add_router.py +5 -1
- cognee/api/v1/cognify/cognify.py +29 -9
- cognee/api/v1/datasets/datasets.py +11 -0
- cognee/api/v1/responses/default_tools.py +0 -1
- cognee/api/v1/responses/dispatch_function.py +1 -1
- cognee/api/v1/responses/routers/default_tools.py +0 -1
- cognee/api/v1/search/search.py +11 -9
- cognee/api/v1/settings/routers/get_settings_router.py +7 -1
- cognee/api/v1/ui/ui.py +47 -16
- cognee/api/v1/update/routers/get_update_router.py +1 -1
- cognee/api/v1/update/update.py +3 -3
- cognee/cli/_cognee.py +61 -10
- cognee/cli/commands/add_command.py +3 -3
- cognee/cli/commands/cognify_command.py +3 -3
- cognee/cli/commands/config_command.py +9 -7
- cognee/cli/commands/delete_command.py +3 -3
- cognee/cli/commands/search_command.py +3 -7
- cognee/cli/config.py +0 -1
- cognee/context_global_variables.py +5 -0
- cognee/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/cache/__init__.py +2 -0
- cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
- cognee/infrastructure/databases/cache/config.py +44 -0
- cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
- cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
- cognee/infrastructure/databases/exceptions/__init__.py +1 -0
- cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
- cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
- cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
- cognee/infrastructure/databases/graph/kuzu/adapter.py +67 -44
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
- cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
- cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
- cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
- cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
- cognee/infrastructure/files/exceptions.py +1 -1
- cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
- cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
- cognee/infrastructure/files/utils/guess_file_type.py +6 -0
- cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
- cognee/infrastructure/loaders/LoaderEngine.py +27 -7
- cognee/infrastructure/loaders/external/__init__.py +7 -0
- cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
- cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
- cognee/infrastructure/loaders/supported_loaders.py +7 -0
- cognee/modules/data/exceptions/exceptions.py +1 -1
- cognee/modules/data/methods/__init__.py +3 -0
- cognee/modules/data/methods/get_dataset_data.py +4 -1
- cognee/modules/data/methods/has_dataset_data.py +21 -0
- cognee/modules/engine/models/TableRow.py +0 -1
- cognee/modules/ingestion/save_data_to_file.py +9 -2
- cognee/modules/pipelines/exceptions/exceptions.py +1 -1
- cognee/modules/pipelines/operations/pipeline.py +12 -1
- cognee/modules/pipelines/operations/run_tasks.py +25 -197
- cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
- cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
- cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
- cognee/modules/retrieval/base_graph_retriever.py +3 -1
- cognee/modules/retrieval/base_retriever.py +3 -1
- cognee/modules/retrieval/chunks_retriever.py +5 -1
- cognee/modules/retrieval/code_retriever.py +20 -2
- cognee/modules/retrieval/completion_retriever.py +50 -9
- cognee/modules/retrieval/cypher_search_retriever.py +11 -1
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
- cognee/modules/retrieval/graph_completion_cot_retriever.py +32 -1
- cognee/modules/retrieval/graph_completion_retriever.py +54 -10
- cognee/modules/retrieval/lexical_retriever.py +20 -2
- cognee/modules/retrieval/natural_language_retriever.py +10 -1
- cognee/modules/retrieval/summaries_retriever.py +5 -1
- cognee/modules/retrieval/temporal_retriever.py +62 -10
- cognee/modules/retrieval/user_qa_feedback.py +3 -2
- cognee/modules/retrieval/utils/completion.py +5 -0
- cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
- cognee/modules/retrieval/utils/session_cache.py +156 -0
- cognee/modules/search/methods/get_search_type_tools.py +0 -5
- cognee/modules/search/methods/no_access_control_search.py +12 -1
- cognee/modules/search/methods/search.py +34 -2
- cognee/modules/search/types/SearchType.py +0 -1
- cognee/modules/settings/get_settings.py +23 -0
- cognee/modules/users/methods/get_authenticated_user.py +3 -1
- cognee/modules/users/methods/get_default_user.py +1 -6
- cognee/modules/users/roles/methods/create_role.py +2 -2
- cognee/modules/users/tenants/methods/create_tenant.py +2 -2
- cognee/shared/exceptions/exceptions.py +1 -1
- cognee/tasks/codingagents/coding_rule_associations.py +1 -2
- cognee/tasks/documents/exceptions/exceptions.py +1 -1
- cognee/tasks/graph/extract_graph_from_data.py +2 -0
- cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
- cognee/tasks/ingestion/ingest_data.py +11 -5
- cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
- cognee/tasks/storage/add_data_points.py +3 -10
- cognee/tasks/storage/index_data_points.py +19 -14
- cognee/tasks/storage/index_graph_edges.py +25 -11
- cognee/tasks/web_scraper/__init__.py +34 -0
- cognee/tasks/web_scraper/config.py +26 -0
- cognee/tasks/web_scraper/default_url_crawler.py +446 -0
- cognee/tasks/web_scraper/models.py +46 -0
- cognee/tasks/web_scraper/types.py +4 -0
- cognee/tasks/web_scraper/utils.py +142 -0
- cognee/tasks/web_scraper/web_scraper_task.py +396 -0
- cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
- cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
- cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
- cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
- cognee/tests/subprocesses/reader.py +25 -0
- cognee/tests/subprocesses/simple_cognify_1.py +31 -0
- cognee/tests/subprocesses/simple_cognify_2.py +31 -0
- cognee/tests/subprocesses/writer.py +32 -0
- cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
- cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
- cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
- cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
- cognee/tests/test_add_docling_document.py +56 -0
- cognee/tests/test_chromadb.py +7 -11
- cognee/tests/test_concurrent_subprocess_access.py +76 -0
- cognee/tests/test_conversation_history.py +240 -0
- cognee/tests/test_kuzu.py +27 -15
- cognee/tests/test_lancedb.py +7 -11
- cognee/tests/test_library.py +32 -2
- cognee/tests/test_neo4j.py +24 -16
- cognee/tests/test_neptune_analytics_vector.py +7 -11
- cognee/tests/test_permissions.py +9 -13
- cognee/tests/test_pgvector.py +4 -4
- cognee/tests/test_remote_kuzu.py +8 -11
- cognee/tests/test_s3_file_storage.py +1 -1
- cognee/tests/test_search_db.py +6 -8
- cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
- cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/METADATA +22 -7
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/RECORD +155 -128
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/entry_points.txt +1 -0
- distributed/Dockerfile +0 -3
- distributed/entrypoint.py +21 -9
- distributed/signal.py +5 -0
- distributed/workers/data_point_saving_worker.py +64 -34
- distributed/workers/graph_saving_worker.py +71 -47
- cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
- cognee/modules/retrieval/insights_retriever.py +0 -133
- cognee/tests/test_memgraph.py +0 -109
- cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
- distributed/poetry.lock +0 -12238
- distributed/pyproject.toml +0 -185
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/WHEEL +0 -0
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -0,0 +1,446 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from functools import lru_cache
|
|
4
|
+
import time
|
|
5
|
+
from typing import Any, Union, List, Dict, Optional
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
from cognee.shared.logging_utils import get_logger
|
|
10
|
+
from cognee.tasks.web_scraper.types import UrlsToHtmls
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
from protego import Protego
|
|
16
|
+
except ImportError:
|
|
17
|
+
logger.warning("Failed to import protego, make sure to install using pip install protego>=0.1")
|
|
18
|
+
Protego = None
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
from playwright.async_api import async_playwright
|
|
22
|
+
except ImportError:
|
|
23
|
+
logger.warning(
|
|
24
|
+
"Failed to import playwright, make sure to install using pip install playwright>=1.9.0"
|
|
25
|
+
)
|
|
26
|
+
async_playwright = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class RobotsTxtCache:
|
|
31
|
+
"""Cache for robots.txt data.
|
|
32
|
+
|
|
33
|
+
Attributes:
|
|
34
|
+
protego: Parsed robots.txt object (Protego instance).
|
|
35
|
+
crawl_delay: Delay between requests (in seconds).
|
|
36
|
+
timestamp: Time when the cache entry was created.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
protego: Any
|
|
40
|
+
crawl_delay: float
|
|
41
|
+
timestamp: float = field(default_factory=time.time)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DefaultUrlCrawler:
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
*,
|
|
48
|
+
concurrency: int = 5,
|
|
49
|
+
crawl_delay: float = 0.5,
|
|
50
|
+
max_crawl_delay: Optional[float] = 10.0,
|
|
51
|
+
timeout: float = 15.0,
|
|
52
|
+
max_retries: int = 2,
|
|
53
|
+
retry_delay_factor: float = 0.5,
|
|
54
|
+
headers: Optional[Dict[str, str]] = None,
|
|
55
|
+
robots_cache_ttl: float = 3600.0,
|
|
56
|
+
):
|
|
57
|
+
"""Initialize the BeautifulSoupCrawler.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
concurrency: Number of concurrent requests allowed.
|
|
61
|
+
crawl_delay: Minimum seconds between requests to the same domain.
|
|
62
|
+
max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
|
|
63
|
+
timeout: Per-request timeout in seconds.
|
|
64
|
+
max_retries: Number of retries for failed requests.
|
|
65
|
+
retry_delay_factor: Multiplier for exponential backoff on retries.
|
|
66
|
+
headers: HTTP headers for requests (defaults to User-Agent: Cognee-Scraper/1.0).
|
|
67
|
+
robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
|
|
68
|
+
"""
|
|
69
|
+
self.concurrency = concurrency
|
|
70
|
+
self._sem = asyncio.Semaphore(concurrency)
|
|
71
|
+
self.crawl_delay = crawl_delay
|
|
72
|
+
self.max_crawl_delay = max_crawl_delay
|
|
73
|
+
self.timeout = timeout
|
|
74
|
+
self.max_retries = max_retries
|
|
75
|
+
self.retry_delay_factor = retry_delay_factor
|
|
76
|
+
self.headers = headers or {"User-Agent": "Cognee-Scraper/1.0"}
|
|
77
|
+
self.robots_cache_ttl = robots_cache_ttl
|
|
78
|
+
self._last_request_time_per_domain: Dict[str, float] = {}
|
|
79
|
+
self._robots_cache: Dict[str, RobotsTxtCache] = {}
|
|
80
|
+
self._client: Optional[httpx.AsyncClient] = None
|
|
81
|
+
self._robots_lock = asyncio.Lock()
|
|
82
|
+
|
|
83
|
+
async def _ensure_client(self):
|
|
84
|
+
"""Initialize the HTTP client if not already created."""
|
|
85
|
+
if self._client is None:
|
|
86
|
+
self._client = httpx.AsyncClient(timeout=self.timeout, headers=self.headers)
|
|
87
|
+
|
|
88
|
+
async def close(self):
|
|
89
|
+
"""Close the HTTP client."""
|
|
90
|
+
if self._client:
|
|
91
|
+
await self._client.aclose()
|
|
92
|
+
self._client = None
|
|
93
|
+
|
|
94
|
+
async def __aenter__(self):
|
|
95
|
+
"""Enter the context manager, initializing the HTTP client."""
|
|
96
|
+
await self._ensure_client()
|
|
97
|
+
return self
|
|
98
|
+
|
|
99
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
100
|
+
"""Exit the context manager, closing the HTTP client."""
|
|
101
|
+
await self.close()
|
|
102
|
+
|
|
103
|
+
@lru_cache(maxsize=1024)
|
|
104
|
+
def _domain_from_url(self, url: str) -> str:
|
|
105
|
+
"""Extract the domain (netloc) from a URL.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
url: The URL to parse.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
str: The domain (netloc) of the URL.
|
|
112
|
+
"""
|
|
113
|
+
try:
|
|
114
|
+
return urlparse(url).netloc
|
|
115
|
+
except Exception:
|
|
116
|
+
return url
|
|
117
|
+
|
|
118
|
+
@lru_cache(maxsize=1024)
|
|
119
|
+
def _get_domain_root(self, url: str) -> str:
|
|
120
|
+
"""Get the root URL (scheme and netloc) from a URL.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
url: The URL to parse.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
str: The root URL (e.g., "https://example.com").
|
|
127
|
+
"""
|
|
128
|
+
parsed = urlparse(url)
|
|
129
|
+
return f"{parsed.scheme}://{parsed.netloc}"
|
|
130
|
+
|
|
131
|
+
async def _respect_rate_limit(self, url: str, crawl_delay: Optional[float] = None):
|
|
132
|
+
"""Enforce rate limiting for requests to the same domain.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
url: The URL to check.
|
|
136
|
+
crawl_delay: Custom crawl delay in seconds (if any).
|
|
137
|
+
"""
|
|
138
|
+
domain = self._domain_from_url(url)
|
|
139
|
+
last = self._last_request_time_per_domain.get(domain)
|
|
140
|
+
delay = crawl_delay if crawl_delay is not None else self.crawl_delay
|
|
141
|
+
|
|
142
|
+
if last is None:
|
|
143
|
+
self._last_request_time_per_domain[domain] = time.time()
|
|
144
|
+
return
|
|
145
|
+
|
|
146
|
+
elapsed = time.time() - last
|
|
147
|
+
wait_for = delay - elapsed
|
|
148
|
+
if wait_for > 0:
|
|
149
|
+
logger.info(
|
|
150
|
+
f"Rate limiting: waiting {wait_for:.2f}s before requesting {url} (crawl_delay={delay}s from robots.txt)"
|
|
151
|
+
)
|
|
152
|
+
await asyncio.sleep(wait_for)
|
|
153
|
+
logger.info(f"Rate limit wait completed for {url}")
|
|
154
|
+
self._last_request_time_per_domain[domain] = time.time()
|
|
155
|
+
|
|
156
|
+
async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]:
|
|
157
|
+
"""Get cached robots.txt data if valid.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
domain_root: The root URL (e.g., "https://example.com").
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Optional[RobotsTxtCache]: Cached robots.txt data or None if expired or not found.
|
|
164
|
+
"""
|
|
165
|
+
if Protego is None:
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
cached = self._robots_cache.get(domain_root)
|
|
169
|
+
if cached and (time.time() - cached.timestamp) < self.robots_cache_ttl:
|
|
170
|
+
return cached
|
|
171
|
+
return None
|
|
172
|
+
|
|
173
|
+
async def _fetch_and_cache_robots(self, domain_root: str) -> RobotsTxtCache:
|
|
174
|
+
"""Fetch and cache robots.txt data.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
domain_root: The root URL (e.g., "https://example.com").
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
RobotsTxtCache: Cached robots.txt data with crawl delay.
|
|
181
|
+
|
|
182
|
+
Raises:
|
|
183
|
+
Exception: If fetching robots.txt fails.
|
|
184
|
+
"""
|
|
185
|
+
async with self._robots_lock:
|
|
186
|
+
cached = await self._get_robots_cache(domain_root)
|
|
187
|
+
if cached:
|
|
188
|
+
return cached
|
|
189
|
+
|
|
190
|
+
robots_url = f"{domain_root}/robots.txt"
|
|
191
|
+
try:
|
|
192
|
+
await self._ensure_client()
|
|
193
|
+
await self._respect_rate_limit(robots_url, self.crawl_delay)
|
|
194
|
+
resp = await self._client.get(robots_url, timeout=5.0)
|
|
195
|
+
content = resp.text if resp.status_code == 200 else ""
|
|
196
|
+
except Exception as e:
|
|
197
|
+
logger.debug(f"Failed to fetch robots.txt from {domain_root}: {e}")
|
|
198
|
+
content = ""
|
|
199
|
+
|
|
200
|
+
protego = Protego.parse(content) if content.strip() else None
|
|
201
|
+
agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
|
|
202
|
+
|
|
203
|
+
crawl_delay = self.crawl_delay
|
|
204
|
+
if protego:
|
|
205
|
+
delay = protego.crawl_delay(agent) or protego.crawl_delay("*")
|
|
206
|
+
if delay:
|
|
207
|
+
# Apply max_crawl_delay cap if configured
|
|
208
|
+
if self.max_crawl_delay is not None and delay > self.max_crawl_delay:
|
|
209
|
+
logger.warning(
|
|
210
|
+
f"robots.txt specifies crawl_delay={delay}s for {domain_root}, "
|
|
211
|
+
f"capping to max_crawl_delay={self.max_crawl_delay}s"
|
|
212
|
+
)
|
|
213
|
+
crawl_delay = self.max_crawl_delay
|
|
214
|
+
else:
|
|
215
|
+
crawl_delay = delay
|
|
216
|
+
|
|
217
|
+
cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay)
|
|
218
|
+
self._robots_cache[domain_root] = cache_entry
|
|
219
|
+
return cache_entry
|
|
220
|
+
|
|
221
|
+
async def _is_url_allowed(self, url: str) -> bool:
|
|
222
|
+
"""Check if a URL is allowed by robots.txt.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
url: The URL to check.
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
bool: True if the URL is allowed, False otherwise.
|
|
229
|
+
"""
|
|
230
|
+
if Protego is None:
|
|
231
|
+
return True
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
domain_root = self._get_domain_root(url)
|
|
235
|
+
cache = await self._get_robots_cache(domain_root)
|
|
236
|
+
if cache is None:
|
|
237
|
+
cache = await self._fetch_and_cache_robots(domain_root)
|
|
238
|
+
|
|
239
|
+
if cache.protego is None:
|
|
240
|
+
return True
|
|
241
|
+
|
|
242
|
+
agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
|
|
243
|
+
return cache.protego.can_fetch(agent, url) or cache.protego.can_fetch("*", url)
|
|
244
|
+
except Exception as e:
|
|
245
|
+
logger.debug(f"Error checking robots.txt for {url}: {e}")
|
|
246
|
+
return True
|
|
247
|
+
|
|
248
|
+
async def _get_crawl_delay(self, url: str) -> float:
|
|
249
|
+
"""Get the crawl delay for a URL from robots.txt.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
url: The URL to check.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
float: Crawl delay in seconds.
|
|
256
|
+
"""
|
|
257
|
+
if Protego is None:
|
|
258
|
+
return self.crawl_delay
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
domain_root = self._get_domain_root(url)
|
|
262
|
+
cache = await self._get_robots_cache(domain_root)
|
|
263
|
+
if cache is None:
|
|
264
|
+
cache = await self._fetch_and_cache_robots(domain_root)
|
|
265
|
+
return cache.crawl_delay
|
|
266
|
+
except Exception:
|
|
267
|
+
return self.crawl_delay
|
|
268
|
+
|
|
269
|
+
async def _fetch_httpx(self, url: str) -> str:
|
|
270
|
+
"""Fetch a URL using HTTPX with retries.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
url: The URL to fetch.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
str: The HTML content of the page.
|
|
277
|
+
|
|
278
|
+
Raises:
|
|
279
|
+
Exception: If all retry attempts fail.
|
|
280
|
+
"""
|
|
281
|
+
await self._ensure_client()
|
|
282
|
+
assert self._client is not None, "HTTP client not initialized"
|
|
283
|
+
|
|
284
|
+
attempt = 0
|
|
285
|
+
crawl_delay = await self._get_crawl_delay(url)
|
|
286
|
+
logger.info(f"Fetching URL with httpx (crawl_delay={crawl_delay}s): {url}")
|
|
287
|
+
|
|
288
|
+
while True:
|
|
289
|
+
try:
|
|
290
|
+
await self._respect_rate_limit(url, crawl_delay)
|
|
291
|
+
resp = await self._client.get(url)
|
|
292
|
+
resp.raise_for_status()
|
|
293
|
+
logger.info(
|
|
294
|
+
f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)"
|
|
295
|
+
)
|
|
296
|
+
return resp.text
|
|
297
|
+
except Exception as exc:
|
|
298
|
+
attempt += 1
|
|
299
|
+
if attempt > self.max_retries:
|
|
300
|
+
logger.error(f"Fetch failed for {url} after {attempt} attempts: {exc}")
|
|
301
|
+
raise
|
|
302
|
+
|
|
303
|
+
delay = self.retry_delay_factor * (2 ** (attempt - 1))
|
|
304
|
+
logger.warning(
|
|
305
|
+
f"Retrying {url} after {delay:.2f}s (attempt {attempt}) due to {exc}"
|
|
306
|
+
)
|
|
307
|
+
await asyncio.sleep(delay)
|
|
308
|
+
|
|
309
|
+
async def _render_with_playwright(
|
|
310
|
+
self, url: str, js_wait: float = 1.0, timeout: Optional[float] = None
|
|
311
|
+
) -> str:
|
|
312
|
+
"""Fetch and render a URL using Playwright for JavaScript content.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
url: The URL to fetch.
|
|
316
|
+
js_wait: Seconds to wait for JavaScript to load.
|
|
317
|
+
timeout: Timeout for the request (in seconds, defaults to instance timeout).
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
str: The rendered HTML content.
|
|
321
|
+
|
|
322
|
+
Raises:
|
|
323
|
+
RuntimeError: If Playwright is not installed.
|
|
324
|
+
Exception: If all retry attempts fail.
|
|
325
|
+
"""
|
|
326
|
+
if async_playwright is None:
|
|
327
|
+
raise RuntimeError(
|
|
328
|
+
"Playwright is not installed. Install with `pip install playwright` and run `playwright install`."
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
timeout_val = timeout or self.timeout
|
|
332
|
+
logger.info(
|
|
333
|
+
f"Rendering URL with Playwright (js_wait={js_wait}s, timeout={timeout_val}s): {url}"
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
attempt = 0
|
|
337
|
+
while True:
|
|
338
|
+
try:
|
|
339
|
+
async with async_playwright() as p:
|
|
340
|
+
logger.info(f"Launching headless Chromium browser for {url}")
|
|
341
|
+
browser = await p.chromium.launch(headless=True)
|
|
342
|
+
try:
|
|
343
|
+
context = await browser.new_context()
|
|
344
|
+
page = await context.new_page()
|
|
345
|
+
logger.info(f"Navigating to {url} and waiting for network idle")
|
|
346
|
+
await page.goto(
|
|
347
|
+
url,
|
|
348
|
+
wait_until="networkidle",
|
|
349
|
+
timeout=int(timeout_val * 1000),
|
|
350
|
+
)
|
|
351
|
+
if js_wait:
|
|
352
|
+
logger.info(f"Waiting {js_wait}s for JavaScript to execute")
|
|
353
|
+
await asyncio.sleep(js_wait)
|
|
354
|
+
content = await page.content()
|
|
355
|
+
logger.info(
|
|
356
|
+
f"Successfully rendered {url} with Playwright (size={len(content)} bytes)"
|
|
357
|
+
)
|
|
358
|
+
return content
|
|
359
|
+
finally:
|
|
360
|
+
await browser.close()
|
|
361
|
+
except Exception as exc:
|
|
362
|
+
attempt += 1
|
|
363
|
+
if attempt > self.max_retries:
|
|
364
|
+
logger.error(f"Playwright fetch failed for {url}: {exc}")
|
|
365
|
+
raise
|
|
366
|
+
backoff = self.retry_delay_factor * (2 ** (attempt - 1))
|
|
367
|
+
logger.warning(
|
|
368
|
+
f"Retrying playwright fetch {url} after {backoff:.2f}s (attempt {attempt})"
|
|
369
|
+
)
|
|
370
|
+
await asyncio.sleep(backoff)
|
|
371
|
+
|
|
372
|
+
async def fetch_urls(
|
|
373
|
+
self,
|
|
374
|
+
urls: Union[str, List[str]],
|
|
375
|
+
*,
|
|
376
|
+
use_playwright: bool = False,
|
|
377
|
+
playwright_js_wait: float = 0.8,
|
|
378
|
+
) -> UrlsToHtmls:
|
|
379
|
+
"""Fetch and extract content from URLs using BeautifulSoup or Playwright.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
urls: A single URL, list of URLs, or dict mapping URLs to extraction rules.
|
|
383
|
+
extraction_rules: Default extraction rules for string or list URLs.
|
|
384
|
+
use_playwright: If True, use Playwright for JavaScript rendering.
|
|
385
|
+
playwright_js_wait: Seconds to wait for JavaScript to load.
|
|
386
|
+
join_all_matches: If True, extract all matching elements for each rule.
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
Dict[str, str]: A dictionary mapping URLs to their extracted content.
|
|
390
|
+
|
|
391
|
+
Raises:
|
|
392
|
+
ValueError: If extraction_rules are missing when required or if urls is invalid.
|
|
393
|
+
Exception: If fetching or extraction fails.
|
|
394
|
+
"""
|
|
395
|
+
if isinstance(urls, str):
|
|
396
|
+
urls = [urls]
|
|
397
|
+
else:
|
|
398
|
+
raise ValueError(f"Invalid urls type: {type(urls)}")
|
|
399
|
+
|
|
400
|
+
async def _task(url: str):
|
|
401
|
+
async with self._sem:
|
|
402
|
+
try:
|
|
403
|
+
logger.info(f"Processing URL: {url}")
|
|
404
|
+
|
|
405
|
+
# Check robots.txt
|
|
406
|
+
allowed = await self._is_url_allowed(url)
|
|
407
|
+
if not allowed:
|
|
408
|
+
logger.warning(f"URL disallowed by robots.txt: {url}")
|
|
409
|
+
return url, ""
|
|
410
|
+
|
|
411
|
+
logger.info(f"Robots.txt check passed for {url}")
|
|
412
|
+
|
|
413
|
+
# Fetch HTML
|
|
414
|
+
if use_playwright:
|
|
415
|
+
logger.info(
|
|
416
|
+
f"Rendering {url} with Playwright (JS wait: {playwright_js_wait}s)"
|
|
417
|
+
)
|
|
418
|
+
html = await self._render_with_playwright(
|
|
419
|
+
url, js_wait=playwright_js_wait, timeout=self.timeout
|
|
420
|
+
)
|
|
421
|
+
else:
|
|
422
|
+
logger.info(f"Fetching {url} with httpx")
|
|
423
|
+
html = await self._fetch_httpx(url)
|
|
424
|
+
|
|
425
|
+
logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)")
|
|
426
|
+
|
|
427
|
+
return url, html
|
|
428
|
+
|
|
429
|
+
except Exception as e:
|
|
430
|
+
logger.error(f"Error processing {url}: {e}")
|
|
431
|
+
return url, ""
|
|
432
|
+
|
|
433
|
+
logger.info(f"Creating {len(urls)} async tasks for concurrent fetching")
|
|
434
|
+
tasks = [asyncio.create_task(_task(u)) for u in urls]
|
|
435
|
+
results = {}
|
|
436
|
+
completed = 0
|
|
437
|
+
total = len(tasks)
|
|
438
|
+
|
|
439
|
+
for coro in asyncio.as_completed(tasks):
|
|
440
|
+
url, html = await coro
|
|
441
|
+
results[url] = html
|
|
442
|
+
completed += 1
|
|
443
|
+
logger.info(f"Progress: {completed}/{total} URLs processed")
|
|
444
|
+
|
|
445
|
+
logger.info(f"Completed fetching all {len(results)} URL(s)")
|
|
446
|
+
return results
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from cognee.infrastructure.engine import DataPoint
|
|
2
|
+
from typing import Optional, Dict, Any, List
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class WebPage(DataPoint):
|
|
7
|
+
"""Represents a scraped web page with metadata"""
|
|
8
|
+
|
|
9
|
+
name: Optional[str]
|
|
10
|
+
content: str
|
|
11
|
+
content_hash: str
|
|
12
|
+
scraped_at: datetime
|
|
13
|
+
last_modified: Optional[datetime]
|
|
14
|
+
status_code: int
|
|
15
|
+
content_type: str
|
|
16
|
+
page_size: int
|
|
17
|
+
extraction_rules: Dict[str, Any] # CSS selectors, XPath rules used
|
|
18
|
+
description: str
|
|
19
|
+
metadata: dict = {"index_fields": ["name", "description", "content"]}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class WebSite(DataPoint):
|
|
23
|
+
"""Represents a website or domain being scraped"""
|
|
24
|
+
|
|
25
|
+
name: str
|
|
26
|
+
base_url: str
|
|
27
|
+
robots_txt: Optional[str]
|
|
28
|
+
crawl_delay: float
|
|
29
|
+
last_crawled: datetime
|
|
30
|
+
page_count: int
|
|
31
|
+
scraping_config: Dict[str, Any]
|
|
32
|
+
description: str
|
|
33
|
+
metadata: dict = {"index_fields": ["name", "description"]}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ScrapingJob(DataPoint):
|
|
37
|
+
"""Represents a scraping job configuration"""
|
|
38
|
+
|
|
39
|
+
name: str
|
|
40
|
+
urls: List[str]
|
|
41
|
+
schedule: Optional[str] # Cron-like schedule for recurring scrapes
|
|
42
|
+
status: str # "active", "paused", "completed", "failed"
|
|
43
|
+
last_run: Optional[datetime]
|
|
44
|
+
next_run: Optional[datetime]
|
|
45
|
+
description: str
|
|
46
|
+
metadata: dict = {"index_fields": ["name", "description"]}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Utilities for fetching web content using BeautifulSoup or Tavily.
|
|
2
|
+
|
|
3
|
+
This module provides functions to fetch and extract content from web pages, supporting
|
|
4
|
+
both BeautifulSoup for custom extraction rules and Tavily for API-based scraping.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
from typing import List, Union
|
|
9
|
+
from cognee.shared.logging_utils import get_logger
|
|
10
|
+
from cognee.tasks.web_scraper.types import UrlsToHtmls
|
|
11
|
+
from .default_url_crawler import DefaultUrlCrawler
|
|
12
|
+
from .config import DefaultCrawlerConfig, TavilyConfig
|
|
13
|
+
|
|
14
|
+
logger = get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
|
|
18
|
+
"""Fetch content from one or more URLs using the specified tool.
|
|
19
|
+
|
|
20
|
+
This function retrieves web page content using either BeautifulSoup (with custom
|
|
21
|
+
extraction rules) or Tavily (API-based scraping). It handles single URLs or lists of
|
|
22
|
+
URLs and returns a dictionary mapping URLs to their extracted content.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
urls: A single URL (str) or a list of URLs (List[str]) to scrape.
|
|
26
|
+
preferred_tool: The scraping tool to use ("tavily" or "beautifulsoup").
|
|
27
|
+
Defaults to "beautifulsoup".
|
|
28
|
+
tavily_config: Configuration for Tavily API, including API key.
|
|
29
|
+
Required if preferred_tool is "tavily".
|
|
30
|
+
default_crawler_config: Configuration for BeautifulSoup crawler, including
|
|
31
|
+
extraction rules. Required if preferred_tool is "beautifulsoup" and
|
|
32
|
+
extraction_rules are needed.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Dict[str, str]: A dictionary mapping each URL to its
|
|
36
|
+
extracted content (as a string for BeautifulSoup or a dict for Tavily).
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: If Tavily API key is missing when using Tavily, or if
|
|
40
|
+
extraction_rules are not provided when using BeautifulSoup.
|
|
41
|
+
ImportError: If required dependencies (beautifulsoup4 or tavily-python) are not
|
|
42
|
+
installed.
|
|
43
|
+
"""
|
|
44
|
+
url_list = [urls] if isinstance(urls, str) else urls
|
|
45
|
+
|
|
46
|
+
if os.getenv("TAVILY_API_KEY"):
|
|
47
|
+
logger.info("Using Tavily API for url fetching")
|
|
48
|
+
return await fetch_with_tavily(urls)
|
|
49
|
+
else:
|
|
50
|
+
logger.info("Using default crawler for content extraction")
|
|
51
|
+
|
|
52
|
+
default_crawler_config = (
|
|
53
|
+
DefaultCrawlerConfig()
|
|
54
|
+
) # We've decided to use defaults, and configure through env vars as needed
|
|
55
|
+
|
|
56
|
+
logger.info(
|
|
57
|
+
f"Initializing BeautifulSoup crawler with concurrency={default_crawler_config.concurrency}, timeout={default_crawler_config.timeout}s, max_crawl_delay={default_crawler_config.max_crawl_delay}s"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
crawler = DefaultUrlCrawler(
|
|
61
|
+
concurrency=default_crawler_config.concurrency,
|
|
62
|
+
crawl_delay=default_crawler_config.crawl_delay,
|
|
63
|
+
max_crawl_delay=default_crawler_config.max_crawl_delay,
|
|
64
|
+
timeout=default_crawler_config.timeout,
|
|
65
|
+
max_retries=default_crawler_config.max_retries,
|
|
66
|
+
retry_delay_factor=default_crawler_config.retry_delay_factor,
|
|
67
|
+
headers=default_crawler_config.headers,
|
|
68
|
+
robots_cache_ttl=default_crawler_config.robots_cache_ttl,
|
|
69
|
+
)
|
|
70
|
+
try:
|
|
71
|
+
logger.info(
|
|
72
|
+
f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={default_crawler_config.use_playwright})"
|
|
73
|
+
)
|
|
74
|
+
results = await crawler.fetch_urls(
|
|
75
|
+
urls,
|
|
76
|
+
use_playwright=default_crawler_config.use_playwright,
|
|
77
|
+
playwright_js_wait=default_crawler_config.playwright_js_wait,
|
|
78
|
+
)
|
|
79
|
+
logger.info(f"Successfully fetched content from {len(results)} URL(s)")
|
|
80
|
+
return results
|
|
81
|
+
except Exception as e:
|
|
82
|
+
logger.error(f"Error fetching page content: {str(e)}")
|
|
83
|
+
raise
|
|
84
|
+
finally:
|
|
85
|
+
logger.info("Closing BeautifulSoup crawler")
|
|
86
|
+
await crawler.close()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
async def fetch_with_tavily(urls: Union[str, List[str]]) -> UrlsToHtmls:
|
|
90
|
+
"""Fetch content from URLs using the Tavily API.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
urls: A single URL (str) or a list of URLs (List[str]) to scrape.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Dict[str, str]: A dictionary mapping each URL to its raw content as a string.
|
|
97
|
+
|
|
98
|
+
Raises:
|
|
99
|
+
ImportError: If tavily-python is not installed.
|
|
100
|
+
Exception: If the Tavily API request fails.
|
|
101
|
+
"""
|
|
102
|
+
try:
|
|
103
|
+
from tavily import AsyncTavilyClient
|
|
104
|
+
except ImportError:
|
|
105
|
+
logger.error(
|
|
106
|
+
"Failed to import tavily, make sure to install using pip install tavily-python>=0.7.0"
|
|
107
|
+
)
|
|
108
|
+
raise
|
|
109
|
+
|
|
110
|
+
tavily_config = TavilyConfig()
|
|
111
|
+
url_list = [urls] if isinstance(urls, str) else urls
|
|
112
|
+
extract_depth = tavily_config.extract_depth if tavily_config else "basic"
|
|
113
|
+
timeout = tavily_config.timeout if tavily_config else 10
|
|
114
|
+
|
|
115
|
+
logger.info(
|
|
116
|
+
f"Initializing Tavily client with extract_depth={extract_depth}, timeout={timeout}s"
|
|
117
|
+
)
|
|
118
|
+
client = AsyncTavilyClient(
|
|
119
|
+
api_key=tavily_config.api_key,
|
|
120
|
+
proxies=tavily_config.proxies,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
logger.info(f"Sending extract request to Tavily API for {len(url_list)} URL(s)")
|
|
124
|
+
results = await client.extract(
|
|
125
|
+
urls,
|
|
126
|
+
format="text",
|
|
127
|
+
extract_depth=extract_depth,
|
|
128
|
+
timeout=timeout,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
failed_count = len(results.get("failed_results", []))
|
|
132
|
+
if failed_count > 0:
|
|
133
|
+
logger.warning(f"Tavily API failed to fetch {failed_count} URL(s)")
|
|
134
|
+
for failed_result in results.get("failed_results", []):
|
|
135
|
+
logger.warning(f"Failed to fetch {failed_result}")
|
|
136
|
+
|
|
137
|
+
return_results = {}
|
|
138
|
+
for result in results.get("results", []):
|
|
139
|
+
return_results[result["url"]] = result["raw_content"]
|
|
140
|
+
|
|
141
|
+
logger.info(f"Successfully fetched content from {len(return_results)} URL(s) via Tavily")
|
|
142
|
+
return return_results
|