cognee 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. cognee/__init__.py +1 -0
  2. cognee/api/health.py +2 -12
  3. cognee/api/v1/add/add.py +46 -6
  4. cognee/api/v1/add/routers/get_add_router.py +5 -1
  5. cognee/api/v1/cognify/cognify.py +29 -9
  6. cognee/api/v1/datasets/datasets.py +11 -0
  7. cognee/api/v1/responses/default_tools.py +0 -1
  8. cognee/api/v1/responses/dispatch_function.py +1 -1
  9. cognee/api/v1/responses/routers/default_tools.py +0 -1
  10. cognee/api/v1/search/search.py +11 -9
  11. cognee/api/v1/settings/routers/get_settings_router.py +7 -1
  12. cognee/api/v1/ui/ui.py +47 -16
  13. cognee/api/v1/update/routers/get_update_router.py +1 -1
  14. cognee/api/v1/update/update.py +3 -3
  15. cognee/cli/_cognee.py +61 -10
  16. cognee/cli/commands/add_command.py +3 -3
  17. cognee/cli/commands/cognify_command.py +3 -3
  18. cognee/cli/commands/config_command.py +9 -7
  19. cognee/cli/commands/delete_command.py +3 -3
  20. cognee/cli/commands/search_command.py +3 -7
  21. cognee/cli/config.py +0 -1
  22. cognee/context_global_variables.py +5 -0
  23. cognee/exceptions/exceptions.py +1 -1
  24. cognee/infrastructure/databases/cache/__init__.py +2 -0
  25. cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
  26. cognee/infrastructure/databases/cache/config.py +44 -0
  27. cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
  28. cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
  29. cognee/infrastructure/databases/exceptions/__init__.py +1 -0
  30. cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
  31. cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
  32. cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
  33. cognee/infrastructure/databases/graph/kuzu/adapter.py +67 -44
  34. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
  35. cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
  36. cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
  37. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
  38. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
  39. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
  40. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
  41. cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
  42. cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
  43. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
  44. cognee/infrastructure/files/exceptions.py +1 -1
  45. cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
  46. cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
  47. cognee/infrastructure/files/utils/guess_file_type.py +6 -0
  48. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
  49. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
  50. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
  51. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
  52. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
  53. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
  54. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
  55. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
  56. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
  57. cognee/infrastructure/loaders/LoaderEngine.py +27 -7
  58. cognee/infrastructure/loaders/external/__init__.py +7 -0
  59. cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
  60. cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
  61. cognee/infrastructure/loaders/supported_loaders.py +7 -0
  62. cognee/modules/data/exceptions/exceptions.py +1 -1
  63. cognee/modules/data/methods/__init__.py +3 -0
  64. cognee/modules/data/methods/get_dataset_data.py +4 -1
  65. cognee/modules/data/methods/has_dataset_data.py +21 -0
  66. cognee/modules/engine/models/TableRow.py +0 -1
  67. cognee/modules/ingestion/save_data_to_file.py +9 -2
  68. cognee/modules/pipelines/exceptions/exceptions.py +1 -1
  69. cognee/modules/pipelines/operations/pipeline.py +12 -1
  70. cognee/modules/pipelines/operations/run_tasks.py +25 -197
  71. cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
  72. cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
  73. cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
  74. cognee/modules/retrieval/base_graph_retriever.py +3 -1
  75. cognee/modules/retrieval/base_retriever.py +3 -1
  76. cognee/modules/retrieval/chunks_retriever.py +5 -1
  77. cognee/modules/retrieval/code_retriever.py +20 -2
  78. cognee/modules/retrieval/completion_retriever.py +50 -9
  79. cognee/modules/retrieval/cypher_search_retriever.py +11 -1
  80. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
  81. cognee/modules/retrieval/graph_completion_cot_retriever.py +32 -1
  82. cognee/modules/retrieval/graph_completion_retriever.py +54 -10
  83. cognee/modules/retrieval/lexical_retriever.py +20 -2
  84. cognee/modules/retrieval/natural_language_retriever.py +10 -1
  85. cognee/modules/retrieval/summaries_retriever.py +5 -1
  86. cognee/modules/retrieval/temporal_retriever.py +62 -10
  87. cognee/modules/retrieval/user_qa_feedback.py +3 -2
  88. cognee/modules/retrieval/utils/completion.py +5 -0
  89. cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
  90. cognee/modules/retrieval/utils/session_cache.py +156 -0
  91. cognee/modules/search/methods/get_search_type_tools.py +0 -5
  92. cognee/modules/search/methods/no_access_control_search.py +12 -1
  93. cognee/modules/search/methods/search.py +34 -2
  94. cognee/modules/search/types/SearchType.py +0 -1
  95. cognee/modules/settings/get_settings.py +23 -0
  96. cognee/modules/users/methods/get_authenticated_user.py +3 -1
  97. cognee/modules/users/methods/get_default_user.py +1 -6
  98. cognee/modules/users/roles/methods/create_role.py +2 -2
  99. cognee/modules/users/tenants/methods/create_tenant.py +2 -2
  100. cognee/shared/exceptions/exceptions.py +1 -1
  101. cognee/tasks/codingagents/coding_rule_associations.py +1 -2
  102. cognee/tasks/documents/exceptions/exceptions.py +1 -1
  103. cognee/tasks/graph/extract_graph_from_data.py +2 -0
  104. cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
  105. cognee/tasks/ingestion/ingest_data.py +11 -5
  106. cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
  107. cognee/tasks/storage/add_data_points.py +3 -10
  108. cognee/tasks/storage/index_data_points.py +19 -14
  109. cognee/tasks/storage/index_graph_edges.py +25 -11
  110. cognee/tasks/web_scraper/__init__.py +34 -0
  111. cognee/tasks/web_scraper/config.py +26 -0
  112. cognee/tasks/web_scraper/default_url_crawler.py +446 -0
  113. cognee/tasks/web_scraper/models.py +46 -0
  114. cognee/tasks/web_scraper/types.py +4 -0
  115. cognee/tasks/web_scraper/utils.py +142 -0
  116. cognee/tasks/web_scraper/web_scraper_task.py +396 -0
  117. cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
  118. cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
  119. cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
  120. cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
  121. cognee/tests/subprocesses/reader.py +25 -0
  122. cognee/tests/subprocesses/simple_cognify_1.py +31 -0
  123. cognee/tests/subprocesses/simple_cognify_2.py +31 -0
  124. cognee/tests/subprocesses/writer.py +32 -0
  125. cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
  126. cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
  127. cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
  128. cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
  129. cognee/tests/test_add_docling_document.py +56 -0
  130. cognee/tests/test_chromadb.py +7 -11
  131. cognee/tests/test_concurrent_subprocess_access.py +76 -0
  132. cognee/tests/test_conversation_history.py +240 -0
  133. cognee/tests/test_kuzu.py +27 -15
  134. cognee/tests/test_lancedb.py +7 -11
  135. cognee/tests/test_library.py +32 -2
  136. cognee/tests/test_neo4j.py +24 -16
  137. cognee/tests/test_neptune_analytics_vector.py +7 -11
  138. cognee/tests/test_permissions.py +9 -13
  139. cognee/tests/test_pgvector.py +4 -4
  140. cognee/tests/test_remote_kuzu.py +8 -11
  141. cognee/tests/test_s3_file_storage.py +1 -1
  142. cognee/tests/test_search_db.py +6 -8
  143. cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
  144. cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
  145. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/METADATA +21 -6
  146. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/RECORD +155 -126
  147. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/entry_points.txt +1 -0
  148. distributed/Dockerfile +0 -3
  149. distributed/entrypoint.py +21 -9
  150. distributed/signal.py +5 -0
  151. distributed/workers/data_point_saving_worker.py +64 -34
  152. distributed/workers/graph_saving_worker.py +71 -47
  153. cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
  154. cognee/modules/retrieval/insights_retriever.py +0 -133
  155. cognee/tests/test_memgraph.py +0 -109
  156. cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
  157. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/WHEEL +0 -0
  158. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/licenses/LICENSE +0 -0
  159. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/licenses/NOTICE.md +0 -0
@@ -0,0 +1,446 @@
1
+ import asyncio
2
+ from dataclasses import dataclass, field
3
+ from functools import lru_cache
4
+ import time
5
+ from typing import Any, Union, List, Dict, Optional
6
+ from urllib.parse import urlparse
7
+ import httpx
8
+
9
+ from cognee.shared.logging_utils import get_logger
10
+ from cognee.tasks.web_scraper.types import UrlsToHtmls
11
+
12
+ logger = get_logger()
13
+
14
+ try:
15
+ from protego import Protego
16
+ except ImportError:
17
+ logger.warning("Failed to import protego, make sure to install using pip install protego>=0.1")
18
+ Protego = None
19
+
20
+ try:
21
+ from playwright.async_api import async_playwright
22
+ except ImportError:
23
+ logger.warning(
24
+ "Failed to import playwright, make sure to install using pip install playwright>=1.9.0"
25
+ )
26
+ async_playwright = None
27
+
28
+
29
+ @dataclass
30
+ class RobotsTxtCache:
31
+ """Cache for robots.txt data.
32
+
33
+ Attributes:
34
+ protego: Parsed robots.txt object (Protego instance).
35
+ crawl_delay: Delay between requests (in seconds).
36
+ timestamp: Time when the cache entry was created.
37
+ """
38
+
39
+ protego: Any
40
+ crawl_delay: float
41
+ timestamp: float = field(default_factory=time.time)
42
+
43
+
44
+ class DefaultUrlCrawler:
45
+ def __init__(
46
+ self,
47
+ *,
48
+ concurrency: int = 5,
49
+ crawl_delay: float = 0.5,
50
+ max_crawl_delay: Optional[float] = 10.0,
51
+ timeout: float = 15.0,
52
+ max_retries: int = 2,
53
+ retry_delay_factor: float = 0.5,
54
+ headers: Optional[Dict[str, str]] = None,
55
+ robots_cache_ttl: float = 3600.0,
56
+ ):
57
+ """Initialize the BeautifulSoupCrawler.
58
+
59
+ Args:
60
+ concurrency: Number of concurrent requests allowed.
61
+ crawl_delay: Minimum seconds between requests to the same domain.
62
+ max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
63
+ timeout: Per-request timeout in seconds.
64
+ max_retries: Number of retries for failed requests.
65
+ retry_delay_factor: Multiplier for exponential backoff on retries.
66
+ headers: HTTP headers for requests (defaults to User-Agent: Cognee-Scraper/1.0).
67
+ robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
68
+ """
69
+ self.concurrency = concurrency
70
+ self._sem = asyncio.Semaphore(concurrency)
71
+ self.crawl_delay = crawl_delay
72
+ self.max_crawl_delay = max_crawl_delay
73
+ self.timeout = timeout
74
+ self.max_retries = max_retries
75
+ self.retry_delay_factor = retry_delay_factor
76
+ self.headers = headers or {"User-Agent": "Cognee-Scraper/1.0"}
77
+ self.robots_cache_ttl = robots_cache_ttl
78
+ self._last_request_time_per_domain: Dict[str, float] = {}
79
+ self._robots_cache: Dict[str, RobotsTxtCache] = {}
80
+ self._client: Optional[httpx.AsyncClient] = None
81
+ self._robots_lock = asyncio.Lock()
82
+
83
+ async def _ensure_client(self):
84
+ """Initialize the HTTP client if not already created."""
85
+ if self._client is None:
86
+ self._client = httpx.AsyncClient(timeout=self.timeout, headers=self.headers)
87
+
88
+ async def close(self):
89
+ """Close the HTTP client."""
90
+ if self._client:
91
+ await self._client.aclose()
92
+ self._client = None
93
+
94
+ async def __aenter__(self):
95
+ """Enter the context manager, initializing the HTTP client."""
96
+ await self._ensure_client()
97
+ return self
98
+
99
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
100
+ """Exit the context manager, closing the HTTP client."""
101
+ await self.close()
102
+
103
+ @lru_cache(maxsize=1024)
104
+ def _domain_from_url(self, url: str) -> str:
105
+ """Extract the domain (netloc) from a URL.
106
+
107
+ Args:
108
+ url: The URL to parse.
109
+
110
+ Returns:
111
+ str: The domain (netloc) of the URL.
112
+ """
113
+ try:
114
+ return urlparse(url).netloc
115
+ except Exception:
116
+ return url
117
+
118
+ @lru_cache(maxsize=1024)
119
+ def _get_domain_root(self, url: str) -> str:
120
+ """Get the root URL (scheme and netloc) from a URL.
121
+
122
+ Args:
123
+ url: The URL to parse.
124
+
125
+ Returns:
126
+ str: The root URL (e.g., "https://example.com").
127
+ """
128
+ parsed = urlparse(url)
129
+ return f"{parsed.scheme}://{parsed.netloc}"
130
+
131
+ async def _respect_rate_limit(self, url: str, crawl_delay: Optional[float] = None):
132
+ """Enforce rate limiting for requests to the same domain.
133
+
134
+ Args:
135
+ url: The URL to check.
136
+ crawl_delay: Custom crawl delay in seconds (if any).
137
+ """
138
+ domain = self._domain_from_url(url)
139
+ last = self._last_request_time_per_domain.get(domain)
140
+ delay = crawl_delay if crawl_delay is not None else self.crawl_delay
141
+
142
+ if last is None:
143
+ self._last_request_time_per_domain[domain] = time.time()
144
+ return
145
+
146
+ elapsed = time.time() - last
147
+ wait_for = delay - elapsed
148
+ if wait_for > 0:
149
+ logger.info(
150
+ f"Rate limiting: waiting {wait_for:.2f}s before requesting {url} (crawl_delay={delay}s from robots.txt)"
151
+ )
152
+ await asyncio.sleep(wait_for)
153
+ logger.info(f"Rate limit wait completed for {url}")
154
+ self._last_request_time_per_domain[domain] = time.time()
155
+
156
+ async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]:
157
+ """Get cached robots.txt data if valid.
158
+
159
+ Args:
160
+ domain_root: The root URL (e.g., "https://example.com").
161
+
162
+ Returns:
163
+ Optional[RobotsTxtCache]: Cached robots.txt data or None if expired or not found.
164
+ """
165
+ if Protego is None:
166
+ return None
167
+
168
+ cached = self._robots_cache.get(domain_root)
169
+ if cached and (time.time() - cached.timestamp) < self.robots_cache_ttl:
170
+ return cached
171
+ return None
172
+
173
+ async def _fetch_and_cache_robots(self, domain_root: str) -> RobotsTxtCache:
174
+ """Fetch and cache robots.txt data.
175
+
176
+ Args:
177
+ domain_root: The root URL (e.g., "https://example.com").
178
+
179
+ Returns:
180
+ RobotsTxtCache: Cached robots.txt data with crawl delay.
181
+
182
+ Raises:
183
+ Exception: If fetching robots.txt fails.
184
+ """
185
+ async with self._robots_lock:
186
+ cached = await self._get_robots_cache(domain_root)
187
+ if cached:
188
+ return cached
189
+
190
+ robots_url = f"{domain_root}/robots.txt"
191
+ try:
192
+ await self._ensure_client()
193
+ await self._respect_rate_limit(robots_url, self.crawl_delay)
194
+ resp = await self._client.get(robots_url, timeout=5.0)
195
+ content = resp.text if resp.status_code == 200 else ""
196
+ except Exception as e:
197
+ logger.debug(f"Failed to fetch robots.txt from {domain_root}: {e}")
198
+ content = ""
199
+
200
+ protego = Protego.parse(content) if content.strip() else None
201
+ agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
202
+
203
+ crawl_delay = self.crawl_delay
204
+ if protego:
205
+ delay = protego.crawl_delay(agent) or protego.crawl_delay("*")
206
+ if delay:
207
+ # Apply max_crawl_delay cap if configured
208
+ if self.max_crawl_delay is not None and delay > self.max_crawl_delay:
209
+ logger.warning(
210
+ f"robots.txt specifies crawl_delay={delay}s for {domain_root}, "
211
+ f"capping to max_crawl_delay={self.max_crawl_delay}s"
212
+ )
213
+ crawl_delay = self.max_crawl_delay
214
+ else:
215
+ crawl_delay = delay
216
+
217
+ cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay)
218
+ self._robots_cache[domain_root] = cache_entry
219
+ return cache_entry
220
+
221
+ async def _is_url_allowed(self, url: str) -> bool:
222
+ """Check if a URL is allowed by robots.txt.
223
+
224
+ Args:
225
+ url: The URL to check.
226
+
227
+ Returns:
228
+ bool: True if the URL is allowed, False otherwise.
229
+ """
230
+ if Protego is None:
231
+ return True
232
+
233
+ try:
234
+ domain_root = self._get_domain_root(url)
235
+ cache = await self._get_robots_cache(domain_root)
236
+ if cache is None:
237
+ cache = await self._fetch_and_cache_robots(domain_root)
238
+
239
+ if cache.protego is None:
240
+ return True
241
+
242
+ agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
243
+ return cache.protego.can_fetch(agent, url) or cache.protego.can_fetch("*", url)
244
+ except Exception as e:
245
+ logger.debug(f"Error checking robots.txt for {url}: {e}")
246
+ return True
247
+
248
+ async def _get_crawl_delay(self, url: str) -> float:
249
+ """Get the crawl delay for a URL from robots.txt.
250
+
251
+ Args:
252
+ url: The URL to check.
253
+
254
+ Returns:
255
+ float: Crawl delay in seconds.
256
+ """
257
+ if Protego is None:
258
+ return self.crawl_delay
259
+
260
+ try:
261
+ domain_root = self._get_domain_root(url)
262
+ cache = await self._get_robots_cache(domain_root)
263
+ if cache is None:
264
+ cache = await self._fetch_and_cache_robots(domain_root)
265
+ return cache.crawl_delay
266
+ except Exception:
267
+ return self.crawl_delay
268
+
269
+ async def _fetch_httpx(self, url: str) -> str:
270
+ """Fetch a URL using HTTPX with retries.
271
+
272
+ Args:
273
+ url: The URL to fetch.
274
+
275
+ Returns:
276
+ str: The HTML content of the page.
277
+
278
+ Raises:
279
+ Exception: If all retry attempts fail.
280
+ """
281
+ await self._ensure_client()
282
+ assert self._client is not None, "HTTP client not initialized"
283
+
284
+ attempt = 0
285
+ crawl_delay = await self._get_crawl_delay(url)
286
+ logger.info(f"Fetching URL with httpx (crawl_delay={crawl_delay}s): {url}")
287
+
288
+ while True:
289
+ try:
290
+ await self._respect_rate_limit(url, crawl_delay)
291
+ resp = await self._client.get(url)
292
+ resp.raise_for_status()
293
+ logger.info(
294
+ f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)"
295
+ )
296
+ return resp.text
297
+ except Exception as exc:
298
+ attempt += 1
299
+ if attempt > self.max_retries:
300
+ logger.error(f"Fetch failed for {url} after {attempt} attempts: {exc}")
301
+ raise
302
+
303
+ delay = self.retry_delay_factor * (2 ** (attempt - 1))
304
+ logger.warning(
305
+ f"Retrying {url} after {delay:.2f}s (attempt {attempt}) due to {exc}"
306
+ )
307
+ await asyncio.sleep(delay)
308
+
309
+ async def _render_with_playwright(
310
+ self, url: str, js_wait: float = 1.0, timeout: Optional[float] = None
311
+ ) -> str:
312
+ """Fetch and render a URL using Playwright for JavaScript content.
313
+
314
+ Args:
315
+ url: The URL to fetch.
316
+ js_wait: Seconds to wait for JavaScript to load.
317
+ timeout: Timeout for the request (in seconds, defaults to instance timeout).
318
+
319
+ Returns:
320
+ str: The rendered HTML content.
321
+
322
+ Raises:
323
+ RuntimeError: If Playwright is not installed.
324
+ Exception: If all retry attempts fail.
325
+ """
326
+ if async_playwright is None:
327
+ raise RuntimeError(
328
+ "Playwright is not installed. Install with `pip install playwright` and run `playwright install`."
329
+ )
330
+
331
+ timeout_val = timeout or self.timeout
332
+ logger.info(
333
+ f"Rendering URL with Playwright (js_wait={js_wait}s, timeout={timeout_val}s): {url}"
334
+ )
335
+
336
+ attempt = 0
337
+ while True:
338
+ try:
339
+ async with async_playwright() as p:
340
+ logger.info(f"Launching headless Chromium browser for {url}")
341
+ browser = await p.chromium.launch(headless=True)
342
+ try:
343
+ context = await browser.new_context()
344
+ page = await context.new_page()
345
+ logger.info(f"Navigating to {url} and waiting for network idle")
346
+ await page.goto(
347
+ url,
348
+ wait_until="networkidle",
349
+ timeout=int(timeout_val * 1000),
350
+ )
351
+ if js_wait:
352
+ logger.info(f"Waiting {js_wait}s for JavaScript to execute")
353
+ await asyncio.sleep(js_wait)
354
+ content = await page.content()
355
+ logger.info(
356
+ f"Successfully rendered {url} with Playwright (size={len(content)} bytes)"
357
+ )
358
+ return content
359
+ finally:
360
+ await browser.close()
361
+ except Exception as exc:
362
+ attempt += 1
363
+ if attempt > self.max_retries:
364
+ logger.error(f"Playwright fetch failed for {url}: {exc}")
365
+ raise
366
+ backoff = self.retry_delay_factor * (2 ** (attempt - 1))
367
+ logger.warning(
368
+ f"Retrying playwright fetch {url} after {backoff:.2f}s (attempt {attempt})"
369
+ )
370
+ await asyncio.sleep(backoff)
371
+
372
+ async def fetch_urls(
373
+ self,
374
+ urls: Union[str, List[str]],
375
+ *,
376
+ use_playwright: bool = False,
377
+ playwright_js_wait: float = 0.8,
378
+ ) -> UrlsToHtmls:
379
+ """Fetch and extract content from URLs using BeautifulSoup or Playwright.
380
+
381
+ Args:
382
+ urls: A single URL, list of URLs, or dict mapping URLs to extraction rules.
383
+ extraction_rules: Default extraction rules for string or list URLs.
384
+ use_playwright: If True, use Playwright for JavaScript rendering.
385
+ playwright_js_wait: Seconds to wait for JavaScript to load.
386
+ join_all_matches: If True, extract all matching elements for each rule.
387
+
388
+ Returns:
389
+ Dict[str, str]: A dictionary mapping URLs to their extracted content.
390
+
391
+ Raises:
392
+ ValueError: If extraction_rules are missing when required or if urls is invalid.
393
+ Exception: If fetching or extraction fails.
394
+ """
395
+ if isinstance(urls, str):
396
+ urls = [urls]
397
+ else:
398
+ raise ValueError(f"Invalid urls type: {type(urls)}")
399
+
400
+ async def _task(url: str):
401
+ async with self._sem:
402
+ try:
403
+ logger.info(f"Processing URL: {url}")
404
+
405
+ # Check robots.txt
406
+ allowed = await self._is_url_allowed(url)
407
+ if not allowed:
408
+ logger.warning(f"URL disallowed by robots.txt: {url}")
409
+ return url, ""
410
+
411
+ logger.info(f"Robots.txt check passed for {url}")
412
+
413
+ # Fetch HTML
414
+ if use_playwright:
415
+ logger.info(
416
+ f"Rendering {url} with Playwright (JS wait: {playwright_js_wait}s)"
417
+ )
418
+ html = await self._render_with_playwright(
419
+ url, js_wait=playwright_js_wait, timeout=self.timeout
420
+ )
421
+ else:
422
+ logger.info(f"Fetching {url} with httpx")
423
+ html = await self._fetch_httpx(url)
424
+
425
+ logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)")
426
+
427
+ return url, html
428
+
429
+ except Exception as e:
430
+ logger.error(f"Error processing {url}: {e}")
431
+ return url, ""
432
+
433
+ logger.info(f"Creating {len(urls)} async tasks for concurrent fetching")
434
+ tasks = [asyncio.create_task(_task(u)) for u in urls]
435
+ results = {}
436
+ completed = 0
437
+ total = len(tasks)
438
+
439
+ for coro in asyncio.as_completed(tasks):
440
+ url, html = await coro
441
+ results[url] = html
442
+ completed += 1
443
+ logger.info(f"Progress: {completed}/{total} URLs processed")
444
+
445
+ logger.info(f"Completed fetching all {len(results)} URL(s)")
446
+ return results
@@ -0,0 +1,46 @@
1
+ from cognee.infrastructure.engine import DataPoint
2
+ from typing import Optional, Dict, Any, List
3
+ from datetime import datetime
4
+
5
+
6
+ class WebPage(DataPoint):
7
+ """Represents a scraped web page with metadata"""
8
+
9
+ name: Optional[str]
10
+ content: str
11
+ content_hash: str
12
+ scraped_at: datetime
13
+ last_modified: Optional[datetime]
14
+ status_code: int
15
+ content_type: str
16
+ page_size: int
17
+ extraction_rules: Dict[str, Any] # CSS selectors, XPath rules used
18
+ description: str
19
+ metadata: dict = {"index_fields": ["name", "description", "content"]}
20
+
21
+
22
+ class WebSite(DataPoint):
23
+ """Represents a website or domain being scraped"""
24
+
25
+ name: str
26
+ base_url: str
27
+ robots_txt: Optional[str]
28
+ crawl_delay: float
29
+ last_crawled: datetime
30
+ page_count: int
31
+ scraping_config: Dict[str, Any]
32
+ description: str
33
+ metadata: dict = {"index_fields": ["name", "description"]}
34
+
35
+
36
+ class ScrapingJob(DataPoint):
37
+ """Represents a scraping job configuration"""
38
+
39
+ name: str
40
+ urls: List[str]
41
+ schedule: Optional[str] # Cron-like schedule for recurring scrapes
42
+ status: str # "active", "paused", "completed", "failed"
43
+ last_run: Optional[datetime]
44
+ next_run: Optional[datetime]
45
+ description: str
46
+ metadata: dict = {"index_fields": ["name", "description"]}
@@ -0,0 +1,4 @@
1
+ from typing import TypeAlias
2
+
3
+
4
+ UrlsToHtmls: TypeAlias = dict[str, str]
@@ -0,0 +1,142 @@
1
+ """Utilities for fetching web content using BeautifulSoup or Tavily.
2
+
3
+ This module provides functions to fetch and extract content from web pages, supporting
4
+ both BeautifulSoup for custom extraction rules and Tavily for API-based scraping.
5
+ """
6
+
7
+ import os
8
+ from typing import List, Union
9
+ from cognee.shared.logging_utils import get_logger
10
+ from cognee.tasks.web_scraper.types import UrlsToHtmls
11
+ from .default_url_crawler import DefaultUrlCrawler
12
+ from .config import DefaultCrawlerConfig, TavilyConfig
13
+
14
+ logger = get_logger(__name__)
15
+
16
+
17
+ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls:
18
+ """Fetch content from one or more URLs using the specified tool.
19
+
20
+ This function retrieves web page content using either BeautifulSoup (with custom
21
+ extraction rules) or Tavily (API-based scraping). It handles single URLs or lists of
22
+ URLs and returns a dictionary mapping URLs to their extracted content.
23
+
24
+ Args:
25
+ urls: A single URL (str) or a list of URLs (List[str]) to scrape.
26
+ preferred_tool: The scraping tool to use ("tavily" or "beautifulsoup").
27
+ Defaults to "beautifulsoup".
28
+ tavily_config: Configuration for Tavily API, including API key.
29
+ Required if preferred_tool is "tavily".
30
+ default_crawler_config: Configuration for BeautifulSoup crawler, including
31
+ extraction rules. Required if preferred_tool is "beautifulsoup" and
32
+ extraction_rules are needed.
33
+
34
+ Returns:
35
+ Dict[str, str]: A dictionary mapping each URL to its
36
+ extracted content (as a string for BeautifulSoup or a dict for Tavily).
37
+
38
+ Raises:
39
+ ValueError: If Tavily API key is missing when using Tavily, or if
40
+ extraction_rules are not provided when using BeautifulSoup.
41
+ ImportError: If required dependencies (beautifulsoup4 or tavily-python) are not
42
+ installed.
43
+ """
44
+ url_list = [urls] if isinstance(urls, str) else urls
45
+
46
+ if os.getenv("TAVILY_API_KEY"):
47
+ logger.info("Using Tavily API for url fetching")
48
+ return await fetch_with_tavily(urls)
49
+ else:
50
+ logger.info("Using default crawler for content extraction")
51
+
52
+ default_crawler_config = (
53
+ DefaultCrawlerConfig()
54
+ ) # We've decided to use defaults, and configure through env vars as needed
55
+
56
+ logger.info(
57
+ f"Initializing BeautifulSoup crawler with concurrency={default_crawler_config.concurrency}, timeout={default_crawler_config.timeout}s, max_crawl_delay={default_crawler_config.max_crawl_delay}s"
58
+ )
59
+
60
+ crawler = DefaultUrlCrawler(
61
+ concurrency=default_crawler_config.concurrency,
62
+ crawl_delay=default_crawler_config.crawl_delay,
63
+ max_crawl_delay=default_crawler_config.max_crawl_delay,
64
+ timeout=default_crawler_config.timeout,
65
+ max_retries=default_crawler_config.max_retries,
66
+ retry_delay_factor=default_crawler_config.retry_delay_factor,
67
+ headers=default_crawler_config.headers,
68
+ robots_cache_ttl=default_crawler_config.robots_cache_ttl,
69
+ )
70
+ try:
71
+ logger.info(
72
+ f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={default_crawler_config.use_playwright})"
73
+ )
74
+ results = await crawler.fetch_urls(
75
+ urls,
76
+ use_playwright=default_crawler_config.use_playwright,
77
+ playwright_js_wait=default_crawler_config.playwright_js_wait,
78
+ )
79
+ logger.info(f"Successfully fetched content from {len(results)} URL(s)")
80
+ return results
81
+ except Exception as e:
82
+ logger.error(f"Error fetching page content: {str(e)}")
83
+ raise
84
+ finally:
85
+ logger.info("Closing BeautifulSoup crawler")
86
+ await crawler.close()
87
+
88
+
89
+ async def fetch_with_tavily(urls: Union[str, List[str]]) -> UrlsToHtmls:
90
+ """Fetch content from URLs using the Tavily API.
91
+
92
+ Args:
93
+ urls: A single URL (str) or a list of URLs (List[str]) to scrape.
94
+
95
+ Returns:
96
+ Dict[str, str]: A dictionary mapping each URL to its raw content as a string.
97
+
98
+ Raises:
99
+ ImportError: If tavily-python is not installed.
100
+ Exception: If the Tavily API request fails.
101
+ """
102
+ try:
103
+ from tavily import AsyncTavilyClient
104
+ except ImportError:
105
+ logger.error(
106
+ "Failed to import tavily, make sure to install using pip install tavily-python>=0.7.0"
107
+ )
108
+ raise
109
+
110
+ tavily_config = TavilyConfig()
111
+ url_list = [urls] if isinstance(urls, str) else urls
112
+ extract_depth = tavily_config.extract_depth if tavily_config else "basic"
113
+ timeout = tavily_config.timeout if tavily_config else 10
114
+
115
+ logger.info(
116
+ f"Initializing Tavily client with extract_depth={extract_depth}, timeout={timeout}s"
117
+ )
118
+ client = AsyncTavilyClient(
119
+ api_key=tavily_config.api_key,
120
+ proxies=tavily_config.proxies,
121
+ )
122
+
123
+ logger.info(f"Sending extract request to Tavily API for {len(url_list)} URL(s)")
124
+ results = await client.extract(
125
+ urls,
126
+ format="text",
127
+ extract_depth=extract_depth,
128
+ timeout=timeout,
129
+ )
130
+
131
+ failed_count = len(results.get("failed_results", []))
132
+ if failed_count > 0:
133
+ logger.warning(f"Tavily API failed to fetch {failed_count} URL(s)")
134
+ for failed_result in results.get("failed_results", []):
135
+ logger.warning(f"Failed to fetch {failed_result}")
136
+
137
+ return_results = {}
138
+ for result in results.get("results", []):
139
+ return_results[result["url"]] = result["raw_content"]
140
+
141
+ logger.info(f"Successfully fetched content from {len(return_results)} URL(s) via Tavily")
142
+ return return_results