cognee 0.3.6__py3-none-any.whl → 0.3.7.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. cognee/__init__.py +1 -0
  2. cognee/api/health.py +2 -12
  3. cognee/api/v1/add/add.py +46 -6
  4. cognee/api/v1/add/routers/get_add_router.py +11 -2
  5. cognee/api/v1/cognify/cognify.py +29 -9
  6. cognee/api/v1/cognify/routers/get_cognify_router.py +2 -1
  7. cognee/api/v1/datasets/datasets.py +11 -0
  8. cognee/api/v1/datasets/routers/get_datasets_router.py +8 -0
  9. cognee/api/v1/delete/routers/get_delete_router.py +2 -0
  10. cognee/api/v1/memify/routers/get_memify_router.py +2 -1
  11. cognee/api/v1/permissions/routers/get_permissions_router.py +6 -0
  12. cognee/api/v1/responses/default_tools.py +0 -1
  13. cognee/api/v1/responses/dispatch_function.py +1 -1
  14. cognee/api/v1/responses/routers/default_tools.py +0 -1
  15. cognee/api/v1/search/routers/get_search_router.py +3 -3
  16. cognee/api/v1/search/search.py +11 -9
  17. cognee/api/v1/settings/routers/get_settings_router.py +7 -1
  18. cognee/api/v1/sync/routers/get_sync_router.py +3 -0
  19. cognee/api/v1/ui/ui.py +45 -16
  20. cognee/api/v1/update/routers/get_update_router.py +3 -1
  21. cognee/api/v1/update/update.py +3 -3
  22. cognee/api/v1/users/routers/get_visualize_router.py +2 -0
  23. cognee/cli/_cognee.py +61 -10
  24. cognee/cli/commands/add_command.py +3 -3
  25. cognee/cli/commands/cognify_command.py +3 -3
  26. cognee/cli/commands/config_command.py +9 -7
  27. cognee/cli/commands/delete_command.py +3 -3
  28. cognee/cli/commands/search_command.py +3 -7
  29. cognee/cli/config.py +0 -1
  30. cognee/context_global_variables.py +5 -0
  31. cognee/exceptions/exceptions.py +1 -1
  32. cognee/infrastructure/databases/cache/__init__.py +2 -0
  33. cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
  34. cognee/infrastructure/databases/cache/config.py +44 -0
  35. cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
  36. cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
  37. cognee/infrastructure/databases/exceptions/__init__.py +1 -0
  38. cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
  39. cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
  40. cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
  41. cognee/infrastructure/databases/graph/kuzu/adapter.py +76 -47
  42. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
  43. cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
  44. cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
  45. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
  46. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
  47. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
  48. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
  49. cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
  50. cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
  51. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
  52. cognee/infrastructure/files/exceptions.py +1 -1
  53. cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
  54. cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
  55. cognee/infrastructure/files/utils/guess_file_type.py +6 -0
  56. cognee/infrastructure/llm/prompts/feedback_reaction_prompt.txt +14 -0
  57. cognee/infrastructure/llm/prompts/feedback_report_prompt.txt +13 -0
  58. cognee/infrastructure/llm/prompts/feedback_user_context_prompt.txt +5 -0
  59. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
  60. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
  61. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
  62. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
  63. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
  64. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
  65. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
  66. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
  67. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
  68. cognee/infrastructure/loaders/LoaderEngine.py +27 -7
  69. cognee/infrastructure/loaders/external/__init__.py +7 -0
  70. cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
  71. cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
  72. cognee/infrastructure/loaders/supported_loaders.py +7 -0
  73. cognee/modules/data/exceptions/exceptions.py +1 -1
  74. cognee/modules/data/methods/__init__.py +3 -0
  75. cognee/modules/data/methods/get_dataset_data.py +4 -1
  76. cognee/modules/data/methods/has_dataset_data.py +21 -0
  77. cognee/modules/engine/models/TableRow.py +0 -1
  78. cognee/modules/ingestion/save_data_to_file.py +9 -2
  79. cognee/modules/pipelines/exceptions/exceptions.py +1 -1
  80. cognee/modules/pipelines/operations/pipeline.py +12 -1
  81. cognee/modules/pipelines/operations/run_tasks.py +25 -197
  82. cognee/modules/pipelines/operations/run_tasks_base.py +7 -0
  83. cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
  84. cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
  85. cognee/modules/pipelines/operations/run_tasks_with_telemetry.py +9 -1
  86. cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
  87. cognee/modules/retrieval/base_graph_retriever.py +3 -1
  88. cognee/modules/retrieval/base_retriever.py +3 -1
  89. cognee/modules/retrieval/chunks_retriever.py +5 -1
  90. cognee/modules/retrieval/code_retriever.py +20 -2
  91. cognee/modules/retrieval/completion_retriever.py +50 -9
  92. cognee/modules/retrieval/cypher_search_retriever.py +11 -1
  93. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
  94. cognee/modules/retrieval/graph_completion_cot_retriever.py +152 -22
  95. cognee/modules/retrieval/graph_completion_retriever.py +54 -10
  96. cognee/modules/retrieval/lexical_retriever.py +20 -2
  97. cognee/modules/retrieval/natural_language_retriever.py +10 -1
  98. cognee/modules/retrieval/summaries_retriever.py +5 -1
  99. cognee/modules/retrieval/temporal_retriever.py +62 -10
  100. cognee/modules/retrieval/user_qa_feedback.py +3 -2
  101. cognee/modules/retrieval/utils/completion.py +30 -4
  102. cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
  103. cognee/modules/retrieval/utils/session_cache.py +156 -0
  104. cognee/modules/search/methods/get_search_type_tools.py +0 -5
  105. cognee/modules/search/methods/no_access_control_search.py +12 -1
  106. cognee/modules/search/methods/search.py +51 -5
  107. cognee/modules/search/types/SearchType.py +0 -1
  108. cognee/modules/settings/get_settings.py +23 -0
  109. cognee/modules/users/methods/get_authenticated_user.py +3 -1
  110. cognee/modules/users/methods/get_default_user.py +1 -6
  111. cognee/modules/users/roles/methods/create_role.py +2 -2
  112. cognee/modules/users/tenants/methods/create_tenant.py +2 -2
  113. cognee/shared/exceptions/exceptions.py +1 -1
  114. cognee/shared/logging_utils.py +18 -11
  115. cognee/shared/utils.py +24 -2
  116. cognee/tasks/codingagents/coding_rule_associations.py +1 -2
  117. cognee/tasks/documents/exceptions/exceptions.py +1 -1
  118. cognee/tasks/feedback/__init__.py +13 -0
  119. cognee/tasks/feedback/create_enrichments.py +84 -0
  120. cognee/tasks/feedback/extract_feedback_interactions.py +230 -0
  121. cognee/tasks/feedback/generate_improved_answers.py +130 -0
  122. cognee/tasks/feedback/link_enrichments_to_feedback.py +67 -0
  123. cognee/tasks/feedback/models.py +26 -0
  124. cognee/tasks/graph/extract_graph_from_data.py +2 -0
  125. cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
  126. cognee/tasks/ingestion/ingest_data.py +11 -5
  127. cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
  128. cognee/tasks/storage/add_data_points.py +3 -10
  129. cognee/tasks/storage/index_data_points.py +19 -14
  130. cognee/tasks/storage/index_graph_edges.py +25 -11
  131. cognee/tasks/web_scraper/__init__.py +34 -0
  132. cognee/tasks/web_scraper/config.py +26 -0
  133. cognee/tasks/web_scraper/default_url_crawler.py +446 -0
  134. cognee/tasks/web_scraper/models.py +46 -0
  135. cognee/tasks/web_scraper/types.py +4 -0
  136. cognee/tasks/web_scraper/utils.py +142 -0
  137. cognee/tasks/web_scraper/web_scraper_task.py +396 -0
  138. cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
  139. cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
  140. cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
  141. cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
  142. cognee/tests/subprocesses/reader.py +25 -0
  143. cognee/tests/subprocesses/simple_cognify_1.py +31 -0
  144. cognee/tests/subprocesses/simple_cognify_2.py +31 -0
  145. cognee/tests/subprocesses/writer.py +32 -0
  146. cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
  147. cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
  148. cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
  149. cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
  150. cognee/tests/test_add_docling_document.py +56 -0
  151. cognee/tests/test_chromadb.py +7 -11
  152. cognee/tests/test_concurrent_subprocess_access.py +76 -0
  153. cognee/tests/test_conversation_history.py +240 -0
  154. cognee/tests/test_feedback_enrichment.py +174 -0
  155. cognee/tests/test_kuzu.py +27 -15
  156. cognee/tests/test_lancedb.py +7 -11
  157. cognee/tests/test_library.py +32 -2
  158. cognee/tests/test_neo4j.py +24 -16
  159. cognee/tests/test_neptune_analytics_vector.py +7 -11
  160. cognee/tests/test_permissions.py +9 -13
  161. cognee/tests/test_pgvector.py +4 -4
  162. cognee/tests/test_remote_kuzu.py +8 -11
  163. cognee/tests/test_s3_file_storage.py +1 -1
  164. cognee/tests/test_search_db.py +6 -8
  165. cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
  166. cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
  167. cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +51 -0
  168. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/METADATA +21 -6
  169. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/RECORD +178 -139
  170. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/entry_points.txt +1 -0
  171. distributed/Dockerfile +0 -3
  172. distributed/entrypoint.py +21 -9
  173. distributed/signal.py +5 -0
  174. distributed/workers/data_point_saving_worker.py +64 -34
  175. distributed/workers/graph_saving_worker.py +71 -47
  176. cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
  177. cognee/modules/retrieval/insights_retriever.py +0 -133
  178. cognee/tests/test_memgraph.py +0 -109
  179. cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
  180. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/WHEEL +0 -0
  181. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/licenses/LICENSE +0 -0
  182. {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/licenses/NOTICE.md +0 -0
@@ -10,9 +10,7 @@ from cognee.tasks.storage.exceptions import (
10
10
  )
11
11
 
12
12
 
13
- async def add_data_points(
14
- data_points: List[DataPoint], update_edge_collection: bool = True
15
- ) -> List[DataPoint]:
13
+ async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
16
14
  """
17
15
  Add a batch of data points to the graph database by extracting nodes and edges,
18
16
  deduplicating them, and indexing them for retrieval.
@@ -25,9 +23,6 @@ async def add_data_points(
25
23
  Args:
26
24
  data_points (List[DataPoint]):
27
25
  A list of data points to process and insert into the graph.
28
- update_edge_collection (bool, optional):
29
- Whether to update the edge index after adding edges.
30
- Defaults to True.
31
26
 
32
27
  Returns:
33
28
  List[DataPoint]:
@@ -73,12 +68,10 @@ async def add_data_points(
73
68
 
74
69
  graph_engine = await get_graph_engine()
75
70
 
71
+ await graph_engine.add_nodes(nodes)
76
72
  await index_data_points(nodes)
77
73
 
78
- await graph_engine.add_nodes(nodes)
79
74
  await graph_engine.add_edges(edges)
80
-
81
- if update_edge_collection:
82
- await index_graph_edges()
75
+ await index_graph_edges(edges)
83
76
 
84
77
  return data_points
@@ -1,6 +1,6 @@
1
- from cognee.shared.logging_utils import get_logger
1
+ import asyncio
2
2
 
3
- from cognee.infrastructure.databases.exceptions import EmbeddingException
3
+ from cognee.shared.logging_utils import get_logger
4
4
  from cognee.infrastructure.databases.vector import get_vector_engine
5
5
  from cognee.infrastructure.engine import DataPoint
6
6
 
@@ -33,18 +33,23 @@ async def index_data_points(data_points: list[DataPoint]):
33
33
  indexed_data_point.metadata["index_fields"] = [field_name]
34
34
  index_points[index_name].append(indexed_data_point)
35
35
 
36
- for index_name_and_field, indexable_points in index_points.items():
37
- first_occurence = index_name_and_field.index("_")
38
- index_name = index_name_and_field[:first_occurence]
39
- field_name = index_name_and_field[first_occurence + 1 :]
40
- try:
41
- # In case the amount of indexable points is too large we need to send them in batches
42
- batch_size = vector_engine.embedding_engine.get_batch_size()
43
- for i in range(0, len(indexable_points), batch_size):
44
- batch = indexable_points[i : i + batch_size]
45
- await vector_engine.index_data_points(index_name, field_name, batch)
46
- except EmbeddingException as e:
47
- logger.warning(f"Failed to index data points for {index_name}.{field_name}: {e}")
36
+ tasks: list[asyncio.Task] = []
37
+ batch_size = vector_engine.embedding_engine.get_batch_size()
38
+
39
+ for index_name_and_field, points in index_points.items():
40
+ first = index_name_and_field.index("_")
41
+ index_name = index_name_and_field[:first]
42
+ field_name = index_name_and_field[first + 1 :]
43
+
44
+ # Create embedding requests per batch to run in parallel later
45
+ for i in range(0, len(points), batch_size):
46
+ batch = points[i : i + batch_size]
47
+ tasks.append(
48
+ asyncio.create_task(vector_engine.index_data_points(index_name, field_name, batch))
49
+ )
50
+
51
+ # Run all embedding requests in parallel
52
+ await asyncio.gather(*tasks)
48
53
 
49
54
  return data_points
50
55
 
@@ -1,15 +1,20 @@
1
+ import asyncio
2
+
1
3
  from cognee.modules.engine.utils.generate_edge_id import generate_edge_id
2
- from cognee.shared.logging_utils import get_logger, ERROR
4
+ from cognee.shared.logging_utils import get_logger
3
5
  from collections import Counter
4
-
6
+ from typing import Optional, Dict, Any, List, Tuple, Union
5
7
  from cognee.infrastructure.databases.vector import get_vector_engine
6
8
  from cognee.infrastructure.databases.graph import get_graph_engine
7
9
  from cognee.modules.graph.models.EdgeType import EdgeType
10
+ from cognee.infrastructure.databases.graph.graph_db_interface import EdgeData
8
11
 
9
- logger = get_logger(level=ERROR)
12
+ logger = get_logger()
10
13
 
11
14
 
12
- async def index_graph_edges():
15
+ async def index_graph_edges(
16
+ edges_data: Union[List[EdgeData], List[Tuple[str, str, str, Optional[Dict[str, Any]]]]] = None,
17
+ ):
13
18
  """
14
19
  Indexes graph edges by creating and managing vector indexes for relationship types.
15
20
 
@@ -35,13 +40,17 @@ async def index_graph_edges():
35
40
  index_points = {}
36
41
 
37
42
  vector_engine = get_vector_engine()
38
- graph_engine = await get_graph_engine()
43
+
44
+ if edges_data is None:
45
+ graph_engine = await get_graph_engine()
46
+ _, edges_data = await graph_engine.get_graph_data()
47
+ logger.warning(
48
+ "Your graph edge embedding is deprecated, please pass edges to the index_graph_edges directly."
49
+ )
39
50
  except Exception as e:
40
51
  logger.error("Failed to initialize engines: %s", e)
41
52
  raise RuntimeError("Initialization error") from e
42
53
 
43
- _, edges_data = await graph_engine.get_graph_data()
44
-
45
54
  edge_types = Counter(
46
55
  item.get("relationship_name")
47
56
  for edge in edges_data
@@ -69,15 +78,20 @@ async def index_graph_edges():
69
78
  indexed_data_point.metadata["index_fields"] = [field_name]
70
79
  index_points[index_name].append(indexed_data_point)
71
80
 
81
+ # Get maximum batch size for embedding model
82
+ batch_size = vector_engine.embedding_engine.get_batch_size()
83
+ tasks: list[asyncio.Task] = []
84
+
72
85
  for index_name, indexable_points in index_points.items():
73
86
  index_name, field_name = index_name.split(".")
74
87
 
75
- # Get maximum batch size for embedding model
76
- batch_size = vector_engine.embedding_engine.get_batch_size()
77
- # We save the data in batches of {batch_size} to not put a lot of pressure on the database
88
+ # Create embedding tasks to run in parallel later
78
89
  for start in range(0, len(indexable_points), batch_size):
79
90
  batch = indexable_points[start : start + batch_size]
80
91
 
81
- await vector_engine.index_data_points(index_name, field_name, batch)
92
+ tasks.append(vector_engine.index_data_points(index_name, field_name, batch))
93
+
94
+ # Start all embedding tasks and wait for completion
95
+ await asyncio.gather(*tasks)
82
96
 
83
97
  return None
@@ -0,0 +1,34 @@
1
+ """Web scraping module for cognee.
2
+
3
+ This module provides tools for scraping web content, managing scraping jobs, and storing
4
+ data in a graph database. It includes classes and functions for crawling web pages using
5
+ BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
6
+ """
7
+
8
+ from .utils import fetch_page_content
9
+ from .default_url_crawler import DefaultUrlCrawler
10
+
11
+ # Lazy import for web_scraper_task to avoid requiring apscheduler
12
+ # Import these directly if needed: from cognee.tasks.web_scraper.web_scraper_task import ...
13
+
14
+
15
+ def __getattr__(name):
16
+ """Lazy load web scraper task functions that require apscheduler."""
17
+ if name == "cron_web_scraper_task":
18
+ from .web_scraper_task import cron_web_scraper_task
19
+
20
+ return cron_web_scraper_task
21
+ elif name == "web_scraper_task":
22
+ from .web_scraper_task import web_scraper_task
23
+
24
+ return web_scraper_task
25
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
26
+
27
+
28
+ __all__ = [
29
+ "BeautifulSoupCrawler",
30
+ "fetch_page_content",
31
+ "cron_web_scraper_task",
32
+ "web_scraper_task",
33
+ "DefaultUrlCrawler",
34
+ ]
@@ -0,0 +1,26 @@
1
+ from pydantic import BaseModel, Field
2
+ from typing import Any, Dict, Optional, Literal
3
+ import os
4
+
5
+
6
+ class TavilyConfig(BaseModel):
7
+ api_key: Optional[str] = os.getenv("TAVILY_API_KEY")
8
+ extract_depth: Literal["basic", "advanced"] = "basic"
9
+ proxies: Optional[Dict[str, str]] = None
10
+ timeout: Optional[int] = Field(default=10, ge=1, le=60)
11
+
12
+
13
+ class DefaultCrawlerConfig(BaseModel):
14
+ concurrency: int = 5
15
+ crawl_delay: float = 0.5
16
+ max_crawl_delay: Optional[float] = (
17
+ 10.0 # Maximum crawl delay to respect from robots.txt (None = no limit)
18
+ )
19
+ timeout: float = 15.0
20
+ max_retries: int = 2
21
+ retry_delay_factor: float = 0.5
22
+ headers: Optional[Dict[str, str]] = None
23
+ use_playwright: bool = False
24
+ playwright_js_wait: float = 0.8
25
+ robots_cache_ttl: float = 3600.0
26
+ join_all_matches: bool = False
@@ -0,0 +1,446 @@
1
+ import asyncio
2
+ from dataclasses import dataclass, field
3
+ from functools import lru_cache
4
+ import time
5
+ from typing import Any, Union, List, Dict, Optional
6
+ from urllib.parse import urlparse
7
+ import httpx
8
+
9
+ from cognee.shared.logging_utils import get_logger
10
+ from cognee.tasks.web_scraper.types import UrlsToHtmls
11
+
12
+ logger = get_logger()
13
+
14
+ try:
15
+ from protego import Protego
16
+ except ImportError:
17
+ logger.warning("Failed to import protego, make sure to install using pip install protego>=0.1")
18
+ Protego = None
19
+
20
+ try:
21
+ from playwright.async_api import async_playwright
22
+ except ImportError:
23
+ logger.warning(
24
+ "Failed to import playwright, make sure to install using pip install playwright>=1.9.0"
25
+ )
26
+ async_playwright = None
27
+
28
+
29
+ @dataclass
30
+ class RobotsTxtCache:
31
+ """Cache for robots.txt data.
32
+
33
+ Attributes:
34
+ protego: Parsed robots.txt object (Protego instance).
35
+ crawl_delay: Delay between requests (in seconds).
36
+ timestamp: Time when the cache entry was created.
37
+ """
38
+
39
+ protego: Any
40
+ crawl_delay: float
41
+ timestamp: float = field(default_factory=time.time)
42
+
43
+
44
+ class DefaultUrlCrawler:
45
+ def __init__(
46
+ self,
47
+ *,
48
+ concurrency: int = 5,
49
+ crawl_delay: float = 0.5,
50
+ max_crawl_delay: Optional[float] = 10.0,
51
+ timeout: float = 15.0,
52
+ max_retries: int = 2,
53
+ retry_delay_factor: float = 0.5,
54
+ headers: Optional[Dict[str, str]] = None,
55
+ robots_cache_ttl: float = 3600.0,
56
+ ):
57
+ """Initialize the BeautifulSoupCrawler.
58
+
59
+ Args:
60
+ concurrency: Number of concurrent requests allowed.
61
+ crawl_delay: Minimum seconds between requests to the same domain.
62
+ max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
63
+ timeout: Per-request timeout in seconds.
64
+ max_retries: Number of retries for failed requests.
65
+ retry_delay_factor: Multiplier for exponential backoff on retries.
66
+ headers: HTTP headers for requests (defaults to User-Agent: Cognee-Scraper/1.0).
67
+ robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
68
+ """
69
+ self.concurrency = concurrency
70
+ self._sem = asyncio.Semaphore(concurrency)
71
+ self.crawl_delay = crawl_delay
72
+ self.max_crawl_delay = max_crawl_delay
73
+ self.timeout = timeout
74
+ self.max_retries = max_retries
75
+ self.retry_delay_factor = retry_delay_factor
76
+ self.headers = headers or {"User-Agent": "Cognee-Scraper/1.0"}
77
+ self.robots_cache_ttl = robots_cache_ttl
78
+ self._last_request_time_per_domain: Dict[str, float] = {}
79
+ self._robots_cache: Dict[str, RobotsTxtCache] = {}
80
+ self._client: Optional[httpx.AsyncClient] = None
81
+ self._robots_lock = asyncio.Lock()
82
+
83
+ async def _ensure_client(self):
84
+ """Initialize the HTTP client if not already created."""
85
+ if self._client is None:
86
+ self._client = httpx.AsyncClient(timeout=self.timeout, headers=self.headers)
87
+
88
+ async def close(self):
89
+ """Close the HTTP client."""
90
+ if self._client:
91
+ await self._client.aclose()
92
+ self._client = None
93
+
94
+ async def __aenter__(self):
95
+ """Enter the context manager, initializing the HTTP client."""
96
+ await self._ensure_client()
97
+ return self
98
+
99
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
100
+ """Exit the context manager, closing the HTTP client."""
101
+ await self.close()
102
+
103
+ @lru_cache(maxsize=1024)
104
+ def _domain_from_url(self, url: str) -> str:
105
+ """Extract the domain (netloc) from a URL.
106
+
107
+ Args:
108
+ url: The URL to parse.
109
+
110
+ Returns:
111
+ str: The domain (netloc) of the URL.
112
+ """
113
+ try:
114
+ return urlparse(url).netloc
115
+ except Exception:
116
+ return url
117
+
118
+ @lru_cache(maxsize=1024)
119
+ def _get_domain_root(self, url: str) -> str:
120
+ """Get the root URL (scheme and netloc) from a URL.
121
+
122
+ Args:
123
+ url: The URL to parse.
124
+
125
+ Returns:
126
+ str: The root URL (e.g., "https://example.com").
127
+ """
128
+ parsed = urlparse(url)
129
+ return f"{parsed.scheme}://{parsed.netloc}"
130
+
131
+ async def _respect_rate_limit(self, url: str, crawl_delay: Optional[float] = None):
132
+ """Enforce rate limiting for requests to the same domain.
133
+
134
+ Args:
135
+ url: The URL to check.
136
+ crawl_delay: Custom crawl delay in seconds (if any).
137
+ """
138
+ domain = self._domain_from_url(url)
139
+ last = self._last_request_time_per_domain.get(domain)
140
+ delay = crawl_delay if crawl_delay is not None else self.crawl_delay
141
+
142
+ if last is None:
143
+ self._last_request_time_per_domain[domain] = time.time()
144
+ return
145
+
146
+ elapsed = time.time() - last
147
+ wait_for = delay - elapsed
148
+ if wait_for > 0:
149
+ logger.info(
150
+ f"Rate limiting: waiting {wait_for:.2f}s before requesting {url} (crawl_delay={delay}s from robots.txt)"
151
+ )
152
+ await asyncio.sleep(wait_for)
153
+ logger.info(f"Rate limit wait completed for {url}")
154
+ self._last_request_time_per_domain[domain] = time.time()
155
+
156
+ async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]:
157
+ """Get cached robots.txt data if valid.
158
+
159
+ Args:
160
+ domain_root: The root URL (e.g., "https://example.com").
161
+
162
+ Returns:
163
+ Optional[RobotsTxtCache]: Cached robots.txt data or None if expired or not found.
164
+ """
165
+ if Protego is None:
166
+ return None
167
+
168
+ cached = self._robots_cache.get(domain_root)
169
+ if cached and (time.time() - cached.timestamp) < self.robots_cache_ttl:
170
+ return cached
171
+ return None
172
+
173
+ async def _fetch_and_cache_robots(self, domain_root: str) -> RobotsTxtCache:
174
+ """Fetch and cache robots.txt data.
175
+
176
+ Args:
177
+ domain_root: The root URL (e.g., "https://example.com").
178
+
179
+ Returns:
180
+ RobotsTxtCache: Cached robots.txt data with crawl delay.
181
+
182
+ Raises:
183
+ Exception: If fetching robots.txt fails.
184
+ """
185
+ async with self._robots_lock:
186
+ cached = await self._get_robots_cache(domain_root)
187
+ if cached:
188
+ return cached
189
+
190
+ robots_url = f"{domain_root}/robots.txt"
191
+ try:
192
+ await self._ensure_client()
193
+ await self._respect_rate_limit(robots_url, self.crawl_delay)
194
+ resp = await self._client.get(robots_url, timeout=5.0)
195
+ content = resp.text if resp.status_code == 200 else ""
196
+ except Exception as e:
197
+ logger.debug(f"Failed to fetch robots.txt from {domain_root}: {e}")
198
+ content = ""
199
+
200
+ protego = Protego.parse(content) if content.strip() else None
201
+ agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
202
+
203
+ crawl_delay = self.crawl_delay
204
+ if protego:
205
+ delay = protego.crawl_delay(agent) or protego.crawl_delay("*")
206
+ if delay:
207
+ # Apply max_crawl_delay cap if configured
208
+ if self.max_crawl_delay is not None and delay > self.max_crawl_delay:
209
+ logger.warning(
210
+ f"robots.txt specifies crawl_delay={delay}s for {domain_root}, "
211
+ f"capping to max_crawl_delay={self.max_crawl_delay}s"
212
+ )
213
+ crawl_delay = self.max_crawl_delay
214
+ else:
215
+ crawl_delay = delay
216
+
217
+ cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay)
218
+ self._robots_cache[domain_root] = cache_entry
219
+ return cache_entry
220
+
221
+ async def _is_url_allowed(self, url: str) -> bool:
222
+ """Check if a URL is allowed by robots.txt.
223
+
224
+ Args:
225
+ url: The URL to check.
226
+
227
+ Returns:
228
+ bool: True if the URL is allowed, False otherwise.
229
+ """
230
+ if Protego is None:
231
+ return True
232
+
233
+ try:
234
+ domain_root = self._get_domain_root(url)
235
+ cache = await self._get_robots_cache(domain_root)
236
+ if cache is None:
237
+ cache = await self._fetch_and_cache_robots(domain_root)
238
+
239
+ if cache.protego is None:
240
+ return True
241
+
242
+ agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*")
243
+ return cache.protego.can_fetch(agent, url) or cache.protego.can_fetch("*", url)
244
+ except Exception as e:
245
+ logger.debug(f"Error checking robots.txt for {url}: {e}")
246
+ return True
247
+
248
+ async def _get_crawl_delay(self, url: str) -> float:
249
+ """Get the crawl delay for a URL from robots.txt.
250
+
251
+ Args:
252
+ url: The URL to check.
253
+
254
+ Returns:
255
+ float: Crawl delay in seconds.
256
+ """
257
+ if Protego is None:
258
+ return self.crawl_delay
259
+
260
+ try:
261
+ domain_root = self._get_domain_root(url)
262
+ cache = await self._get_robots_cache(domain_root)
263
+ if cache is None:
264
+ cache = await self._fetch_and_cache_robots(domain_root)
265
+ return cache.crawl_delay
266
+ except Exception:
267
+ return self.crawl_delay
268
+
269
+ async def _fetch_httpx(self, url: str) -> str:
270
+ """Fetch a URL using HTTPX with retries.
271
+
272
+ Args:
273
+ url: The URL to fetch.
274
+
275
+ Returns:
276
+ str: The HTML content of the page.
277
+
278
+ Raises:
279
+ Exception: If all retry attempts fail.
280
+ """
281
+ await self._ensure_client()
282
+ assert self._client is not None, "HTTP client not initialized"
283
+
284
+ attempt = 0
285
+ crawl_delay = await self._get_crawl_delay(url)
286
+ logger.info(f"Fetching URL with httpx (crawl_delay={crawl_delay}s): {url}")
287
+
288
+ while True:
289
+ try:
290
+ await self._respect_rate_limit(url, crawl_delay)
291
+ resp = await self._client.get(url)
292
+ resp.raise_for_status()
293
+ logger.info(
294
+ f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)"
295
+ )
296
+ return resp.text
297
+ except Exception as exc:
298
+ attempt += 1
299
+ if attempt > self.max_retries:
300
+ logger.error(f"Fetch failed for {url} after {attempt} attempts: {exc}")
301
+ raise
302
+
303
+ delay = self.retry_delay_factor * (2 ** (attempt - 1))
304
+ logger.warning(
305
+ f"Retrying {url} after {delay:.2f}s (attempt {attempt}) due to {exc}"
306
+ )
307
+ await asyncio.sleep(delay)
308
+
309
+ async def _render_with_playwright(
310
+ self, url: str, js_wait: float = 1.0, timeout: Optional[float] = None
311
+ ) -> str:
312
+ """Fetch and render a URL using Playwright for JavaScript content.
313
+
314
+ Args:
315
+ url: The URL to fetch.
316
+ js_wait: Seconds to wait for JavaScript to load.
317
+ timeout: Timeout for the request (in seconds, defaults to instance timeout).
318
+
319
+ Returns:
320
+ str: The rendered HTML content.
321
+
322
+ Raises:
323
+ RuntimeError: If Playwright is not installed.
324
+ Exception: If all retry attempts fail.
325
+ """
326
+ if async_playwright is None:
327
+ raise RuntimeError(
328
+ "Playwright is not installed. Install with `pip install playwright` and run `playwright install`."
329
+ )
330
+
331
+ timeout_val = timeout or self.timeout
332
+ logger.info(
333
+ f"Rendering URL with Playwright (js_wait={js_wait}s, timeout={timeout_val}s): {url}"
334
+ )
335
+
336
+ attempt = 0
337
+ while True:
338
+ try:
339
+ async with async_playwright() as p:
340
+ logger.info(f"Launching headless Chromium browser for {url}")
341
+ browser = await p.chromium.launch(headless=True)
342
+ try:
343
+ context = await browser.new_context()
344
+ page = await context.new_page()
345
+ logger.info(f"Navigating to {url} and waiting for network idle")
346
+ await page.goto(
347
+ url,
348
+ wait_until="networkidle",
349
+ timeout=int(timeout_val * 1000),
350
+ )
351
+ if js_wait:
352
+ logger.info(f"Waiting {js_wait}s for JavaScript to execute")
353
+ await asyncio.sleep(js_wait)
354
+ content = await page.content()
355
+ logger.info(
356
+ f"Successfully rendered {url} with Playwright (size={len(content)} bytes)"
357
+ )
358
+ return content
359
+ finally:
360
+ await browser.close()
361
+ except Exception as exc:
362
+ attempt += 1
363
+ if attempt > self.max_retries:
364
+ logger.error(f"Playwright fetch failed for {url}: {exc}")
365
+ raise
366
+ backoff = self.retry_delay_factor * (2 ** (attempt - 1))
367
+ logger.warning(
368
+ f"Retrying playwright fetch {url} after {backoff:.2f}s (attempt {attempt})"
369
+ )
370
+ await asyncio.sleep(backoff)
371
+
372
+ async def fetch_urls(
373
+ self,
374
+ urls: Union[str, List[str]],
375
+ *,
376
+ use_playwright: bool = False,
377
+ playwright_js_wait: float = 0.8,
378
+ ) -> UrlsToHtmls:
379
+ """Fetch and extract content from URLs using BeautifulSoup or Playwright.
380
+
381
+ Args:
382
+ urls: A single URL, list of URLs, or dict mapping URLs to extraction rules.
383
+ extraction_rules: Default extraction rules for string or list URLs.
384
+ use_playwright: If True, use Playwright for JavaScript rendering.
385
+ playwright_js_wait: Seconds to wait for JavaScript to load.
386
+ join_all_matches: If True, extract all matching elements for each rule.
387
+
388
+ Returns:
389
+ Dict[str, str]: A dictionary mapping URLs to their extracted content.
390
+
391
+ Raises:
392
+ ValueError: If extraction_rules are missing when required or if urls is invalid.
393
+ Exception: If fetching or extraction fails.
394
+ """
395
+ if isinstance(urls, str):
396
+ urls = [urls]
397
+ else:
398
+ raise ValueError(f"Invalid urls type: {type(urls)}")
399
+
400
+ async def _task(url: str):
401
+ async with self._sem:
402
+ try:
403
+ logger.info(f"Processing URL: {url}")
404
+
405
+ # Check robots.txt
406
+ allowed = await self._is_url_allowed(url)
407
+ if not allowed:
408
+ logger.warning(f"URL disallowed by robots.txt: {url}")
409
+ return url, ""
410
+
411
+ logger.info(f"Robots.txt check passed for {url}")
412
+
413
+ # Fetch HTML
414
+ if use_playwright:
415
+ logger.info(
416
+ f"Rendering {url} with Playwright (JS wait: {playwright_js_wait}s)"
417
+ )
418
+ html = await self._render_with_playwright(
419
+ url, js_wait=playwright_js_wait, timeout=self.timeout
420
+ )
421
+ else:
422
+ logger.info(f"Fetching {url} with httpx")
423
+ html = await self._fetch_httpx(url)
424
+
425
+ logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)")
426
+
427
+ return url, html
428
+
429
+ except Exception as e:
430
+ logger.error(f"Error processing {url}: {e}")
431
+ return url, ""
432
+
433
+ logger.info(f"Creating {len(urls)} async tasks for concurrent fetching")
434
+ tasks = [asyncio.create_task(_task(u)) for u in urls]
435
+ results = {}
436
+ completed = 0
437
+ total = len(tasks)
438
+
439
+ for coro in asyncio.as_completed(tasks):
440
+ url, html = await coro
441
+ results[url] = html
442
+ completed += 1
443
+ logger.info(f"Progress: {completed}/{total} URLs processed")
444
+
445
+ logger.info(f"Completed fetching all {len(results)} URL(s)")
446
+ return results