cognee 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +1 -0
- cognee/api/health.py +2 -12
- cognee/api/v1/add/add.py +46 -6
- cognee/api/v1/add/routers/get_add_router.py +5 -1
- cognee/api/v1/cognify/cognify.py +29 -9
- cognee/api/v1/datasets/datasets.py +11 -0
- cognee/api/v1/responses/default_tools.py +0 -1
- cognee/api/v1/responses/dispatch_function.py +1 -1
- cognee/api/v1/responses/routers/default_tools.py +0 -1
- cognee/api/v1/search/search.py +11 -9
- cognee/api/v1/settings/routers/get_settings_router.py +7 -1
- cognee/api/v1/ui/ui.py +47 -16
- cognee/api/v1/update/routers/get_update_router.py +1 -1
- cognee/api/v1/update/update.py +3 -3
- cognee/cli/_cognee.py +61 -10
- cognee/cli/commands/add_command.py +3 -3
- cognee/cli/commands/cognify_command.py +3 -3
- cognee/cli/commands/config_command.py +9 -7
- cognee/cli/commands/delete_command.py +3 -3
- cognee/cli/commands/search_command.py +3 -7
- cognee/cli/config.py +0 -1
- cognee/context_global_variables.py +5 -0
- cognee/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/cache/__init__.py +2 -0
- cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
- cognee/infrastructure/databases/cache/config.py +44 -0
- cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
- cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
- cognee/infrastructure/databases/exceptions/__init__.py +1 -0
- cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
- cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
- cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
- cognee/infrastructure/databases/graph/kuzu/adapter.py +67 -44
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
- cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
- cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
- cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
- cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
- cognee/infrastructure/files/exceptions.py +1 -1
- cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
- cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
- cognee/infrastructure/files/utils/guess_file_type.py +6 -0
- cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
- cognee/infrastructure/loaders/LoaderEngine.py +27 -7
- cognee/infrastructure/loaders/external/__init__.py +7 -0
- cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
- cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
- cognee/infrastructure/loaders/supported_loaders.py +7 -0
- cognee/modules/data/exceptions/exceptions.py +1 -1
- cognee/modules/data/methods/__init__.py +3 -0
- cognee/modules/data/methods/get_dataset_data.py +4 -1
- cognee/modules/data/methods/has_dataset_data.py +21 -0
- cognee/modules/engine/models/TableRow.py +0 -1
- cognee/modules/ingestion/save_data_to_file.py +9 -2
- cognee/modules/pipelines/exceptions/exceptions.py +1 -1
- cognee/modules/pipelines/operations/pipeline.py +12 -1
- cognee/modules/pipelines/operations/run_tasks.py +25 -197
- cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
- cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
- cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
- cognee/modules/retrieval/base_graph_retriever.py +3 -1
- cognee/modules/retrieval/base_retriever.py +3 -1
- cognee/modules/retrieval/chunks_retriever.py +5 -1
- cognee/modules/retrieval/code_retriever.py +20 -2
- cognee/modules/retrieval/completion_retriever.py +50 -9
- cognee/modules/retrieval/cypher_search_retriever.py +11 -1
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
- cognee/modules/retrieval/graph_completion_cot_retriever.py +32 -1
- cognee/modules/retrieval/graph_completion_retriever.py +54 -10
- cognee/modules/retrieval/lexical_retriever.py +20 -2
- cognee/modules/retrieval/natural_language_retriever.py +10 -1
- cognee/modules/retrieval/summaries_retriever.py +5 -1
- cognee/modules/retrieval/temporal_retriever.py +62 -10
- cognee/modules/retrieval/user_qa_feedback.py +3 -2
- cognee/modules/retrieval/utils/completion.py +5 -0
- cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
- cognee/modules/retrieval/utils/session_cache.py +156 -0
- cognee/modules/search/methods/get_search_type_tools.py +0 -5
- cognee/modules/search/methods/no_access_control_search.py +12 -1
- cognee/modules/search/methods/search.py +34 -2
- cognee/modules/search/types/SearchType.py +0 -1
- cognee/modules/settings/get_settings.py +23 -0
- cognee/modules/users/methods/get_authenticated_user.py +3 -1
- cognee/modules/users/methods/get_default_user.py +1 -6
- cognee/modules/users/roles/methods/create_role.py +2 -2
- cognee/modules/users/tenants/methods/create_tenant.py +2 -2
- cognee/shared/exceptions/exceptions.py +1 -1
- cognee/tasks/codingagents/coding_rule_associations.py +1 -2
- cognee/tasks/documents/exceptions/exceptions.py +1 -1
- cognee/tasks/graph/extract_graph_from_data.py +2 -0
- cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
- cognee/tasks/ingestion/ingest_data.py +11 -5
- cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
- cognee/tasks/storage/add_data_points.py +3 -10
- cognee/tasks/storage/index_data_points.py +19 -14
- cognee/tasks/storage/index_graph_edges.py +25 -11
- cognee/tasks/web_scraper/__init__.py +34 -0
- cognee/tasks/web_scraper/config.py +26 -0
- cognee/tasks/web_scraper/default_url_crawler.py +446 -0
- cognee/tasks/web_scraper/models.py +46 -0
- cognee/tasks/web_scraper/types.py +4 -0
- cognee/tasks/web_scraper/utils.py +142 -0
- cognee/tasks/web_scraper/web_scraper_task.py +396 -0
- cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
- cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
- cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
- cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
- cognee/tests/subprocesses/reader.py +25 -0
- cognee/tests/subprocesses/simple_cognify_1.py +31 -0
- cognee/tests/subprocesses/simple_cognify_2.py +31 -0
- cognee/tests/subprocesses/writer.py +32 -0
- cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
- cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
- cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
- cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
- cognee/tests/test_add_docling_document.py +56 -0
- cognee/tests/test_chromadb.py +7 -11
- cognee/tests/test_concurrent_subprocess_access.py +76 -0
- cognee/tests/test_conversation_history.py +240 -0
- cognee/tests/test_kuzu.py +27 -15
- cognee/tests/test_lancedb.py +7 -11
- cognee/tests/test_library.py +32 -2
- cognee/tests/test_neo4j.py +24 -16
- cognee/tests/test_neptune_analytics_vector.py +7 -11
- cognee/tests/test_permissions.py +9 -13
- cognee/tests/test_pgvector.py +4 -4
- cognee/tests/test_remote_kuzu.py +8 -11
- cognee/tests/test_s3_file_storage.py +1 -1
- cognee/tests/test_search_db.py +6 -8
- cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
- cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/METADATA +21 -6
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/RECORD +155 -126
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/entry_points.txt +1 -0
- distributed/Dockerfile +0 -3
- distributed/entrypoint.py +21 -9
- distributed/signal.py +5 -0
- distributed/workers/data_point_saving_worker.py +64 -34
- distributed/workers/graph_saving_worker.py +71 -47
- cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
- cognee/modules/retrieval/insights_retriever.py +0 -133
- cognee/tests/test_memgraph.py +0 -109
- cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/WHEEL +0 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
import cognee
|
|
4
|
+
from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
|
|
5
|
+
from cognee.infrastructure.loaders.LoaderEngine import LoaderEngine
|
|
6
|
+
from cognee.infrastructure.loaders.external.beautiful_soup_loader import BeautifulSoupLoader
|
|
7
|
+
from cognee.tasks.ingestion import save_data_item_to_storage
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@pytest.mark.asyncio
|
|
12
|
+
async def test_url_saves_as_html_file():
|
|
13
|
+
await cognee.prune.prune_data()
|
|
14
|
+
await cognee.prune.prune_system(metadata=True)
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
original_file_path = await save_data_item_to_storage(
|
|
18
|
+
"https://en.wikipedia.org/wiki/Large_language_model"
|
|
19
|
+
)
|
|
20
|
+
file_path = get_data_file_path(original_file_path)
|
|
21
|
+
assert file_path.endswith(".html")
|
|
22
|
+
file = Path(file_path)
|
|
23
|
+
assert file.exists()
|
|
24
|
+
assert file.stat().st_size > 0
|
|
25
|
+
except Exception as e:
|
|
26
|
+
pytest.fail(f"Failed to save data item to storage: {e}")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
skip_for_tavily = pytest.mark.skipif(
|
|
30
|
+
os.getenv("TAVILY_API_KEY") is not None,
|
|
31
|
+
reason="Skipping as Tavily already handles parsing and outputs text",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@skip_for_tavily
|
|
36
|
+
@pytest.mark.asyncio
|
|
37
|
+
async def test_saved_html_is_valid():
|
|
38
|
+
try:
|
|
39
|
+
from bs4 import BeautifulSoup
|
|
40
|
+
except ImportError:
|
|
41
|
+
pytest.fail("Test case requires bs4 installed")
|
|
42
|
+
|
|
43
|
+
await cognee.prune.prune_data()
|
|
44
|
+
await cognee.prune.prune_system(metadata=True)
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
original_file_path = await save_data_item_to_storage(
|
|
48
|
+
"https://en.wikipedia.org/wiki/Large_language_model"
|
|
49
|
+
)
|
|
50
|
+
file_path = get_data_file_path(original_file_path)
|
|
51
|
+
content = Path(file_path).read_text()
|
|
52
|
+
|
|
53
|
+
soup = BeautifulSoup(content, "html.parser")
|
|
54
|
+
assert soup.find() is not None, "File should contain parseable HTML"
|
|
55
|
+
|
|
56
|
+
has_html_elements = any(
|
|
57
|
+
[
|
|
58
|
+
soup.find("html"),
|
|
59
|
+
soup.find("head"),
|
|
60
|
+
soup.find("body"),
|
|
61
|
+
soup.find("div"),
|
|
62
|
+
soup.find("p"),
|
|
63
|
+
]
|
|
64
|
+
)
|
|
65
|
+
assert has_html_elements, "File should contain common HTML elements"
|
|
66
|
+
except Exception as e:
|
|
67
|
+
pytest.fail(f"Failed to save data item to storage: {e}")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@pytest.mark.asyncio
|
|
71
|
+
async def test_add_url():
|
|
72
|
+
await cognee.prune.prune_data()
|
|
73
|
+
await cognee.prune.prune_system(metadata=True)
|
|
74
|
+
|
|
75
|
+
await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
skip_in_ci = pytest.mark.skipif(
|
|
79
|
+
os.getenv("GITHUB_ACTIONS") == "true",
|
|
80
|
+
reason="Skipping in Github for now - before we get TAVILY_API_KEY",
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@skip_in_ci
|
|
85
|
+
@pytest.mark.asyncio
|
|
86
|
+
async def test_add_url_with_tavily():
|
|
87
|
+
assert os.getenv("TAVILY_API_KEY") is not None
|
|
88
|
+
await cognee.prune.prune_data()
|
|
89
|
+
await cognee.prune.prune_system(metadata=True)
|
|
90
|
+
|
|
91
|
+
await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@pytest.mark.asyncio
|
|
95
|
+
async def test_add_url_without_incremental_loading():
|
|
96
|
+
await cognee.prune.prune_data()
|
|
97
|
+
await cognee.prune.prune_system(metadata=True)
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
await cognee.add(
|
|
101
|
+
"https://en.wikipedia.org/wiki/Large_language_model",
|
|
102
|
+
incremental_loading=False,
|
|
103
|
+
)
|
|
104
|
+
except Exception as e:
|
|
105
|
+
pytest.fail(f"Failed to add url: {e}")
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@pytest.mark.asyncio
|
|
109
|
+
async def test_add_url_with_incremental_loading():
|
|
110
|
+
await cognee.prune.prune_data()
|
|
111
|
+
await cognee.prune.prune_system(metadata=True)
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
await cognee.add(
|
|
115
|
+
"https://en.wikipedia.org/wiki/Large_language_model",
|
|
116
|
+
incremental_loading=True,
|
|
117
|
+
)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
pytest.fail(f"Failed to add url: {e}")
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@pytest.mark.asyncio
|
|
123
|
+
async def test_add_url_can_define_preferred_loader_as_list_of_str():
|
|
124
|
+
await cognee.prune.prune_data()
|
|
125
|
+
await cognee.prune.prune_system(metadata=True)
|
|
126
|
+
|
|
127
|
+
await cognee.add(
|
|
128
|
+
"https://en.wikipedia.org/wiki/Large_language_model",
|
|
129
|
+
preferred_loaders=["beautiful_soup_loader"],
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@pytest.mark.asyncio
|
|
134
|
+
async def test_add_url_with_extraction_rules():
|
|
135
|
+
await cognee.prune.prune_data()
|
|
136
|
+
await cognee.prune.prune_system(metadata=True)
|
|
137
|
+
|
|
138
|
+
extraction_rules = {
|
|
139
|
+
"title": {"selector": "title"},
|
|
140
|
+
"headings": {"selector": "h1, h2, h3", "all": True},
|
|
141
|
+
"links": {"selector": "a", "attr": "href", "all": True},
|
|
142
|
+
"paragraphs": {"selector": "p", "all": True},
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
await cognee.add(
|
|
147
|
+
"https://en.wikipedia.org/wiki/Large_language_model",
|
|
148
|
+
preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
|
|
149
|
+
)
|
|
150
|
+
except Exception as e:
|
|
151
|
+
pytest.fail(f"Failed to add url: {e}")
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@pytest.mark.asyncio
|
|
155
|
+
async def test_loader_is_none_by_default():
|
|
156
|
+
await cognee.prune.prune_data()
|
|
157
|
+
await cognee.prune.prune_system(metadata=True)
|
|
158
|
+
extraction_rules = {
|
|
159
|
+
"title": {"selector": "title"},
|
|
160
|
+
"headings": {"selector": "h1, h2, h3", "all": True},
|
|
161
|
+
"links": {"selector": "a", "attr": "href", "all": True},
|
|
162
|
+
"paragraphs": {"selector": "p", "all": True},
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
original_file_path = await save_data_item_to_storage(
|
|
167
|
+
"https://en.wikipedia.org/wiki/Large_language_model"
|
|
168
|
+
)
|
|
169
|
+
file_path = get_data_file_path(original_file_path)
|
|
170
|
+
assert file_path.endswith(".html")
|
|
171
|
+
file = Path(file_path)
|
|
172
|
+
assert file.exists()
|
|
173
|
+
assert file.stat().st_size > 0
|
|
174
|
+
|
|
175
|
+
loader_engine = LoaderEngine()
|
|
176
|
+
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
|
|
177
|
+
loader = loader_engine.get_loader(
|
|
178
|
+
file_path,
|
|
179
|
+
preferred_loaders=preferred_loaders,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
assert loader is None
|
|
183
|
+
except Exception as e:
|
|
184
|
+
pytest.fail(f"Failed to save data item to storage: {e}")
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
@pytest.mark.asyncio
|
|
188
|
+
async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_provided():
|
|
189
|
+
await cognee.prune.prune_data()
|
|
190
|
+
await cognee.prune.prune_system(metadata=True)
|
|
191
|
+
extraction_rules = {
|
|
192
|
+
"title": {"selector": "title"},
|
|
193
|
+
"headings": {"selector": "h1, h2, h3", "all": True},
|
|
194
|
+
"links": {"selector": "a", "attr": "href", "all": True},
|
|
195
|
+
"paragraphs": {"selector": "p", "all": True},
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
try:
|
|
199
|
+
original_file_path = await save_data_item_to_storage(
|
|
200
|
+
"https://en.wikipedia.org/wiki/Large_language_model"
|
|
201
|
+
)
|
|
202
|
+
file_path = get_data_file_path(original_file_path)
|
|
203
|
+
assert file_path.endswith(".html")
|
|
204
|
+
file = Path(file_path)
|
|
205
|
+
assert file.exists()
|
|
206
|
+
assert file.stat().st_size > 0
|
|
207
|
+
|
|
208
|
+
loader_engine = LoaderEngine()
|
|
209
|
+
bs_loader = BeautifulSoupLoader()
|
|
210
|
+
loader_engine.register_loader(bs_loader)
|
|
211
|
+
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
|
|
212
|
+
loader = loader_engine.get_loader(
|
|
213
|
+
file_path,
|
|
214
|
+
preferred_loaders=preferred_loaders,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
assert loader == bs_loader
|
|
218
|
+
except Exception as e:
|
|
219
|
+
pytest.fail(f"Failed to save data item to storage: {e}")
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
@pytest.mark.asyncio
|
|
223
|
+
async def test_beautiful_soup_loader_works_with_and_without_arguments():
|
|
224
|
+
await cognee.prune.prune_data()
|
|
225
|
+
await cognee.prune.prune_system(metadata=True)
|
|
226
|
+
|
|
227
|
+
try:
|
|
228
|
+
original_file_path = await save_data_item_to_storage(
|
|
229
|
+
"https://en.wikipedia.org/wiki/Large_language_model"
|
|
230
|
+
)
|
|
231
|
+
file_path = get_data_file_path(original_file_path)
|
|
232
|
+
assert file_path.endswith(".html")
|
|
233
|
+
file = Path(file_path)
|
|
234
|
+
assert file.exists()
|
|
235
|
+
assert file.stat().st_size > 0
|
|
236
|
+
|
|
237
|
+
loader_engine = LoaderEngine()
|
|
238
|
+
bs_loader = BeautifulSoupLoader()
|
|
239
|
+
loader_engine.register_loader(bs_loader)
|
|
240
|
+
preferred_loaders = {"beautiful_soup_loader": {}}
|
|
241
|
+
await loader_engine.load_file(
|
|
242
|
+
file_path,
|
|
243
|
+
preferred_loaders=preferred_loaders,
|
|
244
|
+
)
|
|
245
|
+
extraction_rules = {
|
|
246
|
+
"title": {"selector": "title"},
|
|
247
|
+
"headings": {"selector": "h1, h2, h3", "all": True},
|
|
248
|
+
"links": {"selector": "a", "attr": "href", "all": True},
|
|
249
|
+
"paragraphs": {"selector": "p", "all": True},
|
|
250
|
+
}
|
|
251
|
+
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
|
|
252
|
+
await loader_engine.load_file(
|
|
253
|
+
file_path,
|
|
254
|
+
preferred_loaders=preferred_loaders,
|
|
255
|
+
)
|
|
256
|
+
except Exception as e:
|
|
257
|
+
pytest.fail(f"Failed to save data item to storage: {e}")
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
@pytest.mark.asyncio
|
|
261
|
+
async def test_beautiful_soup_loader_successfully_loads_file_if_required_args_present():
|
|
262
|
+
await cognee.prune.prune_data()
|
|
263
|
+
await cognee.prune.prune_system(metadata=True)
|
|
264
|
+
|
|
265
|
+
try:
|
|
266
|
+
original_file_path = await save_data_item_to_storage(
|
|
267
|
+
"https://en.wikipedia.org/wiki/Large_language_model"
|
|
268
|
+
)
|
|
269
|
+
file_path = get_data_file_path(original_file_path)
|
|
270
|
+
assert file_path.endswith(".html")
|
|
271
|
+
file = Path(file_path)
|
|
272
|
+
assert file.exists()
|
|
273
|
+
assert file.stat().st_size > 0
|
|
274
|
+
|
|
275
|
+
loader_engine = LoaderEngine()
|
|
276
|
+
bs_loader = BeautifulSoupLoader()
|
|
277
|
+
loader_engine.register_loader(bs_loader)
|
|
278
|
+
extraction_rules = {
|
|
279
|
+
"title": {"selector": "title"},
|
|
280
|
+
"headings": {"selector": "h1, h2, h3", "all": True},
|
|
281
|
+
"links": {"selector": "a", "attr": "href", "all": True},
|
|
282
|
+
"paragraphs": {"selector": "p", "all": True},
|
|
283
|
+
}
|
|
284
|
+
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
|
|
285
|
+
await loader_engine.load_file(
|
|
286
|
+
file_path,
|
|
287
|
+
preferred_loaders=preferred_loaders,
|
|
288
|
+
)
|
|
289
|
+
except Exception as e:
|
|
290
|
+
pytest.fail(f"Failed to save data item to storage: {e}")
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
@pytest.mark.asyncio
|
|
294
|
+
async def test_beautiful_soup_loads_file_successfully():
|
|
295
|
+
await cognee.prune.prune_data()
|
|
296
|
+
await cognee.prune.prune_system(metadata=True)
|
|
297
|
+
extraction_rules = {
|
|
298
|
+
"title": {"selector": "title"},
|
|
299
|
+
"headings": {"selector": "h1, h2, h3", "all": True},
|
|
300
|
+
"links": {"selector": "a", "attr": "href", "all": True},
|
|
301
|
+
"paragraphs": {"selector": "p", "all": True},
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
try:
|
|
305
|
+
original_file_path = await save_data_item_to_storage(
|
|
306
|
+
"https://en.wikipedia.org/wiki/Large_language_model"
|
|
307
|
+
)
|
|
308
|
+
file_path = get_data_file_path(original_file_path)
|
|
309
|
+
assert file_path.endswith(".html")
|
|
310
|
+
original_file = Path(file_path)
|
|
311
|
+
assert original_file.exists()
|
|
312
|
+
assert original_file.stat().st_size > 0
|
|
313
|
+
|
|
314
|
+
loader_engine = LoaderEngine()
|
|
315
|
+
bs_loader = BeautifulSoupLoader()
|
|
316
|
+
loader_engine.register_loader(bs_loader)
|
|
317
|
+
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
|
|
318
|
+
loader = loader_engine.get_loader(
|
|
319
|
+
file_path,
|
|
320
|
+
preferred_loaders=preferred_loaders,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
assert loader == bs_loader
|
|
324
|
+
|
|
325
|
+
cognee_loaded_txt_path = await loader_engine.load_file(
|
|
326
|
+
file_path=file_path, preferred_loaders=preferred_loaders
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
cognee_loaded_txt_path = get_data_file_path(cognee_loaded_txt_path)
|
|
330
|
+
|
|
331
|
+
assert cognee_loaded_txt_path.endswith(".txt")
|
|
332
|
+
|
|
333
|
+
extracted_file = Path(cognee_loaded_txt_path)
|
|
334
|
+
|
|
335
|
+
assert extracted_file.exists()
|
|
336
|
+
assert extracted_file.stat().st_size > 0
|
|
337
|
+
|
|
338
|
+
original_basename = original_file.stem
|
|
339
|
+
extracted_basename = extracted_file.stem
|
|
340
|
+
assert original_basename == extracted_basename, (
|
|
341
|
+
f"Expected same base name: {original_basename} vs {extracted_basename}"
|
|
342
|
+
)
|
|
343
|
+
except Exception as e:
|
|
344
|
+
pytest.fail(f"Failed to save data item to storage: {e}")
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
3
|
+
from cognee.infrastructure.databases.graph.kuzu.adapter import KuzuAdapter
|
|
4
|
+
|
|
5
|
+
# This will create the test.db if it doesn't exist
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
async def main():
|
|
9
|
+
adapter = KuzuAdapter("test.db")
|
|
10
|
+
result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)")
|
|
11
|
+
print(f"Reader: Found {result[0][0]} nodes")
|
|
12
|
+
result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)")
|
|
13
|
+
print(f"Reader: Found {result[0][0]} nodes")
|
|
14
|
+
result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)")
|
|
15
|
+
print(f"Reader: Found {result[0][0]} nodes")
|
|
16
|
+
result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)")
|
|
17
|
+
print(f"Reader: Found {result[0][0]} nodes")
|
|
18
|
+
result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)")
|
|
19
|
+
print(f"Reader: Found {result} nodes")
|
|
20
|
+
result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)")
|
|
21
|
+
print(f"Reader: Found {result[0][0]} nodes")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
if __name__ == "__main__":
|
|
25
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import cognee
|
|
3
|
+
from cognee.shared.logging_utils import setup_logging, INFO
|
|
4
|
+
from cognee.api.v1.search import SearchType
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
async def main():
|
|
8
|
+
await cognee.cognify(datasets=["first_cognify_dataset"])
|
|
9
|
+
|
|
10
|
+
query_text = (
|
|
11
|
+
"Tell me what is in the context. Additionally write out 'FIRST_COGNIFY' before your answer"
|
|
12
|
+
)
|
|
13
|
+
search_results = await cognee.search(
|
|
14
|
+
query_type=SearchType.GRAPH_COMPLETION,
|
|
15
|
+
query_text=query_text,
|
|
16
|
+
datasets=["first_cognify_dataset"],
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
print("Search results:")
|
|
20
|
+
for result_text in search_results:
|
|
21
|
+
print(result_text)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
if __name__ == "__main__":
|
|
25
|
+
logger = setup_logging(log_level=INFO)
|
|
26
|
+
loop = asyncio.new_event_loop()
|
|
27
|
+
asyncio.set_event_loop(loop)
|
|
28
|
+
try:
|
|
29
|
+
loop.run_until_complete(main())
|
|
30
|
+
finally:
|
|
31
|
+
loop.run_until_complete(loop.shutdown_asyncgens())
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import cognee
|
|
3
|
+
from cognee.shared.logging_utils import setup_logging, INFO
|
|
4
|
+
from cognee.api.v1.search import SearchType
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
async def main():
|
|
8
|
+
await cognee.cognify(datasets=["second_cognify_dataset"])
|
|
9
|
+
|
|
10
|
+
query_text = (
|
|
11
|
+
"Tell me what is in the context. Additionally write out 'SECOND_COGNIFY' before your answer"
|
|
12
|
+
)
|
|
13
|
+
search_results = await cognee.search(
|
|
14
|
+
query_type=SearchType.GRAPH_COMPLETION,
|
|
15
|
+
query_text=query_text,
|
|
16
|
+
datasets=["second_cognify_dataset"],
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
print("Search results:")
|
|
20
|
+
for result_text in search_results:
|
|
21
|
+
print(result_text)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
if __name__ == "__main__":
|
|
25
|
+
logger = setup_logging(log_level=INFO)
|
|
26
|
+
loop = asyncio.new_event_loop()
|
|
27
|
+
asyncio.set_event_loop(loop)
|
|
28
|
+
try:
|
|
29
|
+
loop.run_until_complete(main())
|
|
30
|
+
finally:
|
|
31
|
+
loop.run_until_complete(loop.shutdown_asyncgens())
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
3
|
+
import uuid
|
|
4
|
+
from cognee.modules.data.processing.document_types import PdfDocument
|
|
5
|
+
from cognee.infrastructure.databases.graph.kuzu.adapter import KuzuAdapter
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def create_node(name):
|
|
9
|
+
document = PdfDocument(
|
|
10
|
+
id=uuid.uuid4(),
|
|
11
|
+
name=name,
|
|
12
|
+
raw_data_location=name,
|
|
13
|
+
external_metadata="test_external_metadata",
|
|
14
|
+
mime_type="test_mime",
|
|
15
|
+
)
|
|
16
|
+
return document
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
async def main():
|
|
20
|
+
adapter = KuzuAdapter("test.db")
|
|
21
|
+
nodes = [create_node(f"Node{i}") for i in range(5)]
|
|
22
|
+
|
|
23
|
+
print("Writer: Starting...")
|
|
24
|
+
await adapter.add_nodes(nodes)
|
|
25
|
+
|
|
26
|
+
print("writer finished...")
|
|
27
|
+
|
|
28
|
+
time.sleep(10)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
if __name__ == "__main__":
|
|
32
|
+
asyncio.run(main())
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
from cognee.infrastructure.engine import DataPoint
|
|
3
3
|
from cognee.tasks.storage.add_data_points import add_data_points
|
|
4
|
-
from cognee.infrastructure.databases.graph.get_graph_engine import create_graph_engine
|
|
5
4
|
import cognee
|
|
6
5
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
7
6
|
import json
|
|
@@ -64,7 +63,6 @@ async def create_connected_test_graph():
|
|
|
64
63
|
|
|
65
64
|
|
|
66
65
|
async def get_metrics(provider: str, include_optional=True):
|
|
67
|
-
create_graph_engine.cache_clear()
|
|
68
66
|
cognee.config.set_graph_database_provider(provider)
|
|
69
67
|
graph_engine = await get_graph_engine()
|
|
70
68
|
await graph_engine.delete_graph()
|
|
@@ -1,7 +1,12 @@
|
|
|
1
|
-
from cognee.tests.tasks.descriptive_metrics.metrics_test_utils import assert_metrics
|
|
2
1
|
import asyncio
|
|
3
2
|
|
|
4
3
|
|
|
4
|
+
async def main():
|
|
5
|
+
from cognee.tests.tasks.descriptive_metrics.metrics_test_utils import assert_metrics
|
|
6
|
+
|
|
7
|
+
await assert_metrics(provider="neo4j", include_optional=False)
|
|
8
|
+
await assert_metrics(provider="neo4j", include_optional=True)
|
|
9
|
+
|
|
10
|
+
|
|
5
11
|
if __name__ == "__main__":
|
|
6
|
-
asyncio.run(
|
|
7
|
-
asyncio.run(assert_metrics(provider="neo4j", include_optional=True))
|
|
12
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pathlib
|
|
3
|
+
import asyncio
|
|
4
|
+
|
|
5
|
+
import cognee
|
|
6
|
+
import cognee.modules.ingestion as ingestion
|
|
7
|
+
from cognee.infrastructure.llm import get_max_chunk_tokens
|
|
8
|
+
from cognee.infrastructure.llm.extraction import extract_content_graph
|
|
9
|
+
from cognee.modules.chunking.TextChunker import TextChunker
|
|
10
|
+
from cognee.modules.data.processing.document_types import TextDocument
|
|
11
|
+
from cognee.modules.users.methods import get_default_user
|
|
12
|
+
from cognee.shared.data_models import KnowledgeGraph
|
|
13
|
+
from cognee.tasks.documents import extract_chunks_from_documents
|
|
14
|
+
from cognee.tasks.ingestion import save_data_item_to_storage
|
|
15
|
+
from cognee.infrastructure.files.utils.open_data_file import open_data_file
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
async def extract_graphs(document_chunks):
|
|
19
|
+
"""
|
|
20
|
+
Extract graph, and check if entities are present
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
extraction_results = await asyncio.gather(
|
|
24
|
+
*[extract_content_graph(chunk.text, KnowledgeGraph) for chunk in document_chunks]
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
return all(
|
|
28
|
+
any(
|
|
29
|
+
term in node.name.lower()
|
|
30
|
+
for extraction_result in extraction_results
|
|
31
|
+
for node in extraction_result.nodes
|
|
32
|
+
)
|
|
33
|
+
for term in ("qubit", "algorithm", "superposition")
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
async def main():
|
|
38
|
+
"""
|
|
39
|
+
Test how well the entity extraction works. Repeat graph generation a few times.
|
|
40
|
+
If 80% or more graphs are correctly generated, the test passes.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
file_path = os.path.join(
|
|
44
|
+
pathlib.Path(__file__).parent.parent.parent, "test_data/Quantum_computers.txt"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
await cognee.prune.prune_data()
|
|
48
|
+
await cognee.prune.prune_system(metadata=True)
|
|
49
|
+
|
|
50
|
+
await cognee.add("NLP is a subfield of computer science.")
|
|
51
|
+
|
|
52
|
+
original_file_path = await save_data_item_to_storage(file_path)
|
|
53
|
+
|
|
54
|
+
async with open_data_file(original_file_path) as file:
|
|
55
|
+
classified_data = ingestion.classify(file)
|
|
56
|
+
|
|
57
|
+
# data_id is the hash of original file contents + owner id to avoid duplicate data
|
|
58
|
+
data_id = ingestion.identify(classified_data, await get_default_user())
|
|
59
|
+
|
|
60
|
+
await cognee.add(file_path)
|
|
61
|
+
|
|
62
|
+
text_document = TextDocument(
|
|
63
|
+
id=data_id,
|
|
64
|
+
type="text",
|
|
65
|
+
mime_type="text/plain",
|
|
66
|
+
name="quantum_text",
|
|
67
|
+
raw_data_location=file_path,
|
|
68
|
+
external_metadata=None,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
document_chunks = []
|
|
72
|
+
async for chunk in extract_chunks_from_documents(
|
|
73
|
+
[text_document], max_chunk_size=get_max_chunk_tokens(), chunker=TextChunker
|
|
74
|
+
):
|
|
75
|
+
document_chunks.append(chunk)
|
|
76
|
+
|
|
77
|
+
number_of_reps = 5
|
|
78
|
+
|
|
79
|
+
graph_results = await asyncio.gather(
|
|
80
|
+
*[extract_graphs(document_chunks) for _ in range(number_of_reps)]
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
correct_graphs = [result for result in graph_results if result]
|
|
84
|
+
|
|
85
|
+
assert len(correct_graphs) >= 0.8 * number_of_reps
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
if __name__ == "__main__":
|
|
89
|
+
asyncio.run(main())
|