cognee 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. cognee/__init__.py +1 -0
  2. cognee/api/health.py +2 -12
  3. cognee/api/v1/add/add.py +46 -6
  4. cognee/api/v1/add/routers/get_add_router.py +5 -1
  5. cognee/api/v1/cognify/cognify.py +29 -9
  6. cognee/api/v1/datasets/datasets.py +11 -0
  7. cognee/api/v1/responses/default_tools.py +0 -1
  8. cognee/api/v1/responses/dispatch_function.py +1 -1
  9. cognee/api/v1/responses/routers/default_tools.py +0 -1
  10. cognee/api/v1/search/search.py +11 -9
  11. cognee/api/v1/settings/routers/get_settings_router.py +7 -1
  12. cognee/api/v1/ui/ui.py +47 -16
  13. cognee/api/v1/update/routers/get_update_router.py +1 -1
  14. cognee/api/v1/update/update.py +3 -3
  15. cognee/cli/_cognee.py +61 -10
  16. cognee/cli/commands/add_command.py +3 -3
  17. cognee/cli/commands/cognify_command.py +3 -3
  18. cognee/cli/commands/config_command.py +9 -7
  19. cognee/cli/commands/delete_command.py +3 -3
  20. cognee/cli/commands/search_command.py +3 -7
  21. cognee/cli/config.py +0 -1
  22. cognee/context_global_variables.py +5 -0
  23. cognee/exceptions/exceptions.py +1 -1
  24. cognee/infrastructure/databases/cache/__init__.py +2 -0
  25. cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
  26. cognee/infrastructure/databases/cache/config.py +44 -0
  27. cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
  28. cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
  29. cognee/infrastructure/databases/exceptions/__init__.py +1 -0
  30. cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
  31. cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
  32. cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
  33. cognee/infrastructure/databases/graph/kuzu/adapter.py +67 -44
  34. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
  35. cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
  36. cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
  37. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
  38. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
  39. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
  40. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
  41. cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
  42. cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
  43. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
  44. cognee/infrastructure/files/exceptions.py +1 -1
  45. cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
  46. cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
  47. cognee/infrastructure/files/utils/guess_file_type.py +6 -0
  48. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
  49. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
  50. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
  51. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
  52. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
  53. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
  54. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
  55. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
  56. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
  57. cognee/infrastructure/loaders/LoaderEngine.py +27 -7
  58. cognee/infrastructure/loaders/external/__init__.py +7 -0
  59. cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
  60. cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
  61. cognee/infrastructure/loaders/supported_loaders.py +7 -0
  62. cognee/modules/data/exceptions/exceptions.py +1 -1
  63. cognee/modules/data/methods/__init__.py +3 -0
  64. cognee/modules/data/methods/get_dataset_data.py +4 -1
  65. cognee/modules/data/methods/has_dataset_data.py +21 -0
  66. cognee/modules/engine/models/TableRow.py +0 -1
  67. cognee/modules/ingestion/save_data_to_file.py +9 -2
  68. cognee/modules/pipelines/exceptions/exceptions.py +1 -1
  69. cognee/modules/pipelines/operations/pipeline.py +12 -1
  70. cognee/modules/pipelines/operations/run_tasks.py +25 -197
  71. cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
  72. cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
  73. cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
  74. cognee/modules/retrieval/base_graph_retriever.py +3 -1
  75. cognee/modules/retrieval/base_retriever.py +3 -1
  76. cognee/modules/retrieval/chunks_retriever.py +5 -1
  77. cognee/modules/retrieval/code_retriever.py +20 -2
  78. cognee/modules/retrieval/completion_retriever.py +50 -9
  79. cognee/modules/retrieval/cypher_search_retriever.py +11 -1
  80. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
  81. cognee/modules/retrieval/graph_completion_cot_retriever.py +32 -1
  82. cognee/modules/retrieval/graph_completion_retriever.py +54 -10
  83. cognee/modules/retrieval/lexical_retriever.py +20 -2
  84. cognee/modules/retrieval/natural_language_retriever.py +10 -1
  85. cognee/modules/retrieval/summaries_retriever.py +5 -1
  86. cognee/modules/retrieval/temporal_retriever.py +62 -10
  87. cognee/modules/retrieval/user_qa_feedback.py +3 -2
  88. cognee/modules/retrieval/utils/completion.py +5 -0
  89. cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
  90. cognee/modules/retrieval/utils/session_cache.py +156 -0
  91. cognee/modules/search/methods/get_search_type_tools.py +0 -5
  92. cognee/modules/search/methods/no_access_control_search.py +12 -1
  93. cognee/modules/search/methods/search.py +34 -2
  94. cognee/modules/search/types/SearchType.py +0 -1
  95. cognee/modules/settings/get_settings.py +23 -0
  96. cognee/modules/users/methods/get_authenticated_user.py +3 -1
  97. cognee/modules/users/methods/get_default_user.py +1 -6
  98. cognee/modules/users/roles/methods/create_role.py +2 -2
  99. cognee/modules/users/tenants/methods/create_tenant.py +2 -2
  100. cognee/shared/exceptions/exceptions.py +1 -1
  101. cognee/tasks/codingagents/coding_rule_associations.py +1 -2
  102. cognee/tasks/documents/exceptions/exceptions.py +1 -1
  103. cognee/tasks/graph/extract_graph_from_data.py +2 -0
  104. cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
  105. cognee/tasks/ingestion/ingest_data.py +11 -5
  106. cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
  107. cognee/tasks/storage/add_data_points.py +3 -10
  108. cognee/tasks/storage/index_data_points.py +19 -14
  109. cognee/tasks/storage/index_graph_edges.py +25 -11
  110. cognee/tasks/web_scraper/__init__.py +34 -0
  111. cognee/tasks/web_scraper/config.py +26 -0
  112. cognee/tasks/web_scraper/default_url_crawler.py +446 -0
  113. cognee/tasks/web_scraper/models.py +46 -0
  114. cognee/tasks/web_scraper/types.py +4 -0
  115. cognee/tasks/web_scraper/utils.py +142 -0
  116. cognee/tasks/web_scraper/web_scraper_task.py +396 -0
  117. cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
  118. cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
  119. cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
  120. cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
  121. cognee/tests/subprocesses/reader.py +25 -0
  122. cognee/tests/subprocesses/simple_cognify_1.py +31 -0
  123. cognee/tests/subprocesses/simple_cognify_2.py +31 -0
  124. cognee/tests/subprocesses/writer.py +32 -0
  125. cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
  126. cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
  127. cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
  128. cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
  129. cognee/tests/test_add_docling_document.py +56 -0
  130. cognee/tests/test_chromadb.py +7 -11
  131. cognee/tests/test_concurrent_subprocess_access.py +76 -0
  132. cognee/tests/test_conversation_history.py +240 -0
  133. cognee/tests/test_kuzu.py +27 -15
  134. cognee/tests/test_lancedb.py +7 -11
  135. cognee/tests/test_library.py +32 -2
  136. cognee/tests/test_neo4j.py +24 -16
  137. cognee/tests/test_neptune_analytics_vector.py +7 -11
  138. cognee/tests/test_permissions.py +9 -13
  139. cognee/tests/test_pgvector.py +4 -4
  140. cognee/tests/test_remote_kuzu.py +8 -11
  141. cognee/tests/test_s3_file_storage.py +1 -1
  142. cognee/tests/test_search_db.py +6 -8
  143. cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
  144. cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
  145. {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/METADATA +22 -7
  146. {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/RECORD +155 -128
  147. {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/entry_points.txt +1 -0
  148. distributed/Dockerfile +0 -3
  149. distributed/entrypoint.py +21 -9
  150. distributed/signal.py +5 -0
  151. distributed/workers/data_point_saving_worker.py +64 -34
  152. distributed/workers/graph_saving_worker.py +71 -47
  153. cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
  154. cognee/modules/retrieval/insights_retriever.py +0 -133
  155. cognee/tests/test_memgraph.py +0 -109
  156. cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
  157. distributed/poetry.lock +0 -12238
  158. distributed/pyproject.toml +0 -185
  159. {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/WHEEL +0 -0
  160. {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/licenses/LICENSE +0 -0
  161. {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/licenses/NOTICE.md +0 -0
@@ -0,0 +1,344 @@
1
+ import os
2
+ import pytest
3
+ import cognee
4
+ from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
5
+ from cognee.infrastructure.loaders.LoaderEngine import LoaderEngine
6
+ from cognee.infrastructure.loaders.external.beautiful_soup_loader import BeautifulSoupLoader
7
+ from cognee.tasks.ingestion import save_data_item_to_storage
8
+ from pathlib import Path
9
+
10
+
11
+ @pytest.mark.asyncio
12
+ async def test_url_saves_as_html_file():
13
+ await cognee.prune.prune_data()
14
+ await cognee.prune.prune_system(metadata=True)
15
+
16
+ try:
17
+ original_file_path = await save_data_item_to_storage(
18
+ "https://en.wikipedia.org/wiki/Large_language_model"
19
+ )
20
+ file_path = get_data_file_path(original_file_path)
21
+ assert file_path.endswith(".html")
22
+ file = Path(file_path)
23
+ assert file.exists()
24
+ assert file.stat().st_size > 0
25
+ except Exception as e:
26
+ pytest.fail(f"Failed to save data item to storage: {e}")
27
+
28
+
29
+ skip_for_tavily = pytest.mark.skipif(
30
+ os.getenv("TAVILY_API_KEY") is not None,
31
+ reason="Skipping as Tavily already handles parsing and outputs text",
32
+ )
33
+
34
+
35
+ @skip_for_tavily
36
+ @pytest.mark.asyncio
37
+ async def test_saved_html_is_valid():
38
+ try:
39
+ from bs4 import BeautifulSoup
40
+ except ImportError:
41
+ pytest.fail("Test case requires bs4 installed")
42
+
43
+ await cognee.prune.prune_data()
44
+ await cognee.prune.prune_system(metadata=True)
45
+
46
+ try:
47
+ original_file_path = await save_data_item_to_storage(
48
+ "https://en.wikipedia.org/wiki/Large_language_model"
49
+ )
50
+ file_path = get_data_file_path(original_file_path)
51
+ content = Path(file_path).read_text()
52
+
53
+ soup = BeautifulSoup(content, "html.parser")
54
+ assert soup.find() is not None, "File should contain parseable HTML"
55
+
56
+ has_html_elements = any(
57
+ [
58
+ soup.find("html"),
59
+ soup.find("head"),
60
+ soup.find("body"),
61
+ soup.find("div"),
62
+ soup.find("p"),
63
+ ]
64
+ )
65
+ assert has_html_elements, "File should contain common HTML elements"
66
+ except Exception as e:
67
+ pytest.fail(f"Failed to save data item to storage: {e}")
68
+
69
+
70
+ @pytest.mark.asyncio
71
+ async def test_add_url():
72
+ await cognee.prune.prune_data()
73
+ await cognee.prune.prune_system(metadata=True)
74
+
75
+ await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
76
+
77
+
78
+ skip_in_ci = pytest.mark.skipif(
79
+ os.getenv("GITHUB_ACTIONS") == "true",
80
+ reason="Skipping in Github for now - before we get TAVILY_API_KEY",
81
+ )
82
+
83
+
84
+ @skip_in_ci
85
+ @pytest.mark.asyncio
86
+ async def test_add_url_with_tavily():
87
+ assert os.getenv("TAVILY_API_KEY") is not None
88
+ await cognee.prune.prune_data()
89
+ await cognee.prune.prune_system(metadata=True)
90
+
91
+ await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
92
+
93
+
94
+ @pytest.mark.asyncio
95
+ async def test_add_url_without_incremental_loading():
96
+ await cognee.prune.prune_data()
97
+ await cognee.prune.prune_system(metadata=True)
98
+
99
+ try:
100
+ await cognee.add(
101
+ "https://en.wikipedia.org/wiki/Large_language_model",
102
+ incremental_loading=False,
103
+ )
104
+ except Exception as e:
105
+ pytest.fail(f"Failed to add url: {e}")
106
+
107
+
108
+ @pytest.mark.asyncio
109
+ async def test_add_url_with_incremental_loading():
110
+ await cognee.prune.prune_data()
111
+ await cognee.prune.prune_system(metadata=True)
112
+
113
+ try:
114
+ await cognee.add(
115
+ "https://en.wikipedia.org/wiki/Large_language_model",
116
+ incremental_loading=True,
117
+ )
118
+ except Exception as e:
119
+ pytest.fail(f"Failed to add url: {e}")
120
+
121
+
122
+ @pytest.mark.asyncio
123
+ async def test_add_url_can_define_preferred_loader_as_list_of_str():
124
+ await cognee.prune.prune_data()
125
+ await cognee.prune.prune_system(metadata=True)
126
+
127
+ await cognee.add(
128
+ "https://en.wikipedia.org/wiki/Large_language_model",
129
+ preferred_loaders=["beautiful_soup_loader"],
130
+ )
131
+
132
+
133
+ @pytest.mark.asyncio
134
+ async def test_add_url_with_extraction_rules():
135
+ await cognee.prune.prune_data()
136
+ await cognee.prune.prune_system(metadata=True)
137
+
138
+ extraction_rules = {
139
+ "title": {"selector": "title"},
140
+ "headings": {"selector": "h1, h2, h3", "all": True},
141
+ "links": {"selector": "a", "attr": "href", "all": True},
142
+ "paragraphs": {"selector": "p", "all": True},
143
+ }
144
+
145
+ try:
146
+ await cognee.add(
147
+ "https://en.wikipedia.org/wiki/Large_language_model",
148
+ preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
149
+ )
150
+ except Exception as e:
151
+ pytest.fail(f"Failed to add url: {e}")
152
+
153
+
154
+ @pytest.mark.asyncio
155
+ async def test_loader_is_none_by_default():
156
+ await cognee.prune.prune_data()
157
+ await cognee.prune.prune_system(metadata=True)
158
+ extraction_rules = {
159
+ "title": {"selector": "title"},
160
+ "headings": {"selector": "h1, h2, h3", "all": True},
161
+ "links": {"selector": "a", "attr": "href", "all": True},
162
+ "paragraphs": {"selector": "p", "all": True},
163
+ }
164
+
165
+ try:
166
+ original_file_path = await save_data_item_to_storage(
167
+ "https://en.wikipedia.org/wiki/Large_language_model"
168
+ )
169
+ file_path = get_data_file_path(original_file_path)
170
+ assert file_path.endswith(".html")
171
+ file = Path(file_path)
172
+ assert file.exists()
173
+ assert file.stat().st_size > 0
174
+
175
+ loader_engine = LoaderEngine()
176
+ preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
177
+ loader = loader_engine.get_loader(
178
+ file_path,
179
+ preferred_loaders=preferred_loaders,
180
+ )
181
+
182
+ assert loader is None
183
+ except Exception as e:
184
+ pytest.fail(f"Failed to save data item to storage: {e}")
185
+
186
+
187
+ @pytest.mark.asyncio
188
+ async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_provided():
189
+ await cognee.prune.prune_data()
190
+ await cognee.prune.prune_system(metadata=True)
191
+ extraction_rules = {
192
+ "title": {"selector": "title"},
193
+ "headings": {"selector": "h1, h2, h3", "all": True},
194
+ "links": {"selector": "a", "attr": "href", "all": True},
195
+ "paragraphs": {"selector": "p", "all": True},
196
+ }
197
+
198
+ try:
199
+ original_file_path = await save_data_item_to_storage(
200
+ "https://en.wikipedia.org/wiki/Large_language_model"
201
+ )
202
+ file_path = get_data_file_path(original_file_path)
203
+ assert file_path.endswith(".html")
204
+ file = Path(file_path)
205
+ assert file.exists()
206
+ assert file.stat().st_size > 0
207
+
208
+ loader_engine = LoaderEngine()
209
+ bs_loader = BeautifulSoupLoader()
210
+ loader_engine.register_loader(bs_loader)
211
+ preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
212
+ loader = loader_engine.get_loader(
213
+ file_path,
214
+ preferred_loaders=preferred_loaders,
215
+ )
216
+
217
+ assert loader == bs_loader
218
+ except Exception as e:
219
+ pytest.fail(f"Failed to save data item to storage: {e}")
220
+
221
+
222
+ @pytest.mark.asyncio
223
+ async def test_beautiful_soup_loader_works_with_and_without_arguments():
224
+ await cognee.prune.prune_data()
225
+ await cognee.prune.prune_system(metadata=True)
226
+
227
+ try:
228
+ original_file_path = await save_data_item_to_storage(
229
+ "https://en.wikipedia.org/wiki/Large_language_model"
230
+ )
231
+ file_path = get_data_file_path(original_file_path)
232
+ assert file_path.endswith(".html")
233
+ file = Path(file_path)
234
+ assert file.exists()
235
+ assert file.stat().st_size > 0
236
+
237
+ loader_engine = LoaderEngine()
238
+ bs_loader = BeautifulSoupLoader()
239
+ loader_engine.register_loader(bs_loader)
240
+ preferred_loaders = {"beautiful_soup_loader": {}}
241
+ await loader_engine.load_file(
242
+ file_path,
243
+ preferred_loaders=preferred_loaders,
244
+ )
245
+ extraction_rules = {
246
+ "title": {"selector": "title"},
247
+ "headings": {"selector": "h1, h2, h3", "all": True},
248
+ "links": {"selector": "a", "attr": "href", "all": True},
249
+ "paragraphs": {"selector": "p", "all": True},
250
+ }
251
+ preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
252
+ await loader_engine.load_file(
253
+ file_path,
254
+ preferred_loaders=preferred_loaders,
255
+ )
256
+ except Exception as e:
257
+ pytest.fail(f"Failed to save data item to storage: {e}")
258
+
259
+
260
+ @pytest.mark.asyncio
261
+ async def test_beautiful_soup_loader_successfully_loads_file_if_required_args_present():
262
+ await cognee.prune.prune_data()
263
+ await cognee.prune.prune_system(metadata=True)
264
+
265
+ try:
266
+ original_file_path = await save_data_item_to_storage(
267
+ "https://en.wikipedia.org/wiki/Large_language_model"
268
+ )
269
+ file_path = get_data_file_path(original_file_path)
270
+ assert file_path.endswith(".html")
271
+ file = Path(file_path)
272
+ assert file.exists()
273
+ assert file.stat().st_size > 0
274
+
275
+ loader_engine = LoaderEngine()
276
+ bs_loader = BeautifulSoupLoader()
277
+ loader_engine.register_loader(bs_loader)
278
+ extraction_rules = {
279
+ "title": {"selector": "title"},
280
+ "headings": {"selector": "h1, h2, h3", "all": True},
281
+ "links": {"selector": "a", "attr": "href", "all": True},
282
+ "paragraphs": {"selector": "p", "all": True},
283
+ }
284
+ preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
285
+ await loader_engine.load_file(
286
+ file_path,
287
+ preferred_loaders=preferred_loaders,
288
+ )
289
+ except Exception as e:
290
+ pytest.fail(f"Failed to save data item to storage: {e}")
291
+
292
+
293
+ @pytest.mark.asyncio
294
+ async def test_beautiful_soup_loads_file_successfully():
295
+ await cognee.prune.prune_data()
296
+ await cognee.prune.prune_system(metadata=True)
297
+ extraction_rules = {
298
+ "title": {"selector": "title"},
299
+ "headings": {"selector": "h1, h2, h3", "all": True},
300
+ "links": {"selector": "a", "attr": "href", "all": True},
301
+ "paragraphs": {"selector": "p", "all": True},
302
+ }
303
+
304
+ try:
305
+ original_file_path = await save_data_item_to_storage(
306
+ "https://en.wikipedia.org/wiki/Large_language_model"
307
+ )
308
+ file_path = get_data_file_path(original_file_path)
309
+ assert file_path.endswith(".html")
310
+ original_file = Path(file_path)
311
+ assert original_file.exists()
312
+ assert original_file.stat().st_size > 0
313
+
314
+ loader_engine = LoaderEngine()
315
+ bs_loader = BeautifulSoupLoader()
316
+ loader_engine.register_loader(bs_loader)
317
+ preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
318
+ loader = loader_engine.get_loader(
319
+ file_path,
320
+ preferred_loaders=preferred_loaders,
321
+ )
322
+
323
+ assert loader == bs_loader
324
+
325
+ cognee_loaded_txt_path = await loader_engine.load_file(
326
+ file_path=file_path, preferred_loaders=preferred_loaders
327
+ )
328
+
329
+ cognee_loaded_txt_path = get_data_file_path(cognee_loaded_txt_path)
330
+
331
+ assert cognee_loaded_txt_path.endswith(".txt")
332
+
333
+ extracted_file = Path(cognee_loaded_txt_path)
334
+
335
+ assert extracted_file.exists()
336
+ assert extracted_file.stat().st_size > 0
337
+
338
+ original_basename = original_file.stem
339
+ extracted_basename = extracted_file.stem
340
+ assert original_basename == extracted_basename, (
341
+ f"Expected same base name: {original_basename} vs {extracted_basename}"
342
+ )
343
+ except Exception as e:
344
+ pytest.fail(f"Failed to save data item to storage: {e}")
@@ -0,0 +1,25 @@
1
+ import asyncio
2
+ import time
3
+ from cognee.infrastructure.databases.graph.kuzu.adapter import KuzuAdapter
4
+
5
+ # This will create the test.db if it doesn't exist
6
+
7
+
8
+ async def main():
9
+ adapter = KuzuAdapter("test.db")
10
+ result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)")
11
+ print(f"Reader: Found {result[0][0]} nodes")
12
+ result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)")
13
+ print(f"Reader: Found {result[0][0]} nodes")
14
+ result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)")
15
+ print(f"Reader: Found {result[0][0]} nodes")
16
+ result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)")
17
+ print(f"Reader: Found {result[0][0]} nodes")
18
+ result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)")
19
+ print(f"Reader: Found {result} nodes")
20
+ result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)")
21
+ print(f"Reader: Found {result[0][0]} nodes")
22
+
23
+
24
+ if __name__ == "__main__":
25
+ asyncio.run(main())
@@ -0,0 +1,31 @@
1
+ import asyncio
2
+ import cognee
3
+ from cognee.shared.logging_utils import setup_logging, INFO
4
+ from cognee.api.v1.search import SearchType
5
+
6
+
7
+ async def main():
8
+ await cognee.cognify(datasets=["first_cognify_dataset"])
9
+
10
+ query_text = (
11
+ "Tell me what is in the context. Additionally write out 'FIRST_COGNIFY' before your answer"
12
+ )
13
+ search_results = await cognee.search(
14
+ query_type=SearchType.GRAPH_COMPLETION,
15
+ query_text=query_text,
16
+ datasets=["first_cognify_dataset"],
17
+ )
18
+
19
+ print("Search results:")
20
+ for result_text in search_results:
21
+ print(result_text)
22
+
23
+
24
+ if __name__ == "__main__":
25
+ logger = setup_logging(log_level=INFO)
26
+ loop = asyncio.new_event_loop()
27
+ asyncio.set_event_loop(loop)
28
+ try:
29
+ loop.run_until_complete(main())
30
+ finally:
31
+ loop.run_until_complete(loop.shutdown_asyncgens())
@@ -0,0 +1,31 @@
1
+ import asyncio
2
+ import cognee
3
+ from cognee.shared.logging_utils import setup_logging, INFO
4
+ from cognee.api.v1.search import SearchType
5
+
6
+
7
+ async def main():
8
+ await cognee.cognify(datasets=["second_cognify_dataset"])
9
+
10
+ query_text = (
11
+ "Tell me what is in the context. Additionally write out 'SECOND_COGNIFY' before your answer"
12
+ )
13
+ search_results = await cognee.search(
14
+ query_type=SearchType.GRAPH_COMPLETION,
15
+ query_text=query_text,
16
+ datasets=["second_cognify_dataset"],
17
+ )
18
+
19
+ print("Search results:")
20
+ for result_text in search_results:
21
+ print(result_text)
22
+
23
+
24
+ if __name__ == "__main__":
25
+ logger = setup_logging(log_level=INFO)
26
+ loop = asyncio.new_event_loop()
27
+ asyncio.set_event_loop(loop)
28
+ try:
29
+ loop.run_until_complete(main())
30
+ finally:
31
+ loop.run_until_complete(loop.shutdown_asyncgens())
@@ -0,0 +1,32 @@
1
+ import asyncio
2
+ import time
3
+ import uuid
4
+ from cognee.modules.data.processing.document_types import PdfDocument
5
+ from cognee.infrastructure.databases.graph.kuzu.adapter import KuzuAdapter
6
+
7
+
8
+ def create_node(name):
9
+ document = PdfDocument(
10
+ id=uuid.uuid4(),
11
+ name=name,
12
+ raw_data_location=name,
13
+ external_metadata="test_external_metadata",
14
+ mime_type="test_mime",
15
+ )
16
+ return document
17
+
18
+
19
+ async def main():
20
+ adapter = KuzuAdapter("test.db")
21
+ nodes = [create_node(f"Node{i}") for i in range(5)]
22
+
23
+ print("Writer: Starting...")
24
+ await adapter.add_nodes(nodes)
25
+
26
+ print("writer finished...")
27
+
28
+ time.sleep(10)
29
+
30
+
31
+ if __name__ == "__main__":
32
+ asyncio.run(main())
@@ -1,7 +1,6 @@
1
1
  from typing import List
2
2
  from cognee.infrastructure.engine import DataPoint
3
3
  from cognee.tasks.storage.add_data_points import add_data_points
4
- from cognee.infrastructure.databases.graph.get_graph_engine import create_graph_engine
5
4
  import cognee
6
5
  from cognee.infrastructure.databases.graph import get_graph_engine
7
6
  import json
@@ -64,7 +63,6 @@ async def create_connected_test_graph():
64
63
 
65
64
 
66
65
  async def get_metrics(provider: str, include_optional=True):
67
- create_graph_engine.cache_clear()
68
66
  cognee.config.set_graph_database_provider(provider)
69
67
  graph_engine = await get_graph_engine()
70
68
  await graph_engine.delete_graph()
@@ -1,7 +1,12 @@
1
- from cognee.tests.tasks.descriptive_metrics.metrics_test_utils import assert_metrics
2
1
  import asyncio
3
2
 
4
3
 
4
+ async def main():
5
+ from cognee.tests.tasks.descriptive_metrics.metrics_test_utils import assert_metrics
6
+
7
+ await assert_metrics(provider="neo4j", include_optional=False)
8
+ await assert_metrics(provider="neo4j", include_optional=True)
9
+
10
+
5
11
  if __name__ == "__main__":
6
- asyncio.run(assert_metrics(provider="neo4j", include_optional=False))
7
- asyncio.run(assert_metrics(provider="neo4j", include_optional=True))
12
+ asyncio.run(main())
@@ -0,0 +1,89 @@
1
+ import os
2
+ import pathlib
3
+ import asyncio
4
+
5
+ import cognee
6
+ import cognee.modules.ingestion as ingestion
7
+ from cognee.infrastructure.llm import get_max_chunk_tokens
8
+ from cognee.infrastructure.llm.extraction import extract_content_graph
9
+ from cognee.modules.chunking.TextChunker import TextChunker
10
+ from cognee.modules.data.processing.document_types import TextDocument
11
+ from cognee.modules.users.methods import get_default_user
12
+ from cognee.shared.data_models import KnowledgeGraph
13
+ from cognee.tasks.documents import extract_chunks_from_documents
14
+ from cognee.tasks.ingestion import save_data_item_to_storage
15
+ from cognee.infrastructure.files.utils.open_data_file import open_data_file
16
+
17
+
18
+ async def extract_graphs(document_chunks):
19
+ """
20
+ Extract graph, and check if entities are present
21
+ """
22
+
23
+ extraction_results = await asyncio.gather(
24
+ *[extract_content_graph(chunk.text, KnowledgeGraph) for chunk in document_chunks]
25
+ )
26
+
27
+ return all(
28
+ any(
29
+ term in node.name.lower()
30
+ for extraction_result in extraction_results
31
+ for node in extraction_result.nodes
32
+ )
33
+ for term in ("qubit", "algorithm", "superposition")
34
+ )
35
+
36
+
37
+ async def main():
38
+ """
39
+ Test how well the entity extraction works. Repeat graph generation a few times.
40
+ If 80% or more graphs are correctly generated, the test passes.
41
+ """
42
+
43
+ file_path = os.path.join(
44
+ pathlib.Path(__file__).parent.parent.parent, "test_data/Quantum_computers.txt"
45
+ )
46
+
47
+ await cognee.prune.prune_data()
48
+ await cognee.prune.prune_system(metadata=True)
49
+
50
+ await cognee.add("NLP is a subfield of computer science.")
51
+
52
+ original_file_path = await save_data_item_to_storage(file_path)
53
+
54
+ async with open_data_file(original_file_path) as file:
55
+ classified_data = ingestion.classify(file)
56
+
57
+ # data_id is the hash of original file contents + owner id to avoid duplicate data
58
+ data_id = ingestion.identify(classified_data, await get_default_user())
59
+
60
+ await cognee.add(file_path)
61
+
62
+ text_document = TextDocument(
63
+ id=data_id,
64
+ type="text",
65
+ mime_type="text/plain",
66
+ name="quantum_text",
67
+ raw_data_location=file_path,
68
+ external_metadata=None,
69
+ )
70
+
71
+ document_chunks = []
72
+ async for chunk in extract_chunks_from_documents(
73
+ [text_document], max_chunk_size=get_max_chunk_tokens(), chunker=TextChunker
74
+ ):
75
+ document_chunks.append(chunk)
76
+
77
+ number_of_reps = 5
78
+
79
+ graph_results = await asyncio.gather(
80
+ *[extract_graphs(document_chunks) for _ in range(number_of_reps)]
81
+ )
82
+
83
+ correct_graphs = [result for result in graph_results if result]
84
+
85
+ assert len(correct_graphs) >= 0.8 * number_of_reps
86
+
87
+
88
+ if __name__ == "__main__":
89
+ asyncio.run(main())