cognee 0.3.0.dev0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +1 -0
- cognee/api/v1/save/save.py +335 -0
- cognee/api/v1/search/routers/get_search_router.py +3 -3
- cognee/api/v1/ui/__init__.py +1 -0
- cognee/api/v1/ui/ui.py +624 -0
- cognee/cli/_cognee.py +102 -0
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +1 -1
- cognee/modules/retrieval/graph_completion_cot_retriever.py +1 -1
- cognee/modules/retrieval/graph_completion_retriever.py +1 -1
- cognee/modules/retrieval/insights_retriever.py +12 -11
- cognee/modules/retrieval/temporal_retriever.py +1 -1
- cognee/modules/search/methods/search.py +31 -8
- cognee/tests/test_permissions.py +3 -3
- cognee/tests/test_relational_db_migration.py +3 -5
- cognee/tests/test_save_export_path.py +116 -0
- cognee/tests/test_search_db.py +10 -7
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +12 -6
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +12 -6
- cognee/tests/unit/modules/retrieval/insights_retriever_test.py +2 -4
- {cognee-0.3.0.dev0.dist-info → cognee-0.3.2.dist-info}/METADATA +2 -2
- {cognee-0.3.0.dev0.dist-info → cognee-0.3.2.dist-info}/RECORD +34 -30
- distributed/pyproject.toml +1 -1
- /cognee/tests/{integration/cli → cli_tests/cli_integration_tests}/__init__.py +0 -0
- /cognee/tests/{integration/cli → cli_tests/cli_integration_tests}/test_cli_integration.py +0 -0
- /cognee/tests/{unit/cli → cli_tests/cli_unit_tests}/__init__.py +0 -0
- /cognee/tests/{unit/cli → cli_tests/cli_unit_tests}/test_cli_commands.py +0 -0
- /cognee/tests/{unit/cli → cli_tests/cli_unit_tests}/test_cli_edge_cases.py +0 -0
- /cognee/tests/{unit/cli → cli_tests/cli_unit_tests}/test_cli_main.py +0 -0
- /cognee/tests/{unit/cli → cli_tests/cli_unit_tests}/test_cli_runner.py +0 -0
- /cognee/tests/{unit/cli → cli_tests/cli_unit_tests}/test_cli_utils.py +0 -0
- {cognee-0.3.0.dev0.dist-info → cognee-0.3.2.dist-info}/WHEEL +0 -0
- {cognee-0.3.0.dev0.dist-info → cognee-0.3.2.dist-info}/entry_points.txt +0 -0
- {cognee-0.3.0.dev0.dist-info → cognee-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.3.0.dev0.dist-info → cognee-0.3.2.dist-info}/licenses/NOTICE.md +0 -0
cognee/__init__.py
CHANGED
|
@@ -27,6 +27,7 @@ from .api.v1.visualize import visualize_graph, start_visualization_server
|
|
|
27
27
|
from cognee.modules.visualization.cognee_network_visualization import (
|
|
28
28
|
cognee_network_visualization,
|
|
29
29
|
)
|
|
30
|
+
from .api.v1.ui import start_ui
|
|
30
31
|
|
|
31
32
|
# Pipelines
|
|
32
33
|
from .modules import pipelines
|
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import asyncio
|
|
3
|
+
import json
|
|
4
|
+
from typing import Optional, Union, List, Dict
|
|
5
|
+
from uuid import UUID
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
from cognee.base_config import get_base_config
|
|
10
|
+
from cognee.modules.users.models import User
|
|
11
|
+
from cognee.modules.users.methods import get_default_user
|
|
12
|
+
from cognee.modules.data.methods import get_authorized_existing_datasets, get_dataset_data
|
|
13
|
+
from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
|
|
14
|
+
from cognee.infrastructure.llm.LLMGateway import LLMGateway
|
|
15
|
+
from cognee.shared.logging_utils import get_logger
|
|
16
|
+
from cognee.api.v1.search import search
|
|
17
|
+
from cognee.modules.search.types import SearchType
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
logger = get_logger("save")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class QuestionsModel(BaseModel):
|
|
24
|
+
questions: List[str]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _sanitize_filename(name: str) -> str:
|
|
28
|
+
safe = "".join(c if c.isalnum() or c in ("-", "_", ".", " ") else "_" for c in name)
|
|
29
|
+
return safe.strip().replace(" ", "_")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _dataset_dir_name(dataset) -> str:
|
|
33
|
+
# Prefer readable dataset name when available, fallback to id
|
|
34
|
+
if getattr(dataset, "name", None):
|
|
35
|
+
return _sanitize_filename(str(dataset.name))
|
|
36
|
+
return str(dataset.id)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _file_markdown_name(data_item, used_names: set[str]) -> str:
|
|
40
|
+
# Use original file name if present, else data.name
|
|
41
|
+
name = getattr(data_item, "name", None) or "file"
|
|
42
|
+
base = _sanitize_filename(str(name))
|
|
43
|
+
filename = f"{base}.md"
|
|
44
|
+
if filename in used_names:
|
|
45
|
+
short_id = str(getattr(data_item, "id", ""))[:8]
|
|
46
|
+
filename = f"{base}__{short_id}.md"
|
|
47
|
+
used_names.add(filename)
|
|
48
|
+
return filename
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _ascii_path_tree(path_str: str) -> str:
|
|
52
|
+
if not path_str:
|
|
53
|
+
return "(no path)"
|
|
54
|
+
|
|
55
|
+
# Normalize special schemes but keep segments readable
|
|
56
|
+
try:
|
|
57
|
+
normalized = get_data_file_path(path_str)
|
|
58
|
+
except Exception:
|
|
59
|
+
normalized = path_str
|
|
60
|
+
|
|
61
|
+
# Keep the path compact – show last 5 segments
|
|
62
|
+
parts = [p for p in normalized.replace("\\", "/").split("/") if p]
|
|
63
|
+
if len(parts) > 6:
|
|
64
|
+
display = ["…"] + parts[-5:]
|
|
65
|
+
else:
|
|
66
|
+
display = parts
|
|
67
|
+
|
|
68
|
+
# Render a single-branch tree
|
|
69
|
+
lines = []
|
|
70
|
+
for idx, seg in enumerate(display):
|
|
71
|
+
prefix = "└── " if idx == 0 else (" " * idx + "└── ")
|
|
72
|
+
lines.append(f"{prefix}{seg}")
|
|
73
|
+
return "\n".join(lines)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
async def _get_summary_via_summaries(query_text: str, dataset_id: UUID, top_k: int) -> str:
|
|
77
|
+
try:
|
|
78
|
+
results = await search(
|
|
79
|
+
query_text=query_text,
|
|
80
|
+
query_type=SearchType.SUMMARIES,
|
|
81
|
+
dataset_ids=[dataset_id],
|
|
82
|
+
top_k=top_k,
|
|
83
|
+
)
|
|
84
|
+
if not results:
|
|
85
|
+
return ""
|
|
86
|
+
texts: List[str] = []
|
|
87
|
+
for r in results[:top_k]:
|
|
88
|
+
texts.append(str(r))
|
|
89
|
+
return "\n\n".join(texts)
|
|
90
|
+
except Exception as e:
|
|
91
|
+
logger.error(
|
|
92
|
+
"SUMMARIES search failed for '%s' in dataset %s: %s",
|
|
93
|
+
query_text,
|
|
94
|
+
str(dataset_id),
|
|
95
|
+
str(e),
|
|
96
|
+
)
|
|
97
|
+
return ""
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
async def _generate_questions(file_name: str, summary_text: str) -> List[str]:
|
|
101
|
+
prompt = (
|
|
102
|
+
"You are an expert analyst. Given a file and its summary, propose 10 diverse, high-signal "
|
|
103
|
+
"questions to further explore the file's content, implications, relationships, and gaps. "
|
|
104
|
+
"Avoid duplicates; vary depth and angle (overview, details, cross-references, temporal, quality).\n\n"
|
|
105
|
+
f"File: {file_name}\n\nSummary:\n{summary_text[:4000]}"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
model = await LLMGateway.acreate_structured_output(
|
|
109
|
+
text_input=prompt,
|
|
110
|
+
system_prompt="Return strictly a JSON with key 'questions' and value as an array of 10 concise strings.",
|
|
111
|
+
response_model=QuestionsModel,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# model can be either pydantic model or dict-like, normalize
|
|
115
|
+
try:
|
|
116
|
+
questions = list(getattr(model, "questions", []))
|
|
117
|
+
except Exception:
|
|
118
|
+
questions = []
|
|
119
|
+
|
|
120
|
+
# Fallback if the tool returned a dict-like
|
|
121
|
+
if not questions and isinstance(model, dict):
|
|
122
|
+
questions = list(model.get("questions", []) or [])
|
|
123
|
+
|
|
124
|
+
# Enforce 10 max
|
|
125
|
+
return questions[:10]
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
async def _run_searches_for_question(
|
|
129
|
+
question: str, dataset_id: UUID, search_types: List[SearchType], top_k: int
|
|
130
|
+
) -> Dict[str, Union[str, List[dict], List[str]]]:
|
|
131
|
+
async def run_one(st: SearchType):
|
|
132
|
+
try:
|
|
133
|
+
result = await search(
|
|
134
|
+
query_text=question,
|
|
135
|
+
query_type=st,
|
|
136
|
+
dataset_ids=[dataset_id],
|
|
137
|
+
top_k=top_k,
|
|
138
|
+
)
|
|
139
|
+
return st.value, result
|
|
140
|
+
except Exception as e:
|
|
141
|
+
logger.error("Search failed for type %s: %s", st.value, str(e))
|
|
142
|
+
return st.value, [f"Error: {str(e)}"]
|
|
143
|
+
|
|
144
|
+
pairs = await asyncio.gather(*[run_one(st) for st in search_types])
|
|
145
|
+
return {k: v for k, v in pairs}
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _format_results_md(results: Dict[str, Union[str, List[dict], List[str]]]) -> str:
|
|
149
|
+
lines: List[str] = []
|
|
150
|
+
for st, payload in results.items():
|
|
151
|
+
lines.append(f"#### {st}")
|
|
152
|
+
if isinstance(payload, list):
|
|
153
|
+
# Printed as bullet items; stringify dicts
|
|
154
|
+
for item in payload[:5]:
|
|
155
|
+
if isinstance(item, dict):
|
|
156
|
+
# compact representation
|
|
157
|
+
snippet = json.dumps(item, ensure_ascii=False)[:800]
|
|
158
|
+
lines.append(f"- {snippet}")
|
|
159
|
+
else:
|
|
160
|
+
text = str(item)
|
|
161
|
+
lines.append(f"- {text[:800]}")
|
|
162
|
+
else:
|
|
163
|
+
lines.append(str(payload))
|
|
164
|
+
lines.append("")
|
|
165
|
+
return "\n".join(lines)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
async def save(
|
|
169
|
+
datasets: Optional[Union[List[str], List[UUID]]] = None,
|
|
170
|
+
export_root_directory: Optional[str] = None,
|
|
171
|
+
user: Optional[User] = None,
|
|
172
|
+
# Configurable knobs
|
|
173
|
+
max_questions: int = 10,
|
|
174
|
+
search_types: Optional[List[Union[str, SearchType]]] = None,
|
|
175
|
+
top_k: int = 5,
|
|
176
|
+
include_summary: bool = True,
|
|
177
|
+
include_ascii_tree: bool = True,
|
|
178
|
+
concurrency: int = 4,
|
|
179
|
+
timeout: Optional[float] = None,
|
|
180
|
+
) -> Dict[str, str]:
|
|
181
|
+
"""
|
|
182
|
+
Export per-dataset markdown summaries and search insights for each ingested file.
|
|
183
|
+
|
|
184
|
+
For every dataset the user can read:
|
|
185
|
+
- Create a folder under export_root_directory (or data_root_directory/exports)
|
|
186
|
+
- For each data item (file), create a .md containing:
|
|
187
|
+
- Summary of the file (from existing TextSummary nodes)
|
|
188
|
+
- A small ASCII path tree showing its folder position
|
|
189
|
+
- Up to N LLM-generated question ideas (configurable)
|
|
190
|
+
- Results of configured Cognee searches per question
|
|
191
|
+
Also creates an index.md per dataset with links to files and an optional dataset summary.
|
|
192
|
+
|
|
193
|
+
Returns a mapping of dataset_id -> export_directory path.
|
|
194
|
+
"""
|
|
195
|
+
base_config = get_base_config()
|
|
196
|
+
export_root = export_root_directory or os.path.join(
|
|
197
|
+
base_config.data_root_directory, "memory_export"
|
|
198
|
+
)
|
|
199
|
+
os.makedirs(export_root, exist_ok=True)
|
|
200
|
+
|
|
201
|
+
if user is None:
|
|
202
|
+
user = await get_default_user()
|
|
203
|
+
|
|
204
|
+
datasets_list = await get_authorized_existing_datasets(datasets, "read", user)
|
|
205
|
+
results: Dict[str, str] = {}
|
|
206
|
+
|
|
207
|
+
for dataset in datasets_list:
|
|
208
|
+
ds_dir = os.path.join(export_root, _dataset_dir_name(dataset))
|
|
209
|
+
os.makedirs(ds_dir, exist_ok=True)
|
|
210
|
+
results[str(dataset.id)] = ds_dir
|
|
211
|
+
|
|
212
|
+
data_items = await get_dataset_data(dataset.id)
|
|
213
|
+
|
|
214
|
+
# Normalize search types
|
|
215
|
+
if not search_types:
|
|
216
|
+
effective_search_types = [
|
|
217
|
+
SearchType.GRAPH_COMPLETION,
|
|
218
|
+
SearchType.INSIGHTS,
|
|
219
|
+
SearchType.CHUNKS,
|
|
220
|
+
]
|
|
221
|
+
else:
|
|
222
|
+
effective_search_types = []
|
|
223
|
+
for st in search_types:
|
|
224
|
+
if isinstance(st, SearchType):
|
|
225
|
+
effective_search_types.append(st)
|
|
226
|
+
else:
|
|
227
|
+
try:
|
|
228
|
+
effective_search_types.append(SearchType[str(st)])
|
|
229
|
+
except Exception:
|
|
230
|
+
logger.warning("Unknown search type '%s', skipping", str(st))
|
|
231
|
+
|
|
232
|
+
sem = asyncio.Semaphore(max(1, int(concurrency)))
|
|
233
|
+
used_names: set[str] = set()
|
|
234
|
+
index_entries: List[tuple[str, str]] = []
|
|
235
|
+
|
|
236
|
+
async def process_one(data_item):
|
|
237
|
+
async with sem:
|
|
238
|
+
file_label = getattr(data_item, "name", str(data_item.id))
|
|
239
|
+
original_path = getattr(data_item, "original_data_location", None)
|
|
240
|
+
|
|
241
|
+
ascii_tree = (
|
|
242
|
+
_ascii_path_tree(original_path or file_label) if include_ascii_tree else ""
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
summary_text = ""
|
|
246
|
+
if include_summary:
|
|
247
|
+
# Use SUMMARIES search scoped to dataset to derive file summary
|
|
248
|
+
file_query = getattr(data_item, "name", str(data_item.id)) or "file"
|
|
249
|
+
summary_text = await _get_summary_via_summaries(file_query, dataset.id, top_k)
|
|
250
|
+
if not summary_text:
|
|
251
|
+
summary_text = "Summary not available."
|
|
252
|
+
|
|
253
|
+
if max_questions == 0:
|
|
254
|
+
questions = []
|
|
255
|
+
else:
|
|
256
|
+
questions = await _generate_questions(file_label, summary_text)
|
|
257
|
+
if max_questions is not None and max_questions >= 0:
|
|
258
|
+
questions = questions[:max_questions]
|
|
259
|
+
|
|
260
|
+
async def searches_for_question(q: str):
|
|
261
|
+
return await _run_searches_for_question(
|
|
262
|
+
q, dataset.id, effective_search_types, top_k
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# Run per-question searches concurrently
|
|
266
|
+
per_q_results = await asyncio.gather(*[searches_for_question(q) for q in questions])
|
|
267
|
+
|
|
268
|
+
# Build markdown content
|
|
269
|
+
md_lines = [f"# {file_label}", ""]
|
|
270
|
+
if include_ascii_tree:
|
|
271
|
+
md_lines.extend(["## Location", "", "```", ascii_tree, "```", ""])
|
|
272
|
+
if include_summary:
|
|
273
|
+
md_lines.extend(["## Summary", "", summary_text, ""])
|
|
274
|
+
|
|
275
|
+
md_lines.append("## Question ideas")
|
|
276
|
+
for idx, q in enumerate(questions, start=1):
|
|
277
|
+
md_lines.append(f"- {idx}. {q}")
|
|
278
|
+
md_lines.append("")
|
|
279
|
+
|
|
280
|
+
md_lines.append("## Searches")
|
|
281
|
+
md_lines.append("")
|
|
282
|
+
for q, per_type in zip(questions, per_q_results):
|
|
283
|
+
md_lines.append(f"### Q: {q}")
|
|
284
|
+
md_lines.append(_format_results_md(per_type))
|
|
285
|
+
md_lines.append("")
|
|
286
|
+
|
|
287
|
+
# Write to file (collision-safe)
|
|
288
|
+
md_filename = _file_markdown_name(data_item, used_names)
|
|
289
|
+
export_path = os.path.join(ds_dir, md_filename)
|
|
290
|
+
tmp_path = export_path + ".tmp"
|
|
291
|
+
with open(tmp_path, "w", encoding="utf-8") as f:
|
|
292
|
+
f.write("\n".join(md_lines))
|
|
293
|
+
os.replace(tmp_path, export_path)
|
|
294
|
+
|
|
295
|
+
index_entries.append((file_label, md_filename))
|
|
296
|
+
|
|
297
|
+
tasks = [asyncio.create_task(process_one(item)) for item in data_items]
|
|
298
|
+
|
|
299
|
+
if timeout and timeout > 0:
|
|
300
|
+
try:
|
|
301
|
+
await asyncio.wait_for(asyncio.gather(*tasks, return_exceptions=True), timeout)
|
|
302
|
+
except asyncio.TimeoutError:
|
|
303
|
+
logger.error("Save timed out for dataset %s", str(dataset.id))
|
|
304
|
+
else:
|
|
305
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
306
|
+
|
|
307
|
+
# Build dataset index.md with TOC and optional dataset summary via SUMMARIES
|
|
308
|
+
try:
|
|
309
|
+
index_lines = [f"# Dataset: {_dataset_dir_name(dataset)}", "", "## Files", ""]
|
|
310
|
+
for display, fname in sorted(index_entries, key=lambda x: x[0].lower()):
|
|
311
|
+
index_lines.append(f"- [{display}]({fname})")
|
|
312
|
+
|
|
313
|
+
# Dataset summary section
|
|
314
|
+
try:
|
|
315
|
+
summaries = await search(
|
|
316
|
+
query_text="dataset overview",
|
|
317
|
+
query_type=SearchType.SUMMARIES,
|
|
318
|
+
dataset_ids=[dataset.id],
|
|
319
|
+
top_k=top_k,
|
|
320
|
+
)
|
|
321
|
+
except Exception as e:
|
|
322
|
+
logger.error("Dataset summary search failed: %s", str(e))
|
|
323
|
+
summaries = []
|
|
324
|
+
|
|
325
|
+
if summaries:
|
|
326
|
+
index_lines.extend(["", "## Dataset summary (top summaries)", ""])
|
|
327
|
+
for s in summaries[:top_k]:
|
|
328
|
+
index_lines.append(f"- {str(s)[:800]}")
|
|
329
|
+
|
|
330
|
+
with open(os.path.join(ds_dir, "index.md"), "w", encoding="utf-8") as f:
|
|
331
|
+
f.write("\n".join(index_lines))
|
|
332
|
+
except Exception as e:
|
|
333
|
+
logger.error("Failed to write dataset index for %s: %s", str(dataset.id), str(e))
|
|
334
|
+
|
|
335
|
+
return results
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
from uuid import UUID
|
|
2
|
-
from typing import Optional
|
|
2
|
+
from typing import Optional, Union, List, Any
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
from pydantic import Field
|
|
5
5
|
from fastapi import Depends, APIRouter
|
|
6
6
|
from fastapi.responses import JSONResponse
|
|
7
7
|
from fastapi.encoders import jsonable_encoder
|
|
8
8
|
|
|
9
|
-
from cognee.modules.search.types import SearchType
|
|
9
|
+
from cognee.modules.search.types import SearchType, SearchResult, CombinedSearchResult
|
|
10
10
|
from cognee.api.DTO import InDTO, OutDTO
|
|
11
11
|
from cognee.modules.users.exceptions.exceptions import PermissionDeniedError
|
|
12
12
|
from cognee.modules.users.models import User
|
|
@@ -73,7 +73,7 @@ def get_search_router() -> APIRouter:
|
|
|
73
73
|
except Exception as error:
|
|
74
74
|
return JSONResponse(status_code=500, content={"error": str(error)})
|
|
75
75
|
|
|
76
|
-
@router.post("", response_model=
|
|
76
|
+
@router.post("", response_model=Union[List[SearchResult], CombinedSearchResult, List])
|
|
77
77
|
async def search(payload: SearchPayloadDTO, user: User = Depends(get_authenticated_user)):
|
|
78
78
|
"""
|
|
79
79
|
Search for nodes in the graph database.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .ui import start_ui, stop_ui, ui
|