cognee 0.2.1.dev7__py3-none-any.whl → 0.2.2.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/api/client.py +3 -1
- cognee/api/v1/datasets/routers/get_datasets_router.py +2 -2
- cognee/infrastructure/databases/graph/kuzu/adapter.py +31 -9
- cognee/infrastructure/databases/graph/kuzu/kuzu_migrate.py +281 -0
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +103 -64
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +10 -3
- cognee/infrastructure/databases/vector/create_vector_engine.py +3 -11
- cognee/modules/data/models/Data.py +2 -2
- cognee/modules/data/processing/document_types/UnstructuredDocument.py +2 -5
- cognee/modules/graph/cognee_graph/CogneeGraph.py +39 -20
- cognee/modules/graph/methods/get_formatted_graph_data.py +1 -1
- cognee/modules/pipelines/operations/run_tasks.py +1 -1
- cognee/modules/pipelines/operations/run_tasks_distributed.py +1 -1
- cognee/modules/retrieval/chunks_retriever.py +23 -1
- cognee/modules/retrieval/code_retriever.py +64 -5
- cognee/modules/retrieval/completion_retriever.py +12 -10
- cognee/modules/retrieval/graph_completion_retriever.py +1 -1
- cognee/modules/retrieval/insights_retriever.py +4 -0
- cognee/modules/retrieval/natural_language_retriever.py +6 -10
- cognee/modules/retrieval/summaries_retriever.py +23 -1
- cognee/modules/retrieval/utils/brute_force_triplet_search.py +23 -4
- cognee/modules/settings/get_settings.py +0 -4
- cognee/modules/settings/save_vector_db_config.py +1 -1
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +84 -9
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev0.dist-info}/METADATA +5 -7
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev0.dist-info}/RECORD +29 -29
- cognee/tests/test_weaviate.py +0 -94
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev0.dist-info}/WHEEL +0 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev0.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev0.dist-info}/licenses/NOTICE.md +0 -0
cognee/api/client.py
CHANGED
|
@@ -74,7 +74,9 @@ if CORS_ALLOWED_ORIGINS:
|
|
|
74
74
|
origin.strip() for origin in CORS_ALLOWED_ORIGINS.split(",") if origin.strip()
|
|
75
75
|
]
|
|
76
76
|
else:
|
|
77
|
-
allowed_origins = [
|
|
77
|
+
allowed_origins = [
|
|
78
|
+
"http://localhost:3000",
|
|
79
|
+
] # Block all except explicitly set origins
|
|
78
80
|
|
|
79
81
|
app.add_middleware(
|
|
80
82
|
CORSMiddleware,
|
|
@@ -290,7 +290,7 @@ def get_datasets_router() -> APIRouter:
|
|
|
290
290
|
if dataset is None:
|
|
291
291
|
raise DatasetNotFoundError(message=f"Dataset ({str(dataset_id)}) not found.")
|
|
292
292
|
|
|
293
|
-
graph_data = await get_formatted_graph_data(dataset)
|
|
293
|
+
graph_data = await get_formatted_graph_data(dataset.id, user.id)
|
|
294
294
|
|
|
295
295
|
return graph_data
|
|
296
296
|
|
|
@@ -353,7 +353,7 @@ def get_datasets_router() -> APIRouter:
|
|
|
353
353
|
|
|
354
354
|
@router.get("/status", response_model=dict[str, PipelineRunStatus])
|
|
355
355
|
async def get_dataset_status(
|
|
356
|
-
datasets: Annotated[List[UUID], Query(alias="dataset")] =
|
|
356
|
+
datasets: Annotated[List[UUID], Query(alias="dataset")] = [],
|
|
357
357
|
user: User = Depends(get_authenticated_user),
|
|
358
358
|
):
|
|
359
359
|
"""
|
|
@@ -72,11 +72,36 @@ class KuzuAdapter(GraphDBInterface):
|
|
|
72
72
|
|
|
73
73
|
run_sync(file_storage.ensure_directory_exists())
|
|
74
74
|
|
|
75
|
-
|
|
76
|
-
self.
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
75
|
+
try:
|
|
76
|
+
self.db = Database(
|
|
77
|
+
self.db_path,
|
|
78
|
+
buffer_pool_size=2048 * 1024 * 1024, # 2048MB buffer pool
|
|
79
|
+
max_db_size=4096 * 1024 * 1024,
|
|
80
|
+
)
|
|
81
|
+
except RuntimeError:
|
|
82
|
+
from .kuzu_migrate import read_kuzu_storage_version
|
|
83
|
+
import kuzu
|
|
84
|
+
|
|
85
|
+
kuzu_db_version = read_kuzu_storage_version(self.db_path)
|
|
86
|
+
if (
|
|
87
|
+
kuzu_db_version == "0.9.0" or kuzu_db_version == "0.8.2"
|
|
88
|
+
) and kuzu_db_version != kuzu.__version__:
|
|
89
|
+
# Try to migrate kuzu database to latest version
|
|
90
|
+
from .kuzu_migrate import kuzu_migration
|
|
91
|
+
|
|
92
|
+
kuzu_migration(
|
|
93
|
+
new_db=self.db_path + "_new",
|
|
94
|
+
old_db=self.db_path,
|
|
95
|
+
new_version=kuzu.__version__,
|
|
96
|
+
old_version=kuzu_db_version,
|
|
97
|
+
overwrite=True,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
self.db = Database(
|
|
101
|
+
self.db_path,
|
|
102
|
+
buffer_pool_size=2048 * 1024 * 1024, # 2048MB buffer pool
|
|
103
|
+
max_db_size=4096 * 1024 * 1024,
|
|
104
|
+
)
|
|
80
105
|
|
|
81
106
|
self.db.init_database()
|
|
82
107
|
self.connection = Connection(self.db)
|
|
@@ -1438,11 +1463,8 @@ class KuzuAdapter(GraphDBInterface):
|
|
|
1438
1463
|
It raises exceptions for failures occurring during deletion processes.
|
|
1439
1464
|
"""
|
|
1440
1465
|
try:
|
|
1441
|
-
# Use DETACH DELETE to remove both nodes and their relationships in one operation
|
|
1442
|
-
await self.query("MATCH (n:Node) DETACH DELETE n")
|
|
1443
|
-
logger.info("Cleared all data from graph while preserving structure")
|
|
1444
|
-
|
|
1445
1466
|
if self.connection:
|
|
1467
|
+
self.connection.close()
|
|
1446
1468
|
self.connection = None
|
|
1447
1469
|
if self.db:
|
|
1448
1470
|
self.db.close()
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Kuzu Database Migration Script
|
|
4
|
+
|
|
5
|
+
This script migrates Kuzu databases between different versions by:
|
|
6
|
+
1. Setting up isolated Python environments for each Kuzu version
|
|
7
|
+
2. Exporting data from the source database using the old version
|
|
8
|
+
3. Importing data into the target database using the new version
|
|
9
|
+
4. If overwrite is enabled target database will replace source database and source database will have the prefix _old
|
|
10
|
+
5. If delete-old is enabled target database will be renamed to source database and source database will be deleted
|
|
11
|
+
|
|
12
|
+
The script automatically handles:
|
|
13
|
+
- Environment setup (creates virtual environments as needed)
|
|
14
|
+
- Export/import validation
|
|
15
|
+
- Error handling and reporting
|
|
16
|
+
|
|
17
|
+
Usage Examples:
|
|
18
|
+
# Basic migration from 0.9.0 to 0.11.0
|
|
19
|
+
python kuzu_migrate.py --old-version 0.9.0 --new-version 0.11.0 --old-db /path/to/old/database --new-db /path/to/new/database
|
|
20
|
+
|
|
21
|
+
Requirements:
|
|
22
|
+
- Python 3.7+
|
|
23
|
+
- Internet connection (to download Kuzu packages)
|
|
24
|
+
- Sufficient disk space for virtual environments and temporary exports
|
|
25
|
+
|
|
26
|
+
Notes:
|
|
27
|
+
- Can only be used to migrate to newer Kuzu versions, from 0.11.0 onwards
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
import tempfile
|
|
31
|
+
import sys
|
|
32
|
+
import struct
|
|
33
|
+
import shutil
|
|
34
|
+
import subprocess
|
|
35
|
+
import argparse
|
|
36
|
+
import os
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
kuzu_version_mapping = {
|
|
40
|
+
34: "0.7.0",
|
|
41
|
+
35: "0.7.1",
|
|
42
|
+
36: "0.8.2",
|
|
43
|
+
37: "0.9.0",
|
|
44
|
+
38: "0.10.1",
|
|
45
|
+
39: "0.11.0",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def read_kuzu_storage_version(kuzu_db_path: str) -> int:
|
|
50
|
+
"""
|
|
51
|
+
Reads the Kùzu storage version code from the first catalog.bin file bytes.
|
|
52
|
+
|
|
53
|
+
:param kuzu_db_path: Path to the Kuzu database file/directory.
|
|
54
|
+
:return: Storage version code as an integer.
|
|
55
|
+
"""
|
|
56
|
+
if os.path.isdir(kuzu_db_path):
|
|
57
|
+
kuzu_version_file_path = os.path.join(kuzu_db_path, "catalog.kz")
|
|
58
|
+
if not os.path.isfile(kuzu_version_file_path):
|
|
59
|
+
raise FileExistsError("Kuzu catalog.kz file does not exist")
|
|
60
|
+
else:
|
|
61
|
+
kuzu_version_file_path = kuzu_db_path
|
|
62
|
+
|
|
63
|
+
with open(kuzu_version_file_path, "rb") as f:
|
|
64
|
+
# Skip the 3-byte magic "KUZ" and one byte of padding
|
|
65
|
+
f.seek(4)
|
|
66
|
+
# Read the next 8 bytes as a little-endian unsigned 64-bit integer
|
|
67
|
+
data = f.read(8)
|
|
68
|
+
if len(data) < 8:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
f"File '{kuzu_version_file_path}' does not contain a storage version code."
|
|
71
|
+
)
|
|
72
|
+
version_code = struct.unpack("<Q", data)[0]
|
|
73
|
+
|
|
74
|
+
if kuzu_version_mapping.get(version_code):
|
|
75
|
+
return kuzu_version_mapping[version_code]
|
|
76
|
+
else:
|
|
77
|
+
ValueError("Could not map version_code to proper Kuzu version.")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def ensure_env(version: str, export_dir) -> str:
|
|
81
|
+
"""
|
|
82
|
+
Create (if needed) a venv at .kuzu_envs/{version} and install kuzu=={version}.
|
|
83
|
+
Returns the path to the venv's python executable.
|
|
84
|
+
"""
|
|
85
|
+
# Use temp directory to create venv
|
|
86
|
+
kuzu_envs_dir = os.path.join(export_dir, ".kuzu_envs")
|
|
87
|
+
|
|
88
|
+
# venv base under the script directory
|
|
89
|
+
base = os.path.join(kuzu_envs_dir, version)
|
|
90
|
+
py_bin = os.path.join(base, "bin", "python")
|
|
91
|
+
# If environment already exists clean it
|
|
92
|
+
if os.path.isfile(py_bin):
|
|
93
|
+
shutil.rmtree(base)
|
|
94
|
+
|
|
95
|
+
print(f"→ Setting up venv for Kùzu {version}...", file=sys.stderr)
|
|
96
|
+
# Create venv
|
|
97
|
+
# NOTE: Running python in debug mode can cause issues with creating a virtual environment from that python instance
|
|
98
|
+
subprocess.run([sys.executable, "-m", "venv", base], check=True)
|
|
99
|
+
# Install the specific Kùzu version
|
|
100
|
+
subprocess.run([py_bin, "-m", "pip", "install", "--upgrade", "pip"], check=True)
|
|
101
|
+
subprocess.run([py_bin, "-m", "pip", "install", f"kuzu=={version}"], check=True)
|
|
102
|
+
return py_bin
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def run_migration_step(python_exe: str, db_path: str, cypher: str):
|
|
106
|
+
"""
|
|
107
|
+
Uses the given python_exe to execute a short snippet that
|
|
108
|
+
connects to the Kùzu database and runs a Cypher command.
|
|
109
|
+
"""
|
|
110
|
+
snippet = f"""
|
|
111
|
+
import kuzu
|
|
112
|
+
db = kuzu.Database(r"{db_path}")
|
|
113
|
+
conn = kuzu.Connection(db)
|
|
114
|
+
conn.execute(r\"\"\"{cypher}\"\"\")
|
|
115
|
+
"""
|
|
116
|
+
proc = subprocess.run([python_exe, "-c", snippet], capture_output=True, text=True)
|
|
117
|
+
if proc.returncode != 0:
|
|
118
|
+
print(f"[ERROR] {cypher} failed:\n{proc.stderr}", file=sys.stderr)
|
|
119
|
+
sys.exit(proc.returncode)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def kuzu_migration(new_db, old_db, new_version, old_version=None, overwrite=None, delete_old=None):
|
|
123
|
+
"""
|
|
124
|
+
Main migration function that handles the complete migration process.
|
|
125
|
+
"""
|
|
126
|
+
print(f"🔄 Migrating Kuzu database from {old_version} to {new_version}", file=sys.stderr)
|
|
127
|
+
print(f"📂 Source: {old_db}", file=sys.stderr)
|
|
128
|
+
print("", file=sys.stderr)
|
|
129
|
+
|
|
130
|
+
# If version of old kuzu db is not provided try to determine it based on file info
|
|
131
|
+
if not old_version:
|
|
132
|
+
old_version = read_kuzu_storage_version(old_db)
|
|
133
|
+
|
|
134
|
+
# Check if old database exists
|
|
135
|
+
if not os.path.exists(old_db):
|
|
136
|
+
print(f"Source database '{old_db}' does not exist.", file=sys.stderr)
|
|
137
|
+
sys.exit(1)
|
|
138
|
+
|
|
139
|
+
# Prepare target - ensure parent directory exists but remove target if it exists
|
|
140
|
+
parent_dir = os.path.dirname(new_db)
|
|
141
|
+
if parent_dir:
|
|
142
|
+
os.makedirs(parent_dir, exist_ok=True)
|
|
143
|
+
|
|
144
|
+
if os.path.exists(new_db):
|
|
145
|
+
raise FileExistsError(
|
|
146
|
+
"File already exists at new database location, remove file or change new database file path to continue"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Use temp directory for all processing, it will be cleaned up after with statement
|
|
150
|
+
with tempfile.TemporaryDirectory() as export_dir:
|
|
151
|
+
# Set up environments
|
|
152
|
+
print(f"Setting up Kuzu {old_version} environment...", file=sys.stderr)
|
|
153
|
+
old_py = ensure_env(old_version, export_dir)
|
|
154
|
+
print(f"Setting up Kuzu {new_version} environment...", file=sys.stderr)
|
|
155
|
+
new_py = ensure_env(new_version, export_dir)
|
|
156
|
+
|
|
157
|
+
export_file = os.path.join(export_dir, "kuzu_export")
|
|
158
|
+
print(f"Exporting old DB → {export_dir}", file=sys.stderr)
|
|
159
|
+
run_migration_step(old_py, old_db, f"EXPORT DATABASE '{export_file}'")
|
|
160
|
+
print("Export complete.", file=sys.stderr)
|
|
161
|
+
|
|
162
|
+
# Check if export files were created and have content
|
|
163
|
+
schema_file = os.path.join(export_file, "schema.cypher")
|
|
164
|
+
if not os.path.exists(schema_file) or os.path.getsize(schema_file) == 0:
|
|
165
|
+
raise ValueError(f"Schema file not found: {schema_file}")
|
|
166
|
+
|
|
167
|
+
print(f"Importing into new DB at {new_db}", file=sys.stderr)
|
|
168
|
+
run_migration_step(new_py, new_db, f"IMPORT DATABASE '{export_file}'")
|
|
169
|
+
print("Import complete.", file=sys.stderr)
|
|
170
|
+
|
|
171
|
+
# Rename new kuzu database to old kuzu database name if enabled
|
|
172
|
+
if overwrite or delete_old:
|
|
173
|
+
# Remove kuzu lock from migrated DB
|
|
174
|
+
lock_file = new_db + ".lock"
|
|
175
|
+
if os.path.exists(lock_file):
|
|
176
|
+
os.remove(lock_file)
|
|
177
|
+
rename_databases(old_db, old_version, new_db, delete_old)
|
|
178
|
+
|
|
179
|
+
print("✅ Kuzu graph database migration finished successfully!")
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def rename_databases(old_db: str, old_version: str, new_db: str, delete_old: bool):
|
|
183
|
+
"""
|
|
184
|
+
When overwrite is enabled, back up the original old_db (file with .lock and .wal or directory)
|
|
185
|
+
by renaming it to *_old, and replace it with the newly imported new_db files.
|
|
186
|
+
|
|
187
|
+
When delete_old is enabled replace the old database with the new one and delete old database
|
|
188
|
+
"""
|
|
189
|
+
base_dir = os.path.dirname(old_db)
|
|
190
|
+
name = os.path.basename(old_db.rstrip(os.sep))
|
|
191
|
+
# Add _old_ and version info to backup graph database
|
|
192
|
+
backup_database_name = f"{name}_old_" + old_version.replace(".", "_")
|
|
193
|
+
backup_base = os.path.join(base_dir, backup_database_name)
|
|
194
|
+
|
|
195
|
+
if os.path.isfile(old_db):
|
|
196
|
+
# File-based database: handle main file and accompanying lock/WAL
|
|
197
|
+
for ext in ["", ".wal"]:
|
|
198
|
+
src = old_db + ext
|
|
199
|
+
dst = backup_base + ext
|
|
200
|
+
if os.path.exists(src):
|
|
201
|
+
if delete_old:
|
|
202
|
+
os.remove(src)
|
|
203
|
+
else:
|
|
204
|
+
os.rename(src, dst)
|
|
205
|
+
print(f"Renamed '{src}' to '{dst}'", file=sys.stderr)
|
|
206
|
+
elif os.path.isdir(old_db):
|
|
207
|
+
# Directory-based Kuzu database
|
|
208
|
+
backup_dir = backup_base
|
|
209
|
+
if delete_old:
|
|
210
|
+
shutil.rmtree(old_db)
|
|
211
|
+
else:
|
|
212
|
+
os.rename(old_db, backup_dir)
|
|
213
|
+
print(f"Renamed directory '{old_db}' to '{backup_dir}'", file=sys.stderr)
|
|
214
|
+
else:
|
|
215
|
+
print(f"Original database path '{old_db}' not found for renaming.", file=sys.stderr)
|
|
216
|
+
sys.exit(1)
|
|
217
|
+
|
|
218
|
+
# Now move new files into place
|
|
219
|
+
for ext in ["", ".wal"]:
|
|
220
|
+
src_new = new_db + ext
|
|
221
|
+
dst_new = os.path.join(base_dir, name + ext)
|
|
222
|
+
if os.path.exists(src_new):
|
|
223
|
+
os.rename(src_new, dst_new)
|
|
224
|
+
print(f"Renamed '{src_new}' to '{dst_new}'", file=sys.stderr)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def main():
|
|
228
|
+
p = argparse.ArgumentParser(
|
|
229
|
+
description="Migrate Kùzu DB via PyPI versions",
|
|
230
|
+
epilog="""
|
|
231
|
+
Examples:
|
|
232
|
+
%(prog)s --old-version 0.9.0 --new-version 0.11.0 \\
|
|
233
|
+
--old-db /path/to/old/db --new-db /path/to/new/db --overwrite
|
|
234
|
+
|
|
235
|
+
Note: This script will create temporary virtual environments in .kuzu_envs/ directory
|
|
236
|
+
to isolate different Kuzu versions.
|
|
237
|
+
""",
|
|
238
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
239
|
+
)
|
|
240
|
+
p.add_argument(
|
|
241
|
+
"--old-version",
|
|
242
|
+
required=False,
|
|
243
|
+
default=None,
|
|
244
|
+
help="Source Kuzu version (e.g., 0.9.0). If not provided automatic kuzu version detection will be attempted.",
|
|
245
|
+
)
|
|
246
|
+
p.add_argument("--new-version", required=True, help="Target Kuzu version (e.g., 0.11.0)")
|
|
247
|
+
p.add_argument("--old-db", required=True, help="Path to source database directory")
|
|
248
|
+
p.add_argument(
|
|
249
|
+
"--new-db",
|
|
250
|
+
required=True,
|
|
251
|
+
help="Path to target database directory, it can't be the same path as the old database. Use the overwrite flag if you want to replace the old database with the new one.",
|
|
252
|
+
)
|
|
253
|
+
p.add_argument(
|
|
254
|
+
"--overwrite",
|
|
255
|
+
required=False,
|
|
256
|
+
action="store_true",
|
|
257
|
+
default=False,
|
|
258
|
+
help="Rename new-db to the old-db name and location, keeps old-db as backup if delete-old is not True",
|
|
259
|
+
)
|
|
260
|
+
p.add_argument(
|
|
261
|
+
"--delete-old",
|
|
262
|
+
required=False,
|
|
263
|
+
action="store_true",
|
|
264
|
+
default=False,
|
|
265
|
+
help="When overwrite and delete-old is True old-db will not be stored as backup",
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
args = p.parse_args()
|
|
269
|
+
|
|
270
|
+
kuzu_migration(
|
|
271
|
+
new_db=args.new_db,
|
|
272
|
+
old_db=args.old_db,
|
|
273
|
+
new_version=args.new_version,
|
|
274
|
+
old_version=args.old_version,
|
|
275
|
+
overwrite=args.overwrite,
|
|
276
|
+
delete_old=args.delete_old,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
if __name__ == "__main__":
|
|
281
|
+
main()
|
|
@@ -33,7 +33,7 @@ from .neo4j_metrics_utils import (
|
|
|
33
33
|
from .deadlock_retry import deadlock_retry
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
logger = get_logger("Neo4jAdapter"
|
|
36
|
+
logger = get_logger("Neo4jAdapter")
|
|
37
37
|
|
|
38
38
|
BASE_LABEL = "__Node__"
|
|
39
39
|
|
|
@@ -870,34 +870,52 @@ class Neo4jAdapter(GraphDBInterface):
|
|
|
870
870
|
|
|
871
871
|
A tuple containing two lists: nodes and edges with their properties.
|
|
872
872
|
"""
|
|
873
|
-
|
|
873
|
+
import time
|
|
874
874
|
|
|
875
|
-
|
|
875
|
+
start_time = time.time()
|
|
876
876
|
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
877
|
+
try:
|
|
878
|
+
# Retrieve nodes
|
|
879
|
+
query = "MATCH (n) RETURN ID(n) AS id, labels(n) AS labels, properties(n) AS properties"
|
|
880
|
+
result = await self.query(query)
|
|
881
|
+
|
|
882
|
+
nodes = []
|
|
883
|
+
for record in result:
|
|
884
|
+
nodes.append(
|
|
885
|
+
(
|
|
886
|
+
record["properties"]["id"],
|
|
887
|
+
record["properties"],
|
|
888
|
+
)
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
# Retrieve edges
|
|
892
|
+
query = """
|
|
893
|
+
MATCH (n)-[r]->(m)
|
|
894
|
+
RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties
|
|
895
|
+
"""
|
|
896
|
+
result = await self.query(query)
|
|
897
|
+
|
|
898
|
+
edges = []
|
|
899
|
+
for record in result:
|
|
900
|
+
edges.append(
|
|
901
|
+
(
|
|
902
|
+
record["properties"]["source_node_id"],
|
|
903
|
+
record["properties"]["target_node_id"],
|
|
904
|
+
record["type"],
|
|
905
|
+
record["properties"],
|
|
906
|
+
)
|
|
907
|
+
)
|
|
908
|
+
|
|
909
|
+
retrieval_time = time.time() - start_time
|
|
910
|
+
logger.info(
|
|
911
|
+
f"Retrieved {len(nodes)} nodes and {len(edges)} edges in {retrieval_time:.2f} seconds"
|
|
881
912
|
)
|
|
882
|
-
for record in result
|
|
883
|
-
]
|
|
884
913
|
|
|
885
|
-
|
|
886
|
-
MATCH (n)-[r]->(m)
|
|
887
|
-
RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties
|
|
888
|
-
"""
|
|
889
|
-
result = await self.query(query)
|
|
890
|
-
edges = [
|
|
891
|
-
(
|
|
892
|
-
record["properties"]["source_node_id"],
|
|
893
|
-
record["properties"]["target_node_id"],
|
|
894
|
-
record["type"],
|
|
895
|
-
record["properties"],
|
|
896
|
-
)
|
|
897
|
-
for record in result
|
|
898
|
-
]
|
|
914
|
+
return (nodes, edges)
|
|
899
915
|
|
|
900
|
-
|
|
916
|
+
except Exception as e:
|
|
917
|
+
logger.error(f"Error during graph data retrieval: {str(e)}")
|
|
918
|
+
raise
|
|
901
919
|
|
|
902
920
|
async def get_nodeset_subgraph(
|
|
903
921
|
self, node_type: Type[Any], node_name: List[str]
|
|
@@ -918,50 +936,71 @@ class Neo4jAdapter(GraphDBInterface):
|
|
|
918
936
|
- Tuple[List[Tuple[int, dict]], List[Tuple[int, int, str, dict]]}: A tuple
|
|
919
937
|
containing nodes and edges in the requested subgraph.
|
|
920
938
|
"""
|
|
921
|
-
|
|
939
|
+
import time
|
|
922
940
|
|
|
923
|
-
|
|
924
|
-
UNWIND $names AS wantedName
|
|
925
|
-
MATCH (n:`{label}`)
|
|
926
|
-
WHERE n.name = wantedName
|
|
927
|
-
WITH collect(DISTINCT n) AS primary
|
|
928
|
-
UNWIND primary AS p
|
|
929
|
-
OPTIONAL MATCH (p)--(nbr)
|
|
930
|
-
WITH primary, collect(DISTINCT nbr) AS nbrs
|
|
931
|
-
WITH primary + nbrs AS nodelist
|
|
932
|
-
UNWIND nodelist AS node
|
|
933
|
-
WITH collect(DISTINCT node) AS nodes
|
|
934
|
-
MATCH (a)-[r]-(b)
|
|
935
|
-
WHERE a IN nodes AND b IN nodes
|
|
936
|
-
WITH nodes, collect(DISTINCT r) AS rels
|
|
937
|
-
RETURN
|
|
938
|
-
[n IN nodes |
|
|
939
|
-
{{ id: n.id,
|
|
940
|
-
properties: properties(n) }}] AS rawNodes,
|
|
941
|
-
[r IN rels |
|
|
942
|
-
{{ type: type(r),
|
|
943
|
-
properties: properties(r) }}] AS rawRels
|
|
944
|
-
"""
|
|
941
|
+
start_time = time.time()
|
|
945
942
|
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
return [], []
|
|
943
|
+
try:
|
|
944
|
+
label = node_type.__name__
|
|
949
945
|
|
|
950
|
-
|
|
951
|
-
|
|
946
|
+
query = f"""
|
|
947
|
+
UNWIND $names AS wantedName
|
|
948
|
+
MATCH (n:`{label}`)
|
|
949
|
+
WHERE n.name = wantedName
|
|
950
|
+
WITH collect(DISTINCT n) AS primary
|
|
951
|
+
UNWIND primary AS p
|
|
952
|
+
OPTIONAL MATCH (p)--(nbr)
|
|
953
|
+
WITH primary, collect(DISTINCT nbr) AS nbrs
|
|
954
|
+
WITH primary + nbrs AS nodelist
|
|
955
|
+
UNWIND nodelist AS node
|
|
956
|
+
WITH collect(DISTINCT node) AS nodes
|
|
957
|
+
MATCH (a)-[r]-(b)
|
|
958
|
+
WHERE a IN nodes AND b IN nodes
|
|
959
|
+
WITH nodes, collect(DISTINCT r) AS rels
|
|
960
|
+
RETURN
|
|
961
|
+
[n IN nodes |
|
|
962
|
+
{{ id: n.id,
|
|
963
|
+
properties: properties(n) }}] AS rawNodes,
|
|
964
|
+
[r IN rels |
|
|
965
|
+
{{ type: type(r),
|
|
966
|
+
properties: properties(r) }}] AS rawRels
|
|
967
|
+
"""
|
|
952
968
|
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
969
|
+
result = await self.query(query, {"names": node_name})
|
|
970
|
+
|
|
971
|
+
if not result:
|
|
972
|
+
return [], []
|
|
973
|
+
|
|
974
|
+
raw_nodes = result[0]["rawNodes"]
|
|
975
|
+
raw_rels = result[0]["rawRels"]
|
|
976
|
+
|
|
977
|
+
# Process nodes
|
|
978
|
+
nodes = []
|
|
979
|
+
for n in raw_nodes:
|
|
980
|
+
nodes.append((n["properties"]["id"], n["properties"]))
|
|
981
|
+
|
|
982
|
+
# Process edges
|
|
983
|
+
edges = []
|
|
984
|
+
for r in raw_rels:
|
|
985
|
+
edges.append(
|
|
986
|
+
(
|
|
987
|
+
r["properties"]["source_node_id"],
|
|
988
|
+
r["properties"]["target_node_id"],
|
|
989
|
+
r["type"],
|
|
990
|
+
r["properties"],
|
|
991
|
+
)
|
|
992
|
+
)
|
|
993
|
+
|
|
994
|
+
retrieval_time = time.time() - start_time
|
|
995
|
+
logger.info(
|
|
996
|
+
f"Retrieved {len(nodes)} nodes and {len(edges)} edges for {node_type.__name__} in {retrieval_time:.2f} seconds"
|
|
960
997
|
)
|
|
961
|
-
for r in raw_rels
|
|
962
|
-
]
|
|
963
998
|
|
|
964
|
-
|
|
999
|
+
return nodes, edges
|
|
1000
|
+
|
|
1001
|
+
except Exception as e:
|
|
1002
|
+
logger.error(f"Error during nodeset subgraph retrieval: {str(e)}")
|
|
1003
|
+
raise
|
|
965
1004
|
|
|
966
1005
|
async def get_filtered_graph_data(self, attribute_filters):
|
|
967
1006
|
"""
|
|
@@ -1011,8 +1050,8 @@ class Neo4jAdapter(GraphDBInterface):
|
|
|
1011
1050
|
|
|
1012
1051
|
edges = [
|
|
1013
1052
|
(
|
|
1014
|
-
record["
|
|
1015
|
-
record["
|
|
1053
|
+
record["properties"]["source_node_id"],
|
|
1054
|
+
record["properties"]["target_node_id"],
|
|
1016
1055
|
record["type"],
|
|
1017
1056
|
record["properties"],
|
|
1018
1057
|
)
|
|
@@ -49,9 +49,16 @@ class SQLAlchemyAdapter:
|
|
|
49
49
|
|
|
50
50
|
run_sync(self.pull_from_s3())
|
|
51
51
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
52
|
+
if "sqlite" in connection_string:
|
|
53
|
+
self.engine = create_async_engine(
|
|
54
|
+
connection_string,
|
|
55
|
+
poolclass=NullPool,
|
|
56
|
+
)
|
|
57
|
+
else:
|
|
58
|
+
self.engine = create_async_engine(
|
|
59
|
+
connection_string, pool_size=12, max_overflow=12, poolclass=None
|
|
60
|
+
)
|
|
61
|
+
|
|
55
62
|
self.sessionmaker = async_sessionmaker(bind=self.engine, expire_on_commit=False)
|
|
56
63
|
|
|
57
64
|
async def push_to_s3(self) -> None:
|
|
@@ -19,7 +19,7 @@ def create_vector_engine(
|
|
|
19
19
|
for each provider, raising an EnvironmentError if any are missing, or ImportError if the
|
|
20
20
|
ChromaDB package is not installed.
|
|
21
21
|
|
|
22
|
-
Supported providers include:
|
|
22
|
+
Supported providers include: Qdrant, pgvector, FalkorDB, ChromaDB, and
|
|
23
23
|
LanceDB.
|
|
24
24
|
|
|
25
25
|
Parameters:
|
|
@@ -30,7 +30,7 @@ def create_vector_engine(
|
|
|
30
30
|
providers.
|
|
31
31
|
- vector_db_key (str): The API key or access token for the vector database instance.
|
|
32
32
|
- vector_db_provider (str): The name of the vector database provider to use (e.g.,
|
|
33
|
-
'
|
|
33
|
+
'qdrant', 'pgvector').
|
|
34
34
|
|
|
35
35
|
Returns:
|
|
36
36
|
--------
|
|
@@ -48,15 +48,7 @@ def create_vector_engine(
|
|
|
48
48
|
embedding_engine=embedding_engine,
|
|
49
49
|
)
|
|
50
50
|
|
|
51
|
-
if vector_db_provider == "
|
|
52
|
-
from .weaviate_db import WeaviateAdapter
|
|
53
|
-
|
|
54
|
-
if not (vector_db_url and vector_db_key):
|
|
55
|
-
raise EnvironmentError("Missing requred Weaviate credentials!")
|
|
56
|
-
|
|
57
|
-
return WeaviateAdapter(vector_db_url, vector_db_key, embedding_engine=embedding_engine)
|
|
58
|
-
|
|
59
|
-
elif vector_db_provider == "qdrant":
|
|
51
|
+
if vector_db_provider == "qdrant":
|
|
60
52
|
if not (vector_db_url and vector_db_key):
|
|
61
53
|
raise EnvironmentError("Missing requred Qdrant credentials!")
|
|
62
54
|
|
|
@@ -18,12 +18,12 @@ class Data(Base):
|
|
|
18
18
|
mime_type = Column(String)
|
|
19
19
|
raw_data_location = Column(String)
|
|
20
20
|
owner_id = Column(UUID, index=True)
|
|
21
|
-
tenant_id = Column(UUID, index=True,
|
|
21
|
+
tenant_id = Column(UUID, index=True, nullable=True)
|
|
22
22
|
content_hash = Column(String)
|
|
23
23
|
external_metadata = Column(JSON)
|
|
24
24
|
node_set = Column(JSON, nullable=True) # Store NodeSet as JSON list of strings
|
|
25
25
|
token_count = Column(Integer)
|
|
26
|
-
data_size = Column(Integer) # File size in bytes
|
|
26
|
+
data_size = Column(Integer, nullable=True) # File size in bytes
|
|
27
27
|
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
|
28
28
|
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
|
|
29
29
|
|
|
@@ -18,11 +18,8 @@ class UnstructuredDocument(Document):
|
|
|
18
18
|
except ModuleNotFoundError:
|
|
19
19
|
raise UnstructuredLibraryImportError
|
|
20
20
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
elements = partition(file=f, content_type=self.mime_type)
|
|
24
|
-
else:
|
|
25
|
-
elements = partition(self.raw_data_location, content_type=self.mime_type)
|
|
21
|
+
async with open_data_file(self.raw_data_location, mode="rb") as f:
|
|
22
|
+
elements = partition(file=f, content_type=self.mime_type)
|
|
26
23
|
|
|
27
24
|
in_memory_file = StringIO("\n\n".join([str(el) for el in elements]))
|
|
28
25
|
in_memory_file.seek(0)
|