ltcai 3.6.0 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -31
- package/docs/CHANGELOG.md +64 -0
- package/docs/REALTIME_COLLABORATION.md +3 -3
- package/docs/V3_FRONTEND.md +9 -8
- package/docs/V4_BRAIN_ARCHITECTURE.md +322 -0
- package/docs/V4_DIGITAL_BRAIN_RECOVERY.md +552 -0
- package/docs/V4_IMPLEMENTATION_PLAN.md +470 -0
- package/docs/kg-schema.md +51 -53
- package/docs/spec-vs-impl.md +10 -10
- package/kg_schema.py +2 -520
- package/knowledge_graph.py +37 -4629
- package/knowledge_graph_api.py +11 -127
- package/latticeai/__init__.py +1 -1
- package/latticeai/api/admin.py +16 -17
- package/latticeai/api/agents.py +20 -7
- package/latticeai/api/auth.py +46 -15
- package/latticeai/api/chat.py +112 -76
- package/latticeai/api/health.py +1 -1
- package/latticeai/api/hooks.py +1 -1
- package/latticeai/api/invitations.py +100 -0
- package/latticeai/api/knowledge_graph.py +139 -0
- package/latticeai/api/local_files.py +1 -1
- package/latticeai/api/mcp.py +23 -11
- package/latticeai/api/memory.py +1 -1
- package/latticeai/api/models.py +1 -1
- package/latticeai/api/network.py +81 -0
- package/latticeai/api/plugins.py +3 -6
- package/latticeai/api/realtime.py +5 -8
- package/latticeai/api/search.py +26 -2
- package/latticeai/api/security_dashboard.py +2 -3
- package/latticeai/api/setup.py +2 -2
- package/latticeai/api/static_routes.py +11 -16
- package/latticeai/api/tools.py +3 -0
- package/latticeai/api/ui_redirects.py +26 -0
- package/latticeai/api/workflow_designer.py +85 -6
- package/latticeai/api/workspace.py +93 -57
- package/latticeai/app_factory.py +1781 -0
- package/latticeai/brain/__init__.py +18 -0
- package/latticeai/brain/_kg_common.py +1123 -0
- package/latticeai/brain/context.py +213 -0
- package/latticeai/brain/conversations.py +236 -0
- package/latticeai/brain/discovery.py +1455 -0
- package/latticeai/brain/documents.py +218 -0
- package/latticeai/brain/identity.py +175 -0
- package/latticeai/brain/ingest.py +644 -0
- package/latticeai/brain/memory.py +102 -0
- package/latticeai/brain/network.py +205 -0
- package/latticeai/brain/projection.py +561 -0
- package/latticeai/brain/provenance.py +401 -0
- package/latticeai/brain/retrieval.py +1316 -0
- package/latticeai/brain/schema.py +640 -0
- package/latticeai/brain/store.py +216 -0
- package/latticeai/brain/write_master.py +225 -0
- package/latticeai/core/agent.py +31 -7
- package/latticeai/core/audit.py +0 -7
- package/latticeai/core/config.py +1 -1
- package/latticeai/core/context_builder.py +1 -2
- package/latticeai/core/enterprise.py +1 -1
- package/latticeai/core/graph_curator.py +2 -2
- package/latticeai/core/invitations.py +131 -0
- package/latticeai/core/marketplace.py +1 -1
- package/latticeai/core/mcp_registry.py +791 -0
- package/latticeai/core/model_compat.py +1 -1
- package/latticeai/core/model_resolution.py +0 -1
- package/latticeai/core/multi_agent.py +238 -4
- package/latticeai/core/policy.py +54 -0
- package/latticeai/core/realtime.py +65 -44
- package/latticeai/core/security.py +1 -1
- package/latticeai/core/sessions.py +66 -10
- package/latticeai/core/users.py +147 -0
- package/latticeai/core/workflow_engine.py +114 -2
- package/latticeai/core/workspace_os.py +477 -29
- package/latticeai/models/__init__.py +7 -0
- package/latticeai/models/router.py +779 -0
- package/latticeai/server_app.py +29 -1536
- package/latticeai/services/agent_runtime.py +243 -4
- package/latticeai/services/app_context.py +75 -14
- package/latticeai/services/ingestion.py +47 -0
- package/latticeai/services/kg_portability.py +33 -3
- package/latticeai/services/memory_service.py +39 -11
- package/latticeai/services/model_runtime.py +2 -5
- package/latticeai/services/platform_runtime.py +100 -23
- package/latticeai/services/run_executor.py +328 -0
- package/latticeai/services/search_service.py +17 -8
- package/latticeai/services/tool_dispatch.py +12 -2
- package/latticeai/services/triggers.py +241 -0
- package/latticeai/services/upload_service.py +37 -12
- package/latticeai/services/workspace_service.py +55 -16
- package/llm_router.py +29 -772
- package/ltcai_cli.py +1 -2
- package/mcp_registry.py +25 -788
- package/p_reinforce.py +124 -14
- package/package.json +10 -20
- package/scripts/bump_version.py +99 -0
- package/scripts/generate_diagrams.py +0 -1
- package/scripts/lint_v3.mjs +105 -18
- package/scripts/validate_release_artifacts.py +0 -1
- package/scripts/wheel_smoke.py +142 -0
- package/server.py +11 -7
- package/setup_wizard.py +1142 -0
- package/static/sw.js +81 -52
- package/static/v3/asset-manifest.json +33 -25
- package/static/v3/css/{lattice.base.e4cdd05d.css → lattice.base.49deefb5.css} +1 -1
- package/static/v3/css/lattice.base.css +1 -1
- package/static/v3/css/{lattice.components.9b49d614.css → lattice.components.cde18231.css} +1 -1
- package/static/v3/css/lattice.components.css +1 -1
- package/static/v3/css/{lattice.shell.8fcc9d33.css → lattice.shell.29d36d85.css} +1 -1
- package/static/v3/css/lattice.shell.css +1 -1
- package/static/v3/css/{lattice.tokens.e7018963.css → lattice.tokens.304cbc40.css} +3 -0
- package/static/v3/css/lattice.tokens.css +3 -0
- package/static/v3/css/{lattice.views.22f69117.css → lattice.views.0a18b6c5.css} +2 -2
- package/static/v3/css/lattice.views.css +2 -2
- package/static/v3/index.html +3 -4
- package/static/v3/js/{app.c541f955.js → app.c5c80c46.js} +1 -1
- package/static/v3/js/core/{api.33d6320e.js → api.ba0fbf14.js} +58 -1
- package/static/v3/js/core/api.js +57 -0
- package/static/v3/js/core/i18n.880e1fec.js +575 -0
- package/static/v3/js/core/i18n.js +575 -0
- package/static/v3/js/core/routes.37522821.js +101 -0
- package/static/v3/js/core/routes.js +71 -63
- package/static/v3/js/core/{shell.8c163e0e.js → shell.e3f6bbfa.js} +68 -39
- package/static/v3/js/core/shell.js +66 -37
- package/static/v3/js/core/{store.34ebd5e6.js → store.7b2aa044.js} +11 -1
- package/static/v3/js/core/store.js +11 -1
- package/static/v3/js/views/account.eff40715.js +143 -0
- package/static/v3/js/views/account.js +143 -0
- package/static/v3/js/views/activity.0d271ef9.js +67 -0
- package/static/v3/js/views/activity.js +67 -0
- package/static/v3/js/views/{admin-users.03bac88c.js → admin-users.f7ac7b43.js} +4 -6
- package/static/v3/js/views/admin-users.js +4 -6
- package/static/v3/js/views/{agents.014d0b74.js → agents.17c5288d.js} +35 -12
- package/static/v3/js/views/agents.js +35 -12
- package/static/v3/js/views/{chat.e6dd7dd0.js → chat.e250e2cc.js} +23 -0
- package/static/v3/js/views/chat.js +23 -0
- package/static/v3/js/views/graph-canvas.17c15d65.js +509 -0
- package/static/v3/js/views/graph-canvas.js +509 -0
- package/static/v3/js/views/{hybrid-search.b22b97e0.js → hybrid-search.2fb63ed9.js} +1 -2
- package/static/v3/js/views/hybrid-search.js +1 -2
- package/static/v3/js/views/{knowledge-graph.a96040a5.js → knowledge-graph.4d09c537.js} +60 -44
- package/static/v3/js/views/knowledge-graph.js +60 -44
- package/static/v3/js/views/network.52a4f181.js +97 -0
- package/static/v3/js/views/network.js +97 -0
- package/static/v3/js/views/{planning.9ac3e313.js → planning.4876fd77.js} +26 -5
- package/static/v3/js/views/planning.js +26 -5
- package/static/v3/js/views/runs.b63b2afa.js +144 -0
- package/static/v3/js/views/runs.js +144 -0
- package/static/v3/js/views/{settings.8631fa5e.js → settings.b7140634.js} +7 -8
- package/static/v3/js/views/settings.js +7 -8
- package/static/v3/js/views/snapshots.6f5db095.js +135 -0
- package/static/v3/js/views/snapshots.js +135 -0
- package/static/v3/js/views/{workflows.26c57290.js → workflows.7752225a.js} +87 -2
- package/static/v3/js/views/workflows.js +87 -2
- package/static/v3/js/views/workspace-admin.c466029b.js +156 -0
- package/static/v3/js/views/workspace-admin.js +156 -0
- package/static/vendor/chart.umd.min.js +20 -0
- package/static/vendor/fonts/inter-latin-300-normal.woff2 +0 -0
- package/static/vendor/fonts/inter-latin-400-normal.woff2 +0 -0
- package/static/vendor/fonts/inter-latin-500-normal.woff2 +0 -0
- package/static/vendor/fonts/inter-latin-600-normal.woff2 +0 -0
- package/static/vendor/fonts/inter-latin-700-normal.woff2 +0 -0
- package/static/vendor/fonts/inter-latin-800-normal.woff2 +0 -0
- package/static/vendor/fonts/inter.css +44 -0
- package/static/vendor/icons/tabler-icons.min.css +4 -0
- package/static/vendor/icons/tabler-icons.woff2 +0 -0
- package/static/vendor/marked.min.js +69 -0
- package/telegram_bot.py +1 -2
- package/tools/commands.py +4 -2
- package/tools/computer.py +1 -1
- package/tools/documents.py +1 -3
- package/tools/filesystem.py +0 -4
- package/tools/knowledge.py +1 -3
- package/tools/network.py +1 -3
- package/codex_telegram_bot.py +0 -195
- package/docs/assets/v3.4.0/agent-run.png +0 -0
- package/docs/assets/v3.4.0/agents.png +0 -0
- package/docs/assets/v3.4.0/before/chat-before.png +0 -0
- package/docs/assets/v3.4.0/before/files-before.png +0 -0
- package/docs/assets/v3.4.0/chat.png +0 -0
- package/docs/assets/v3.4.0/connect-folder.png +0 -0
- package/docs/assets/v3.4.0/files.png +0 -0
- package/docs/assets/v3.4.0/home.png +0 -0
- package/docs/assets/v3.4.0/hooks-dispatch.png +0 -0
- package/docs/assets/v3.4.0/knowledge-graph.png +0 -0
- package/docs/assets/v3.4.0/local-agent.png +0 -0
- package/docs/assets/v3.4.0/memory.png +0 -0
- package/docs/assets/v3.4.0/settings.png +0 -0
- package/docs/assets/v3.4.0/vision-input.png +0 -0
- package/docs/assets/v3.4.0/workflows.png +0 -0
- package/docs/assets/v3.4.1/e2e_runtime_log.txt +0 -42
- package/docs/assets/v3.4.1/hooks-dispatch.png +0 -0
- package/docs/assets/v3.4.1/local-agent.png +0 -0
- package/docs/images/admin-dashboard.png +0 -0
- package/docs/images/architecture.png +0 -0
- package/docs/images/enterprise.png +0 -0
- package/docs/images/graph.png +0 -0
- package/docs/images/hero.gif +0 -0
- package/docs/images/knowledge-graph.png +0 -0
- package/docs/images/lattice-ai-demo.gif +0 -0
- package/docs/images/lattice-ai-hero.png +0 -0
- package/docs/images/logo.svg +0 -33
- package/docs/images/mobile-responsive.png +0 -0
- package/docs/images/model-recommendation.png +0 -0
- package/docs/images/onboarding.png +0 -0
- package/docs/images/organization.png +0 -0
- package/docs/images/pipeline.png +0 -0
- package/docs/images/screenshot-admin.png +0 -0
- package/docs/images/screenshot-chat.png +0 -0
- package/docs/images/screenshot-graph.png +0 -0
- package/docs/images/skills.png +0 -0
- package/docs/images/workspace-dark.png +0 -0
- package/docs/images/workspace-light.png +0 -0
- package/docs/images/workspace.png +0 -0
- package/requirements.txt +0 -16
- package/static/account.html +0 -115
- package/static/activity.html +0 -73
- package/static/admin.html +0 -488
- package/static/agents.html +0 -139
- package/static/chat.html +0 -844
- package/static/css/reference/account.css +0 -439
- package/static/css/reference/admin.css +0 -610
- package/static/css/reference/base.css +0 -1661
- package/static/css/reference/chat.css +0 -4623
- package/static/css/reference/graph.css +0 -1016
- package/static/css/responsive.css +0 -861
- package/static/graph.html +0 -124
- package/static/platform.css +0 -104
- package/static/plugins.html +0 -136
- package/static/scripts/account.js +0 -238
- package/static/scripts/admin.js +0 -1614
- package/static/scripts/chat.js +0 -5081
- package/static/scripts/graph.js +0 -1804
- package/static/scripts/platform.js +0 -64
- package/static/scripts/ux.js +0 -167
- package/static/scripts/workspace.js +0 -948
- package/static/v3/js/core/routes.2ce3815a.js +0 -93
- package/static/workflows.html +0 -146
- package/static/workspace.css +0 -1121
- package/static/workspace.html +0 -357
package/knowledge_graph.py
CHANGED
|
@@ -1,4633 +1,41 @@
|
|
|
1
|
-
"""
|
|
2
|
-
SQLite knowledge graph for Lattice AI workspace memory.
|
|
1
|
+
"""Compatibility shim for the v4 brain store.
|
|
3
2
|
|
|
4
|
-
The
|
|
5
|
-
|
|
6
|
-
the ingestion contract.
|
|
3
|
+
The implementation now lives under :mod:`latticeai.brain`. Root imports are
|
|
4
|
+
kept for older integrations and tests.
|
|
7
5
|
"""
|
|
8
6
|
|
|
9
|
-
import
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
except Exception: # pragma: no cover - v2 schema is optional at import time
|
|
29
|
-
KGStoreV2 = None # type: ignore[assignment]
|
|
30
|
-
NodeType = None # type: ignore[assignment]
|
|
31
|
-
EdgeType = None # type: ignore[assignment]
|
|
32
|
-
_exec_script = None # type: ignore[assignment]
|
|
33
|
-
|
|
34
|
-
from latticeai.core.local_embeddings import LocalEmbeddingModel
|
|
35
|
-
|
|
36
|
-
# Default read source for the graph queries: v2 reconstruction views.
|
|
37
|
-
# Override with LATTICEAI_KG_READ_V2=0 to fall back to the legacy tables.
|
|
38
|
-
_READ_FROM_V2_DEFAULT = os.getenv("LATTICEAI_KG_READ_V2", "1") != "0"
|
|
39
|
-
|
|
40
|
-
# Bump when the v2 projection layout changes (columns, normalization rules).
|
|
41
|
-
# On init, a stale projection is dropped and rebuilt from the authoritative
|
|
42
|
-
# legacy tables — safe because nodes_v2/edges_v2 only ever hold a derived view.
|
|
43
|
-
# v4: summary nullable + verbatim (byte-faithful) projection of legacy values.
|
|
44
|
-
_PROJECTION_VERSION = 4
|
|
45
|
-
|
|
46
|
-
_llm_router_ref = None
|
|
47
|
-
|
|
48
|
-
def set_llm_router(router_instance):
|
|
49
|
-
global _llm_router_ref
|
|
50
|
-
_llm_router_ref = router_instance
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
GRAPH_SCHEMA_VERSION = 1
|
|
54
|
-
|
|
55
|
-
LOCAL_TEXT_EXTENSIONS = {".txt", ".md"}
|
|
56
|
-
LOCAL_CODE_EXTENSIONS = {
|
|
57
|
-
".py", ".js", ".ts", ".tsx", ".jsx", ".html", ".css", ".json",
|
|
58
|
-
".yaml", ".yml", ".xml", ".sql", ".sh", ".zsh", ".toml", ".ini",
|
|
59
|
-
}
|
|
60
|
-
LOCAL_DOCUMENT_EXTENSIONS = {".pdf", ".docx"}
|
|
61
|
-
LOCAL_SPREADSHEET_EXTENSIONS = {".xlsx", ".csv"}
|
|
62
|
-
LOCAL_SLIDE_EXTENSIONS = {".pptx"}
|
|
63
|
-
LOCAL_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
|
|
64
|
-
LOCAL_SUPPORTED_EXTENSIONS = (
|
|
65
|
-
LOCAL_TEXT_EXTENSIONS
|
|
66
|
-
| LOCAL_CODE_EXTENSIONS
|
|
67
|
-
| LOCAL_DOCUMENT_EXTENSIONS
|
|
68
|
-
| LOCAL_SPREADSHEET_EXTENSIONS
|
|
69
|
-
| LOCAL_SLIDE_EXTENSIONS
|
|
70
|
-
| LOCAL_IMAGE_EXTENSIONS
|
|
7
|
+
from latticeai.brain._kg_common import ( # noqa: F401
|
|
8
|
+
EDGE_VERB,
|
|
9
|
+
GRAPH_SCHEMA_VERSION,
|
|
10
|
+
LOCAL_CODE_EXTENSIONS,
|
|
11
|
+
LOCAL_DOCUMENT_EXTENSIONS,
|
|
12
|
+
LOCAL_IMAGE_EXTENSIONS,
|
|
13
|
+
LOCAL_SIZE_LIMITS,
|
|
14
|
+
LOCAL_SLIDE_EXTENSIONS,
|
|
15
|
+
LOCAL_SPREADSHEET_EXTENSIONS,
|
|
16
|
+
LOCAL_SUPPORTED_EXTENSIONS,
|
|
17
|
+
LOCAL_TEXT_EXTENSIONS,
|
|
18
|
+
_KG_DB_FORMAT_VERSION,
|
|
19
|
+
_PROJECTION_VERSION,
|
|
20
|
+
_extract_concepts,
|
|
21
|
+
_extract_concepts_rules,
|
|
22
|
+
_extract_triples,
|
|
23
|
+
_extract_triples_rules,
|
|
24
|
+
_slug,
|
|
25
|
+
set_llm_router,
|
|
71
26
|
)
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
"
|
|
76
|
-
"
|
|
77
|
-
"
|
|
78
|
-
"
|
|
79
|
-
"
|
|
80
|
-
"
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
"
|
|
85
|
-
"
|
|
86
|
-
|
|
87
|
-
".config", ".ssh", ".gnupg", ".docker", ".kube", ".aws", ".azure",
|
|
88
|
-
".npm", ".pnpm-store", ".yarn", ".bun", ".cargo", ".rustup", ".pyenv",
|
|
89
|
-
".conda", ".local", ".claude", ".codex", ".cursor", ".copilot",
|
|
90
|
-
".antigravity", ".antigravity-ide",
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
COMMON_EXCLUDED_FILE_NAMES = {
|
|
94
|
-
".env", ".env.local", ".env.production", ".env.development",
|
|
95
|
-
"id_rsa", "id_ed25519", "authorized_keys", "known_hosts",
|
|
96
|
-
"credentials.json", "service-account.json", "token.json", "secrets.json",
|
|
97
|
-
"cookies", "login data", "history", "web data", ".ds_store", "thumbs.db",
|
|
98
|
-
}
|
|
99
|
-
COMMON_EXCLUDED_FILE_SUFFIXES = {
|
|
100
|
-
".pem", ".key", ".p12", ".pfx", ".kdbx", ".wallet", ".sqlite", ".db",
|
|
101
|
-
".exe", ".dll", ".sys", ".msi", ".dmg", ".pkg", ".app", ".zip", ".tar",
|
|
102
|
-
".gz", ".7z", ".rar", ".mp4", ".mov", ".mp3", ".wav", ".tmp", ".bak",
|
|
103
|
-
".lock",
|
|
104
|
-
}
|
|
105
|
-
SENSITIVE_PATH_KEYWORDS = {
|
|
106
|
-
"secret", "secrets", "token", "password", "passwd", "credential",
|
|
107
|
-
"credentials", "private", "key", "wallet", "recovery", "seed",
|
|
108
|
-
"mnemonic", "cookie", "session", "auth", "oauth", "certificate",
|
|
109
|
-
"cert", "api_key", "apikey",
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
MACOS_EXCLUDED_PREFIXES = (
|
|
113
|
-
"/System", "/Library", "/Applications", "/private", "/tmp", "/var",
|
|
114
|
-
)
|
|
115
|
-
WINDOWS_EXCLUDED_NAMES = {
|
|
116
|
-
"windows", "program files", "program files (x86)", "programdata", "appdata",
|
|
117
|
-
"$recycle.bin", "system volume information", "recovery", "perflogs",
|
|
118
|
-
"intel", "amd", "nvidia",
|
|
119
|
-
}
|
|
120
|
-
LINUX_EXCLUDED_PREFIXES = (
|
|
121
|
-
"/bin", "/boot", "/dev", "/etc", "/lib", "/lib64", "/proc", "/root",
|
|
122
|
-
"/run", "/sbin", "/sys", "/tmp", "/usr", "/var", "/snap", "/lost+found",
|
|
123
|
-
)
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
def _now() -> str:
|
|
127
|
-
return datetime.now().isoformat()
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
def _parse_iso(raw: Optional[str]) -> Optional[datetime]:
|
|
131
|
-
if not raw:
|
|
132
|
-
return None
|
|
133
|
-
try:
|
|
134
|
-
return datetime.fromisoformat(str(raw))
|
|
135
|
-
except (TypeError, ValueError):
|
|
136
|
-
return None
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
def _recency_score(updated_at: Optional[str], *, now: Optional[datetime] = None, half_life_days: float = 14.0) -> float:
|
|
140
|
-
stamp = _parse_iso(updated_at)
|
|
141
|
-
if not stamp:
|
|
142
|
-
return 0.0
|
|
143
|
-
now = now or datetime.now()
|
|
144
|
-
age_days = max(0.0, (now - stamp).total_seconds() / 86400.0)
|
|
145
|
-
decay = math.log(2) / max(0.1, half_life_days)
|
|
146
|
-
return math.exp(-decay * age_days)
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
def _json(data: Optional[Dict[str, Any]]) -> str:
|
|
150
|
-
return json.dumps(data or {}, ensure_ascii=False, sort_keys=True)
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
def _safe_loads(raw: Optional[str]) -> Dict[str, Any]:
|
|
154
|
-
"""Tolerantly parse a metadata_json column — returns {} on corrupt rows."""
|
|
155
|
-
if not raw:
|
|
156
|
-
return {}
|
|
157
|
-
try:
|
|
158
|
-
value = json.loads(raw)
|
|
159
|
-
return value if isinstance(value, dict) else {}
|
|
160
|
-
except (json.JSONDecodeError, TypeError) as e:
|
|
161
|
-
logging.warning("knowledge_graph: corrupt metadata_json (%s) — using empty dict", e)
|
|
162
|
-
return {}
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
def _slug(text: str, max_len: int = 96) -> str:
|
|
166
|
-
value = re.sub(r"\s+", " ", str(text or "")).strip().lower()
|
|
167
|
-
value = re.sub(r"[^0-9a-zA-Z가-힣._:@/-]+", "-", value).strip("-")
|
|
168
|
-
return (value or "untitled")[:max_len]
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
def _sha256_bytes(data: bytes) -> str:
|
|
172
|
-
return hashlib.sha256(data).hexdigest()
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
def _sha256_text(text: str) -> str:
|
|
176
|
-
return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
def _safe_iso_from_stat_mtime(mtime: float) -> str:
|
|
180
|
-
try:
|
|
181
|
-
return datetime.fromtimestamp(float(mtime)).isoformat()
|
|
182
|
-
except (TypeError, ValueError, OSError):
|
|
183
|
-
return ""
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
def _path_fingerprint(path: Path) -> str:
|
|
187
|
-
return _sha256_text(str(path.expanduser().resolve()))[:24]
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
def _is_relative_to(path: Path, base: Path) -> bool:
|
|
191
|
-
try:
|
|
192
|
-
path.relative_to(base)
|
|
193
|
-
return True
|
|
194
|
-
except ValueError:
|
|
195
|
-
return False
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
def _path_parts_lower(path: Path) -> List[str]:
|
|
199
|
-
return [part.lower() for part in path.parts if part and part not in {os.sep, path.anchor}]
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
def _current_os_type() -> str:
|
|
203
|
-
system = platform.system().lower()
|
|
204
|
-
if system.startswith("darwin"):
|
|
205
|
-
return "macos"
|
|
206
|
-
if system.startswith("windows"):
|
|
207
|
-
return "windows"
|
|
208
|
-
if system.startswith("linux"):
|
|
209
|
-
return "linux"
|
|
210
|
-
return system or "unknown"
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
def _drive_id_for_path(path: Path) -> str:
|
|
214
|
-
resolved = path.expanduser().resolve()
|
|
215
|
-
if resolved.drive:
|
|
216
|
-
return resolved.drive.upper()
|
|
217
|
-
parts = resolved.parts
|
|
218
|
-
if len(parts) >= 3 and parts[1] == "Volumes":
|
|
219
|
-
return f"/Volumes/{parts[2]}"
|
|
220
|
-
if len(parts) >= 3 and parts[1] == "media":
|
|
221
|
-
return f"/media/{parts[2]}"
|
|
222
|
-
if len(parts) >= 3 and parts[1] == "mnt":
|
|
223
|
-
return f"/mnt/{parts[2]}"
|
|
224
|
-
return resolved.anchor or "/"
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
def _file_category(ext: str) -> str:
|
|
228
|
-
ext = (ext or "").lower()
|
|
229
|
-
if ext in LOCAL_CODE_EXTENSIONS:
|
|
230
|
-
return "code"
|
|
231
|
-
if ext in LOCAL_TEXT_EXTENSIONS:
|
|
232
|
-
return "text"
|
|
233
|
-
if ext == ".pdf":
|
|
234
|
-
return "pdf"
|
|
235
|
-
if ext in LOCAL_DOCUMENT_EXTENSIONS:
|
|
236
|
-
return "document"
|
|
237
|
-
if ext in LOCAL_SPREADSHEET_EXTENSIONS:
|
|
238
|
-
return "spreadsheet"
|
|
239
|
-
if ext in LOCAL_SLIDE_EXTENSIONS:
|
|
240
|
-
return "slide_deck"
|
|
241
|
-
if ext in LOCAL_IMAGE_EXTENSIONS:
|
|
242
|
-
return "image"
|
|
243
|
-
return "unsupported"
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
def _node_type_for_category(category: str) -> str:
|
|
247
|
-
return {
|
|
248
|
-
"code": "CodeFile",
|
|
249
|
-
"spreadsheet": "Spreadsheet",
|
|
250
|
-
"slide_deck": "SlideDeck",
|
|
251
|
-
"image": "Image",
|
|
252
|
-
"unsupported": "File",
|
|
253
|
-
}.get(category, "Document")
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
def _parser_type_for_category(category: str, ext: str) -> str:
|
|
257
|
-
if category in {"text", "code"}:
|
|
258
|
-
return "plain_text"
|
|
259
|
-
if category == "spreadsheet" and ext == ".csv":
|
|
260
|
-
return "csv_text"
|
|
261
|
-
if category == "image":
|
|
262
|
-
return "image_ocr"
|
|
263
|
-
return ext.lstrip(".") or category
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
def _size_limit_for_category(category: str) -> int:
|
|
267
|
-
return LOCAL_SIZE_LIMITS.get(category, LOCAL_SIZE_LIMITS["document"])
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
def _is_hidden_path(path: Path, root: Optional[Path] = None) -> bool:
|
|
271
|
-
parts: Iterable[str]
|
|
272
|
-
if root is not None:
|
|
273
|
-
try:
|
|
274
|
-
parts = path.relative_to(root).parts
|
|
275
|
-
except ValueError:
|
|
276
|
-
parts = path.parts
|
|
277
|
-
else:
|
|
278
|
-
parts = path.parts
|
|
279
|
-
return any(part.startswith(".") and part not in {".", ".."} for part in parts)
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
def _excluded_directory_reason(path: Path, *, root: Optional[Path] = None, os_type: Optional[str] = None) -> Optional[str]:
|
|
283
|
-
os_type = os_type or _current_os_type()
|
|
284
|
-
name = path.name.lower()
|
|
285
|
-
if name in COMMON_EXCLUDED_DIRS:
|
|
286
|
-
return "excluded_folder"
|
|
287
|
-
if _is_hidden_path(path, root):
|
|
288
|
-
return "hidden_folder"
|
|
289
|
-
parts = _path_parts_lower(path)
|
|
290
|
-
if os_type == "windows" and any(part in WINDOWS_EXCLUDED_NAMES for part in parts):
|
|
291
|
-
return "system_folder"
|
|
292
|
-
normalized = path.as_posix()
|
|
293
|
-
root_normalized = root.as_posix() if root else ""
|
|
294
|
-
|
|
295
|
-
def _prefix_blocks(prefixes: Tuple[str, ...]) -> bool:
|
|
296
|
-
for prefix in prefixes:
|
|
297
|
-
path_under_prefix = normalized == prefix or normalized.startswith(f"{prefix}/")
|
|
298
|
-
root_under_prefix = bool(root_normalized) and (
|
|
299
|
-
root_normalized == prefix or root_normalized.startswith(f"{prefix}/")
|
|
300
|
-
)
|
|
301
|
-
if path_under_prefix and not root_under_prefix:
|
|
302
|
-
return True
|
|
303
|
-
return False
|
|
304
|
-
|
|
305
|
-
if os_type == "macos":
|
|
306
|
-
home_library = Path.home() / "Library"
|
|
307
|
-
try:
|
|
308
|
-
root_is_library = bool(root) and _is_relative_to(root.expanduser().resolve(), home_library.expanduser().resolve())
|
|
309
|
-
if _is_relative_to(path.expanduser().resolve(), home_library.expanduser().resolve()) and not root_is_library:
|
|
310
|
-
return "user_library"
|
|
311
|
-
except OSError:
|
|
312
|
-
pass
|
|
313
|
-
if _prefix_blocks(MACOS_EXCLUDED_PREFIXES):
|
|
314
|
-
return "system_folder"
|
|
315
|
-
if os_type == "linux":
|
|
316
|
-
if _prefix_blocks(LINUX_EXCLUDED_PREFIXES):
|
|
317
|
-
return "system_folder"
|
|
318
|
-
return None
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
def _sensitive_file_reason(path: Path, *, root: Optional[Path] = None) -> Optional[str]:
|
|
322
|
-
name = path.name.lower()
|
|
323
|
-
suffix = path.suffix.lower()
|
|
324
|
-
if name in COMMON_EXCLUDED_FILE_NAMES or suffix in COMMON_EXCLUDED_FILE_SUFFIXES:
|
|
325
|
-
return "sensitive_or_excluded_file"
|
|
326
|
-
try:
|
|
327
|
-
rel_text = path.relative_to(root).as_posix().lower() if root else path.as_posix().lower()
|
|
328
|
-
except ValueError:
|
|
329
|
-
rel_text = path.as_posix().lower()
|
|
330
|
-
tokens = re.split(r"[^0-9a-zA-Z_가-힣]+", rel_text)
|
|
331
|
-
if any(token in SENSITIVE_PATH_KEYWORDS for token in tokens):
|
|
332
|
-
return "sensitive_name"
|
|
333
|
-
return None
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
def _root_warning(path: Path, os_type: str) -> Optional[str]:
|
|
337
|
-
resolved = path.expanduser().resolve()
|
|
338
|
-
home = Path.home().expanduser().resolve()
|
|
339
|
-
if os_type == "macos" and resolved == home:
|
|
340
|
-
return "홈 전체에는 설정/숨김 폴더가 포함될 수 있습니다. 문서, 데스크탑, 다운로드, 프로젝트 폴더부터 추가하는 것을 권장합니다."
|
|
341
|
-
if os_type == "linux" and resolved.as_posix() == "/":
|
|
342
|
-
return "루트 디렉터리에는 시스템 파일이 포함되어 있습니다. 일반 사용자 폴더나 마운트된 데이터 폴더를 권장합니다."
|
|
343
|
-
if os_type == "windows" and str(resolved).rstrip("\\/").upper() in {"C:", "C:\\"}:
|
|
344
|
-
return "C드라이브에는 Windows 시스템 파일과 앱 설정 파일이 포함되어 있습니다. 하위 폴더를 선택하는 것을 권장합니다."
|
|
345
|
-
return None
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
def _sample_file(path: Path, root: Path, status: str, reason: str = "") -> Dict[str, Any]:
|
|
349
|
-
try:
|
|
350
|
-
rel = path.relative_to(root).as_posix()
|
|
351
|
-
except ValueError:
|
|
352
|
-
rel = path.name
|
|
353
|
-
try:
|
|
354
|
-
stat = path.stat()
|
|
355
|
-
size = stat.st_size if path.is_file() else None
|
|
356
|
-
modified_at = _safe_iso_from_stat_mtime(stat.st_mtime)
|
|
357
|
-
except OSError:
|
|
358
|
-
size = None
|
|
359
|
-
modified_at = ""
|
|
360
|
-
return {
|
|
361
|
-
"path": str(path),
|
|
362
|
-
"relative_path": rel,
|
|
363
|
-
"name": path.name,
|
|
364
|
-
"extension": path.suffix.lower(),
|
|
365
|
-
"status": status,
|
|
366
|
-
"reason": reason,
|
|
367
|
-
"size_bytes": size,
|
|
368
|
-
"modified_at": modified_at,
|
|
369
|
-
}
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
def _clean_text(text: str) -> str:
|
|
373
|
-
return re.sub(r"\s+", " ", str(text or "")).strip()
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
def _chunks(text: str, size: int = 1200, overlap: int = 160) -> List[str]:
|
|
377
|
-
cleaned = str(text or "").strip()
|
|
378
|
-
if not cleaned:
|
|
379
|
-
return []
|
|
380
|
-
chunks: List[str] = []
|
|
381
|
-
start = 0
|
|
382
|
-
while start < len(cleaned):
|
|
383
|
-
end = min(len(cleaned), start + size)
|
|
384
|
-
chunks.append(cleaned[start:end])
|
|
385
|
-
if end >= len(cleaned):
|
|
386
|
-
break
|
|
387
|
-
start = max(0, end - overlap)
|
|
388
|
-
return chunks
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
_LLM_EXTRACT_CONCEPT_PROMPT = """Extract the key concepts from the following text.
|
|
392
|
-
Return ONLY a JSON array of objects, each with "concept" (string) and "importance" (float 0-1).
|
|
393
|
-
Extract up to {limit} concepts. Focus on named entities, technical terms, and domain-specific nouns.
|
|
394
|
-
Do NOT include common words, stop words, or generic terms.
|
|
395
|
-
|
|
396
|
-
Text:
|
|
397
|
-
{text}
|
|
398
|
-
|
|
399
|
-
JSON:"""
|
|
400
|
-
|
|
401
|
-
_LLM_EXTRACT_TRIPLE_PROMPT = """Extract relationship triples from the following text.
|
|
402
|
-
Return ONLY a JSON array of objects, each with:
|
|
403
|
-
- "subject": source concept (string)
|
|
404
|
-
- "relation": relationship verb (string, Korean or English)
|
|
405
|
-
- "object": target concept (string)
|
|
406
|
-
- "evidence": the sentence supporting this triple (string, max 240 chars)
|
|
407
|
-
- "confidence": how confident you are (float 0-1)
|
|
408
|
-
|
|
409
|
-
Extract up to {limit} triples. Focus on meaningful semantic relationships.
|
|
410
|
-
|
|
411
|
-
Text:
|
|
412
|
-
{text}
|
|
413
|
-
|
|
414
|
-
Concepts already identified: {concepts}
|
|
415
|
-
|
|
416
|
-
JSON:"""
|
|
417
|
-
|
|
418
|
-
ENABLE_LLM_EXTRACTION = os.getenv("LATTICEAI_LLM_EXTRACTION", "true").lower() in ("1", "true", "yes")
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
def _llm_extract_concepts(text: str, limit: int = 12) -> Optional[List[str]]:
|
|
422
|
-
if not ENABLE_LLM_EXTRACTION or not _llm_router_ref:
|
|
423
|
-
return None
|
|
424
|
-
if not _llm_router_ref.current_model_id:
|
|
425
|
-
return None
|
|
426
|
-
prompt = _LLM_EXTRACT_CONCEPT_PROMPT.format(text=text[:3000], limit=limit)
|
|
427
|
-
try:
|
|
428
|
-
loop = asyncio.get_event_loop()
|
|
429
|
-
if loop.is_running():
|
|
430
|
-
import concurrent.futures
|
|
431
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
|
432
|
-
future = pool.submit(asyncio.run, _llm_router_ref.generate(prompt, max_tokens=1024, temperature=0.1))
|
|
433
|
-
raw = future.result(timeout=30)
|
|
434
|
-
else:
|
|
435
|
-
raw = asyncio.run(_llm_router_ref.generate(prompt, max_tokens=1024, temperature=0.1))
|
|
436
|
-
raw = raw.strip()
|
|
437
|
-
if raw.startswith("```"):
|
|
438
|
-
raw = re.sub(r"^```(?:json)?\s*", "", raw)
|
|
439
|
-
raw = re.sub(r"\s*```$", "", raw)
|
|
440
|
-
parsed = json.loads(raw)
|
|
441
|
-
if isinstance(parsed, list):
|
|
442
|
-
concepts = []
|
|
443
|
-
for item in parsed[:limit]:
|
|
444
|
-
if isinstance(item, dict) and "concept" in item:
|
|
445
|
-
concepts.append(item["concept"])
|
|
446
|
-
elif isinstance(item, str):
|
|
447
|
-
concepts.append(item)
|
|
448
|
-
return concepts if concepts else None
|
|
449
|
-
except Exception as e:
|
|
450
|
-
logging.debug("LLM concept extraction failed (falling back to rules): %s", e)
|
|
451
|
-
return None
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
def _llm_extract_triples(text: str, concepts: List[str], limit: int = 20) -> Optional[List[Dict[str, str]]]:
|
|
455
|
-
if not ENABLE_LLM_EXTRACTION or not _llm_router_ref:
|
|
456
|
-
return None
|
|
457
|
-
if not _llm_router_ref.current_model_id:
|
|
458
|
-
return None
|
|
459
|
-
prompt = _LLM_EXTRACT_TRIPLE_PROMPT.format(
|
|
460
|
-
text=text[:3000], limit=limit,
|
|
461
|
-
concepts=", ".join(concepts[:15]),
|
|
462
|
-
)
|
|
463
|
-
try:
|
|
464
|
-
loop = asyncio.get_event_loop()
|
|
465
|
-
if loop.is_running():
|
|
466
|
-
import concurrent.futures
|
|
467
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
|
468
|
-
future = pool.submit(asyncio.run, _llm_router_ref.generate(prompt, max_tokens=2048, temperature=0.1))
|
|
469
|
-
raw = future.result(timeout=30)
|
|
470
|
-
else:
|
|
471
|
-
raw = asyncio.run(_llm_router_ref.generate(prompt, max_tokens=2048, temperature=0.1))
|
|
472
|
-
raw = raw.strip()
|
|
473
|
-
if raw.startswith("```"):
|
|
474
|
-
raw = re.sub(r"^```(?:json)?\s*", "", raw)
|
|
475
|
-
raw = re.sub(r"\s*```$", "", raw)
|
|
476
|
-
parsed = json.loads(raw)
|
|
477
|
-
if isinstance(parsed, list):
|
|
478
|
-
triples = []
|
|
479
|
-
for item in parsed[:limit]:
|
|
480
|
-
if isinstance(item, dict) and "subject" in item and "object" in item:
|
|
481
|
-
triples.append({
|
|
482
|
-
"subject": str(item["subject"]),
|
|
483
|
-
"relation": str(item.get("relation", "관련됨")),
|
|
484
|
-
"object": str(item["object"]),
|
|
485
|
-
"context": str(item.get("evidence", ""))[:240],
|
|
486
|
-
"confidence": float(item.get("confidence", 0.8)),
|
|
487
|
-
})
|
|
488
|
-
return triples if triples else None
|
|
489
|
-
except Exception as e:
|
|
490
|
-
logging.debug("LLM triple extraction failed (falling back to rules): %s", e)
|
|
491
|
-
return None
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
_CONCEPT_STOP: set = {
|
|
495
|
-
# English stop words
|
|
496
|
-
"the", "and", "for", "with", "this", "that", "from", "into", "which",
|
|
497
|
-
"are", "was", "were", "has", "have", "had", "can", "will", "would",
|
|
498
|
-
"could", "should", "may", "might", "must", "shall", "being", "been",
|
|
499
|
-
"also", "just", "then", "than", "when", "where", "what", "how", "why",
|
|
500
|
-
"its", "their", "your", "our", "you", "they", "them", "these", "those",
|
|
501
|
-
"use", "used", "using", "based", "like", "such", "via", "per", "let",
|
|
502
|
-
"yes", "not", "but", "are", "all", "any", "out", "new", "get", "set",
|
|
503
|
-
# Korean stop words
|
|
504
|
-
"사용자", "내용", "파일", "채팅", "답변", "입니다", "그리고", "처럼",
|
|
505
|
-
"있어", "없어", "이야", "이다", "한다", "하다", "되다", "됩니다",
|
|
506
|
-
"경우", "방법", "부분", "상태", "정도", "결과", "이후", "이전",
|
|
507
|
-
"그것", "이것", "저것", "여기", "거기", "저기", "우리", "저희",
|
|
508
|
-
"기능", "서버", "모델", "설정", "설명", "버전", "지원", "사용", "실행",
|
|
509
|
-
"todo", "fixme", "note", "참고", "주의", "warning",
|
|
510
|
-
}
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
def _extract_concepts(text: str, limit: int = 12) -> List[str]:
|
|
514
|
-
"""LLM-first concept extraction with rule-based fallback."""
|
|
515
|
-
llm_result = _llm_extract_concepts(text, limit)
|
|
516
|
-
if llm_result:
|
|
517
|
-
return llm_result
|
|
518
|
-
return _extract_concepts_rules(text, limit)
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
def _extract_concepts_rules(text: str, limit: int = 12) -> List[str]:
|
|
522
|
-
"""Extract meaningful named concepts from text (rule-based).
|
|
523
|
-
|
|
524
|
-
Priority order:
|
|
525
|
-
1. Backtick / quoted terms (explicitly technical)
|
|
526
|
-
2. Multi-word proper nouns (Lattice AI, GPT-4o, Claude Sonnet)
|
|
527
|
-
3. Single capitalized proper nouns not at sentence start (Claude, Python, FastAPI)
|
|
528
|
-
4. Korean compound technical terms (멀티모달, 에이전트, 그래프RAG)
|
|
529
|
-
5. Hyphenated / versioned identifiers (gpt-4o, mlx-vlm, gemma-4)
|
|
530
|
-
"""
|
|
531
|
-
text = str(text or "")
|
|
532
|
-
seen: dict = {} # concept_lower → original form
|
|
533
|
-
|
|
534
|
-
def _add(term: str) -> None:
|
|
535
|
-
key = term.strip().lower()
|
|
536
|
-
if (
|
|
537
|
-
key
|
|
538
|
-
and key not in _CONCEPT_STOP
|
|
539
|
-
and not key.isdigit()
|
|
540
|
-
and len(key) >= 2
|
|
541
|
-
):
|
|
542
|
-
seen.setdefault(key, term.strip())
|
|
543
|
-
|
|
544
|
-
# 1. Backtick-quoted code/term (highest confidence)
|
|
545
|
-
for m in re.findall(r'`([^`]{2,40})`', text):
|
|
546
|
-
if not re.search(r'[\(\)\[\]{}]', m): # skip code expressions
|
|
547
|
-
_add(m)
|
|
548
|
-
|
|
549
|
-
# 2. Double/single quoted terms
|
|
550
|
-
for m in re.findall(r'"([^"]{2,40})"', text):
|
|
551
|
-
_add(m)
|
|
552
|
-
|
|
553
|
-
# 3. Multi-word English proper nouns (Title Case or ALL-CAPS first word, 2–4 words).
|
|
554
|
-
# Pattern A: Mixed-case first word — "Lattice AI", "Tool Use", "Graph RAG"
|
|
555
|
-
for m in re.findall(
|
|
556
|
-
r'([A-Z][a-z]{1,20}(?:\s+(?:[A-Z]{2,10}|[A-Z][a-z0-9]{1,20}|\d[\w.]{0,6})){1,3})',
|
|
557
|
-
text,
|
|
558
|
-
):
|
|
559
|
-
_add(m)
|
|
560
|
-
# Pattern B: ALL-CAPS first word — "VS Code", "MCP Server", "GPT-4o Mini"
|
|
561
|
-
for m in re.findall(
|
|
562
|
-
r'([A-Z]{2,6}(?:\s+(?:[A-Z]{2,10}|[A-Z][a-z0-9]{1,20})){1,2})',
|
|
563
|
-
text,
|
|
564
|
-
):
|
|
565
|
-
_add(m)
|
|
566
|
-
|
|
567
|
-
# 4. Single capitalized proper noun.
|
|
568
|
-
# Use ASCII-boundary lookaround instead of \b so Korean particles
|
|
569
|
-
# (와, 의, 는 …) after an English word don't block the match.
|
|
570
|
-
all_caps_words = re.findall(r'(?<![A-Za-z0-9])([A-Z][A-Za-z0-9]{2,24})(?![A-Za-z0-9])', text)
|
|
571
|
-
freq: Dict[str, int] = {}
|
|
572
|
-
for w in all_caps_words:
|
|
573
|
-
freq[w] = freq.get(w, 0) + 1
|
|
574
|
-
sentence_starts = set(re.findall(r'(?:^|(?<=[.!?])\s+)([A-Z][a-z]+)', text))
|
|
575
|
-
for m, cnt in freq.items():
|
|
576
|
-
if m.lower() in _CONCEPT_STOP:
|
|
577
|
-
continue
|
|
578
|
-
if cnt >= 2 or m not in sentence_starts:
|
|
579
|
-
_add(m)
|
|
580
|
-
|
|
581
|
-
# 5. Korean technical compound nouns (3–12 chars, no common particles)
|
|
582
|
-
for m in re.findall(r'[가-힣]{2,12}(?:AI|LLM|API|UI|RAG|bot|Bot|기능|모델|서버|에이전트|파이프라인|워크플로)', text):
|
|
583
|
-
_add(m)
|
|
584
|
-
# Korean standalone terms that appear after topic markers (은/는/이/가 앞)
|
|
585
|
-
for m in re.findall(r'([가-힣]{2,12})(?:은|는|이|가|을|를|의|에서|으로|와|과)', text):
|
|
586
|
-
if m.lower() not in _CONCEPT_STOP and len(m) >= 2:
|
|
587
|
-
# Only add if it's non-trivial (has 3+ chars or appears multiple times)
|
|
588
|
-
cnt = text.count(m)
|
|
589
|
-
if len(m) >= 3 or cnt >= 2:
|
|
590
|
-
_add(m)
|
|
591
|
-
|
|
592
|
-
# 6. Hyphenated / versioned identifiers (gpt-4o, gemma-4, mlx-vlm)
|
|
593
|
-
for m in re.findall(r'\b([a-zA-Z][a-zA-Z0-9]*(?:-[a-zA-Z0-9.]+)+)\b', text):
|
|
594
|
-
if len(m) >= 4:
|
|
595
|
-
_add(m)
|
|
596
|
-
|
|
597
|
-
# De-duplicate: remove shorter if ALL its occurrences in the source text
|
|
598
|
-
# are followed immediately by the suffix that forms the longer concept.
|
|
599
|
-
# "Lattice" → dropped when every occurrence is "Lattice AI"
|
|
600
|
-
# "Claude" → kept because it appears as just "Claude" too.
|
|
601
|
-
values = list(seen.values())
|
|
602
|
-
values_lower = [v.lower() for v in values]
|
|
603
|
-
keep = set(range(len(values)))
|
|
604
|
-
for i, v in enumerate(values):
|
|
605
|
-
vl = v.lower()
|
|
606
|
-
for j, wl in enumerate(values_lower):
|
|
607
|
-
if i == j or j not in keep:
|
|
608
|
-
continue
|
|
609
|
-
# Check if vl is a word-prefix of wl
|
|
610
|
-
suffix = wl[len(vl):]
|
|
611
|
-
if not (wl.startswith(vl) and re.match(r'^[\s\-]', suffix)):
|
|
612
|
-
continue
|
|
613
|
-
# Count occurrences of v NOT followed by the suffix
|
|
614
|
-
suffix_stripped = suffix.lstrip(" -")
|
|
615
|
-
# Escape for regex
|
|
616
|
-
pattern_with_suffix = re.escape(v) + r'[\s\-]+' + re.escape(suffix_stripped)
|
|
617
|
-
pattern_alone = re.escape(v) + r'(?![\s\-]*' + re.escape(suffix_stripped) + r')'
|
|
618
|
-
alone_count = len(re.findall(pattern_alone, text, re.IGNORECASE))
|
|
619
|
-
if alone_count == 0:
|
|
620
|
-
# Shorter term never appears alone → safe to remove
|
|
621
|
-
keep.discard(i)
|
|
622
|
-
break
|
|
623
|
-
|
|
624
|
-
final = [values[i] for i in range(len(values)) if i in keep]
|
|
625
|
-
return final[:limit]
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
# ──────────────────────────────────────────────────────────────────────────────
|
|
629
|
-
# Node type taxonomy (점 = 명사)
|
|
630
|
-
# ──────────────────────────────────────────────────────────────────────────────
|
|
631
|
-
# Chat — 대화 세션
|
|
632
|
-
# Document — 파일 (PDF·PPT·Word·Excel·이미지 등)
|
|
633
|
-
# Concept — 개념·아이디어·기술 용어
|
|
634
|
-
# Person — 사람 (사용자, 언급된 인물)
|
|
635
|
-
# Error — 오류·버그·예외
|
|
636
|
-
# Code — 코드 스니펫·함수·클래스
|
|
637
|
-
# Feature — 소프트웨어 기능
|
|
638
|
-
# Task — 할 일·액션 아이템
|
|
639
|
-
# Decision — 결정 사항
|
|
640
|
-
|
|
641
|
-
# Edge type vocabulary (선 = 동사 — 과거형 서술어)
|
|
642
|
-
EDGE_VERB = {
|
|
643
|
-
"언급함": r"언급|mention|refer|cited",
|
|
644
|
-
"포함함": r"포함|include|consist|구성|탑재|contains",
|
|
645
|
-
"해결함": r"해결|resolv|fix|수정|고쳤|closed",
|
|
646
|
-
"의존함": r"의존|depend|require|필요|based on",
|
|
647
|
-
"설명함": r"설명|explain|describe|정의|란|이란|means",
|
|
648
|
-
"비교함": r"비교|versus|vs\.?|차이|다르|compare",
|
|
649
|
-
"사용함": r"사용|use|활용|이용|apply",
|
|
650
|
-
"연결함": r"연결|connect|통합|integrate|연동|link",
|
|
651
|
-
"확장함": r"확장|extend|플러그인|plugin|addon",
|
|
652
|
-
"생성함": r"생성|만들|create|generate|build|produced",
|
|
653
|
-
"대체함": r"대체|replace|instead|alternative",
|
|
654
|
-
"지원함": r"지원|support|제공|provide|offer",
|
|
655
|
-
"발생함": r"발생|occur|throw|raise|triggered",
|
|
656
|
-
"관련됨": r"관련|related|associated|연관",
|
|
657
|
-
}
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
def _infer_edge(sentence: str) -> str:
|
|
661
|
-
"""Return the best-matching verb-form edge label for a sentence."""
|
|
662
|
-
s = sentence.lower()
|
|
663
|
-
for label, pattern in EDGE_VERB.items():
|
|
664
|
-
if re.search(pattern, s):
|
|
665
|
-
return label
|
|
666
|
-
return "관련됨"
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
# Technical words that cannot be person names
|
|
670
|
-
_NOT_PERSON_WORDS: set = {
|
|
671
|
-
"use", "api", "rag", "sdk", "ide", "cli", "llm", "mcp", "ui", "ux",
|
|
672
|
-
"new", "old", "get", "set", "run", "add", "fix", "tool", "code",
|
|
673
|
-
"base", "core", "data", "file", "test", "type", "mode", "view",
|
|
674
|
-
}
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
def _classify_node_type(concept: str, text: str) -> str:
|
|
678
|
-
"""Classify a concept into the node taxonomy.
|
|
679
|
-
|
|
680
|
-
Term-level signals take priority; then a tight ±60-char window is used
|
|
681
|
-
so distant keywords don't cause mis-classification.
|
|
682
|
-
"""
|
|
683
|
-
term = concept.lower()
|
|
684
|
-
|
|
685
|
-
# ── Term-level signals (highest confidence) ───────────────────────────
|
|
686
|
-
if re.search(r'(?:error|exception|traceback|오류|에러|버그)$', term, re.I):
|
|
687
|
-
return "Error"
|
|
688
|
-
if re.search(r'error|exception|err\b', term, re.I) and len(concept) < 30:
|
|
689
|
-
return "Error"
|
|
690
|
-
if re.search(r'\(\)|\.py$|\.js$|\.ts$|\.go$|::\w', term):
|
|
691
|
-
return "Code"
|
|
692
|
-
|
|
693
|
-
# Person: "First Last" pattern, neither word is a known technical term
|
|
694
|
-
if re.match(r'^[A-Z][a-z]{1,15} [A-Z][a-z]{1,15}$', concept):
|
|
695
|
-
words = term.split()
|
|
696
|
-
if not any(w in _NOT_PERSON_WORDS for w in words):
|
|
697
|
-
return "Person"
|
|
698
|
-
|
|
699
|
-
# ── Windowed context (±60 chars) — NOT used for Error to avoid false positives
|
|
700
|
-
idx = text.lower().find(term)
|
|
701
|
-
if idx >= 0:
|
|
702
|
-
win = text[max(0, idx - 60): idx + len(concept) + 60].lower()
|
|
703
|
-
if re.search(r'def |class |function|함수|클래스|메서드|import', win):
|
|
704
|
-
return "Code"
|
|
705
|
-
# Feature: concept appears DIRECTLY adjacent to 기능/feature keyword
|
|
706
|
-
if (
|
|
707
|
-
len(concept) <= 12
|
|
708
|
-
and re.search(
|
|
709
|
-
rf'{re.escape(term)}.{{0,8}}(?:기능|feature)|(?:기능|feature).{{0,8}}{re.escape(term)}',
|
|
710
|
-
win,
|
|
711
|
-
)
|
|
712
|
-
):
|
|
713
|
-
return "Feature"
|
|
714
|
-
|
|
715
|
-
return "Concept"
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
def _extract_triples(
|
|
719
|
-
text: str,
|
|
720
|
-
concepts: List[str],
|
|
721
|
-
limit: int = 20,
|
|
722
|
-
) -> List[Dict[str, str]]:
|
|
723
|
-
"""LLM-first triple extraction with rule-based fallback."""
|
|
724
|
-
llm_result = _llm_extract_triples(text, concepts, limit)
|
|
725
|
-
if llm_result:
|
|
726
|
-
return llm_result
|
|
727
|
-
return _extract_triples_rules(text, concepts, limit)
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
def _extract_triples_rules(
|
|
731
|
-
text: str,
|
|
732
|
-
concepts: List[str],
|
|
733
|
-
limit: int = 20,
|
|
734
|
-
) -> List[Dict[str, str]]:
|
|
735
|
-
"""Extract (subject, verb-edge, object, context) triples from text (rule-based).
|
|
736
|
-
|
|
737
|
-
For each sentence containing ≥2 concepts, infer the verb-form edge label
|
|
738
|
-
from surrounding context and create a directed triple.
|
|
739
|
-
"""
|
|
740
|
-
if len(concepts) < 2:
|
|
741
|
-
return []
|
|
742
|
-
|
|
743
|
-
concept_lower = {c.lower(): c for c in concepts}
|
|
744
|
-
triples: List[Dict[str, str]] = []
|
|
745
|
-
seen_pairs: set = set()
|
|
746
|
-
|
|
747
|
-
# Split on sentence boundaries
|
|
748
|
-
sentences = re.split(r'(?<=[.!?\n])\s+|\n{2,}', text)
|
|
749
|
-
for sent in sentences:
|
|
750
|
-
sent = sent.strip()
|
|
751
|
-
if len(sent) < 8:
|
|
752
|
-
continue
|
|
753
|
-
sent_lower = sent.lower()
|
|
754
|
-
|
|
755
|
-
present = [concept_lower[k] for k in concept_lower if k in sent_lower]
|
|
756
|
-
if len(present) < 2:
|
|
757
|
-
continue
|
|
758
|
-
|
|
759
|
-
edge = _infer_edge(sent)
|
|
760
|
-
|
|
761
|
-
for i in range(len(present) - 1):
|
|
762
|
-
subj, obj = present[i], present[i + 1]
|
|
763
|
-
# Deduplicate by (subj, obj) regardless of direction for same edge
|
|
764
|
-
pair_key = tuple(sorted([subj.lower(), obj.lower()])) + (edge,)
|
|
765
|
-
if pair_key in seen_pairs:
|
|
766
|
-
continue
|
|
767
|
-
seen_pairs.add(pair_key)
|
|
768
|
-
triples.append({
|
|
769
|
-
"subject": subj,
|
|
770
|
-
"relation": edge, # verb form (동사)
|
|
771
|
-
"object": obj,
|
|
772
|
-
"context": sent[:240],
|
|
773
|
-
})
|
|
774
|
-
if len(triples) >= limit:
|
|
775
|
-
return triples
|
|
776
|
-
|
|
777
|
-
return triples
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
def _semantic_items(text: str) -> List[Dict[str, str]]:
|
|
781
|
-
"""Extract explicit decision / task items from text."""
|
|
782
|
-
items: List[Dict[str, str]] = []
|
|
783
|
-
for raw_line in str(text or "").splitlines():
|
|
784
|
-
line = _clean_text(raw_line)
|
|
785
|
-
if len(line) < 6:
|
|
786
|
-
continue
|
|
787
|
-
lowered = line.lower()
|
|
788
|
-
if re.search(r"(결정|확정|하기로|decided|decision)", lowered):
|
|
789
|
-
items.append({"type": "Decision", "title": line[:120], "summary": line[:500]})
|
|
790
|
-
if re.search(r"(todo|해야|하자|진행|구현|수정|확인|next|task|\[ \])", lowered):
|
|
791
|
-
items.append({"type": "Task", "title": line[:120], "summary": line[:500]})
|
|
792
|
-
return items[:8]
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
def _topic_candidates(text: str, limit: int = 8) -> List[str]:
|
|
796
|
-
"""Return compact keyword candidates for fallback graph search."""
|
|
797
|
-
candidates = _extract_concepts(text, limit=limit)
|
|
798
|
-
if candidates:
|
|
799
|
-
return candidates[:limit]
|
|
800
|
-
seen: Dict[str, str] = {}
|
|
801
|
-
for token in re.findall(r"[A-Za-z][A-Za-z0-9_.:-]{2,}|[가-힣]{2,12}", str(text or "")):
|
|
802
|
-
key = token.lower()
|
|
803
|
-
if key in _CONCEPT_STOP or key.isdigit():
|
|
804
|
-
continue
|
|
805
|
-
seen.setdefault(key, token)
|
|
806
|
-
if len(seen) >= limit:
|
|
807
|
-
break
|
|
808
|
-
return list(seen.values())[:limit]
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
class KnowledgeGraphStore:
|
|
812
|
-
def __init__(self, db_path: Path, blob_dir: Path, embedder: Any = None):
|
|
813
|
-
self.db_path = Path(db_path)
|
|
814
|
-
self.blob_dir = Path(blob_dir)
|
|
815
|
-
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
816
|
-
self.blob_dir.mkdir(parents=True, exist_ok=True)
|
|
817
|
-
# The embedder is swappable behind a fixed interface
|
|
818
|
-
# (model_id/dim/embed/encode/decode/similarity). Defaults to the
|
|
819
|
-
# deterministic, offline hash model so the store works with no config;
|
|
820
|
-
# server_app injects a provider-backed embedder from Config.
|
|
821
|
-
self._embedding_model = embedder if embedder is not None else LocalEmbeddingModel()
|
|
822
|
-
self._init_db()
|
|
823
|
-
# Read graph queries from the v2 projection (kgv2_* views) when available.
|
|
824
|
-
# Toggle off (e.g. in tests) to compare against the legacy tables.
|
|
825
|
-
self._read_from_v2 = KGStoreV2 is not None and _READ_FROM_V2_DEFAULT
|
|
826
|
-
|
|
827
|
-
def _read_tables(self) -> tuple:
|
|
828
|
-
"""Return (nodes_table, edges_table) for read queries.
|
|
829
|
-
|
|
830
|
-
Same read code runs against the legacy tables or the v2 reconstruction
|
|
831
|
-
views, so the two paths are equivalent by construction.
|
|
832
|
-
"""
|
|
833
|
-
if self._read_from_v2:
|
|
834
|
-
return ("kgv2_nodes", "kgv2_edges")
|
|
835
|
-
return ("nodes", "edges")
|
|
836
|
-
|
|
837
|
-
def _connect(self) -> sqlite3.Connection:
|
|
838
|
-
conn = sqlite3.connect(str(self.db_path))
|
|
839
|
-
conn.row_factory = sqlite3.Row
|
|
840
|
-
conn.execute("PRAGMA journal_mode=WAL")
|
|
841
|
-
conn.execute("PRAGMA foreign_keys=ON")
|
|
842
|
-
return conn
|
|
843
|
-
|
|
844
|
-
def _init_db(self) -> None:
|
|
845
|
-
with self._connect() as conn:
|
|
846
|
-
conn.executescript(
|
|
847
|
-
"""
|
|
848
|
-
CREATE TABLE IF NOT EXISTS graph_meta (
|
|
849
|
-
key TEXT PRIMARY KEY,
|
|
850
|
-
value TEXT NOT NULL
|
|
851
|
-
);
|
|
852
|
-
CREATE TABLE IF NOT EXISTS nodes (
|
|
853
|
-
id TEXT PRIMARY KEY,
|
|
854
|
-
type TEXT NOT NULL,
|
|
855
|
-
title TEXT NOT NULL,
|
|
856
|
-
summary TEXT,
|
|
857
|
-
metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
|
|
858
|
-
raw_json TEXT NOT NULL CHECK (json_valid(raw_json)),
|
|
859
|
-
created_at TEXT NOT NULL,
|
|
860
|
-
updated_at TEXT NOT NULL
|
|
861
|
-
);
|
|
862
|
-
CREATE TABLE IF NOT EXISTS edges (
|
|
863
|
-
id TEXT PRIMARY KEY,
|
|
864
|
-
from_node TEXT NOT NULL,
|
|
865
|
-
to_node TEXT NOT NULL,
|
|
866
|
-
type TEXT NOT NULL,
|
|
867
|
-
weight REAL NOT NULL DEFAULT 1.0,
|
|
868
|
-
metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
|
|
869
|
-
created_at TEXT NOT NULL,
|
|
870
|
-
UNIQUE(from_node, to_node, type),
|
|
871
|
-
FOREIGN KEY(from_node) REFERENCES nodes(id) ON DELETE CASCADE,
|
|
872
|
-
FOREIGN KEY(to_node) REFERENCES nodes(id) ON DELETE CASCADE
|
|
873
|
-
);
|
|
874
|
-
CREATE TABLE IF NOT EXISTS chunks (
|
|
875
|
-
id TEXT PRIMARY KEY,
|
|
876
|
-
source_node TEXT NOT NULL,
|
|
877
|
-
text TEXT NOT NULL,
|
|
878
|
-
metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
|
|
879
|
-
created_at TEXT NOT NULL,
|
|
880
|
-
FOREIGN KEY(source_node) REFERENCES nodes(id) ON DELETE CASCADE
|
|
881
|
-
);
|
|
882
|
-
CREATE TABLE IF NOT EXISTS knowledge_sources (
|
|
883
|
-
id TEXT PRIMARY KEY,
|
|
884
|
-
root_path TEXT NOT NULL UNIQUE,
|
|
885
|
-
os_type TEXT NOT NULL,
|
|
886
|
-
drive_id TEXT,
|
|
887
|
-
label TEXT,
|
|
888
|
-
status TEXT NOT NULL,
|
|
889
|
-
include_ocr INTEGER NOT NULL DEFAULT 0,
|
|
890
|
-
watch_enabled INTEGER NOT NULL DEFAULT 0,
|
|
891
|
-
consent_json TEXT NOT NULL CHECK (json_valid(consent_json)),
|
|
892
|
-
created_at TEXT NOT NULL,
|
|
893
|
-
updated_at TEXT NOT NULL,
|
|
894
|
-
last_scanned_at TEXT
|
|
895
|
-
);
|
|
896
|
-
CREATE TABLE IF NOT EXISTS local_file_index (
|
|
897
|
-
id TEXT PRIMARY KEY,
|
|
898
|
-
source_id TEXT NOT NULL,
|
|
899
|
-
os_type TEXT NOT NULL,
|
|
900
|
-
drive_id TEXT,
|
|
901
|
-
root_path TEXT NOT NULL,
|
|
902
|
-
file_path TEXT NOT NULL,
|
|
903
|
-
relative_path TEXT NOT NULL,
|
|
904
|
-
file_name TEXT NOT NULL,
|
|
905
|
-
extension TEXT NOT NULL,
|
|
906
|
-
size_bytes INTEGER,
|
|
907
|
-
modified_at TEXT,
|
|
908
|
-
sha256 TEXT,
|
|
909
|
-
last_scanned_at TEXT,
|
|
910
|
-
last_indexed_at TEXT,
|
|
911
|
-
parser_type TEXT,
|
|
912
|
-
status TEXT NOT NULL,
|
|
913
|
-
error_message TEXT,
|
|
914
|
-
graph_node_id TEXT,
|
|
915
|
-
deleted INTEGER NOT NULL DEFAULT 0,
|
|
916
|
-
metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
|
|
917
|
-
UNIQUE(source_id, relative_path),
|
|
918
|
-
FOREIGN KEY(source_id) REFERENCES knowledge_sources(id) ON DELETE CASCADE
|
|
919
|
-
);
|
|
920
|
-
CREATE TABLE IF NOT EXISTS vector_embeddings (
|
|
921
|
-
item_id TEXT PRIMARY KEY,
|
|
922
|
-
item_type TEXT NOT NULL,
|
|
923
|
-
source_node TEXT NOT NULL,
|
|
924
|
-
text_hash TEXT NOT NULL,
|
|
925
|
-
embedding BLOB NOT NULL,
|
|
926
|
-
embedding_dim INTEGER NOT NULL,
|
|
927
|
-
embedding_model TEXT NOT NULL,
|
|
928
|
-
metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
|
|
929
|
-
indexed_at TEXT NOT NULL,
|
|
930
|
-
FOREIGN KEY(source_node) REFERENCES nodes(id) ON DELETE CASCADE
|
|
931
|
-
);
|
|
932
|
-
CREATE TABLE IF NOT EXISTS vector_index_operations (
|
|
933
|
-
id TEXT PRIMARY KEY,
|
|
934
|
-
operation TEXT NOT NULL,
|
|
935
|
-
status TEXT NOT NULL,
|
|
936
|
-
requested_at TEXT NOT NULL,
|
|
937
|
-
started_at TEXT,
|
|
938
|
-
completed_at TEXT,
|
|
939
|
-
items_total INTEGER NOT NULL DEFAULT 0,
|
|
940
|
-
items_indexed INTEGER NOT NULL DEFAULT 0,
|
|
941
|
-
items_skipped INTEGER NOT NULL DEFAULT 0,
|
|
942
|
-
error_message TEXT,
|
|
943
|
-
metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json))
|
|
944
|
-
);
|
|
945
|
-
-- v3.6.0 Knowledge Graph First: per-ingestion provenance trail.
|
|
946
|
-
-- Append-only audit of where every graph node came from, when it
|
|
947
|
-
-- was captured, how it was processed, and whether it was embedded /
|
|
948
|
-
-- linked / used by an agent. get_provenance() returns the latest row.
|
|
949
|
-
CREATE TABLE IF NOT EXISTS ingestion_provenance (
|
|
950
|
-
id TEXT PRIMARY KEY,
|
|
951
|
-
node_id TEXT NOT NULL,
|
|
952
|
-
source_type TEXT NOT NULL,
|
|
953
|
-
source_uri TEXT,
|
|
954
|
-
content_hash TEXT,
|
|
955
|
-
title TEXT,
|
|
956
|
-
pipeline TEXT NOT NULL,
|
|
957
|
-
owner TEXT,
|
|
958
|
-
workspace_id TEXT,
|
|
959
|
-
captured_at TEXT,
|
|
960
|
-
modified_at TEXT,
|
|
961
|
-
embedded INTEGER NOT NULL DEFAULT 0,
|
|
962
|
-
linked INTEGER NOT NULL DEFAULT 0,
|
|
963
|
-
duplicate INTEGER NOT NULL DEFAULT 0,
|
|
964
|
-
agent_used TEXT,
|
|
965
|
-
chunk_count INTEGER NOT NULL DEFAULT 0,
|
|
966
|
-
permissions_json TEXT NOT NULL DEFAULT '{}' CHECK (json_valid(permissions_json)),
|
|
967
|
-
metadata_json TEXT NOT NULL DEFAULT '{}' CHECK (json_valid(metadata_json)),
|
|
968
|
-
created_at TEXT NOT NULL
|
|
969
|
-
);
|
|
970
|
-
CREATE INDEX IF NOT EXISTS idx_nodes_type ON nodes(type);
|
|
971
|
-
CREATE INDEX IF NOT EXISTS idx_edges_from ON edges(from_node);
|
|
972
|
-
CREATE INDEX IF NOT EXISTS idx_edges_to ON edges(to_node);
|
|
973
|
-
CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source_node);
|
|
974
|
-
CREATE INDEX IF NOT EXISTS idx_knowledge_sources_root ON knowledge_sources(root_path);
|
|
975
|
-
CREATE INDEX IF NOT EXISTS idx_local_file_index_source ON local_file_index(source_id);
|
|
976
|
-
CREATE INDEX IF NOT EXISTS idx_local_file_index_status ON local_file_index(status);
|
|
977
|
-
CREATE INDEX IF NOT EXISTS idx_local_file_index_graph_node ON local_file_index(graph_node_id);
|
|
978
|
-
CREATE INDEX IF NOT EXISTS idx_vector_embeddings_type ON vector_embeddings(item_type);
|
|
979
|
-
CREATE INDEX IF NOT EXISTS idx_vector_embeddings_source ON vector_embeddings(source_node);
|
|
980
|
-
CREATE INDEX IF NOT EXISTS idx_vector_embeddings_model ON vector_embeddings(embedding_model);
|
|
981
|
-
CREATE INDEX IF NOT EXISTS idx_vector_index_operations_requested ON vector_index_operations(requested_at);
|
|
982
|
-
CREATE INDEX IF NOT EXISTS idx_provenance_node ON ingestion_provenance(node_id);
|
|
983
|
-
CREATE INDEX IF NOT EXISTS idx_provenance_source_type ON ingestion_provenance(source_type);
|
|
984
|
-
CREATE INDEX IF NOT EXISTS idx_provenance_hash ON ingestion_provenance(content_hash);
|
|
985
|
-
CREATE INDEX IF NOT EXISTS idx_provenance_created ON ingestion_provenance(created_at);
|
|
986
|
-
"""
|
|
987
|
-
)
|
|
988
|
-
conn.execute(
|
|
989
|
-
"INSERT OR REPLACE INTO graph_meta(key, value) VALUES (?, ?)",
|
|
990
|
-
("schema_version", str(GRAPH_SCHEMA_VERSION)),
|
|
991
|
-
)
|
|
992
|
-
self._init_v2_schema()
|
|
993
|
-
|
|
994
|
-
# SQL views that reconstruct the *exact* legacy row shape on top of the
|
|
995
|
-
# normalized v2 tables, so the read methods run unchanged against either
|
|
996
|
-
# source. The projection stores the raw legacy type string in ``legacy_type``
|
|
997
|
-
# and promotes summary + metadata to first-class columns (no more
|
|
998
|
-
# ``attrs._kg`` passthrough / ``evidence`` abuse), so these views are
|
|
999
|
-
# byte-faithful to the legacy nodes/edges tables.
|
|
1000
|
-
_V2_VIEWS_SQL = """
|
|
1001
|
-
CREATE VIEW IF NOT EXISTS kgv2_nodes AS
|
|
1002
|
-
SELECT id,
|
|
1003
|
-
COALESCE(legacy_type, type) AS type,
|
|
1004
|
-
label AS title,
|
|
1005
|
-
summary,
|
|
1006
|
-
attrs AS metadata_json,
|
|
1007
|
-
created_at, updated_at
|
|
1008
|
-
FROM nodes_v2;
|
|
1009
|
-
CREATE VIEW IF NOT EXISTS kgv2_edges AS
|
|
1010
|
-
SELECT id, source AS from_node, target AS to_node,
|
|
1011
|
-
COALESCE(legacy_type, type) AS type,
|
|
1012
|
-
weight,
|
|
1013
|
-
metadata AS metadata_json,
|
|
1014
|
-
created_at
|
|
1015
|
-
FROM edges_v2;
|
|
1016
|
-
"""
|
|
1017
|
-
|
|
1018
|
-
def _init_v2_schema(self) -> None:
|
|
1019
|
-
"""Initialize the normalized v2 tables + reconstruction views, migrating
|
|
1020
|
-
the projection layout when it is stale — **atomically**.
|
|
1021
|
-
|
|
1022
|
-
The entire DROP → CREATE → VIEWS → BACKFILL → version-stamp sequence runs
|
|
1023
|
-
in a single transaction on one connection: on any failure it rolls back,
|
|
1024
|
-
leaving the prior projection untouched and the version unchanged, so the
|
|
1025
|
-
next startup simply retries. The migration only ever touches the v2
|
|
1026
|
-
tables/views and the ``projection_version`` key — never the authoritative
|
|
1027
|
-
legacy ``nodes``/``edges`` — so legacy data cannot be corrupted even if
|
|
1028
|
-
the rebuild fails midway.
|
|
1029
|
-
"""
|
|
1030
|
-
if KGStoreV2 is None or _exec_script is None:
|
|
1031
|
-
return
|
|
1032
|
-
try:
|
|
1033
|
-
with self._connect() as conn:
|
|
1034
|
-
conn.execute("BEGIN")
|
|
1035
|
-
stale = self._projection_version(conn) != _PROJECTION_VERSION
|
|
1036
|
-
if stale:
|
|
1037
|
-
# The projection is non-authoritative; drop it so init_schema
|
|
1038
|
-
# recreates the tables with the current normalized columns.
|
|
1039
|
-
for stmt in (
|
|
1040
|
-
"DROP VIEW IF EXISTS kgv2_edges",
|
|
1041
|
-
"DROP VIEW IF EXISTS kgv2_nodes",
|
|
1042
|
-
"DROP TABLE IF EXISTS edges_v2",
|
|
1043
|
-
"DROP TABLE IF EXISTS nodes_v2",
|
|
1044
|
-
):
|
|
1045
|
-
conn.execute(stmt)
|
|
1046
|
-
# init_schema(conn=...) joins this transaction (no implicit commit)
|
|
1047
|
-
KGStoreV2(self.db_path).init_schema(conn=conn)
|
|
1048
|
-
_exec_script(conn, self._V2_VIEWS_SQL)
|
|
1049
|
-
self._backfill_v2_on(conn, force=stale)
|
|
1050
|
-
# version stamp commits together with the backfill — never stranded
|
|
1051
|
-
conn.execute(
|
|
1052
|
-
"INSERT OR REPLACE INTO kg_meta(key, value) VALUES ('projection_version', ?)",
|
|
1053
|
-
(str(_PROJECTION_VERSION),),
|
|
1054
|
-
)
|
|
1055
|
-
except Exception as e:
|
|
1056
|
-
logging.warning("knowledge_graph: v2 schema init/backfill skipped: %s", e)
|
|
1057
|
-
|
|
1058
|
-
def _projection_version(self, conn: sqlite3.Connection) -> int:
|
|
1059
|
-
"""Return the stored v2 projection layout version (0 if unknown).
|
|
1060
|
-
|
|
1061
|
-
A fresh DB (kg_meta absent) raises ``sqlite3.OperationalError`` here and
|
|
1062
|
-
is correctly treated as version 0 → rebuild. Only sqlite errors are
|
|
1063
|
-
swallowed so a real bug doesn't masquerade as a stale projection.
|
|
1064
|
-
"""
|
|
1065
|
-
try:
|
|
1066
|
-
row = conn.execute(
|
|
1067
|
-
"SELECT value FROM kg_meta WHERE key='projection_version'"
|
|
1068
|
-
).fetchone()
|
|
1069
|
-
return int(row["value"]) if row and row["value"] is not None else 0
|
|
1070
|
-
except sqlite3.Error:
|
|
1071
|
-
return 0
|
|
1072
|
-
|
|
1073
|
-
def _backfill_v2_if_needed(self, *, force: bool = False) -> None:
|
|
1074
|
-
"""Project legacy nodes/edges into v2 on a fresh transaction.
|
|
1075
|
-
|
|
1076
|
-
Thin wrapper around :meth:`_backfill_v2_on` for callers (tests, ad-hoc
|
|
1077
|
-
re-sync) that aren't already inside the migration transaction.
|
|
1078
|
-
"""
|
|
1079
|
-
try:
|
|
1080
|
-
with self._connect() as conn:
|
|
1081
|
-
self._backfill_v2_on(conn, force=force)
|
|
1082
|
-
except Exception as ex:
|
|
1083
|
-
logging.warning("knowledge_graph: v2 backfill skipped: %s", ex)
|
|
1084
|
-
|
|
1085
|
-
def _backfill_v2_on(self, conn: sqlite3.Connection, *, force: bool = False) -> None:
|
|
1086
|
-
"""Project legacy nodes/edges into the normalized v2 tables on ``conn``.
|
|
1087
|
-
|
|
1088
|
-
Non-destructive to legacy. ``force`` rebuilds unconditionally (used after
|
|
1089
|
-
a layout migration); otherwise it only projects when v2 is empty. The v2
|
|
1090
|
-
graph is a derived projection, so clearing + rebuilding it is always safe.
|
|
1091
|
-
Idempotent: no-ops once v2 carries the current projection. Copies the
|
|
1092
|
-
legacy column values **verbatim** so the kgv2_* views are byte-faithful.
|
|
1093
|
-
"""
|
|
1094
|
-
legacy_nodes = conn.execute("SELECT COUNT(*) FROM nodes").fetchone()[0]
|
|
1095
|
-
if legacy_nodes == 0:
|
|
1096
|
-
return
|
|
1097
|
-
v2_nodes = conn.execute("SELECT COUNT(*) FROM nodes_v2").fetchone()[0]
|
|
1098
|
-
if v2_nodes > 0 and not force:
|
|
1099
|
-
return # current projection already present
|
|
1100
|
-
# (re)project: clear v2 graph (not authoritative) and rebuild
|
|
1101
|
-
conn.execute("DELETE FROM edges_v2")
|
|
1102
|
-
conn.execute("DELETE FROM nodes_v2")
|
|
1103
|
-
n = e = 0
|
|
1104
|
-
for r in conn.execute(
|
|
1105
|
-
"SELECT id, type, title, summary, metadata_json, created_at, updated_at FROM nodes"
|
|
1106
|
-
).fetchall():
|
|
1107
|
-
self._v2_project_node(
|
|
1108
|
-
conn, r["id"], r["type"], r["title"], r["summary"], r["metadata_json"],
|
|
1109
|
-
created_at=r["created_at"], updated_at=r["updated_at"],
|
|
1110
|
-
)
|
|
1111
|
-
n += 1
|
|
1112
|
-
for r in conn.execute(
|
|
1113
|
-
"SELECT id, from_node, to_node, type, weight, metadata_json, created_at FROM edges"
|
|
1114
|
-
).fetchall():
|
|
1115
|
-
self._v2_project_edge(
|
|
1116
|
-
conn, r["from_node"], r["to_node"], r["type"], float(r["weight"] or 1.0),
|
|
1117
|
-
r["metadata_json"], edge_id=r["id"], created_at=r["created_at"],
|
|
1118
|
-
)
|
|
1119
|
-
e += 1
|
|
1120
|
-
logging.info("knowledge_graph: projected legacy → v2 (%d nodes, %d edges)", n, e)
|
|
1121
|
-
|
|
1122
|
-
# ── v2 dual-write projection (normalized type, byte-faithful legacy values) ──
|
|
1123
|
-
# The projection stores the legacy ``title``/``summary``/``metadata_json``
|
|
1124
|
-
# values it is handed VERBATIM (no truncation or JSON re-encoding) so the
|
|
1125
|
-
# kgv2_* views reproduce the legacy rows exactly. Callers (_upsert_* and the
|
|
1126
|
-
# backfill) pass the already-canonical legacy column values.
|
|
1127
|
-
def _v2_project_node(
|
|
1128
|
-
self, conn: sqlite3.Connection, node_id: str, node_type: str, title: str,
|
|
1129
|
-
summary: Optional[str], metadata_json: Optional[str],
|
|
1130
|
-
*, created_at: Optional[str] = None, updated_at: Optional[str] = None,
|
|
1131
|
-
) -> None:
|
|
1132
|
-
if KGStoreV2 is None:
|
|
1133
|
-
return
|
|
1134
|
-
ts = updated_at or _now()
|
|
1135
|
-
norm_type = NodeType.from_legacy(node_type).value if NodeType is not None else node_type
|
|
1136
|
-
try:
|
|
1137
|
-
conn.execute(
|
|
1138
|
-
"""
|
|
1139
|
-
INSERT INTO nodes_v2(id, type, legacy_type, label, summary, attrs,
|
|
1140
|
-
owner_id, visibility, created_at, updated_at,
|
|
1141
|
-
importance_score)
|
|
1142
|
-
VALUES (?, ?, ?, ?, ?, ?, NULL, 'private', ?, ?, 0.0)
|
|
1143
|
-
ON CONFLICT(id) DO UPDATE SET
|
|
1144
|
-
type=excluded.type, legacy_type=excluded.legacy_type,
|
|
1145
|
-
label=excluded.label, summary=excluded.summary,
|
|
1146
|
-
attrs=excluded.attrs, updated_at=excluded.updated_at
|
|
1147
|
-
""",
|
|
1148
|
-
(node_id, norm_type, node_type, title, summary,
|
|
1149
|
-
metadata_json if metadata_json is not None else "{}",
|
|
1150
|
-
created_at or ts, ts),
|
|
1151
|
-
)
|
|
1152
|
-
except Exception as ex:
|
|
1153
|
-
logging.debug("knowledge_graph: v2 node projection skipped (%s): %s", node_id, ex)
|
|
1154
|
-
|
|
1155
|
-
def _v2_project_edge(
|
|
1156
|
-
self, conn: sqlite3.Connection, from_node: str, to_node: str, edge_type: str,
|
|
1157
|
-
weight: float, metadata_json: Optional[str],
|
|
1158
|
-
*, edge_id: Optional[str] = None, created_at: Optional[str] = None,
|
|
1159
|
-
) -> None:
|
|
1160
|
-
if KGStoreV2 is None:
|
|
1161
|
-
return
|
|
1162
|
-
eid = edge_id or f"edge:{_sha256_text(f'{from_node}|{edge_type}|{to_node}')[:24]}"
|
|
1163
|
-
norm_type = EdgeType.from_legacy(edge_type).value if EdgeType is not None else edge_type
|
|
1164
|
-
meta_str = metadata_json if metadata_json is not None else "{}"
|
|
1165
|
-
confidence = float(_safe_loads(meta_str).get("confidence", 1.0))
|
|
1166
|
-
try:
|
|
1167
|
-
conn.execute(
|
|
1168
|
-
"""
|
|
1169
|
-
INSERT INTO edges_v2(id, source, target, type, legacy_type, weight,
|
|
1170
|
-
confidence, evidence, metadata, created_by, created_at)
|
|
1171
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, '[]', ?, 'legacy', ?)
|
|
1172
|
-
ON CONFLICT(source, target, legacy_type) DO UPDATE SET
|
|
1173
|
-
type=excluded.type,
|
|
1174
|
-
weight=max(edges_v2.weight, excluded.weight),
|
|
1175
|
-
confidence=excluded.confidence,
|
|
1176
|
-
metadata=excluded.metadata
|
|
1177
|
-
""",
|
|
1178
|
-
(eid, from_node, to_node, norm_type, edge_type, float(weight),
|
|
1179
|
-
confidence, meta_str, created_at or _now()),
|
|
1180
|
-
)
|
|
1181
|
-
except Exception as ex:
|
|
1182
|
-
logging.debug("knowledge_graph: v2 edge projection skipped (%s->%s): %s", from_node, to_node, ex)
|
|
1183
|
-
|
|
1184
|
-
def _v2_delete_nodes(self, conn: sqlite3.Connection, ids) -> None:
|
|
1185
|
-
"""Mirror legacy node deletions into v2 (edges_v2 cascade on the FK)."""
|
|
1186
|
-
if KGStoreV2 is None:
|
|
1187
|
-
return
|
|
1188
|
-
ids = list(ids)
|
|
1189
|
-
if not ids:
|
|
1190
|
-
return
|
|
1191
|
-
ph = ",".join("?" * len(ids))
|
|
1192
|
-
try:
|
|
1193
|
-
conn.execute(f"DELETE FROM nodes_v2 WHERE id IN ({ph})", ids)
|
|
1194
|
-
except Exception as ex:
|
|
1195
|
-
logging.debug("knowledge_graph: v2 node delete mirror skipped: %s", ex)
|
|
1196
|
-
|
|
1197
|
-
def _v2_delete_edges_from(self, conn: sqlite3.Connection, node_id: str) -> None:
|
|
1198
|
-
"""Mirror a legacy ``DELETE FROM edges WHERE from_node=?`` into v2."""
|
|
1199
|
-
if KGStoreV2 is None:
|
|
1200
|
-
return
|
|
1201
|
-
try:
|
|
1202
|
-
conn.execute("DELETE FROM edges_v2 WHERE source=?", (node_id,))
|
|
1203
|
-
except Exception as ex:
|
|
1204
|
-
logging.debug("knowledge_graph: v2 edge delete mirror skipped: %s", ex)
|
|
1205
|
-
|
|
1206
|
-
def _v2_sync_report(self) -> Dict[str, Any]:
|
|
1207
|
-
"""Diagnose the dual-write invariant: legacy node/edge id sets must equal
|
|
1208
|
-
the v2 projection's. Returns counts + any drift (ids missing from / extra
|
|
1209
|
-
in v2). ``in_sync`` is True only when both id sets match exactly.
|
|
1210
|
-
|
|
1211
|
-
All legacy writes go through _upsert_node/_upsert_edge (which dual-write)
|
|
1212
|
-
and every legacy delete is mirrored, so a non-empty drift signals a
|
|
1213
|
-
bypassed write path — this is the runtime guard for that invariant.
|
|
1214
|
-
"""
|
|
1215
|
-
if KGStoreV2 is None:
|
|
1216
|
-
return {"available": False, "in_sync": True}
|
|
1217
|
-
with self._connect() as conn:
|
|
1218
|
-
legacy_nodes = {r[0] for r in conn.execute("SELECT id FROM nodes")}
|
|
1219
|
-
v2_nodes = {r[0] for r in conn.execute("SELECT id FROM nodes_v2")}
|
|
1220
|
-
legacy_edges = {r[0] for r in conn.execute("SELECT id FROM edges")}
|
|
1221
|
-
v2_edges = {r[0] for r in conn.execute("SELECT id FROM edges_v2")}
|
|
1222
|
-
return {
|
|
1223
|
-
"available": True,
|
|
1224
|
-
"in_sync": legacy_nodes == v2_nodes and legacy_edges == v2_edges,
|
|
1225
|
-
"nodes_legacy": len(legacy_nodes),
|
|
1226
|
-
"nodes_v2": len(v2_nodes),
|
|
1227
|
-
"edges_legacy": len(legacy_edges),
|
|
1228
|
-
"edges_v2": len(v2_edges),
|
|
1229
|
-
"nodes_missing_from_v2": sorted(legacy_nodes - v2_nodes),
|
|
1230
|
-
"nodes_extra_in_v2": sorted(v2_nodes - legacy_nodes),
|
|
1231
|
-
"edges_missing_from_v2": sorted(legacy_edges - v2_edges),
|
|
1232
|
-
"edges_extra_in_v2": sorted(v2_edges - legacy_edges),
|
|
1233
|
-
}
|
|
1234
|
-
|
|
1235
|
-
def _upsert_node(
|
|
1236
|
-
self,
|
|
1237
|
-
conn: sqlite3.Connection,
|
|
1238
|
-
node_id: str,
|
|
1239
|
-
node_type: str,
|
|
1240
|
-
title: str,
|
|
1241
|
-
summary: str = "",
|
|
1242
|
-
metadata: Optional[Dict[str, Any]] = None,
|
|
1243
|
-
raw: Optional[Dict[str, Any]] = None,
|
|
1244
|
-
) -> str:
|
|
1245
|
-
now = _now()
|
|
1246
|
-
# Canonical stored values, computed once and shared with the v2
|
|
1247
|
-
# projection so legacy and v2 hold byte-identical strings.
|
|
1248
|
-
title_s = title[:240]
|
|
1249
|
-
summary_s = summary[:1000]
|
|
1250
|
-
meta_json = _json(metadata)
|
|
1251
|
-
conn.execute(
|
|
1252
|
-
"""
|
|
1253
|
-
INSERT INTO nodes(id, type, title, summary, metadata_json, raw_json, created_at, updated_at)
|
|
1254
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
1255
|
-
ON CONFLICT(id) DO UPDATE SET
|
|
1256
|
-
title=excluded.title,
|
|
1257
|
-
summary=excluded.summary,
|
|
1258
|
-
metadata_json=excluded.metadata_json,
|
|
1259
|
-
raw_json=excluded.raw_json,
|
|
1260
|
-
updated_at=excluded.updated_at
|
|
1261
|
-
""",
|
|
1262
|
-
(node_id, node_type, title_s, summary_s, meta_json, _json(raw), now, now),
|
|
1263
|
-
)
|
|
1264
|
-
# dual-write: project into the v2 graph on the same transaction
|
|
1265
|
-
self._v2_project_node(conn, node_id, node_type, title_s, summary_s, meta_json,
|
|
1266
|
-
created_at=now, updated_at=now)
|
|
1267
|
-
if node_type != "Chunk":
|
|
1268
|
-
self._upsert_vector_item(
|
|
1269
|
-
conn,
|
|
1270
|
-
item_id=node_id,
|
|
1271
|
-
item_type="node",
|
|
1272
|
-
source_node=node_id,
|
|
1273
|
-
text=self._vector_text_for_node(title=title_s, summary=summary_s, metadata=metadata),
|
|
1274
|
-
metadata={"node_type": node_type, **(metadata or {})},
|
|
1275
|
-
)
|
|
1276
|
-
return node_id
|
|
1277
|
-
|
|
1278
|
-
def _upsert_edge(
|
|
1279
|
-
self,
|
|
1280
|
-
conn: sqlite3.Connection,
|
|
1281
|
-
from_node: str,
|
|
1282
|
-
to_node: str,
|
|
1283
|
-
edge_type: str,
|
|
1284
|
-
weight: float = 1.0,
|
|
1285
|
-
metadata: Optional[Dict[str, Any]] = None,
|
|
1286
|
-
) -> str:
|
|
1287
|
-
edge_id = f"edge:{_sha256_text(f'{from_node}|{edge_type}|{to_node}')[:24]}"
|
|
1288
|
-
now = _now()
|
|
1289
|
-
meta_json = _json(metadata) # canonical string shared with the projection
|
|
1290
|
-
conn.execute(
|
|
1291
|
-
"""
|
|
1292
|
-
INSERT INTO edges(id, from_node, to_node, type, weight, metadata_json, created_at)
|
|
1293
|
-
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
1294
|
-
ON CONFLICT(from_node, to_node, type) DO UPDATE SET
|
|
1295
|
-
weight=max(edges.weight, excluded.weight),
|
|
1296
|
-
metadata_json=excluded.metadata_json
|
|
1297
|
-
""",
|
|
1298
|
-
(edge_id, from_node, to_node, edge_type, float(weight), meta_json, now),
|
|
1299
|
-
)
|
|
1300
|
-
# dual-write: project into the v2 graph on the same transaction
|
|
1301
|
-
self._v2_project_edge(conn, from_node, to_node, edge_type, float(weight), meta_json,
|
|
1302
|
-
edge_id=edge_id, created_at=now)
|
|
1303
|
-
return edge_id
|
|
1304
|
-
|
|
1305
|
-
def _vector_text_for_node(
|
|
1306
|
-
self,
|
|
1307
|
-
*,
|
|
1308
|
-
title: str,
|
|
1309
|
-
summary: str = "",
|
|
1310
|
-
metadata: Optional[Dict[str, Any]] = None,
|
|
1311
|
-
) -> str:
|
|
1312
|
-
metadata = metadata or {}
|
|
1313
|
-
meta_parts = []
|
|
1314
|
-
for key in (
|
|
1315
|
-
"filename", "relative_path", "file_path", "conversation_id", "source",
|
|
1316
|
-
"category", "ext", "role",
|
|
1317
|
-
):
|
|
1318
|
-
value = metadata.get(key)
|
|
1319
|
-
if value:
|
|
1320
|
-
meta_parts.append(str(value))
|
|
1321
|
-
return _clean_text("\n".join([str(title or ""), str(summary or ""), " ".join(meta_parts)]))
|
|
1322
|
-
|
|
1323
|
-
def _upsert_vector_item(
|
|
1324
|
-
self,
|
|
1325
|
-
conn: sqlite3.Connection,
|
|
1326
|
-
*,
|
|
1327
|
-
item_id: str,
|
|
1328
|
-
item_type: str,
|
|
1329
|
-
source_node: str,
|
|
1330
|
-
text: str,
|
|
1331
|
-
metadata: Optional[Dict[str, Any]] = None,
|
|
1332
|
-
) -> bool:
|
|
1333
|
-
text = _clean_text(text)
|
|
1334
|
-
if len(text) < 2:
|
|
1335
|
-
conn.execute("DELETE FROM vector_embeddings WHERE item_id=?", (item_id,))
|
|
1336
|
-
return False
|
|
1337
|
-
text_hash = _sha256_text(text)
|
|
1338
|
-
existing = conn.execute(
|
|
1339
|
-
"""
|
|
1340
|
-
SELECT text_hash, embedding_dim, embedding_model
|
|
1341
|
-
FROM vector_embeddings
|
|
1342
|
-
WHERE item_id=?
|
|
1343
|
-
""",
|
|
1344
|
-
(item_id,),
|
|
1345
|
-
).fetchone()
|
|
1346
|
-
if (
|
|
1347
|
-
existing
|
|
1348
|
-
and existing["text_hash"] == text_hash
|
|
1349
|
-
and existing["embedding_dim"] == self._embedding_model.dim
|
|
1350
|
-
and existing["embedding_model"] == self._embedding_model.model_id
|
|
1351
|
-
):
|
|
1352
|
-
return False
|
|
1353
|
-
embedding = self._embedding_model.encode(self._embedding_model.embed(text[:50_000]))
|
|
1354
|
-
conn.execute(
|
|
1355
|
-
"""
|
|
1356
|
-
INSERT INTO vector_embeddings(
|
|
1357
|
-
item_id, item_type, source_node, text_hash, embedding,
|
|
1358
|
-
embedding_dim, embedding_model, metadata_json, indexed_at
|
|
1359
|
-
)
|
|
1360
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
1361
|
-
ON CONFLICT(item_id) DO UPDATE SET
|
|
1362
|
-
item_type=excluded.item_type,
|
|
1363
|
-
source_node=excluded.source_node,
|
|
1364
|
-
text_hash=excluded.text_hash,
|
|
1365
|
-
embedding=excluded.embedding,
|
|
1366
|
-
embedding_dim=excluded.embedding_dim,
|
|
1367
|
-
embedding_model=excluded.embedding_model,
|
|
1368
|
-
metadata_json=excluded.metadata_json,
|
|
1369
|
-
indexed_at=excluded.indexed_at
|
|
1370
|
-
""",
|
|
1371
|
-
(
|
|
1372
|
-
item_id,
|
|
1373
|
-
item_type,
|
|
1374
|
-
source_node,
|
|
1375
|
-
text_hash,
|
|
1376
|
-
embedding,
|
|
1377
|
-
self._embedding_model.dim,
|
|
1378
|
-
self._embedding_model.model_id,
|
|
1379
|
-
_json(metadata),
|
|
1380
|
-
_now(),
|
|
1381
|
-
),
|
|
1382
|
-
)
|
|
1383
|
-
return True
|
|
1384
|
-
|
|
1385
|
-
def _upsert_chunk(
|
|
1386
|
-
self,
|
|
1387
|
-
conn: sqlite3.Connection,
|
|
1388
|
-
*,
|
|
1389
|
-
chunk_id: str,
|
|
1390
|
-
source_node: str,
|
|
1391
|
-
text: str,
|
|
1392
|
-
metadata: Optional[Dict[str, Any]] = None,
|
|
1393
|
-
) -> None:
|
|
1394
|
-
metadata = metadata or {}
|
|
1395
|
-
conn.execute(
|
|
1396
|
-
"INSERT OR REPLACE INTO chunks(id, source_node, text, metadata_json, created_at) "
|
|
1397
|
-
"VALUES (?, ?, ?, ?, ?)",
|
|
1398
|
-
(chunk_id, source_node, text, _json(metadata), _now()),
|
|
1399
|
-
)
|
|
1400
|
-
self._upsert_vector_item(
|
|
1401
|
-
conn,
|
|
1402
|
-
item_id=chunk_id,
|
|
1403
|
-
item_type="chunk",
|
|
1404
|
-
source_node=chunk_id,
|
|
1405
|
-
text=text,
|
|
1406
|
-
metadata={**metadata, "parent_source_node": source_node},
|
|
1407
|
-
)
|
|
1408
|
-
|
|
1409
|
-
# ── Local folder sources → Graph RAG ──────────────────────────────────
|
|
1410
|
-
|
|
1411
|
-
def discover_local_roots(self) -> Dict[str, Any]:
|
|
1412
|
-
"""Return safe, cross-platform starting points for structure browsing."""
|
|
1413
|
-
os_type = _current_os_type()
|
|
1414
|
-
home = Path.home().expanduser()
|
|
1415
|
-
roots: List[Dict[str, Any]] = []
|
|
1416
|
-
seen: set = set()
|
|
1417
|
-
|
|
1418
|
-
def add(label: str, path: Path, kind: str, *, recommended: bool = True, warning: Optional[str] = None) -> None:
|
|
1419
|
-
try:
|
|
1420
|
-
resolved = path.expanduser().resolve()
|
|
1421
|
-
except OSError:
|
|
1422
|
-
resolved = path.expanduser()
|
|
1423
|
-
key = str(resolved)
|
|
1424
|
-
if key in seen or not resolved.exists():
|
|
1425
|
-
return
|
|
1426
|
-
seen.add(key)
|
|
1427
|
-
roots.append({
|
|
1428
|
-
"id": f"{kind}:{_path_fingerprint(resolved)}",
|
|
1429
|
-
"label": label,
|
|
1430
|
-
"path": key,
|
|
1431
|
-
"kind": kind,
|
|
1432
|
-
"recommended": recommended,
|
|
1433
|
-
"warning": warning or _root_warning(resolved, os_type),
|
|
1434
|
-
})
|
|
1435
|
-
|
|
1436
|
-
add("홈", home, "home", warning=_root_warning(home, os_type))
|
|
1437
|
-
for name, label in (
|
|
1438
|
-
("Documents", "문서"),
|
|
1439
|
-
("Desktop", "데스크탑"),
|
|
1440
|
-
("Downloads", "다운로드"),
|
|
1441
|
-
("Pictures", "사진"),
|
|
1442
|
-
("Projects", "프로젝트"),
|
|
1443
|
-
):
|
|
1444
|
-
add(label, home / name, name.lower())
|
|
1445
|
-
|
|
1446
|
-
if os_type == "macos":
|
|
1447
|
-
volumes = Path("/Volumes")
|
|
1448
|
-
if volumes.exists():
|
|
1449
|
-
try:
|
|
1450
|
-
for volume in sorted(volumes.iterdir(), key=lambda p: p.name.lower()):
|
|
1451
|
-
add(volume.name, volume, "volume", recommended=False)
|
|
1452
|
-
except OSError:
|
|
1453
|
-
pass
|
|
1454
|
-
elif os_type == "windows":
|
|
1455
|
-
for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
|
1456
|
-
drive = Path(f"{letter}:\\")
|
|
1457
|
-
if drive.exists():
|
|
1458
|
-
add(f"{letter}: 드라이브", drive, "drive", recommended=(letter != "C"))
|
|
1459
|
-
for env_name, label in (("OneDrive", "OneDrive"), ("OneDriveCommercial", "OneDrive")):
|
|
1460
|
-
raw = os.environ.get(env_name)
|
|
1461
|
-
if raw:
|
|
1462
|
-
add(label, Path(raw), "cloud", recommended=False)
|
|
1463
|
-
elif os_type == "linux":
|
|
1464
|
-
for base in (Path("/mnt"), Path("/media")):
|
|
1465
|
-
add(str(base), base, "mounts", recommended=False)
|
|
1466
|
-
try:
|
|
1467
|
-
if base.exists():
|
|
1468
|
-
for mounted in sorted(base.iterdir(), key=lambda p: p.name.lower()):
|
|
1469
|
-
add(mounted.name, mounted, "volume", recommended=False)
|
|
1470
|
-
except OSError:
|
|
1471
|
-
pass
|
|
1472
|
-
|
|
1473
|
-
return {
|
|
1474
|
-
"os_type": os_type,
|
|
1475
|
-
"computer": platform.node() or "local",
|
|
1476
|
-
"roots": roots,
|
|
1477
|
-
"privacy_notice": "처음에는 드라이브와 폴더 구조만 확인하며, 파일 내용은 사용자가 동의한 뒤에만 읽습니다.",
|
|
1478
|
-
}
|
|
1479
|
-
|
|
1480
|
-
def preview_local_tree(self, path: Path, *, max_items: int = 200) -> Dict[str, Any]:
|
|
1481
|
-
"""List one folder level using metadata only; file contents are not read."""
|
|
1482
|
-
root = Path(path).expanduser().resolve()
|
|
1483
|
-
if not root.exists():
|
|
1484
|
-
raise ValueError(f"경로가 존재하지 않습니다: {path}")
|
|
1485
|
-
if not root.is_dir():
|
|
1486
|
-
raise ValueError(f"폴더가 아닙니다: {path}")
|
|
1487
|
-
|
|
1488
|
-
os_type = _current_os_type()
|
|
1489
|
-
max_items = max(1, min(int(max_items or 200), 1000))
|
|
1490
|
-
items: List[Dict[str, Any]] = []
|
|
1491
|
-
inaccessible = 0
|
|
1492
|
-
try:
|
|
1493
|
-
children = sorted(root.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower()))
|
|
1494
|
-
except PermissionError as exc:
|
|
1495
|
-
return {
|
|
1496
|
-
"path": str(root),
|
|
1497
|
-
"items": [],
|
|
1498
|
-
"error": f"접근 권한 없음: {exc}",
|
|
1499
|
-
"privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
|
|
1500
|
-
}
|
|
1501
|
-
|
|
1502
|
-
for child in children[:max_items]:
|
|
1503
|
-
try:
|
|
1504
|
-
is_dir = child.is_dir()
|
|
1505
|
-
stat = child.stat()
|
|
1506
|
-
reason = _excluded_directory_reason(child, root=root, os_type=os_type) if is_dir else _sensitive_file_reason(child, root=root)
|
|
1507
|
-
items.append({
|
|
1508
|
-
"name": child.name,
|
|
1509
|
-
"path": str(child),
|
|
1510
|
-
"type": "directory" if is_dir else "file",
|
|
1511
|
-
"extension": "" if is_dir else child.suffix.lower(),
|
|
1512
|
-
"size_bytes": None if is_dir else stat.st_size,
|
|
1513
|
-
"modified_at": _safe_iso_from_stat_mtime(stat.st_mtime),
|
|
1514
|
-
"hidden": _is_hidden_path(child, root),
|
|
1515
|
-
"accessible": True,
|
|
1516
|
-
"excluded_reason": reason,
|
|
1517
|
-
})
|
|
1518
|
-
except PermissionError:
|
|
1519
|
-
inaccessible += 1
|
|
1520
|
-
items.append({
|
|
1521
|
-
"name": child.name,
|
|
1522
|
-
"path": str(child),
|
|
1523
|
-
"type": "unknown",
|
|
1524
|
-
"accessible": False,
|
|
1525
|
-
"excluded_reason": "permission_denied",
|
|
1526
|
-
})
|
|
1527
|
-
except OSError as exc:
|
|
1528
|
-
inaccessible += 1
|
|
1529
|
-
items.append({
|
|
1530
|
-
"name": child.name,
|
|
1531
|
-
"path": str(child),
|
|
1532
|
-
"type": "unknown",
|
|
1533
|
-
"accessible": False,
|
|
1534
|
-
"excluded_reason": str(exc),
|
|
1535
|
-
})
|
|
1536
|
-
|
|
1537
|
-
return {
|
|
1538
|
-
"path": str(root),
|
|
1539
|
-
"os_type": os_type,
|
|
1540
|
-
"items": items,
|
|
1541
|
-
"truncated": len(children) > max_items,
|
|
1542
|
-
"inaccessible": inaccessible,
|
|
1543
|
-
"warning": _root_warning(root, os_type),
|
|
1544
|
-
"privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
|
|
1545
|
-
}
|
|
1546
|
-
|
|
1547
|
-
def _iter_local_scan_entries(self, root: Path, *, max_files: int) -> Iterable[Dict[str, Any]]:
|
|
1548
|
-
os_type = _current_os_type()
|
|
1549
|
-
stack = [root]
|
|
1550
|
-
files_seen = 0
|
|
1551
|
-
while stack:
|
|
1552
|
-
current = stack.pop()
|
|
1553
|
-
try:
|
|
1554
|
-
children = sorted(current.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower()))
|
|
1555
|
-
except PermissionError as exc:
|
|
1556
|
-
yield {"kind": "inaccessible_dir", "path": current, "reason": f"permission_denied: {exc}"}
|
|
1557
|
-
continue
|
|
1558
|
-
except OSError as exc:
|
|
1559
|
-
yield {"kind": "inaccessible_dir", "path": current, "reason": str(exc)}
|
|
1560
|
-
continue
|
|
1561
|
-
|
|
1562
|
-
for child in children:
|
|
1563
|
-
if child.is_symlink():
|
|
1564
|
-
yield {"kind": "excluded", "path": child, "reason": "symlink"}
|
|
1565
|
-
continue
|
|
1566
|
-
try:
|
|
1567
|
-
if child.is_dir():
|
|
1568
|
-
reason = _excluded_directory_reason(child, root=root, os_type=os_type)
|
|
1569
|
-
if reason:
|
|
1570
|
-
yield {"kind": "excluded_dir", "path": child, "reason": reason}
|
|
1571
|
-
else:
|
|
1572
|
-
stack.append(child)
|
|
1573
|
-
continue
|
|
1574
|
-
if not child.is_file():
|
|
1575
|
-
yield {"kind": "excluded", "path": child, "reason": "not_regular_file"}
|
|
1576
|
-
continue
|
|
1577
|
-
stat = child.stat()
|
|
1578
|
-
except PermissionError as exc:
|
|
1579
|
-
yield {"kind": "inaccessible_file", "path": child, "reason": f"permission_denied: {exc}"}
|
|
1580
|
-
continue
|
|
1581
|
-
except OSError as exc:
|
|
1582
|
-
yield {"kind": "inaccessible_file", "path": child, "reason": str(exc)}
|
|
1583
|
-
continue
|
|
1584
|
-
|
|
1585
|
-
files_seen += 1
|
|
1586
|
-
if files_seen > max_files:
|
|
1587
|
-
yield {"kind": "limit_reached", "path": child, "reason": "max_files"}
|
|
1588
|
-
return
|
|
1589
|
-
yield {"kind": "file", "path": child, "stat": stat}
|
|
1590
|
-
|
|
1591
|
-
def _local_file_decision(self, path: Path, root: Path, stat: os.stat_result) -> Dict[str, Any]:
|
|
1592
|
-
ext = path.suffix.lower()
|
|
1593
|
-
category = _file_category(ext)
|
|
1594
|
-
parser_type = _parser_type_for_category(category, ext)
|
|
1595
|
-
sensitive_reason = _sensitive_file_reason(path, root=root)
|
|
1596
|
-
if sensitive_reason:
|
|
1597
|
-
return {
|
|
1598
|
-
"status": "sensitive_blocked",
|
|
1599
|
-
"reason": sensitive_reason,
|
|
1600
|
-
"category": category,
|
|
1601
|
-
"parser_type": parser_type,
|
|
1602
|
-
"indexable": False,
|
|
1603
|
-
}
|
|
1604
|
-
if category == "unsupported":
|
|
1605
|
-
return {
|
|
1606
|
-
"status": "unsupported",
|
|
1607
|
-
"reason": "unsupported_extension",
|
|
1608
|
-
"category": category,
|
|
1609
|
-
"parser_type": parser_type,
|
|
1610
|
-
"indexable": False,
|
|
1611
|
-
}
|
|
1612
|
-
limit = _size_limit_for_category(category)
|
|
1613
|
-
if stat.st_size > limit:
|
|
1614
|
-
return {
|
|
1615
|
-
"status": "too_large",
|
|
1616
|
-
"reason": f"size>{limit}",
|
|
1617
|
-
"category": category,
|
|
1618
|
-
"parser_type": parser_type,
|
|
1619
|
-
"indexable": False,
|
|
1620
|
-
}
|
|
1621
|
-
return {
|
|
1622
|
-
"status": "pending",
|
|
1623
|
-
"reason": "",
|
|
1624
|
-
"category": category,
|
|
1625
|
-
"parser_type": parser_type,
|
|
1626
|
-
"indexable": True,
|
|
1627
|
-
}
|
|
1628
|
-
|
|
1629
|
-
def audit_local_folder(self, path: Path, *, include_ocr: bool = False, max_files: int = 50_000) -> Dict[str, Any]:
|
|
1630
|
-
"""Safety-check a folder using metadata only; file bodies are not read."""
|
|
1631
|
-
root = Path(path).expanduser().resolve()
|
|
1632
|
-
if not root.exists():
|
|
1633
|
-
raise ValueError(f"경로가 존재하지 않습니다: {path}")
|
|
1634
|
-
if not root.is_dir():
|
|
1635
|
-
raise ValueError(f"폴더가 아닙니다: {path}")
|
|
1636
|
-
|
|
1637
|
-
os_type = _current_os_type()
|
|
1638
|
-
max_files = max(1, min(int(max_files or 50_000), 200_000))
|
|
1639
|
-
status_counts: Counter = Counter()
|
|
1640
|
-
category_counts: Counter = Counter()
|
|
1641
|
-
extension_counts: Counter = Counter()
|
|
1642
|
-
allowed_samples: List[Dict[str, Any]] = []
|
|
1643
|
-
excluded_samples: List[Dict[str, Any]] = []
|
|
1644
|
-
total_files = 0
|
|
1645
|
-
readable_files = 0
|
|
1646
|
-
inaccessible = 0
|
|
1647
|
-
excluded_dirs = 0
|
|
1648
|
-
limit_reached = False
|
|
1649
|
-
|
|
1650
|
-
for entry in self._iter_local_scan_entries(root, max_files=max_files):
|
|
1651
|
-
kind = entry["kind"]
|
|
1652
|
-
path_obj = entry["path"]
|
|
1653
|
-
if kind == "limit_reached":
|
|
1654
|
-
limit_reached = True
|
|
1655
|
-
break
|
|
1656
|
-
if kind == "excluded_dir":
|
|
1657
|
-
excluded_dirs += 1
|
|
1658
|
-
if len(excluded_samples) < 25:
|
|
1659
|
-
excluded_samples.append(_sample_file(path_obj, root, "excluded", entry.get("reason", "")))
|
|
1660
|
-
continue
|
|
1661
|
-
if kind in {"inaccessible_dir", "inaccessible_file"}:
|
|
1662
|
-
inaccessible += 1
|
|
1663
|
-
status_counts["failed"] += 1
|
|
1664
|
-
if len(excluded_samples) < 25:
|
|
1665
|
-
excluded_samples.append(_sample_file(path_obj, root, "failed", entry.get("reason", "")))
|
|
1666
|
-
continue
|
|
1667
|
-
if kind == "excluded":
|
|
1668
|
-
status_counts["excluded"] += 1
|
|
1669
|
-
if len(excluded_samples) < 25:
|
|
1670
|
-
excluded_samples.append(_sample_file(path_obj, root, "excluded", entry.get("reason", "")))
|
|
1671
|
-
continue
|
|
1672
|
-
if kind != "file":
|
|
1673
|
-
continue
|
|
1674
|
-
|
|
1675
|
-
total_files += 1
|
|
1676
|
-
stat = entry["stat"]
|
|
1677
|
-
decision = self._local_file_decision(path_obj, root, stat)
|
|
1678
|
-
status = decision["status"]
|
|
1679
|
-
category = decision["category"]
|
|
1680
|
-
ext = path_obj.suffix.lower() or "(none)"
|
|
1681
|
-
category_counts[category] += 1
|
|
1682
|
-
extension_counts[ext] += 1
|
|
1683
|
-
if decision["indexable"]:
|
|
1684
|
-
readable_files += 1
|
|
1685
|
-
status_counts["readable"] += 1
|
|
1686
|
-
if len(allowed_samples) < 25:
|
|
1687
|
-
allowed_samples.append(_sample_file(path_obj, root, "readable"))
|
|
1688
|
-
else:
|
|
1689
|
-
status_counts[status] += 1
|
|
1690
|
-
if len(excluded_samples) < 25:
|
|
1691
|
-
excluded_samples.append(_sample_file(path_obj, root, status, decision["reason"]))
|
|
1692
|
-
|
|
1693
|
-
doc_weight = category_counts["pdf"] * 1.4 + category_counts["document"] * 0.9 + category_counts["slide_deck"] * 1.0
|
|
1694
|
-
sheet_weight = category_counts["spreadsheet"] * 0.6
|
|
1695
|
-
ocr_weight = category_counts["image"] * (1.8 if include_ocr else 0.1)
|
|
1696
|
-
estimated_seconds = round(readable_files * 0.04 + doc_weight + sheet_weight + ocr_weight, 1)
|
|
1697
|
-
|
|
1698
|
-
return {
|
|
1699
|
-
"path": str(root),
|
|
1700
|
-
"source_id": f"source:{_path_fingerprint(root)}",
|
|
1701
|
-
"os_type": os_type,
|
|
1702
|
-
"drive_id": _drive_id_for_path(root),
|
|
1703
|
-
"warning": _root_warning(root, os_type),
|
|
1704
|
-
"privacy_notice": "현재 단계에서는 파일 내용을 읽지 않고, 폴더와 파일의 이름/크기/수정일만 확인합니다.",
|
|
1705
|
-
"include_ocr_requested": bool(include_ocr),
|
|
1706
|
-
"summary": {
|
|
1707
|
-
"total_files": total_files,
|
|
1708
|
-
"readable_files": readable_files,
|
|
1709
|
-
"excluded_files": int(
|
|
1710
|
-
status_counts["excluded"]
|
|
1711
|
-
+ status_counts["sensitive_blocked"]
|
|
1712
|
-
+ status_counts["too_large"]
|
|
1713
|
-
+ status_counts["unsupported"]
|
|
1714
|
-
),
|
|
1715
|
-
"sensitive_files": int(status_counts["sensitive_blocked"]),
|
|
1716
|
-
"too_large_files": int(status_counts["too_large"]),
|
|
1717
|
-
"unsupported_files": int(status_counts["unsupported"]),
|
|
1718
|
-
"image_ocr_candidates": int(category_counts["image"]),
|
|
1719
|
-
"inaccessible_items": inaccessible,
|
|
1720
|
-
"excluded_dirs": excluded_dirs,
|
|
1721
|
-
"estimated_seconds": estimated_seconds,
|
|
1722
|
-
"storage_root": str(self.db_path.parent),
|
|
1723
|
-
"limit_reached": limit_reached,
|
|
1724
|
-
},
|
|
1725
|
-
"by_status": dict(status_counts),
|
|
1726
|
-
"by_category": dict(category_counts),
|
|
1727
|
-
"by_extension": dict(extension_counts.most_common(40)),
|
|
1728
|
-
"allowed_samples": allowed_samples,
|
|
1729
|
-
"excluded_samples": excluded_samples,
|
|
1730
|
-
"consent_required": {
|
|
1731
|
-
"knowledge_source": True,
|
|
1732
|
-
"image_ocr": bool(category_counts["image"]),
|
|
1733
|
-
"watch": True,
|
|
1734
|
-
"sensitive_files_default_excluded": True,
|
|
1735
|
-
},
|
|
1736
|
-
}
|
|
1737
|
-
|
|
1738
|
-
def local_sources(self) -> Dict[str, Any]:
|
|
1739
|
-
with self._connect() as conn:
|
|
1740
|
-
sources = [
|
|
1741
|
-
{
|
|
1742
|
-
"id": row["id"],
|
|
1743
|
-
"root_path": row["root_path"],
|
|
1744
|
-
"os_type": row["os_type"],
|
|
1745
|
-
"drive_id": row["drive_id"],
|
|
1746
|
-
"label": row["label"],
|
|
1747
|
-
"status": row["status"],
|
|
1748
|
-
"include_ocr": bool(row["include_ocr"]),
|
|
1749
|
-
"watch_enabled": bool(row["watch_enabled"]),
|
|
1750
|
-
"consent": _safe_loads(row["consent_json"]),
|
|
1751
|
-
"created_at": row["created_at"],
|
|
1752
|
-
"updated_at": row["updated_at"],
|
|
1753
|
-
"last_scanned_at": row["last_scanned_at"],
|
|
1754
|
-
}
|
|
1755
|
-
for row in conn.execute(
|
|
1756
|
-
"""
|
|
1757
|
-
SELECT id, root_path, os_type, drive_id, label, status, include_ocr,
|
|
1758
|
-
watch_enabled, consent_json, created_at, updated_at, last_scanned_at
|
|
1759
|
-
FROM knowledge_sources
|
|
1760
|
-
ORDER BY updated_at DESC, id ASC
|
|
1761
|
-
"""
|
|
1762
|
-
)
|
|
1763
|
-
]
|
|
1764
|
-
status_rows = conn.execute(
|
|
1765
|
-
"SELECT source_id, status, COUNT(*) AS count FROM local_file_index GROUP BY source_id, status"
|
|
1766
|
-
).fetchall()
|
|
1767
|
-
counts: Dict[str, Dict[str, int]] = {}
|
|
1768
|
-
for row in status_rows:
|
|
1769
|
-
counts.setdefault(row["source_id"], {})[row["status"]] = row["count"]
|
|
1770
|
-
for source in sources:
|
|
1771
|
-
source["file_status"] = counts.get(source["id"], {})
|
|
1772
|
-
return {"sources": sources}
|
|
1773
|
-
|
|
1774
|
-
def set_local_source_watch(self, source_id: str, enabled: bool) -> Dict[str, Any]:
|
|
1775
|
-
source_id = str(source_id or "").strip()
|
|
1776
|
-
if not source_id:
|
|
1777
|
-
raise ValueError("source_id required")
|
|
1778
|
-
with self._connect() as conn:
|
|
1779
|
-
row = conn.execute(
|
|
1780
|
-
"SELECT id FROM knowledge_sources WHERE id=?",
|
|
1781
|
-
(source_id,),
|
|
1782
|
-
).fetchone()
|
|
1783
|
-
if not row:
|
|
1784
|
-
raise ValueError(f"knowledge source not found: {source_id}")
|
|
1785
|
-
conn.execute(
|
|
1786
|
-
"UPDATE knowledge_sources SET watch_enabled=?, updated_at=? WHERE id=?",
|
|
1787
|
-
(1 if enabled else 0, _now(), source_id),
|
|
1788
|
-
)
|
|
1789
|
-
return {"source_id": source_id, "watch_enabled": bool(enabled)}
|
|
1790
|
-
|
|
1791
|
-
def remove_local_source(self, source_id: str) -> Dict[str, Any]:
|
|
1792
|
-
"""Remove one approved local source and its derived graph projection.
|
|
1793
|
-
|
|
1794
|
-
This is intentionally non-destructive for user files: only the LatticeAI
|
|
1795
|
-
index rows, graph nodes, edges, and chunks derived from the source are
|
|
1796
|
-
removed. The original folder and files are never touched.
|
|
1797
|
-
"""
|
|
1798
|
-
source_id = str(source_id or "").strip()
|
|
1799
|
-
if not source_id:
|
|
1800
|
-
raise ValueError("source_id required")
|
|
1801
|
-
with self._connect() as conn:
|
|
1802
|
-
source = conn.execute(
|
|
1803
|
-
"SELECT id, root_path FROM knowledge_sources WHERE id=?",
|
|
1804
|
-
(source_id,),
|
|
1805
|
-
).fetchone()
|
|
1806
|
-
if not source:
|
|
1807
|
-
raise ValueError(f"knowledge source not found: {source_id}")
|
|
1808
|
-
rows = conn.execute(
|
|
1809
|
-
"SELECT graph_node_id FROM local_file_index WHERE source_id=? AND graph_node_id IS NOT NULL",
|
|
1810
|
-
(source_id,),
|
|
1811
|
-
).fetchall()
|
|
1812
|
-
graph_node_ids = [row["graph_node_id"] for row in rows if row["graph_node_id"]]
|
|
1813
|
-
for graph_node_id in graph_node_ids:
|
|
1814
|
-
self._delete_local_file_graph(conn, graph_node_id)
|
|
1815
|
-
conn.execute("DELETE FROM local_file_index WHERE source_id=?", (source_id,))
|
|
1816
|
-
conn.execute("DELETE FROM knowledge_sources WHERE id=?", (source_id,))
|
|
1817
|
-
self._cleanup_local_graph_orphans(conn, source_id)
|
|
1818
|
-
return {
|
|
1819
|
-
"source_id": source_id,
|
|
1820
|
-
"root_path": source["root_path"],
|
|
1821
|
-
"removed_graph_nodes": len(graph_node_ids),
|
|
1822
|
-
}
|
|
1823
|
-
|
|
1824
|
-
def _extract_local_file_text(self, path: Path, category: str, *, include_ocr: bool) -> Tuple[str, Dict[str, Any]]:
|
|
1825
|
-
ext = path.suffix.lower()
|
|
1826
|
-
meta: Dict[str, Any] = {"parser": _parser_type_for_category(category, ext)}
|
|
1827
|
-
text = ""
|
|
1828
|
-
if category in {"text", "code"} or ext == ".csv":
|
|
1829
|
-
text = path.read_text(encoding="utf-8", errors="replace")
|
|
1830
|
-
elif ext == ".pdf":
|
|
1831
|
-
import pdfplumber
|
|
1832
|
-
with pdfplumber.open(str(path)) as pdf:
|
|
1833
|
-
meta["pages"] = len(pdf.pages)
|
|
1834
|
-
text = "\n\n".join((page.extract_text() or "") for page in pdf.pages)
|
|
1835
|
-
elif ext == ".docx":
|
|
1836
|
-
from docx import Document
|
|
1837
|
-
doc = Document(str(path))
|
|
1838
|
-
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
1839
|
-
table_lines = []
|
|
1840
|
-
for table in doc.tables:
|
|
1841
|
-
for row in table.rows:
|
|
1842
|
-
cells = [_clean_text(cell.text) for cell in row.cells]
|
|
1843
|
-
if any(cells):
|
|
1844
|
-
table_lines.append("\t".join(cells))
|
|
1845
|
-
meta["paragraphs"] = len(paragraphs)
|
|
1846
|
-
meta["tables"] = len(doc.tables)
|
|
1847
|
-
meta["table_rows"] = len(table_lines)
|
|
1848
|
-
text = "\n\n".join([*paragraphs, *table_lines])
|
|
1849
|
-
elif ext == ".xlsx":
|
|
1850
|
-
from openpyxl import load_workbook
|
|
1851
|
-
wb = load_workbook(str(path), read_only=True, data_only=True)
|
|
1852
|
-
rows_all = []
|
|
1853
|
-
non_empty_rows = 0
|
|
1854
|
-
non_empty_cells = 0
|
|
1855
|
-
char_count = 0
|
|
1856
|
-
for ws in wb.worksheets:
|
|
1857
|
-
sheet_rows = []
|
|
1858
|
-
for row in ws.iter_rows(values_only=True):
|
|
1859
|
-
cells = [str(cell).strip() if cell is not None else "" for cell in row]
|
|
1860
|
-
if not any(cells):
|
|
1861
|
-
continue
|
|
1862
|
-
line = "\t".join(cells)
|
|
1863
|
-
non_empty_rows += 1
|
|
1864
|
-
non_empty_cells += sum(1 for cell in cells if cell)
|
|
1865
|
-
sheet_rows.append(line)
|
|
1866
|
-
char_count += len(line) + 1
|
|
1867
|
-
if char_count > 200_000:
|
|
1868
|
-
break
|
|
1869
|
-
if sheet_rows:
|
|
1870
|
-
rows_all.append(f"[Sheet: {ws.title}]")
|
|
1871
|
-
rows_all.extend(sheet_rows)
|
|
1872
|
-
if char_count > 200_000:
|
|
1873
|
-
break
|
|
1874
|
-
meta["sheets"] = len(wb.worksheets)
|
|
1875
|
-
meta["rows"] = non_empty_rows
|
|
1876
|
-
meta["cells"] = non_empty_cells
|
|
1877
|
-
text = "\n".join(rows_all)
|
|
1878
|
-
elif ext == ".pptx":
|
|
1879
|
-
from pptx import Presentation
|
|
1880
|
-
prs = Presentation(str(path))
|
|
1881
|
-
slides_text = []
|
|
1882
|
-
for index, slide in enumerate(prs.slides, 1):
|
|
1883
|
-
parts = []
|
|
1884
|
-
for shape in slide.shapes:
|
|
1885
|
-
if getattr(shape, "has_text_frame", False):
|
|
1886
|
-
slide_text = shape.text_frame.text.strip()
|
|
1887
|
-
if slide_text:
|
|
1888
|
-
parts.append(slide_text)
|
|
1889
|
-
if parts:
|
|
1890
|
-
slides_text.append(f"[Slide {index}]\n" + "\n".join(parts))
|
|
1891
|
-
meta["slides"] = len(prs.slides)
|
|
1892
|
-
meta["text_slides"] = len(slides_text)
|
|
1893
|
-
text = "\n\n".join(slides_text)
|
|
1894
|
-
elif category == "image":
|
|
1895
|
-
from PIL import Image
|
|
1896
|
-
with Image.open(str(path)) as image:
|
|
1897
|
-
meta.update({
|
|
1898
|
-
"width": image.width,
|
|
1899
|
-
"height": image.height,
|
|
1900
|
-
"format": image.format,
|
|
1901
|
-
"mode": image.mode,
|
|
1902
|
-
"ocr_enabled": bool(include_ocr),
|
|
1903
|
-
})
|
|
1904
|
-
if include_ocr:
|
|
1905
|
-
try:
|
|
1906
|
-
import pytesseract
|
|
1907
|
-
text = pytesseract.image_to_string(image)
|
|
1908
|
-
meta["ocr_chars"] = len(text)
|
|
1909
|
-
except Exception as exc: # pragma: no cover - depends on local OCR runtime
|
|
1910
|
-
meta["ocr_error"] = str(exc)
|
|
1911
|
-
text = ""
|
|
1912
|
-
return text[:200_000], meta
|
|
1913
|
-
|
|
1914
|
-
def _ensure_local_hierarchy(
|
|
1915
|
-
self,
|
|
1916
|
-
conn: sqlite3.Connection,
|
|
1917
|
-
*,
|
|
1918
|
-
source_id: str,
|
|
1919
|
-
root: Path,
|
|
1920
|
-
file_path: Path,
|
|
1921
|
-
os_type: str,
|
|
1922
|
-
drive_id: str,
|
|
1923
|
-
) -> str:
|
|
1924
|
-
computer_label = platform.node() or "내 컴퓨터"
|
|
1925
|
-
computer_id = f"computer:{_slug(computer_label)}"
|
|
1926
|
-
drive_node_id = f"drive:{_sha256_text(f'{os_type}:{drive_id}')[:24]}"
|
|
1927
|
-
root_folder_id = f"folder:{_sha256_text(f'{source_id}:root')[:24]}"
|
|
1928
|
-
self._upsert_node(conn, computer_id, "Computer", computer_label, metadata={"os_type": os_type})
|
|
1929
|
-
self._upsert_node(conn, drive_node_id, "Drive", drive_id, metadata={"os_type": os_type, "drive_id": drive_id})
|
|
1930
|
-
self._upsert_edge(conn, computer_id, drive_node_id, "포함함", metadata={"source": "local_scan"})
|
|
1931
|
-
self._upsert_node(
|
|
1932
|
-
conn,
|
|
1933
|
-
root_folder_id,
|
|
1934
|
-
"Folder",
|
|
1935
|
-
root.name or str(root),
|
|
1936
|
-
summary=str(root),
|
|
1937
|
-
metadata={"source_id": source_id, "path": str(root), "root": True},
|
|
1938
|
-
)
|
|
1939
|
-
self._upsert_edge(conn, drive_node_id, root_folder_id, "포함함", metadata={"source": "local_scan"})
|
|
1940
|
-
|
|
1941
|
-
try:
|
|
1942
|
-
relative_parent = file_path.parent.relative_to(root)
|
|
1943
|
-
except ValueError:
|
|
1944
|
-
relative_parent = Path()
|
|
1945
|
-
parent_id = root_folder_id
|
|
1946
|
-
current_path = root
|
|
1947
|
-
for part in relative_parent.parts:
|
|
1948
|
-
current_path = current_path / part
|
|
1949
|
-
folder_id = f"folder:{_sha256_text(f'{source_id}:{current_path.as_posix()}')[:24]}"
|
|
1950
|
-
self._upsert_node(
|
|
1951
|
-
conn,
|
|
1952
|
-
folder_id,
|
|
1953
|
-
"Folder",
|
|
1954
|
-
part,
|
|
1955
|
-
summary=str(current_path),
|
|
1956
|
-
metadata={"source_id": source_id, "path": str(current_path), "root": False},
|
|
1957
|
-
)
|
|
1958
|
-
self._upsert_edge(conn, parent_id, folder_id, "포함함", metadata={"source": "local_scan"})
|
|
1959
|
-
parent_id = folder_id
|
|
1960
|
-
return parent_id
|
|
1961
|
-
|
|
1962
|
-
def _upsert_local_file_index(
|
|
1963
|
-
self,
|
|
1964
|
-
conn: sqlite3.Connection,
|
|
1965
|
-
*,
|
|
1966
|
-
source_id: str,
|
|
1967
|
-
root: Path,
|
|
1968
|
-
file_path: Path,
|
|
1969
|
-
stat: Optional[os.stat_result],
|
|
1970
|
-
os_type: str,
|
|
1971
|
-
drive_id: str,
|
|
1972
|
-
status: str,
|
|
1973
|
-
parser_type: str,
|
|
1974
|
-
sha256: Optional[str] = None,
|
|
1975
|
-
graph_node_id: Optional[str] = None,
|
|
1976
|
-
error_message: Optional[str] = None,
|
|
1977
|
-
metadata: Optional[Dict[str, Any]] = None,
|
|
1978
|
-
) -> str:
|
|
1979
|
-
try:
|
|
1980
|
-
relative_path = file_path.relative_to(root).as_posix()
|
|
1981
|
-
except ValueError:
|
|
1982
|
-
relative_path = file_path.name
|
|
1983
|
-
index_id = f"local-index:{_sha256_text(f'{source_id}:{relative_path}')[:24]}"
|
|
1984
|
-
now = _now()
|
|
1985
|
-
size = stat.st_size if stat else None
|
|
1986
|
-
modified_at = _safe_iso_from_stat_mtime(stat.st_mtime) if stat else ""
|
|
1987
|
-
conn.execute(
|
|
1988
|
-
"""
|
|
1989
|
-
INSERT INTO local_file_index(
|
|
1990
|
-
id, source_id, os_type, drive_id, root_path, file_path, relative_path,
|
|
1991
|
-
file_name, extension, size_bytes, modified_at, sha256, last_scanned_at,
|
|
1992
|
-
last_indexed_at, parser_type, status, error_message, graph_node_id,
|
|
1993
|
-
deleted, metadata_json
|
|
1994
|
-
)
|
|
1995
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
1996
|
-
ON CONFLICT(source_id, relative_path) DO UPDATE SET
|
|
1997
|
-
os_type=excluded.os_type,
|
|
1998
|
-
drive_id=excluded.drive_id,
|
|
1999
|
-
root_path=excluded.root_path,
|
|
2000
|
-
file_path=excluded.file_path,
|
|
2001
|
-
file_name=excluded.file_name,
|
|
2002
|
-
extension=excluded.extension,
|
|
2003
|
-
size_bytes=excluded.size_bytes,
|
|
2004
|
-
modified_at=excluded.modified_at,
|
|
2005
|
-
sha256=excluded.sha256,
|
|
2006
|
-
last_scanned_at=excluded.last_scanned_at,
|
|
2007
|
-
last_indexed_at=excluded.last_indexed_at,
|
|
2008
|
-
parser_type=excluded.parser_type,
|
|
2009
|
-
status=excluded.status,
|
|
2010
|
-
error_message=excluded.error_message,
|
|
2011
|
-
graph_node_id=excluded.graph_node_id,
|
|
2012
|
-
deleted=excluded.deleted,
|
|
2013
|
-
metadata_json=excluded.metadata_json
|
|
2014
|
-
""",
|
|
2015
|
-
(
|
|
2016
|
-
index_id, source_id, os_type, drive_id, str(root), str(file_path), relative_path,
|
|
2017
|
-
file_path.name, file_path.suffix.lower(), size, modified_at, sha256, now,
|
|
2018
|
-
now if status == "indexed" else None, parser_type, status, error_message,
|
|
2019
|
-
graph_node_id, 0 if status != "deleted" else 1, _json(metadata),
|
|
2020
|
-
),
|
|
2021
|
-
)
|
|
2022
|
-
return index_id
|
|
2023
|
-
|
|
2024
|
-
def _delete_local_file_graph(self, conn: sqlite3.Connection, file_node_id: Optional[str]) -> None:
|
|
2025
|
-
if not file_node_id:
|
|
2026
|
-
return
|
|
2027
|
-
|
|
2028
|
-
file_row = conn.execute(
|
|
2029
|
-
"SELECT metadata_json FROM nodes WHERE id=?",
|
|
2030
|
-
(file_node_id,),
|
|
2031
|
-
).fetchone()
|
|
2032
|
-
source_id = None
|
|
2033
|
-
if file_row:
|
|
2034
|
-
source_id = _safe_loads(file_row["metadata_json"]).get("source_id")
|
|
2035
|
-
|
|
2036
|
-
linked_rows = conn.execute(
|
|
2037
|
-
"""
|
|
2038
|
-
SELECT n.id, n.type, n.metadata_json
|
|
2039
|
-
FROM edges e
|
|
2040
|
-
JOIN nodes n ON n.id=e.to_node
|
|
2041
|
-
WHERE e.from_node=?
|
|
2042
|
-
""",
|
|
2043
|
-
(file_node_id,),
|
|
2044
|
-
).fetchall()
|
|
2045
|
-
owned_ids: set = set()
|
|
2046
|
-
auto_candidate_ids: set = set()
|
|
2047
|
-
for row in linked_rows:
|
|
2048
|
-
metadata = _safe_loads(row["metadata_json"])
|
|
2049
|
-
if row["type"] in {"Chunk", "ImageText", "Section"} or metadata.get("source_node") == file_node_id:
|
|
2050
|
-
owned_ids.add(row["id"])
|
|
2051
|
-
elif metadata.get("auto_extracted") and metadata.get("source") == "local_folder":
|
|
2052
|
-
auto_candidate_ids.add(row["id"])
|
|
2053
|
-
|
|
2054
|
-
conn.execute("DELETE FROM chunks WHERE source_node=?", (file_node_id,))
|
|
2055
|
-
conn.execute("DELETE FROM edges WHERE from_node=? OR to_node=?", (file_node_id, file_node_id))
|
|
2056
|
-
conn.execute("DELETE FROM nodes WHERE id=?", (file_node_id,))
|
|
2057
|
-
self._v2_delete_nodes(conn, [file_node_id])
|
|
2058
|
-
|
|
2059
|
-
def delete_nodes(node_ids: set) -> None:
|
|
2060
|
-
if not node_ids:
|
|
2061
|
-
return
|
|
2062
|
-
placeholders = ",".join("?" * len(node_ids))
|
|
2063
|
-
params = list(node_ids)
|
|
2064
|
-
conn.execute(f"DELETE FROM chunks WHERE source_node IN ({placeholders})", params)
|
|
2065
|
-
conn.execute(f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})", params * 2)
|
|
2066
|
-
conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", params)
|
|
2067
|
-
self._v2_delete_nodes(conn, params)
|
|
2068
|
-
|
|
2069
|
-
delete_nodes(owned_ids)
|
|
2070
|
-
|
|
2071
|
-
removable_auto_ids: set = set()
|
|
2072
|
-
for node_id in auto_candidate_ids:
|
|
2073
|
-
remaining_edges = conn.execute(
|
|
2074
|
-
"SELECT from_node, to_node FROM edges WHERE from_node=? OR to_node=?",
|
|
2075
|
-
(node_id, node_id),
|
|
2076
|
-
).fetchall()
|
|
2077
|
-
if all(
|
|
2078
|
-
(row["from_node"] in auto_candidate_ids and row["to_node"] in auto_candidate_ids)
|
|
2079
|
-
for row in remaining_edges
|
|
2080
|
-
):
|
|
2081
|
-
removable_auto_ids.add(node_id)
|
|
2082
|
-
delete_nodes(removable_auto_ids)
|
|
2083
|
-
if source_id:
|
|
2084
|
-
self._cleanup_local_graph_orphans(conn, str(source_id))
|
|
2085
|
-
|
|
2086
|
-
def _cleanup_local_graph_orphans(self, conn: sqlite3.Connection, source_id: str) -> None:
|
|
2087
|
-
while True:
|
|
2088
|
-
folder_rows = conn.execute(
|
|
2089
|
-
"SELECT id, metadata_json FROM nodes WHERE type='Folder'"
|
|
2090
|
-
).fetchall()
|
|
2091
|
-
leaf_ids = []
|
|
2092
|
-
for row in folder_rows:
|
|
2093
|
-
metadata = _safe_loads(row["metadata_json"])
|
|
2094
|
-
if metadata.get("source_id") != source_id:
|
|
2095
|
-
continue
|
|
2096
|
-
has_children = conn.execute(
|
|
2097
|
-
"SELECT 1 FROM edges WHERE from_node=? LIMIT 1",
|
|
2098
|
-
(row["id"],),
|
|
2099
|
-
).fetchone()
|
|
2100
|
-
if not has_children:
|
|
2101
|
-
leaf_ids.append(row["id"])
|
|
2102
|
-
if not leaf_ids:
|
|
2103
|
-
break
|
|
2104
|
-
placeholders = ",".join("?" * len(leaf_ids))
|
|
2105
|
-
conn.execute(f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})", leaf_ids * 2)
|
|
2106
|
-
conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", leaf_ids)
|
|
2107
|
-
self._v2_delete_nodes(conn, leaf_ids)
|
|
2108
|
-
|
|
2109
|
-
for node_type in ("Drive", "Computer"):
|
|
2110
|
-
rows = conn.execute("SELECT id FROM nodes WHERE type=?", (node_type,)).fetchall()
|
|
2111
|
-
removable = []
|
|
2112
|
-
for row in rows:
|
|
2113
|
-
has_children = conn.execute(
|
|
2114
|
-
"SELECT 1 FROM edges WHERE from_node=? LIMIT 1",
|
|
2115
|
-
(row["id"],),
|
|
2116
|
-
).fetchone()
|
|
2117
|
-
if not has_children:
|
|
2118
|
-
removable.append(row["id"])
|
|
2119
|
-
if removable:
|
|
2120
|
-
placeholders = ",".join("?" * len(removable))
|
|
2121
|
-
conn.execute(f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})", removable * 2)
|
|
2122
|
-
conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", removable)
|
|
2123
|
-
self._v2_delete_nodes(conn, removable)
|
|
2124
|
-
|
|
2125
|
-
def _local_file_index_has_extracted_text(self, row: sqlite3.Row) -> bool:
|
|
2126
|
-
metadata = _safe_loads(row["metadata_json"])
|
|
2127
|
-
parser = metadata.get("parser") if isinstance(metadata, dict) else {}
|
|
2128
|
-
if not isinstance(parser, dict):
|
|
2129
|
-
return False
|
|
2130
|
-
try:
|
|
2131
|
-
return int(parser.get("extracted_chars") or 0) > 0
|
|
2132
|
-
except (TypeError, ValueError):
|
|
2133
|
-
return False
|
|
2134
|
-
|
|
2135
|
-
def _upsert_local_file_node(
|
|
2136
|
-
self,
|
|
2137
|
-
conn: sqlite3.Connection,
|
|
2138
|
-
*,
|
|
2139
|
-
source_id: str,
|
|
2140
|
-
root: Path,
|
|
2141
|
-
file_path: Path,
|
|
2142
|
-
stat: os.stat_result,
|
|
2143
|
-
os_type: str,
|
|
2144
|
-
drive_id: str,
|
|
2145
|
-
sha256: str,
|
|
2146
|
-
category: str,
|
|
2147
|
-
parser_type: str,
|
|
2148
|
-
text: str,
|
|
2149
|
-
parser_meta: Dict[str, Any],
|
|
2150
|
-
) -> str:
|
|
2151
|
-
text = _clean_text(text)
|
|
2152
|
-
if not text:
|
|
2153
|
-
raise ValueError("텍스트 추출 결과가 비어 있습니다.")
|
|
2154
|
-
try:
|
|
2155
|
-
relative_path = file_path.relative_to(root).as_posix()
|
|
2156
|
-
except ValueError:
|
|
2157
|
-
relative_path = file_path.name
|
|
2158
|
-
file_node_id = f"local-file:{_sha256_text(f'{source_id}:{relative_path}')[:24]}"
|
|
2159
|
-
parent_folder_id = self._ensure_local_hierarchy(
|
|
2160
|
-
conn,
|
|
2161
|
-
source_id=source_id,
|
|
2162
|
-
root=root,
|
|
2163
|
-
file_path=file_path,
|
|
2164
|
-
os_type=os_type,
|
|
2165
|
-
drive_id=drive_id,
|
|
2166
|
-
)
|
|
2167
|
-
child_rows = conn.execute(
|
|
2168
|
-
"""
|
|
2169
|
-
SELECT e.to_node AS id
|
|
2170
|
-
FROM edges e
|
|
2171
|
-
JOIN nodes n ON n.id=e.to_node
|
|
2172
|
-
WHERE e.from_node=? AND n.type IN ('Chunk', 'ImageText', 'Section')
|
|
2173
|
-
""",
|
|
2174
|
-
(file_node_id,),
|
|
2175
|
-
).fetchall()
|
|
2176
|
-
child_ids = [row["id"] for row in child_rows]
|
|
2177
|
-
conn.execute("DELETE FROM chunks WHERE source_node=?", (file_node_id,))
|
|
2178
|
-
if child_ids:
|
|
2179
|
-
placeholders = ",".join("?" * len(child_ids))
|
|
2180
|
-
conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", child_ids)
|
|
2181
|
-
self._v2_delete_nodes(conn, child_ids)
|
|
2182
|
-
conn.execute("DELETE FROM edges WHERE from_node=?", (file_node_id,))
|
|
2183
|
-
self._v2_delete_edges_from(conn, file_node_id)
|
|
2184
|
-
|
|
2185
|
-
metadata = {
|
|
2186
|
-
"source": "local_folder",
|
|
2187
|
-
"source_id": source_id,
|
|
2188
|
-
"root_path": str(root),
|
|
2189
|
-
"file_path": str(file_path),
|
|
2190
|
-
"relative_path": relative_path,
|
|
2191
|
-
"filename": file_path.name,
|
|
2192
|
-
"ext": file_path.suffix.lower(),
|
|
2193
|
-
"category": category,
|
|
2194
|
-
"parser_type": parser_type,
|
|
2195
|
-
"bytes": stat.st_size,
|
|
2196
|
-
"modified_at": _safe_iso_from_stat_mtime(stat.st_mtime),
|
|
2197
|
-
"sha256": sha256,
|
|
2198
|
-
"parser": parser_meta,
|
|
2199
|
-
}
|
|
2200
|
-
self._upsert_node(
|
|
2201
|
-
conn,
|
|
2202
|
-
file_node_id,
|
|
2203
|
-
_node_type_for_category(category),
|
|
2204
|
-
file_path.name,
|
|
2205
|
-
summary=text[:700],
|
|
2206
|
-
metadata=metadata,
|
|
2207
|
-
raw=metadata,
|
|
2208
|
-
)
|
|
2209
|
-
self._upsert_edge(conn, parent_folder_id, file_node_id, "포함함", weight=1.0, metadata={"source": "local_scan"})
|
|
2210
|
-
|
|
2211
|
-
target_for_concepts = text
|
|
2212
|
-
if category == "image" and text:
|
|
2213
|
-
image_text_id = f"imagetext:{_sha256_text(f'{file_node_id}:ocr')[:24]}"
|
|
2214
|
-
self._upsert_node(
|
|
2215
|
-
conn,
|
|
2216
|
-
image_text_id,
|
|
2217
|
-
"ImageText",
|
|
2218
|
-
f"{file_path.name} OCR",
|
|
2219
|
-
summary=_clean_text(text)[:700],
|
|
2220
|
-
metadata={"source_node": file_node_id, "source_id": source_id, "chars": len(text)},
|
|
2221
|
-
)
|
|
2222
|
-
self._upsert_edge(conn, file_node_id, image_text_id, "포함함", weight=0.8, metadata={"source": "ocr"})
|
|
2223
|
-
|
|
2224
|
-
for index, chunk in enumerate(_chunks(text)):
|
|
2225
|
-
chunk_id = f"chunk:{_sha256_text(f'{file_node_id}:{index}:{chunk}')[:24]}"
|
|
2226
|
-
self._upsert_node(
|
|
2227
|
-
conn,
|
|
2228
|
-
chunk_id,
|
|
2229
|
-
"Chunk",
|
|
2230
|
-
f"{file_path.name} chunk {index + 1}",
|
|
2231
|
-
summary=chunk[:500],
|
|
2232
|
-
metadata={"index": index, "source_node": file_node_id, "source_id": source_id},
|
|
2233
|
-
)
|
|
2234
|
-
self._upsert_chunk(
|
|
2235
|
-
conn,
|
|
2236
|
-
chunk_id=chunk_id,
|
|
2237
|
-
source_node=file_node_id,
|
|
2238
|
-
text=chunk,
|
|
2239
|
-
metadata={"index": index, "source_node": file_node_id, "source_id": source_id},
|
|
2240
|
-
)
|
|
2241
|
-
self._upsert_edge(conn, file_node_id, chunk_id, "포함함", weight=0.7, metadata={"source": "local_scan"})
|
|
2242
|
-
|
|
2243
|
-
concepts = _extract_concepts(target_for_concepts, limit=18)
|
|
2244
|
-
concept_ids: Dict[str, str] = {}
|
|
2245
|
-
for concept in concepts:
|
|
2246
|
-
node_t = _classify_node_type(concept, target_for_concepts)
|
|
2247
|
-
concept_id = f"{node_t.lower()}:{_slug(concept)}"
|
|
2248
|
-
concept_ids[concept.lower()] = concept_id
|
|
2249
|
-
self._upsert_node(
|
|
2250
|
-
conn,
|
|
2251
|
-
concept_id,
|
|
2252
|
-
node_t,
|
|
2253
|
-
concept,
|
|
2254
|
-
metadata={"auto_extracted": True, "source": "local_folder", "source_id": source_id},
|
|
2255
|
-
)
|
|
2256
|
-
self._upsert_edge(conn, file_node_id, concept_id, "언급함", weight=0.75, metadata={"source": "local_scan"})
|
|
2257
|
-
|
|
2258
|
-
for triple in _extract_triples(target_for_concepts, concepts, limit=20):
|
|
2259
|
-
subj_id = concept_ids.get(triple["subject"].lower())
|
|
2260
|
-
obj_id = concept_ids.get(triple["object"].lower())
|
|
2261
|
-
if subj_id and obj_id and subj_id != obj_id:
|
|
2262
|
-
self._upsert_edge(
|
|
2263
|
-
conn,
|
|
2264
|
-
subj_id,
|
|
2265
|
-
obj_id,
|
|
2266
|
-
triple["relation"],
|
|
2267
|
-
weight=0.9,
|
|
2268
|
-
metadata={"context": triple.get("context", "")[:240], "source_id": source_id},
|
|
2269
|
-
)
|
|
2270
|
-
|
|
2271
|
-
for item in _semantic_items(target_for_concepts):
|
|
2272
|
-
sem_type = item["type"]
|
|
2273
|
-
sem_title = item["title"]
|
|
2274
|
-
sem_id = f"{sem_type.lower()}:{_sha256_text(f'{file_node_id}:{sem_type}:{sem_title}')[:24]}"
|
|
2275
|
-
self._upsert_node(
|
|
2276
|
-
conn,
|
|
2277
|
-
sem_id,
|
|
2278
|
-
sem_type,
|
|
2279
|
-
sem_title,
|
|
2280
|
-
summary=item["summary"],
|
|
2281
|
-
metadata={"auto_extracted": True, "source_node": file_node_id, "filename": file_path.name},
|
|
2282
|
-
raw=item,
|
|
2283
|
-
)
|
|
2284
|
-
self._upsert_edge(conn, file_node_id, sem_id, "포함함", weight=0.9)
|
|
2285
|
-
|
|
2286
|
-
return file_node_id
|
|
2287
|
-
|
|
2288
|
-
def index_local_folder(
|
|
2289
|
-
self,
|
|
2290
|
-
path: Path,
|
|
2291
|
-
*,
|
|
2292
|
-
include_ocr: bool = False,
|
|
2293
|
-
watch_enabled: bool = False,
|
|
2294
|
-
user_email: Optional[str] = None,
|
|
2295
|
-
consent: Optional[Dict[str, Any]] = None,
|
|
2296
|
-
max_files: int = 5_000,
|
|
2297
|
-
) -> Dict[str, Any]:
|
|
2298
|
-
"""Read approved files from a local folder and connect them to Graph RAG."""
|
|
2299
|
-
root = Path(path).expanduser().resolve()
|
|
2300
|
-
if not root.exists():
|
|
2301
|
-
raise ValueError(f"경로가 존재하지 않습니다: {path}")
|
|
2302
|
-
if not root.is_dir():
|
|
2303
|
-
raise ValueError(f"폴더가 아닙니다: {path}")
|
|
2304
|
-
|
|
2305
|
-
os_type = _current_os_type()
|
|
2306
|
-
drive_id = _drive_id_for_path(root)
|
|
2307
|
-
source_id = f"source:{_path_fingerprint(root)}"
|
|
2308
|
-
now = _now()
|
|
2309
|
-
max_files = max(1, min(int(max_files or 5_000), 50_000))
|
|
2310
|
-
consent_payload = {
|
|
2311
|
-
"approved_at": now,
|
|
2312
|
-
"approved_by": user_email,
|
|
2313
|
-
"knowledge_source": True,
|
|
2314
|
-
"include_ocr": bool(include_ocr),
|
|
2315
|
-
"watch_enabled": bool(watch_enabled),
|
|
2316
|
-
"sensitive_files_default_excluded": True,
|
|
2317
|
-
**(consent or {}),
|
|
2318
|
-
}
|
|
2319
|
-
counts: Counter = Counter()
|
|
2320
|
-
seen_relative_paths: set = set()
|
|
2321
|
-
indexed_nodes: List[str] = []
|
|
2322
|
-
errors: List[Dict[str, str]] = []
|
|
2323
|
-
limit_reached = False
|
|
2324
|
-
|
|
2325
|
-
with self._connect() as conn:
|
|
2326
|
-
conn.execute(
|
|
2327
|
-
"""
|
|
2328
|
-
INSERT INTO knowledge_sources(
|
|
2329
|
-
id, root_path, os_type, drive_id, label, status, include_ocr,
|
|
2330
|
-
watch_enabled, consent_json, created_at, updated_at, last_scanned_at
|
|
2331
|
-
)
|
|
2332
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
2333
|
-
ON CONFLICT(id) DO UPDATE SET
|
|
2334
|
-
root_path=excluded.root_path,
|
|
2335
|
-
os_type=excluded.os_type,
|
|
2336
|
-
drive_id=excluded.drive_id,
|
|
2337
|
-
label=excluded.label,
|
|
2338
|
-
status=excluded.status,
|
|
2339
|
-
include_ocr=excluded.include_ocr,
|
|
2340
|
-
watch_enabled=excluded.watch_enabled,
|
|
2341
|
-
consent_json=excluded.consent_json,
|
|
2342
|
-
updated_at=excluded.updated_at,
|
|
2343
|
-
last_scanned_at=excluded.last_scanned_at
|
|
2344
|
-
""",
|
|
2345
|
-
(
|
|
2346
|
-
source_id, str(root), os_type, drive_id, root.name or str(root), "scanning",
|
|
2347
|
-
1 if include_ocr else 0, 1 if watch_enabled else 0, _json(consent_payload),
|
|
2348
|
-
now, now, now,
|
|
2349
|
-
),
|
|
2350
|
-
)
|
|
2351
|
-
|
|
2352
|
-
for entry in self._iter_local_scan_entries(root, max_files=max_files):
|
|
2353
|
-
kind = entry["kind"]
|
|
2354
|
-
file_path = entry["path"]
|
|
2355
|
-
if kind == "limit_reached":
|
|
2356
|
-
counts["limit_reached"] += 1
|
|
2357
|
-
limit_reached = True
|
|
2358
|
-
break
|
|
2359
|
-
if kind in {"excluded_dir", "excluded"}:
|
|
2360
|
-
counts["excluded"] += 1
|
|
2361
|
-
continue
|
|
2362
|
-
if kind in {"inaccessible_dir", "inaccessible_file"}:
|
|
2363
|
-
counts["failed"] += 1
|
|
2364
|
-
errors.append({"path": str(file_path), "error": entry.get("reason", "inaccessible")})
|
|
2365
|
-
continue
|
|
2366
|
-
if kind != "file":
|
|
2367
|
-
continue
|
|
2368
|
-
|
|
2369
|
-
stat = entry["stat"]
|
|
2370
|
-
try:
|
|
2371
|
-
relative_path = file_path.relative_to(root).as_posix()
|
|
2372
|
-
except ValueError:
|
|
2373
|
-
relative_path = file_path.name
|
|
2374
|
-
seen_relative_paths.add(relative_path)
|
|
2375
|
-
modified_at = _safe_iso_from_stat_mtime(stat.st_mtime)
|
|
2376
|
-
existing = conn.execute(
|
|
2377
|
-
"""
|
|
2378
|
-
SELECT size_bytes, modified_at, sha256, graph_node_id, status, metadata_json
|
|
2379
|
-
FROM local_file_index
|
|
2380
|
-
WHERE source_id=? AND relative_path=?
|
|
2381
|
-
""",
|
|
2382
|
-
(source_id, relative_path),
|
|
2383
|
-
).fetchone()
|
|
2384
|
-
decision = self._local_file_decision(file_path, root, stat)
|
|
2385
|
-
parser_type = decision["parser_type"]
|
|
2386
|
-
if not decision["indexable"]:
|
|
2387
|
-
counts[decision["status"]] += 1
|
|
2388
|
-
if existing and existing["graph_node_id"]:
|
|
2389
|
-
self._delete_local_file_graph(conn, existing["graph_node_id"])
|
|
2390
|
-
self._upsert_local_file_index(
|
|
2391
|
-
conn,
|
|
2392
|
-
source_id=source_id,
|
|
2393
|
-
root=root,
|
|
2394
|
-
file_path=file_path,
|
|
2395
|
-
stat=stat,
|
|
2396
|
-
os_type=os_type,
|
|
2397
|
-
drive_id=drive_id,
|
|
2398
|
-
status=decision["status"],
|
|
2399
|
-
parser_type=parser_type,
|
|
2400
|
-
metadata={"reason": decision["reason"], "category": decision["category"]},
|
|
2401
|
-
)
|
|
2402
|
-
continue
|
|
2403
|
-
|
|
2404
|
-
if (
|
|
2405
|
-
existing
|
|
2406
|
-
and existing["status"] == "indexed"
|
|
2407
|
-
and existing["graph_node_id"]
|
|
2408
|
-
and self._local_file_index_has_extracted_text(existing)
|
|
2409
|
-
and existing["size_bytes"] == stat.st_size
|
|
2410
|
-
and existing["modified_at"] == modified_at
|
|
2411
|
-
):
|
|
2412
|
-
counts["skipped_unchanged"] += 1
|
|
2413
|
-
self._upsert_local_file_index(
|
|
2414
|
-
conn,
|
|
2415
|
-
source_id=source_id,
|
|
2416
|
-
root=root,
|
|
2417
|
-
file_path=file_path,
|
|
2418
|
-
stat=stat,
|
|
2419
|
-
os_type=os_type,
|
|
2420
|
-
drive_id=drive_id,
|
|
2421
|
-
status="indexed",
|
|
2422
|
-
parser_type=parser_type,
|
|
2423
|
-
sha256=existing["sha256"],
|
|
2424
|
-
graph_node_id=existing["graph_node_id"],
|
|
2425
|
-
metadata={**_safe_loads(existing["metadata_json"]), "category": decision["category"], "unchanged": True},
|
|
2426
|
-
)
|
|
2427
|
-
continue
|
|
2428
|
-
|
|
2429
|
-
try:
|
|
2430
|
-
data = file_path.read_bytes()
|
|
2431
|
-
digest = _sha256_bytes(data)
|
|
2432
|
-
except Exception as exc:
|
|
2433
|
-
counts["failed"] += 1
|
|
2434
|
-
errors.append({"path": str(file_path), "error": str(exc)})
|
|
2435
|
-
if existing and existing["graph_node_id"]:
|
|
2436
|
-
self._delete_local_file_graph(conn, existing["graph_node_id"])
|
|
2437
|
-
self._upsert_local_file_index(
|
|
2438
|
-
conn,
|
|
2439
|
-
source_id=source_id,
|
|
2440
|
-
root=root,
|
|
2441
|
-
file_path=file_path,
|
|
2442
|
-
stat=stat,
|
|
2443
|
-
os_type=os_type,
|
|
2444
|
-
drive_id=drive_id,
|
|
2445
|
-
status="failed",
|
|
2446
|
-
parser_type=parser_type,
|
|
2447
|
-
error_message=str(exc),
|
|
2448
|
-
metadata={"category": decision["category"]},
|
|
2449
|
-
)
|
|
2450
|
-
continue
|
|
2451
|
-
|
|
2452
|
-
if (
|
|
2453
|
-
existing
|
|
2454
|
-
and existing["sha256"] == digest
|
|
2455
|
-
and existing["graph_node_id"]
|
|
2456
|
-
and self._local_file_index_has_extracted_text(existing)
|
|
2457
|
-
):
|
|
2458
|
-
counts["skipped_unchanged"] += 1
|
|
2459
|
-
self._upsert_local_file_index(
|
|
2460
|
-
conn,
|
|
2461
|
-
source_id=source_id,
|
|
2462
|
-
root=root,
|
|
2463
|
-
file_path=file_path,
|
|
2464
|
-
stat=stat,
|
|
2465
|
-
os_type=os_type,
|
|
2466
|
-
drive_id=drive_id,
|
|
2467
|
-
status="indexed",
|
|
2468
|
-
parser_type=parser_type,
|
|
2469
|
-
sha256=digest,
|
|
2470
|
-
graph_node_id=existing["graph_node_id"],
|
|
2471
|
-
metadata={**_safe_loads(existing["metadata_json"]), "category": decision["category"], "sha256_unchanged": True},
|
|
2472
|
-
)
|
|
2473
|
-
continue
|
|
2474
|
-
|
|
2475
|
-
try:
|
|
2476
|
-
text, parser_meta = self._extract_local_file_text(
|
|
2477
|
-
file_path,
|
|
2478
|
-
decision["category"],
|
|
2479
|
-
include_ocr=include_ocr,
|
|
2480
|
-
)
|
|
2481
|
-
text = _clean_text(text)
|
|
2482
|
-
parser_meta = {**parser_meta, "extracted_chars": len(text)}
|
|
2483
|
-
if not text:
|
|
2484
|
-
counts["skipped_empty_text"] += 1
|
|
2485
|
-
if existing and existing["graph_node_id"]:
|
|
2486
|
-
self._delete_local_file_graph(conn, existing["graph_node_id"])
|
|
2487
|
-
self._upsert_local_file_index(
|
|
2488
|
-
conn,
|
|
2489
|
-
source_id=source_id,
|
|
2490
|
-
root=root,
|
|
2491
|
-
file_path=file_path,
|
|
2492
|
-
stat=stat,
|
|
2493
|
-
os_type=os_type,
|
|
2494
|
-
drive_id=drive_id,
|
|
2495
|
-
status="skipped_empty_text",
|
|
2496
|
-
parser_type=parser_type,
|
|
2497
|
-
sha256=digest,
|
|
2498
|
-
error_message="텍스트 추출 결과가 비어 있습니다.",
|
|
2499
|
-
metadata={"category": decision["category"], "parser": parser_meta},
|
|
2500
|
-
)
|
|
2501
|
-
continue
|
|
2502
|
-
graph_node_id = self._upsert_local_file_node(
|
|
2503
|
-
conn,
|
|
2504
|
-
source_id=source_id,
|
|
2505
|
-
root=root,
|
|
2506
|
-
file_path=file_path,
|
|
2507
|
-
stat=stat,
|
|
2508
|
-
os_type=os_type,
|
|
2509
|
-
drive_id=drive_id,
|
|
2510
|
-
sha256=digest,
|
|
2511
|
-
category=decision["category"],
|
|
2512
|
-
parser_type=parser_type,
|
|
2513
|
-
text=text,
|
|
2514
|
-
parser_meta=parser_meta,
|
|
2515
|
-
)
|
|
2516
|
-
self._upsert_local_file_index(
|
|
2517
|
-
conn,
|
|
2518
|
-
source_id=source_id,
|
|
2519
|
-
root=root,
|
|
2520
|
-
file_path=file_path,
|
|
2521
|
-
stat=stat,
|
|
2522
|
-
os_type=os_type,
|
|
2523
|
-
drive_id=drive_id,
|
|
2524
|
-
status="indexed",
|
|
2525
|
-
parser_type=parser_type,
|
|
2526
|
-
sha256=digest,
|
|
2527
|
-
graph_node_id=graph_node_id,
|
|
2528
|
-
metadata={"category": decision["category"], "parser": parser_meta},
|
|
2529
|
-
)
|
|
2530
|
-
counts["indexed"] += 1
|
|
2531
|
-
indexed_nodes.append(graph_node_id)
|
|
2532
|
-
except Exception as exc:
|
|
2533
|
-
counts["failed"] += 1
|
|
2534
|
-
errors.append({"path": str(file_path), "error": str(exc)})
|
|
2535
|
-
if existing and existing["graph_node_id"]:
|
|
2536
|
-
self._delete_local_file_graph(conn, existing["graph_node_id"])
|
|
2537
|
-
self._upsert_local_file_index(
|
|
2538
|
-
conn,
|
|
2539
|
-
source_id=source_id,
|
|
2540
|
-
root=root,
|
|
2541
|
-
file_path=file_path,
|
|
2542
|
-
stat=stat,
|
|
2543
|
-
os_type=os_type,
|
|
2544
|
-
drive_id=drive_id,
|
|
2545
|
-
status="failed",
|
|
2546
|
-
parser_type=parser_type,
|
|
2547
|
-
sha256=digest,
|
|
2548
|
-
error_message=str(exc),
|
|
2549
|
-
metadata={"category": decision["category"]},
|
|
2550
|
-
)
|
|
2551
|
-
|
|
2552
|
-
if not limit_reached:
|
|
2553
|
-
existing_rows = {
|
|
2554
|
-
row["relative_path"]: row["graph_node_id"]
|
|
2555
|
-
for row in conn.execute(
|
|
2556
|
-
"SELECT relative_path, graph_node_id FROM local_file_index WHERE source_id=?",
|
|
2557
|
-
(source_id,),
|
|
2558
|
-
)
|
|
2559
|
-
}
|
|
2560
|
-
deleted_paths = set(existing_rows) - seen_relative_paths
|
|
2561
|
-
for relative_path in deleted_paths:
|
|
2562
|
-
self._delete_local_file_graph(conn, existing_rows.get(relative_path))
|
|
2563
|
-
conn.execute(
|
|
2564
|
-
"""
|
|
2565
|
-
UPDATE local_file_index
|
|
2566
|
-
SET status='deleted', deleted=1, last_scanned_at=?, error_message=NULL, graph_node_id=NULL
|
|
2567
|
-
WHERE source_id=? AND relative_path=?
|
|
2568
|
-
""",
|
|
2569
|
-
(_now(), source_id, relative_path),
|
|
2570
|
-
)
|
|
2571
|
-
counts["deleted"] = len(deleted_paths)
|
|
2572
|
-
conn.execute(
|
|
2573
|
-
"""
|
|
2574
|
-
UPDATE knowledge_sources
|
|
2575
|
-
SET status='active', updated_at=?, last_scanned_at=?
|
|
2576
|
-
WHERE id=?
|
|
2577
|
-
""",
|
|
2578
|
-
(_now(), _now(), source_id),
|
|
2579
|
-
)
|
|
2580
|
-
|
|
2581
|
-
return {
|
|
2582
|
-
"status": "ok",
|
|
2583
|
-
"source": {
|
|
2584
|
-
"id": source_id,
|
|
2585
|
-
"root_path": str(root),
|
|
2586
|
-
"os_type": os_type,
|
|
2587
|
-
"drive_id": drive_id,
|
|
2588
|
-
"include_ocr": bool(include_ocr),
|
|
2589
|
-
"watch_enabled": bool(watch_enabled),
|
|
2590
|
-
},
|
|
2591
|
-
"counts": dict(counts),
|
|
2592
|
-
"indexed_nodes": indexed_nodes[:100],
|
|
2593
|
-
"errors": errors[:50],
|
|
2594
|
-
"notice": "Lattice AI는 사용자가 선택한 폴더만 AI 지식으로 변환합니다.",
|
|
2595
|
-
}
|
|
2596
|
-
|
|
2597
|
-
def ingest_message(
|
|
2598
|
-
self,
|
|
2599
|
-
role: str,
|
|
2600
|
-
content: str,
|
|
2601
|
-
*,
|
|
2602
|
-
user_email: Optional[str] = None,
|
|
2603
|
-
user_nickname: Optional[str] = None,
|
|
2604
|
-
source: Optional[str] = None,
|
|
2605
|
-
conversation_id: Optional[str] = None,
|
|
2606
|
-
raw: Optional[Dict[str, Any]] = None,
|
|
2607
|
-
) -> Dict[str, Any]:
|
|
2608
|
-
content = str(content or "")
|
|
2609
|
-
digest = _sha256_text("|".join([role or "", content, conversation_id or "", user_email or ""]))[:24]
|
|
2610
|
-
node_type = "AIResponse" if role == "assistant" else "Message"
|
|
2611
|
-
node_id = f"{node_type.lower()}:{digest}"
|
|
2612
|
-
conv_id = f"conversation:{_slug(conversation_id or 'default')}"
|
|
2613
|
-
metadata = {
|
|
2614
|
-
"role": role,
|
|
2615
|
-
"source": source,
|
|
2616
|
-
"conversation_id": conversation_id,
|
|
2617
|
-
"user_email": user_email,
|
|
2618
|
-
"user_nickname": user_nickname,
|
|
2619
|
-
"chars": len(content),
|
|
2620
|
-
}
|
|
2621
|
-
concepts = _extract_concepts(content)
|
|
2622
|
-
triples = _extract_triples(content, concepts)
|
|
2623
|
-
semantic = _semantic_items(content)
|
|
2624
|
-
|
|
2625
|
-
with self._connect() as conn:
|
|
2626
|
-
# ── 1. Chat node (점: 명사 — 대화 세션 단위) ─────────────────────
|
|
2627
|
-
# One Chat node per conversation_id; title = first 80 chars of
|
|
2628
|
-
# the first user message in this session (updated on each call).
|
|
2629
|
-
chat_title = _clean_text(content)[:80] or (conversation_id or "대화")
|
|
2630
|
-
self._upsert_node(
|
|
2631
|
-
conn, conv_id, "Chat",
|
|
2632
|
-
chat_title,
|
|
2633
|
-
summary=_clean_text(content)[:400],
|
|
2634
|
-
metadata={"source": source, "conversation_id": conversation_id},
|
|
2635
|
-
)
|
|
2636
|
-
|
|
2637
|
-
# ── 2. Person node (점: 명사 — 사람) ─────────────────────────────
|
|
2638
|
-
person_id = None
|
|
2639
|
-
if user_email or user_nickname:
|
|
2640
|
-
person_key = user_email or user_nickname or "unknown"
|
|
2641
|
-
person_id = f"person:{_slug(person_key)}"
|
|
2642
|
-
self._upsert_node(
|
|
2643
|
-
conn, person_id, "Person",
|
|
2644
|
-
user_nickname or user_email or "Unknown",
|
|
2645
|
-
metadata={"email": user_email, "nickname": user_nickname},
|
|
2646
|
-
)
|
|
2647
|
-
# 선: 동사 — Person이 Chat을 "작성함"
|
|
2648
|
-
self._upsert_edge(conn, person_id, conv_id, "작성함",
|
|
2649
|
-
weight=1.0, metadata={"role": role})
|
|
2650
|
-
|
|
2651
|
-
# ── 3. Raw message node (RAG 검색용, 그래프에서 숨김) ─────────────
|
|
2652
|
-
self._upsert_node(
|
|
2653
|
-
conn, node_id, node_type,
|
|
2654
|
-
_clean_text(content)[:80] or role,
|
|
2655
|
-
summary=_clean_text(content)[:500],
|
|
2656
|
-
metadata=metadata,
|
|
2657
|
-
raw=raw or metadata,
|
|
2658
|
-
)
|
|
2659
|
-
# 선: Chat이 메시지를 "포함함"
|
|
2660
|
-
self._upsert_edge(conn, conv_id, node_id, "포함함",
|
|
2661
|
-
weight=0.3, metadata={"role": role})
|
|
2662
|
-
|
|
2663
|
-
# ── 4. RAG chunks (검색용, 그래프에서 숨김) ──────────────────────
|
|
2664
|
-
for index, chunk in enumerate(_chunks(content)):
|
|
2665
|
-
chunk_id = f"chunk:{_sha256_text(f'{node_id}:{index}:{chunk}')[:24]}"
|
|
2666
|
-
self._upsert_node(
|
|
2667
|
-
conn, chunk_id, "Chunk",
|
|
2668
|
-
f"chunk {index + 1}",
|
|
2669
|
-
summary=chunk[:500],
|
|
2670
|
-
metadata={"index": index, "source_node": node_id},
|
|
2671
|
-
)
|
|
2672
|
-
self._upsert_chunk(
|
|
2673
|
-
conn,
|
|
2674
|
-
chunk_id=chunk_id,
|
|
2675
|
-
source_node=node_id,
|
|
2676
|
-
text=chunk,
|
|
2677
|
-
metadata={"index": index, "source_node": node_id},
|
|
2678
|
-
)
|
|
2679
|
-
self._upsert_edge(conn, node_id, chunk_id, "포함함")
|
|
2680
|
-
|
|
2681
|
-
# ── 5. Concept / Feature / Error / Code 노드 (점: 명사) ───────────
|
|
2682
|
-
concept_ids: Dict[str, str] = {}
|
|
2683
|
-
for concept in concepts:
|
|
2684
|
-
node_t = _classify_node_type(concept, content)
|
|
2685
|
-
cid = f"{node_t.lower()}:{_slug(concept)}"
|
|
2686
|
-
concept_ids[concept.lower()] = cid
|
|
2687
|
-
self._upsert_node(
|
|
2688
|
-
conn, cid, node_t, concept,
|
|
2689
|
-
metadata={"auto_extracted": True, "source": source},
|
|
2690
|
-
)
|
|
2691
|
-
# 선: Chat이 개념을 "언급함"
|
|
2692
|
-
self._upsert_edge(conn, conv_id, cid, "언급함",
|
|
2693
|
-
weight=0.7, metadata={"source": source})
|
|
2694
|
-
|
|
2695
|
-
# ── 6. Concept–Concept 엣지 (선: 동사형) ─────────────────────────
|
|
2696
|
-
for triple in triples:
|
|
2697
|
-
subj_id = concept_ids.get(triple["subject"].lower())
|
|
2698
|
-
obj_id = concept_ids.get(triple["object"].lower())
|
|
2699
|
-
if subj_id and obj_id and subj_id != obj_id:
|
|
2700
|
-
self._upsert_edge(
|
|
2701
|
-
conn, subj_id, obj_id,
|
|
2702
|
-
triple["relation"], # 동사형 레이블
|
|
2703
|
-
weight=1.0,
|
|
2704
|
-
metadata={"context": triple.get("context", "")[:240]},
|
|
2705
|
-
)
|
|
2706
|
-
|
|
2707
|
-
# ── 7. Task / Decision 노드 (점: 명사) ────────────────────────────
|
|
2708
|
-
for item in semantic:
|
|
2709
|
-
sem_type = item["type"]
|
|
2710
|
-
sem_title = item["title"]
|
|
2711
|
-
sem_id = f"{sem_type.lower()}:{_sha256_text(f'{conv_id}:{sem_type}:{sem_title}')[:24]}"
|
|
2712
|
-
self._upsert_node(
|
|
2713
|
-
conn, sem_id, sem_type, sem_title,
|
|
2714
|
-
summary=item["summary"],
|
|
2715
|
-
metadata={"auto_extracted": True, "source_node": node_id},
|
|
2716
|
-
raw=item,
|
|
2717
|
-
)
|
|
2718
|
-
# 선: Chat이 Task/Decision을 "생성함"
|
|
2719
|
-
self._upsert_edge(conn, conv_id, sem_id, "생성함", weight=0.9)
|
|
2720
|
-
# Task/Decision이 관련 개념을 "언급함"
|
|
2721
|
-
for cid in list(concept_ids.values())[:3]:
|
|
2722
|
-
self._upsert_edge(conn, sem_id, cid, "언급함", weight=0.6)
|
|
2723
|
-
|
|
2724
|
-
return {"node_id": node_id, "type": node_type}
|
|
2725
|
-
|
|
2726
|
-
def ingest_document(
|
|
2727
|
-
self,
|
|
2728
|
-
path: Path,
|
|
2729
|
-
*,
|
|
2730
|
-
original_filename: Optional[str] = None,
|
|
2731
|
-
mime_type: Optional[str] = None,
|
|
2732
|
-
uploader: Optional[str] = None,
|
|
2733
|
-
conversation_id: Optional[str] = None,
|
|
2734
|
-
extracted: Optional[Dict[str, Any]] = None,
|
|
2735
|
-
source_type: Optional[str] = None,
|
|
2736
|
-
source_uri: Optional[str] = None,
|
|
2737
|
-
captured_at: Optional[str] = None,
|
|
2738
|
-
modified_at: Optional[str] = None,
|
|
2739
|
-
owner: Optional[str] = None,
|
|
2740
|
-
workspace_id: Optional[str] = None,
|
|
2741
|
-
permissions: Optional[Dict[str, Any]] = None,
|
|
2742
|
-
) -> Dict[str, Any]:
|
|
2743
|
-
path = Path(path)
|
|
2744
|
-
data = path.read_bytes()
|
|
2745
|
-
digest = _sha256_bytes(data)
|
|
2746
|
-
ext = path.suffix.lower()
|
|
2747
|
-
filename = original_filename or path.name
|
|
2748
|
-
captured_at = captured_at or _now()
|
|
2749
|
-
blob_path = self.blob_dir / digest[:2] / f"{digest}{ext}"
|
|
2750
|
-
blob_path.parent.mkdir(parents=True, exist_ok=True)
|
|
2751
|
-
if not blob_path.exists():
|
|
2752
|
-
shutil.copyfile(path, blob_path)
|
|
2753
|
-
|
|
2754
|
-
doc_meta = self._document_structure(path, ext)
|
|
2755
|
-
text = str((extracted or {}).get("content") or (extracted or {}).get("preview") or "")
|
|
2756
|
-
file_id = f"file:{digest[:24]}"
|
|
2757
|
-
metadata = {
|
|
2758
|
-
"filename": filename,
|
|
2759
|
-
"ext": ext,
|
|
2760
|
-
"mime_type": mime_type,
|
|
2761
|
-
"bytes": len(data),
|
|
2762
|
-
"sha256": digest,
|
|
2763
|
-
"content_hash": digest,
|
|
2764
|
-
"blob_path": str(blob_path),
|
|
2765
|
-
"uploader": uploader,
|
|
2766
|
-
"owner": owner or uploader,
|
|
2767
|
-
"workspace_id": workspace_id,
|
|
2768
|
-
"permissions": permissions or {},
|
|
2769
|
-
"source_type": source_type or "file",
|
|
2770
|
-
"source_uri": source_uri or str(path),
|
|
2771
|
-
"captured_at": captured_at,
|
|
2772
|
-
"modified_at": modified_at,
|
|
2773
|
-
"conversation_id": conversation_id,
|
|
2774
|
-
"extracted": {k: v for k, v in (extracted or {}).items() if k != "content"},
|
|
2775
|
-
"structure": doc_meta,
|
|
2776
|
-
}
|
|
2777
|
-
full_text = f"{filename}\n{text}"
|
|
2778
|
-
concepts = _extract_concepts(full_text, limit=15)
|
|
2779
|
-
triples = _extract_triples(full_text, concepts)
|
|
2780
|
-
chunk_ids: List[str] = []
|
|
2781
|
-
source_node_id: Optional[str] = None
|
|
2782
|
-
|
|
2783
|
-
with self._connect() as conn:
|
|
2784
|
-
duplicate = self._node_exists(conn, file_id)
|
|
2785
|
-
# ── Document 노드 (점: 명사 — 파일) ────────────────────────────────
|
|
2786
|
-
self._upsert_node(
|
|
2787
|
-
conn, file_id, "Document", filename,
|
|
2788
|
-
summary=(text or filename)[:500],
|
|
2789
|
-
metadata=metadata, raw=metadata,
|
|
2790
|
-
)
|
|
2791
|
-
self._ingest_structure_nodes(conn, file_id, filename, doc_meta)
|
|
2792
|
-
|
|
2793
|
-
# ── SOURCE 노드 + indexed_from (v3.6.0, source_type 지정 시) ──────
|
|
2794
|
-
if source_type:
|
|
2795
|
-
source_node_id = self._attach_source_node(
|
|
2796
|
-
conn, file_id,
|
|
2797
|
-
source_type=source_type, source_uri=source_uri or str(path),
|
|
2798
|
-
title=filename, content_hash=digest, captured_at=captured_at,
|
|
2799
|
-
extra={"owner": owner or uploader, "workspace_id": workspace_id, "ext": ext},
|
|
2800
|
-
)
|
|
2801
|
-
|
|
2802
|
-
# ── Person 노드 + 동사형 엣지 ─────────────────────────────────────
|
|
2803
|
-
if uploader:
|
|
2804
|
-
person_id = f"person:{_slug(uploader)}"
|
|
2805
|
-
self._upsert_node(
|
|
2806
|
-
conn, person_id, "Person", uploader,
|
|
2807
|
-
metadata={"email": uploader},
|
|
2808
|
-
)
|
|
2809
|
-
# 선: 동사 — Person이 Document를 "업로드함"
|
|
2810
|
-
self._upsert_edge(conn, person_id, file_id, "업로드함", weight=1.0)
|
|
2811
|
-
|
|
2812
|
-
# ── Chat 노드와 연결 ──────────────────────────────────────────────
|
|
2813
|
-
if conversation_id:
|
|
2814
|
-
conv_id = f"conversation:{_slug(conversation_id)}"
|
|
2815
|
-
self._upsert_node(conn, conv_id, "Chat", conversation_id)
|
|
2816
|
-
# 선: 동사 — Chat이 Document를 "언급함"
|
|
2817
|
-
self._upsert_edge(conn, conv_id, file_id, "언급함", weight=0.8)
|
|
2818
|
-
|
|
2819
|
-
# ── RAG chunks (검색용, 그래프 비표시) ────────────────────────────
|
|
2820
|
-
for index, chunk in enumerate(_chunks(text)):
|
|
2821
|
-
chunk_id = f"chunk:{_sha256_text(f'{file_id}:{index}:{chunk}')[:24]}"
|
|
2822
|
-
chunk_ids.append(chunk_id)
|
|
2823
|
-
self._upsert_node(
|
|
2824
|
-
conn, chunk_id, "Chunk",
|
|
2825
|
-
f"{filename} chunk {index + 1}",
|
|
2826
|
-
summary=chunk[:500],
|
|
2827
|
-
metadata={"index": index, "source_node": file_id},
|
|
2828
|
-
)
|
|
2829
|
-
self._upsert_chunk(
|
|
2830
|
-
conn,
|
|
2831
|
-
chunk_id=chunk_id,
|
|
2832
|
-
source_node=file_id,
|
|
2833
|
-
text=chunk,
|
|
2834
|
-
metadata={"index": index, "source_node": file_id},
|
|
2835
|
-
)
|
|
2836
|
-
self._upsert_edge(conn, file_id, chunk_id, "포함함")
|
|
2837
|
-
|
|
2838
|
-
# ── Concept / Feature / Error / Code 노드 + 동사형 엣지 ───────────
|
|
2839
|
-
concept_ids: Dict[str, str] = {}
|
|
2840
|
-
for concept in concepts:
|
|
2841
|
-
node_t = _classify_node_type(concept, full_text)
|
|
2842
|
-
cid = f"{node_t.lower()}:{_slug(concept)}"
|
|
2843
|
-
concept_ids[concept.lower()] = cid
|
|
2844
|
-
self._upsert_node(
|
|
2845
|
-
conn, cid, node_t, concept,
|
|
2846
|
-
metadata={"auto_extracted": True, "source_file": filename},
|
|
2847
|
-
)
|
|
2848
|
-
# 선: 동사 — Document가 Concept을 "포함함"
|
|
2849
|
-
self._upsert_edge(conn, file_id, cid, "포함함", weight=0.8)
|
|
2850
|
-
|
|
2851
|
-
# ── Concept–Concept 엣지 (선: 동사형) ───────────────────────────
|
|
2852
|
-
for triple in triples:
|
|
2853
|
-
subj_id = concept_ids.get(triple["subject"].lower())
|
|
2854
|
-
obj_id = concept_ids.get(triple["object"].lower())
|
|
2855
|
-
if subj_id and obj_id and subj_id != obj_id:
|
|
2856
|
-
self._upsert_edge(
|
|
2857
|
-
conn, subj_id, obj_id,
|
|
2858
|
-
triple["relation"],
|
|
2859
|
-
weight=1.0,
|
|
2860
|
-
metadata={"context": triple.get("context", "")[:240]},
|
|
2861
|
-
)
|
|
2862
|
-
|
|
2863
|
-
# ── Task / Decision 노드 ──────────────────────────────────────────
|
|
2864
|
-
for item in _semantic_items(text):
|
|
2865
|
-
sem_type = item["type"]
|
|
2866
|
-
sem_title = item["title"]
|
|
2867
|
-
sem_id = f"{sem_type.lower()}:{_sha256_text(f'{file_id}:{sem_type}:{sem_title}')[:24]}"
|
|
2868
|
-
self._upsert_node(
|
|
2869
|
-
conn, sem_id, sem_type, sem_title,
|
|
2870
|
-
summary=item["summary"],
|
|
2871
|
-
metadata={"auto_extracted": True, "source_node": file_id, "filename": filename},
|
|
2872
|
-
raw=item,
|
|
2873
|
-
)
|
|
2874
|
-
# 선: Document가 Task/Decision을 "포함함"
|
|
2875
|
-
self._upsert_edge(conn, file_id, sem_id, "포함함", weight=0.9)
|
|
2876
|
-
|
|
2877
|
-
return {
|
|
2878
|
-
"node_id": file_id,
|
|
2879
|
-
"type": "Document",
|
|
2880
|
-
"sha256": digest,
|
|
2881
|
-
"content_hash": digest,
|
|
2882
|
-
"source_node_id": source_node_id,
|
|
2883
|
-
"chunk_ids": chunk_ids,
|
|
2884
|
-
"chunk_count": len(chunk_ids),
|
|
2885
|
-
"duplicate": duplicate,
|
|
2886
|
-
"captured_at": captured_at,
|
|
2887
|
-
"metadata": metadata,
|
|
2888
|
-
}
|
|
2889
|
-
|
|
2890
|
-
def ingest_event(
|
|
2891
|
-
self,
|
|
2892
|
-
event_type: str,
|
|
2893
|
-
title: str,
|
|
2894
|
-
*,
|
|
2895
|
-
user_email: Optional[str] = None,
|
|
2896
|
-
user_nickname: Optional[str] = None,
|
|
2897
|
-
source: Optional[str] = None,
|
|
2898
|
-
conversation_id: Optional[str] = None,
|
|
2899
|
-
metadata: Optional[Dict[str, Any]] = None,
|
|
2900
|
-
) -> Dict[str, Any]:
|
|
2901
|
-
event_type = str(event_type or "Event")
|
|
2902
|
-
title = str(title or event_type)
|
|
2903
|
-
payload = {
|
|
2904
|
-
"event_type": event_type,
|
|
2905
|
-
"title": title,
|
|
2906
|
-
"user_email": user_email,
|
|
2907
|
-
"user_nickname": user_nickname,
|
|
2908
|
-
"source": source,
|
|
2909
|
-
"conversation_id": conversation_id,
|
|
2910
|
-
"metadata": metadata or {},
|
|
2911
|
-
"timestamp": _now(),
|
|
2912
|
-
}
|
|
2913
|
-
event_id = f"event:{_sha256_text(_json(payload))[:24]}"
|
|
2914
|
-
conv_id = f"conversation:{_slug(conversation_id or 'default')}"
|
|
2915
|
-
with self._connect() as conn:
|
|
2916
|
-
self._upsert_node(conn, event_id, event_type, title, summary=title, metadata=payload, raw=payload)
|
|
2917
|
-
self._upsert_node(conn, conv_id, "Conversation", conversation_id or "Default conversation", metadata={"source": source})
|
|
2918
|
-
self._upsert_edge(conn, conv_id, event_id, "has_event", metadata={"source": source})
|
|
2919
|
-
if user_email or user_nickname:
|
|
2920
|
-
person_key = user_email or user_nickname or "unknown"
|
|
2921
|
-
person_id = f"person:{_slug(person_key)}"
|
|
2922
|
-
self._upsert_node(conn, person_id, "Person", user_nickname or user_email or "Unknown user", metadata={"email": user_email})
|
|
2923
|
-
self._upsert_edge(conn, person_id, event_id, "triggered", metadata={"event_type": event_type})
|
|
2924
|
-
return {"node_id": event_id, "type": event_type}
|
|
2925
|
-
|
|
2926
|
-
# ── v3.6.0 Knowledge Graph First: unified source ingestion + provenance ──────
|
|
2927
|
-
def _node_exists(self, conn: sqlite3.Connection, node_id: str) -> bool:
|
|
2928
|
-
row = conn.execute("SELECT 1 FROM nodes WHERE id = ?", (node_id,)).fetchone()
|
|
2929
|
-
return row is not None
|
|
2930
|
-
|
|
2931
|
-
def node_is_embedded(self, node_id: str) -> bool:
|
|
2932
|
-
"""True when a vector embedding exists for ``node_id`` (RAG-ready)."""
|
|
2933
|
-
with self._connect() as conn:
|
|
2934
|
-
row = conn.execute(
|
|
2935
|
-
"SELECT 1 FROM vector_embeddings WHERE item_id = ? LIMIT 1",
|
|
2936
|
-
(node_id,),
|
|
2937
|
-
).fetchone()
|
|
2938
|
-
return row is not None
|
|
2939
|
-
|
|
2940
|
-
def _attach_source_node(
|
|
2941
|
-
self,
|
|
2942
|
-
conn: sqlite3.Connection,
|
|
2943
|
-
content_node_id: str,
|
|
2944
|
-
*,
|
|
2945
|
-
source_type: str,
|
|
2946
|
-
source_uri: Optional[str] = None,
|
|
2947
|
-
title: Optional[str] = None,
|
|
2948
|
-
content_hash: Optional[str] = None,
|
|
2949
|
-
captured_at: Optional[str] = None,
|
|
2950
|
-
extra: Optional[Dict[str, Any]] = None,
|
|
2951
|
-
) -> str:
|
|
2952
|
-
"""Create the SOURCE node for an ingested item and link it via INDEXED_FROM.
|
|
2953
|
-
|
|
2954
|
-
Every ingested content node points at exactly one SOURCE node, so the
|
|
2955
|
-
graph is always able to explain *where* a node came from. The source id
|
|
2956
|
-
is derived from (source_type, source_uri | content_hash) so re-ingesting
|
|
2957
|
-
the same origin reuses the same SOURCE node (idempotent).
|
|
2958
|
-
"""
|
|
2959
|
-
key = source_uri or content_hash or content_node_id
|
|
2960
|
-
source_id = f"source:{_sha256_text(f'{source_type}|{key}')[:24]}"
|
|
2961
|
-
meta = {
|
|
2962
|
-
"source_type": source_type,
|
|
2963
|
-
"source_uri": source_uri,
|
|
2964
|
-
"content_hash": content_hash,
|
|
2965
|
-
"captured_at": captured_at or _now(),
|
|
2966
|
-
**(extra or {}),
|
|
2967
|
-
}
|
|
2968
|
-
label = title or source_uri or source_type
|
|
2969
|
-
self._upsert_node(
|
|
2970
|
-
conn, source_id, "Source", label,
|
|
2971
|
-
summary=str(source_uri or title or source_type)[:400],
|
|
2972
|
-
metadata=meta,
|
|
2973
|
-
)
|
|
2974
|
-
# 선: 콘텐츠 노드가 "이 출처에서 색인됨" (indexed_from → SOURCE)
|
|
2975
|
-
self._upsert_edge(conn, content_node_id, source_id, "indexed_from",
|
|
2976
|
-
weight=1.0, metadata={"source_type": source_type})
|
|
2977
|
-
return source_id
|
|
2978
|
-
|
|
2979
|
-
def ingest_source(
|
|
2980
|
-
self,
|
|
2981
|
-
*,
|
|
2982
|
-
source_type: str,
|
|
2983
|
-
title: str,
|
|
2984
|
-
text: str,
|
|
2985
|
-
source_uri: Optional[str] = None,
|
|
2986
|
-
owner: Optional[str] = None,
|
|
2987
|
-
workspace_id: Optional[str] = None,
|
|
2988
|
-
permissions: Optional[Dict[str, Any]] = None,
|
|
2989
|
-
captured_at: Optional[str] = None,
|
|
2990
|
-
modified_at: Optional[str] = None,
|
|
2991
|
-
conversation_id: Optional[str] = None,
|
|
2992
|
-
metadata: Optional[Dict[str, Any]] = None,
|
|
2993
|
-
) -> Dict[str, Any]:
|
|
2994
|
-
"""Unified text/web ingestion: one shape for URL, browser tab, note, text.
|
|
2995
|
-
|
|
2996
|
-
Creates a content ``Document`` node (idempotent by content hash), a
|
|
2997
|
-
``Source`` node linked via ``indexed_from``, RAG chunks, and extracted
|
|
2998
|
-
Concept/Task/Decision nodes — mirroring ingest_document for non-file
|
|
2999
|
-
sources. Returns the full set of ids the caller needs to record
|
|
3000
|
-
provenance, including ``duplicate`` (was the content already indexed).
|
|
3001
|
-
"""
|
|
3002
|
-
source_type = str(source_type or "text")
|
|
3003
|
-
text = str(text or "")
|
|
3004
|
-
title = _clean_text(str(title or source_uri or source_type))[:240] or source_type
|
|
3005
|
-
captured_at = captured_at or _now()
|
|
3006
|
-
content_hash = _sha256_text(f"{source_type}|{source_uri or ''}|{text}")
|
|
3007
|
-
content_id = f"webdoc:{content_hash[:24]}"
|
|
3008
|
-
full_text = f"{title}\n{text}"
|
|
3009
|
-
node_meta = {
|
|
3010
|
-
"source_type": source_type,
|
|
3011
|
-
"source_uri": source_uri,
|
|
3012
|
-
"content_hash": content_hash,
|
|
3013
|
-
"title": title,
|
|
3014
|
-
"captured_at": captured_at,
|
|
3015
|
-
"modified_at": modified_at,
|
|
3016
|
-
"owner": owner,
|
|
3017
|
-
"workspace_id": workspace_id,
|
|
3018
|
-
"permissions": permissions or {},
|
|
3019
|
-
"chars": len(text),
|
|
3020
|
-
**(metadata or {}),
|
|
3021
|
-
}
|
|
3022
|
-
concepts = _extract_concepts(full_text, limit=15)
|
|
3023
|
-
triples = _extract_triples(full_text, concepts)
|
|
3024
|
-
chunk_ids: List[str] = []
|
|
3025
|
-
|
|
3026
|
-
with self._connect() as conn:
|
|
3027
|
-
duplicate = self._node_exists(conn, content_id)
|
|
3028
|
-
# ── 콘텐츠 노드 (점: 명사 — 문서) ────────────────────────────────
|
|
3029
|
-
self._upsert_node(
|
|
3030
|
-
conn, content_id, "Document", title,
|
|
3031
|
-
summary=(text or title)[:500],
|
|
3032
|
-
metadata=node_meta, raw=node_meta,
|
|
3033
|
-
)
|
|
3034
|
-
# ── SOURCE 노드 + indexed_from 엣지 (출처 추적) ──────────────────
|
|
3035
|
-
source_node_id = self._attach_source_node(
|
|
3036
|
-
conn, content_id,
|
|
3037
|
-
source_type=source_type, source_uri=source_uri, title=title,
|
|
3038
|
-
content_hash=content_hash, captured_at=captured_at,
|
|
3039
|
-
extra={"owner": owner, "workspace_id": workspace_id},
|
|
3040
|
-
)
|
|
3041
|
-
# ── 소유자(Person) + 동사형 엣지 ────────────────────────────────
|
|
3042
|
-
if owner:
|
|
3043
|
-
person_id = f"person:{_slug(owner)}"
|
|
3044
|
-
self._upsert_node(conn, person_id, "Person", owner, metadata={"email": owner})
|
|
3045
|
-
self._upsert_edge(conn, person_id, content_id, "업로드함", weight=1.0)
|
|
3046
|
-
# ── 대화 연결 ───────────────────────────────────────────────────
|
|
3047
|
-
if conversation_id:
|
|
3048
|
-
conv_id = f"conversation:{_slug(conversation_id)}"
|
|
3049
|
-
self._upsert_node(conn, conv_id, "Chat", conversation_id)
|
|
3050
|
-
self._upsert_edge(conn, conv_id, content_id, "언급함", weight=0.8)
|
|
3051
|
-
# ── RAG 청크 ────────────────────────────────────────────────────
|
|
3052
|
-
for index, chunk in enumerate(_chunks(text)):
|
|
3053
|
-
chunk_id = f"chunk:{_sha256_text(f'{content_id}:{index}:{chunk}')[:24]}"
|
|
3054
|
-
chunk_ids.append(chunk_id)
|
|
3055
|
-
self._upsert_node(
|
|
3056
|
-
conn, chunk_id, "Chunk", f"{title} chunk {index + 1}",
|
|
3057
|
-
summary=chunk[:500], metadata={"index": index, "source_node": content_id},
|
|
3058
|
-
)
|
|
3059
|
-
self._upsert_chunk(conn, chunk_id=chunk_id, source_node=content_id,
|
|
3060
|
-
text=chunk, metadata={"index": index, "source_node": content_id})
|
|
3061
|
-
self._upsert_edge(conn, content_id, chunk_id, "포함함")
|
|
3062
|
-
# ── Concept / Feature / Error / Code 노드 + 엣지 ────────────────
|
|
3063
|
-
concept_ids: Dict[str, str] = {}
|
|
3064
|
-
for concept in concepts:
|
|
3065
|
-
node_t = _classify_node_type(concept, full_text)
|
|
3066
|
-
cid = f"{node_t.lower()}:{_slug(concept)}"
|
|
3067
|
-
concept_ids[concept.lower()] = cid
|
|
3068
|
-
self._upsert_node(conn, cid, node_t, concept,
|
|
3069
|
-
metadata={"auto_extracted": True, "source_type": source_type})
|
|
3070
|
-
self._upsert_edge(conn, content_id, cid, "포함함", weight=0.8)
|
|
3071
|
-
for triple in triples:
|
|
3072
|
-
subj_id = concept_ids.get(triple["subject"].lower())
|
|
3073
|
-
obj_id = concept_ids.get(triple["object"].lower())
|
|
3074
|
-
if subj_id and obj_id and subj_id != obj_id:
|
|
3075
|
-
self._upsert_edge(conn, subj_id, obj_id, triple["relation"],
|
|
3076
|
-
weight=1.0, metadata={"context": triple.get("context", "")[:240]})
|
|
3077
|
-
# ── Task / Decision 노드 ────────────────────────────────────────
|
|
3078
|
-
for item in _semantic_items(text):
|
|
3079
|
-
sem_type = item["type"]
|
|
3080
|
-
sem_title = item["title"]
|
|
3081
|
-
sem_id = f"{sem_type.lower()}:{_sha256_text(f'{content_id}:{sem_type}:{sem_title}')[:24]}"
|
|
3082
|
-
self._upsert_node(conn, sem_id, sem_type, sem_title, summary=item["summary"],
|
|
3083
|
-
metadata={"auto_extracted": True, "source_node": content_id}, raw=item)
|
|
3084
|
-
self._upsert_edge(conn, content_id, sem_id, "포함함", weight=0.9)
|
|
3085
|
-
|
|
3086
|
-
return {
|
|
3087
|
-
"node_id": content_id,
|
|
3088
|
-
"type": "Document",
|
|
3089
|
-
"source_node_id": source_node_id,
|
|
3090
|
-
"content_hash": content_hash,
|
|
3091
|
-
"chunk_ids": chunk_ids,
|
|
3092
|
-
"chunk_count": len(chunk_ids),
|
|
3093
|
-
"duplicate": duplicate,
|
|
3094
|
-
"captured_at": captured_at,
|
|
3095
|
-
}
|
|
3096
|
-
|
|
3097
|
-
def record_provenance(
|
|
3098
|
-
self,
|
|
3099
|
-
*,
|
|
3100
|
-
node_id: str,
|
|
3101
|
-
source_type: str,
|
|
3102
|
-
pipeline: str = "unified-ingestion",
|
|
3103
|
-
source_uri: Optional[str] = None,
|
|
3104
|
-
content_hash: Optional[str] = None,
|
|
3105
|
-
title: Optional[str] = None,
|
|
3106
|
-
owner: Optional[str] = None,
|
|
3107
|
-
workspace_id: Optional[str] = None,
|
|
3108
|
-
captured_at: Optional[str] = None,
|
|
3109
|
-
modified_at: Optional[str] = None,
|
|
3110
|
-
embedded: bool = False,
|
|
3111
|
-
linked: bool = False,
|
|
3112
|
-
duplicate: bool = False,
|
|
3113
|
-
agent_used: Optional[str] = None,
|
|
3114
|
-
chunk_count: int = 0,
|
|
3115
|
-
permissions: Optional[Dict[str, Any]] = None,
|
|
3116
|
-
metadata: Optional[Dict[str, Any]] = None,
|
|
3117
|
-
) -> Dict[str, Any]:
|
|
3118
|
-
"""Append a provenance record for an ingested node (audit trail)."""
|
|
3119
|
-
now = _now()
|
|
3120
|
-
prov_basis = f"{node_id}|{content_hash or ''}|{now}"
|
|
3121
|
-
prov_id = f"prov:{_sha256_text(prov_basis)[:24]}"
|
|
3122
|
-
with self._connect() as conn:
|
|
3123
|
-
conn.execute(
|
|
3124
|
-
"""
|
|
3125
|
-
INSERT OR REPLACE INTO ingestion_provenance(
|
|
3126
|
-
id, node_id, source_type, source_uri, content_hash, title, pipeline,
|
|
3127
|
-
owner, workspace_id, captured_at, modified_at, embedded, linked,
|
|
3128
|
-
duplicate, agent_used, chunk_count, permissions_json, metadata_json, created_at)
|
|
3129
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
3130
|
-
""",
|
|
3131
|
-
(
|
|
3132
|
-
prov_id, node_id, source_type, source_uri, content_hash, title, pipeline,
|
|
3133
|
-
owner, workspace_id, captured_at, modified_at, 1 if embedded else 0,
|
|
3134
|
-
1 if linked else 0, 1 if duplicate else 0, agent_used, int(chunk_count or 0),
|
|
3135
|
-
_json(permissions or {}), _json(metadata or {}), now,
|
|
3136
|
-
),
|
|
3137
|
-
)
|
|
3138
|
-
return {"id": prov_id, "node_id": node_id, "created_at": now}
|
|
3139
|
-
|
|
3140
|
-
@staticmethod
|
|
3141
|
-
def _provenance_row(row: sqlite3.Row) -> Dict[str, Any]:
|
|
3142
|
-
return {
|
|
3143
|
-
"id": row["id"],
|
|
3144
|
-
"node_id": row["node_id"],
|
|
3145
|
-
"source_type": row["source_type"],
|
|
3146
|
-
"source_uri": row["source_uri"],
|
|
3147
|
-
"content_hash": row["content_hash"],
|
|
3148
|
-
"title": row["title"],
|
|
3149
|
-
"pipeline": row["pipeline"],
|
|
3150
|
-
"owner": row["owner"],
|
|
3151
|
-
"workspace_id": row["workspace_id"],
|
|
3152
|
-
"captured_at": row["captured_at"],
|
|
3153
|
-
"modified_at": row["modified_at"],
|
|
3154
|
-
"embedded": bool(row["embedded"]),
|
|
3155
|
-
"linked": bool(row["linked"]),
|
|
3156
|
-
"duplicate": bool(row["duplicate"]),
|
|
3157
|
-
"agent_used": row["agent_used"],
|
|
3158
|
-
"chunk_count": row["chunk_count"],
|
|
3159
|
-
"permissions": _safe_loads(row["permissions_json"]),
|
|
3160
|
-
"metadata": _safe_loads(row["metadata_json"]),
|
|
3161
|
-
"created_at": row["created_at"],
|
|
3162
|
-
}
|
|
3163
|
-
|
|
3164
|
-
def get_provenance(self, node_id: str) -> Optional[Dict[str, Any]]:
|
|
3165
|
-
"""Return the most recent provenance record for a node, or None."""
|
|
3166
|
-
with self._connect() as conn:
|
|
3167
|
-
row = conn.execute(
|
|
3168
|
-
"SELECT * FROM ingestion_provenance WHERE node_id = ? "
|
|
3169
|
-
"ORDER BY created_at DESC, rowid DESC LIMIT 1",
|
|
3170
|
-
(node_id,),
|
|
3171
|
-
).fetchone()
|
|
3172
|
-
return self._provenance_row(row) if row else None
|
|
3173
|
-
|
|
3174
|
-
def list_provenance(self, *, limit: int = 100, source_type: Optional[str] = None) -> Dict[str, Any]:
|
|
3175
|
-
"""Recent provenance records (newest first), optionally by source_type."""
|
|
3176
|
-
limit = max(1, min(int(limit or 100), 1000))
|
|
3177
|
-
with self._connect() as conn:
|
|
3178
|
-
if source_type:
|
|
3179
|
-
rows = conn.execute(
|
|
3180
|
-
"SELECT * FROM ingestion_provenance WHERE source_type = ? "
|
|
3181
|
-
"ORDER BY created_at DESC, rowid DESC LIMIT ?",
|
|
3182
|
-
(source_type, limit),
|
|
3183
|
-
).fetchall()
|
|
3184
|
-
else:
|
|
3185
|
-
rows = conn.execute(
|
|
3186
|
-
"SELECT * FROM ingestion_provenance "
|
|
3187
|
-
"ORDER BY created_at DESC, rowid DESC LIMIT ?",
|
|
3188
|
-
(limit,),
|
|
3189
|
-
).fetchall()
|
|
3190
|
-
return {"items": [self._provenance_row(r) for r in rows], "count": len(rows)}
|
|
3191
|
-
|
|
3192
|
-
def provenance_stats(self) -> Dict[str, Any]:
|
|
3193
|
-
"""Aggregate provenance counts for the Knowledge Graph status surface."""
|
|
3194
|
-
with self._connect() as conn:
|
|
3195
|
-
total = conn.execute("SELECT COUNT(*) AS c FROM ingestion_provenance").fetchone()["c"]
|
|
3196
|
-
by_source = {
|
|
3197
|
-
r["source_type"]: r["c"]
|
|
3198
|
-
for r in conn.execute(
|
|
3199
|
-
"SELECT source_type, COUNT(*) AS c FROM ingestion_provenance GROUP BY source_type"
|
|
3200
|
-
).fetchall()
|
|
3201
|
-
}
|
|
3202
|
-
embedded = conn.execute(
|
|
3203
|
-
"SELECT COUNT(*) AS c FROM ingestion_provenance WHERE embedded = 1"
|
|
3204
|
-
).fetchone()["c"]
|
|
3205
|
-
duplicates = conn.execute(
|
|
3206
|
-
"SELECT COUNT(*) AS c FROM ingestion_provenance WHERE duplicate = 1"
|
|
3207
|
-
).fetchone()["c"]
|
|
3208
|
-
last = conn.execute(
|
|
3209
|
-
"SELECT created_at FROM ingestion_provenance ORDER BY created_at DESC LIMIT 1"
|
|
3210
|
-
).fetchone()
|
|
3211
|
-
return {
|
|
3212
|
-
"total": total,
|
|
3213
|
-
"by_source_type": by_source,
|
|
3214
|
-
"embedded": embedded,
|
|
3215
|
-
"duplicates": duplicates,
|
|
3216
|
-
"last_ingested_at": last["created_at"] if last else None,
|
|
3217
|
-
}
|
|
3218
|
-
|
|
3219
|
-
# ── v3.6.0 portability: logical export / import + binary backup ──────────────
|
|
3220
|
-
def schema_versions(self) -> Dict[str, Any]:
|
|
3221
|
-
"""Versions an exporter stamps and an importer validates against."""
|
|
3222
|
-
try:
|
|
3223
|
-
from kg_schema import EMBED_DIM as _EMBED_DIM, KG_SCHEMA_V2_VERSION as _V2
|
|
3224
|
-
except Exception: # pragma: no cover - kg_schema always importable in practice
|
|
3225
|
-
_EMBED_DIM, _V2 = 1024, 2
|
|
3226
|
-
return {
|
|
3227
|
-
"graph_schema_version": GRAPH_SCHEMA_VERSION,
|
|
3228
|
-
"kg_v2_schema_version": _V2,
|
|
3229
|
-
"projection_version": _PROJECTION_VERSION,
|
|
3230
|
-
"embed_dim": _EMBED_DIM,
|
|
3231
|
-
}
|
|
3232
|
-
|
|
3233
|
-
def export_graph_data(self) -> Dict[str, Any]:
|
|
3234
|
-
"""Raw, lossless logical export of the graph (nodes/edges/chunks/sources/
|
|
3235
|
-
provenance). Vector embeddings are intentionally omitted — they are
|
|
3236
|
-
re-derived on import — so the artifact stays portable and small. Use
|
|
3237
|
-
:meth:`backup_database` for a faithful binary copy incl. embeddings.
|
|
3238
|
-
"""
|
|
3239
|
-
with self._connect() as conn:
|
|
3240
|
-
def rows(table: str):
|
|
3241
|
-
return [dict(r) for r in conn.execute(f"SELECT * FROM {table}").fetchall()]
|
|
3242
|
-
|
|
3243
|
-
data = {
|
|
3244
|
-
"nodes": rows("nodes"),
|
|
3245
|
-
"edges": rows("edges"),
|
|
3246
|
-
"chunks": rows("chunks"),
|
|
3247
|
-
"knowledge_sources": rows("knowledge_sources"),
|
|
3248
|
-
"provenance": rows("ingestion_provenance"),
|
|
3249
|
-
}
|
|
3250
|
-
data["counts"] = {k: len(v) for k, v in data.items()}
|
|
3251
|
-
return data
|
|
3252
|
-
|
|
3253
|
-
def import_graph_data(
|
|
3254
|
-
self, data: Dict[str, Any], *, mode: str = "merge", dry_run: bool = False
|
|
3255
|
-
) -> Dict[str, Any]:
|
|
3256
|
-
"""Import a logical export back into the store.
|
|
3257
|
-
|
|
3258
|
-
``mode='merge'`` upserts on top of existing data (id collisions update);
|
|
3259
|
-
``mode='replace'`` clears the graph first. ``dry_run=True`` reports the
|
|
3260
|
-
plan without writing. Refuses artifacts from a NEWER graph schema than
|
|
3261
|
-
this build.
|
|
3262
|
-
"""
|
|
3263
|
-
nodes = data.get("nodes") or []
|
|
3264
|
-
edges = data.get("edges") or []
|
|
3265
|
-
chunks = data.get("chunks") or []
|
|
3266
|
-
sources = data.get("knowledge_sources") or []
|
|
3267
|
-
provenance = data.get("provenance") or []
|
|
3268
|
-
|
|
3269
|
-
header = data.get("header") or {}
|
|
3270
|
-
incoming_schema = header.get("graph_schema_version")
|
|
3271
|
-
if isinstance(incoming_schema, int) and incoming_schema > GRAPH_SCHEMA_VERSION:
|
|
3272
|
-
raise ValueError(
|
|
3273
|
-
f"Artifact graph_schema_version {incoming_schema} is newer than this "
|
|
3274
|
-
f"build ({GRAPH_SCHEMA_VERSION}); refusing to import."
|
|
3275
|
-
)
|
|
3276
|
-
|
|
3277
|
-
plan = {
|
|
3278
|
-
"mode": mode,
|
|
3279
|
-
"nodes": len(nodes),
|
|
3280
|
-
"edges": len(edges),
|
|
3281
|
-
"chunks": len(chunks),
|
|
3282
|
-
"knowledge_sources": len(sources),
|
|
3283
|
-
"provenance": len(provenance),
|
|
3284
|
-
}
|
|
3285
|
-
if dry_run:
|
|
3286
|
-
plan["dry_run"] = True
|
|
3287
|
-
return plan
|
|
3288
|
-
|
|
3289
|
-
if mode == "replace":
|
|
3290
|
-
self.clear_all()
|
|
3291
|
-
|
|
3292
|
-
with self._connect() as conn:
|
|
3293
|
-
for n in nodes:
|
|
3294
|
-
self._upsert_node(
|
|
3295
|
-
conn, n["id"], n["type"], n.get("title") or "",
|
|
3296
|
-
summary=n.get("summary") or "",
|
|
3297
|
-
metadata=_safe_loads(n.get("metadata_json")),
|
|
3298
|
-
raw=_safe_loads(n.get("raw_json")),
|
|
3299
|
-
)
|
|
3300
|
-
for c in chunks:
|
|
3301
|
-
self._upsert_chunk(
|
|
3302
|
-
conn, chunk_id=c["id"], source_node=c["source_node"],
|
|
3303
|
-
text=c.get("text") or "", metadata=_safe_loads(c.get("metadata_json")),
|
|
3304
|
-
)
|
|
3305
|
-
for e in edges:
|
|
3306
|
-
self._upsert_edge(
|
|
3307
|
-
conn, e["from_node"], e["to_node"], e["type"],
|
|
3308
|
-
weight=float(e.get("weight") or 1.0),
|
|
3309
|
-
metadata=_safe_loads(e.get("metadata_json")),
|
|
3310
|
-
)
|
|
3311
|
-
for s in sources:
|
|
3312
|
-
conn.execute(
|
|
3313
|
-
"""
|
|
3314
|
-
INSERT OR REPLACE INTO knowledge_sources(
|
|
3315
|
-
id, root_path, os_type, drive_id, label, status, include_ocr,
|
|
3316
|
-
watch_enabled, consent_json, created_at, updated_at, last_scanned_at)
|
|
3317
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
3318
|
-
""",
|
|
3319
|
-
(
|
|
3320
|
-
s["id"], s["root_path"], s["os_type"], s.get("drive_id"), s.get("label"),
|
|
3321
|
-
s.get("status") or "active", int(s.get("include_ocr") or 0),
|
|
3322
|
-
int(s.get("watch_enabled") or 0), s.get("consent_json") or "{}",
|
|
3323
|
-
s.get("created_at") or _now(), s.get("updated_at") or _now(),
|
|
3324
|
-
s.get("last_scanned_at"),
|
|
3325
|
-
),
|
|
3326
|
-
)
|
|
3327
|
-
for p in provenance:
|
|
3328
|
-
conn.execute(
|
|
3329
|
-
"""
|
|
3330
|
-
INSERT OR REPLACE INTO ingestion_provenance(
|
|
3331
|
-
id, node_id, source_type, source_uri, content_hash, title, pipeline,
|
|
3332
|
-
owner, workspace_id, captured_at, modified_at, embedded, linked,
|
|
3333
|
-
duplicate, agent_used, chunk_count, permissions_json, metadata_json, created_at)
|
|
3334
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
3335
|
-
""",
|
|
3336
|
-
(
|
|
3337
|
-
p["id"], p["node_id"], p["source_type"], p.get("source_uri"),
|
|
3338
|
-
p.get("content_hash"), p.get("title"), p.get("pipeline") or "import",
|
|
3339
|
-
p.get("owner"), p.get("workspace_id"), p.get("captured_at"),
|
|
3340
|
-
p.get("modified_at"), int(p.get("embedded") or 0), int(p.get("linked") or 0),
|
|
3341
|
-
int(p.get("duplicate") or 0), p.get("agent_used"), int(p.get("chunk_count") or 0),
|
|
3342
|
-
p.get("permissions_json") or "{}", p.get("metadata_json") or "{}",
|
|
3343
|
-
p.get("created_at") or _now(),
|
|
3344
|
-
),
|
|
3345
|
-
)
|
|
3346
|
-
plan["imported"] = True
|
|
3347
|
-
return plan
|
|
3348
|
-
|
|
3349
|
-
def backup_database(self, dest_path) -> Path:
|
|
3350
|
-
"""Write a clean, standalone snapshot of the live DB to ``dest_path``.
|
|
3351
|
-
|
|
3352
|
-
Uses ``VACUUM INTO`` (after a full WAL checkpoint) so the snapshot is a
|
|
3353
|
-
defragmented, rollback-journal-mode database with no companion -wal/-shm
|
|
3354
|
-
— which restores cleanly by a plain file copy. Captures all data incl.
|
|
3355
|
-
the vector_embeddings BLOBs.
|
|
3356
|
-
"""
|
|
3357
|
-
dest = Path(dest_path)
|
|
3358
|
-
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
3359
|
-
if dest.exists():
|
|
3360
|
-
dest.unlink() # VACUUM INTO requires the target to not exist
|
|
3361
|
-
conn = self._connect()
|
|
3362
|
-
try:
|
|
3363
|
-
conn.execute("PRAGMA wal_checkpoint(FULL)")
|
|
3364
|
-
conn.execute("VACUUM INTO ?", (str(dest),))
|
|
3365
|
-
finally:
|
|
3366
|
-
conn.close()
|
|
3367
|
-
return dest
|
|
3368
|
-
|
|
3369
|
-
def _ingest_structure_nodes(
|
|
3370
|
-
self,
|
|
3371
|
-
conn: sqlite3.Connection,
|
|
3372
|
-
file_id: str,
|
|
3373
|
-
filename: str,
|
|
3374
|
-
structure: Dict[str, Any],
|
|
3375
|
-
) -> None:
|
|
3376
|
-
for slide in structure.get("slides") or []:
|
|
3377
|
-
index = slide.get("index")
|
|
3378
|
-
slide_id = f"slide:{_sha256_text(f'{file_id}:slide:{index}')[:24]}"
|
|
3379
|
-
title = f"{filename} slide {index}"
|
|
3380
|
-
summary = "\n".join(slide.get("texts") or [])[:800]
|
|
3381
|
-
self._upsert_node(conn, slide_id, "Slide", title, summary=summary, metadata=slide)
|
|
3382
|
-
self._upsert_edge(conn, file_id, slide_id, "has_slide")
|
|
3383
|
-
for text in slide.get("texts") or []:
|
|
3384
|
-
for topic in _topic_candidates(text, limit=4):
|
|
3385
|
-
topic_id = f"topic:{_slug(topic)}"
|
|
3386
|
-
self._upsert_node(conn, topic_id, "Topic", topic, metadata={"auto_extracted": True})
|
|
3387
|
-
self._upsert_edge(conn, slide_id, topic_id, "discusses", weight=0.6)
|
|
3388
|
-
|
|
3389
|
-
for page in structure.get("pages") or []:
|
|
3390
|
-
index = page.get("index")
|
|
3391
|
-
page_id = f"page:{_sha256_text(f'{file_id}:page:{index}')[:24]}"
|
|
3392
|
-
title = f"{filename} page {index}"
|
|
3393
|
-
self._upsert_node(conn, page_id, "Page", title, summary=page.get("preview") or "", metadata=page)
|
|
3394
|
-
self._upsert_edge(conn, file_id, page_id, "has_page")
|
|
3395
|
-
for topic in _topic_candidates(page.get("preview") or "", limit=4):
|
|
3396
|
-
topic_id = f"topic:{_slug(topic)}"
|
|
3397
|
-
self._upsert_node(conn, topic_id, "Topic", topic, metadata={"auto_extracted": True})
|
|
3398
|
-
self._upsert_edge(conn, page_id, topic_id, "discusses", weight=0.6)
|
|
3399
|
-
|
|
3400
|
-
for sheet in (structure.get("sheets") or []):
|
|
3401
|
-
sheet_title = sheet.get("title")
|
|
3402
|
-
sheet_id = f"sheet:{_sha256_text(f'{file_id}:sheet:{sheet_title}')[:24]}"
|
|
3403
|
-
self._upsert_node(conn, sheet_id, "Sheet", f"{filename} / {sheet_title}", metadata=sheet)
|
|
3404
|
-
self._upsert_edge(conn, file_id, sheet_id, "has_sheet")
|
|
3405
|
-
|
|
3406
|
-
for image in (structure.get("images") or []):
|
|
3407
|
-
image_key = image.get("sha256") or _sha256_text(json.dumps(image, ensure_ascii=False, sort_keys=True))
|
|
3408
|
-
image_id = f"image:{str(image_key)[:24]}"
|
|
3409
|
-
title_parts = [filename, "image"]
|
|
3410
|
-
if image.get("page"):
|
|
3411
|
-
title_parts.append(f"page {image.get('page')}")
|
|
3412
|
-
if image.get("name"):
|
|
3413
|
-
title_parts.append(str(image.get("name")).split("/")[-1])
|
|
3414
|
-
self._upsert_node(conn, image_id, "Image", " / ".join(title_parts), metadata=image)
|
|
3415
|
-
self._upsert_edge(conn, file_id, image_id, "contains_image")
|
|
3416
|
-
|
|
3417
|
-
def _document_structure(self, path: Path, ext: str) -> Dict[str, Any]:
|
|
3418
|
-
try:
|
|
3419
|
-
if ext == ".pptx":
|
|
3420
|
-
return self._pptx_structure(path)
|
|
3421
|
-
if ext == ".pdf":
|
|
3422
|
-
return self._pdf_structure(path)
|
|
3423
|
-
if ext == ".docx":
|
|
3424
|
-
return self._docx_structure(path)
|
|
3425
|
-
if ext == ".xlsx":
|
|
3426
|
-
return self._xlsx_structure(path)
|
|
3427
|
-
except Exception as exc:
|
|
3428
|
-
return {"error": str(exc)}
|
|
3429
|
-
return {}
|
|
3430
|
-
|
|
3431
|
-
def _pptx_structure(self, path: Path) -> Dict[str, Any]:
|
|
3432
|
-
result: Dict[str, Any] = {"slides": [], "images": []}
|
|
3433
|
-
try:
|
|
3434
|
-
from PIL import Image
|
|
3435
|
-
from pptx import Presentation
|
|
3436
|
-
prs = Presentation(str(path))
|
|
3437
|
-
for slide_index, slide in enumerate(prs.slides, start=1):
|
|
3438
|
-
slide_info = {"index": slide_index, "shapes": [], "texts": []}
|
|
3439
|
-
for shape_index, shape in enumerate(slide.shapes, start=1):
|
|
3440
|
-
shape_info = {
|
|
3441
|
-
"index": shape_index,
|
|
3442
|
-
"name": getattr(shape, "name", ""),
|
|
3443
|
-
"shape_type": str(getattr(shape, "shape_type", "")),
|
|
3444
|
-
"bbox": {
|
|
3445
|
-
"left": int(getattr(shape, "left", 0) or 0),
|
|
3446
|
-
"top": int(getattr(shape, "top", 0) or 0),
|
|
3447
|
-
"width": int(getattr(shape, "width", 0) or 0),
|
|
3448
|
-
"height": int(getattr(shape, "height", 0) or 0),
|
|
3449
|
-
},
|
|
3450
|
-
}
|
|
3451
|
-
if getattr(shape, "has_text_frame", False):
|
|
3452
|
-
text = shape.text_frame.text.strip()
|
|
3453
|
-
if text:
|
|
3454
|
-
shape_info["text"] = text[:1000]
|
|
3455
|
-
slide_info["texts"].append(text)
|
|
3456
|
-
slide_info["shapes"].append(shape_info)
|
|
3457
|
-
result["slides"].append(slide_info)
|
|
3458
|
-
with zipfile.ZipFile(path) as zf:
|
|
3459
|
-
for name in zf.namelist():
|
|
3460
|
-
if not name.startswith("ppt/media/"):
|
|
3461
|
-
continue
|
|
3462
|
-
data = zf.read(name)
|
|
3463
|
-
image_info: Dict[str, Any] = {
|
|
3464
|
-
"name": name,
|
|
3465
|
-
"bytes": len(data),
|
|
3466
|
-
"sha256": _sha256_bytes(data),
|
|
3467
|
-
}
|
|
3468
|
-
try:
|
|
3469
|
-
from io import BytesIO
|
|
3470
|
-
with Image.open(BytesIO(data)) as img:
|
|
3471
|
-
image_info.update({"width": img.width, "height": img.height, "format": img.format})
|
|
3472
|
-
except Exception:
|
|
3473
|
-
pass
|
|
3474
|
-
result["images"].append(image_info)
|
|
3475
|
-
except Exception as exc:
|
|
3476
|
-
result["error"] = str(exc)
|
|
3477
|
-
return result
|
|
3478
|
-
|
|
3479
|
-
def _pdf_structure(self, path: Path) -> Dict[str, Any]:
|
|
3480
|
-
result: Dict[str, Any] = {"pages": [], "images": []}
|
|
3481
|
-
try:
|
|
3482
|
-
import pdfplumber
|
|
3483
|
-
with pdfplumber.open(str(path)) as pdf:
|
|
3484
|
-
metadata = dict(pdf.metadata or {})
|
|
3485
|
-
result["metadata"] = {str(k): str(v) for k, v in metadata.items()}
|
|
3486
|
-
for page_index, page in enumerate(pdf.pages, start=1):
|
|
3487
|
-
text = page.extract_text() or ""
|
|
3488
|
-
page_info = {
|
|
3489
|
-
"index": page_index,
|
|
3490
|
-
"width": float(page.width or 0),
|
|
3491
|
-
"height": float(page.height or 0),
|
|
3492
|
-
"chars": len(text),
|
|
3493
|
-
"preview": _clean_text(text)[:500],
|
|
3494
|
-
"image_count": len(page.images or []),
|
|
3495
|
-
}
|
|
3496
|
-
result["pages"].append(page_info)
|
|
3497
|
-
for image_index, image in enumerate(page.images or [], start=1):
|
|
3498
|
-
result["images"].append({
|
|
3499
|
-
"page": page_index,
|
|
3500
|
-
"index": image_index,
|
|
3501
|
-
"name": image.get("name"),
|
|
3502
|
-
"width": image.get("width"),
|
|
3503
|
-
"height": image.get("height"),
|
|
3504
|
-
"bbox": {
|
|
3505
|
-
"x0": image.get("x0"),
|
|
3506
|
-
"top": image.get("top"),
|
|
3507
|
-
"x1": image.get("x1"),
|
|
3508
|
-
"bottom": image.get("bottom"),
|
|
3509
|
-
},
|
|
3510
|
-
})
|
|
3511
|
-
except Exception as exc:
|
|
3512
|
-
result["error"] = str(exc)
|
|
3513
|
-
return result
|
|
3514
|
-
|
|
3515
|
-
def _docx_structure(self, path: Path) -> Dict[str, Any]:
|
|
3516
|
-
from docx import Document
|
|
3517
|
-
doc = Document(str(path))
|
|
3518
|
-
headings = []
|
|
3519
|
-
paragraphs = 0
|
|
3520
|
-
for p in doc.paragraphs:
|
|
3521
|
-
text = p.text.strip()
|
|
3522
|
-
if not text:
|
|
3523
|
-
continue
|
|
3524
|
-
paragraphs += 1
|
|
3525
|
-
style = getattr(p.style, "name", "")
|
|
3526
|
-
if style.lower().startswith("heading"):
|
|
3527
|
-
headings.append({"style": style, "text": text[:240]})
|
|
3528
|
-
return {"paragraphs": paragraphs, "headings": headings[:80], "tables": len(doc.tables)}
|
|
3529
|
-
|
|
3530
|
-
def _xlsx_structure(self, path: Path) -> Dict[str, Any]:
|
|
3531
|
-
from openpyxl import load_workbook
|
|
3532
|
-
wb = load_workbook(str(path), read_only=True, data_only=True)
|
|
3533
|
-
sheets = []
|
|
3534
|
-
for ws in wb.worksheets:
|
|
3535
|
-
sheets.append({"title": ws.title, "max_row": ws.max_row, "max_column": ws.max_column})
|
|
3536
|
-
return {"sheets": sheets}
|
|
3537
|
-
|
|
3538
|
-
# ── 그래프에 표시되는 노드 타입 (점 = 명사) ──────────────────────────────
|
|
3539
|
-
# Message / AIResponse / Chunk 는 RAG 검색용으로만 저장, 그래프에서 숨김.
|
|
3540
|
-
_GRAPH_VISIBLE_TYPES = (
|
|
3541
|
-
"Computer", # 내 컴퓨터
|
|
3542
|
-
"Drive", # 드라이브 / 볼륨
|
|
3543
|
-
"Folder", # 폴더
|
|
3544
|
-
"File", # 일반 파일
|
|
3545
|
-
"Chat", # 대화 세션
|
|
3546
|
-
"Document", # 파일 (PDF·PPT·Word·Excel·이미지)
|
|
3547
|
-
"CodeFile", # 코드 파일
|
|
3548
|
-
"Spreadsheet",# 엑셀/CSV
|
|
3549
|
-
"SlideDeck", # 프레젠테이션
|
|
3550
|
-
"Image", # 이미지
|
|
3551
|
-
"ImageText", # OCR 텍스트
|
|
3552
|
-
"Concept", # 개념 / 아이디어 / 기술 용어
|
|
3553
|
-
"Person", # 사람
|
|
3554
|
-
"Error", # 오류 / 버그
|
|
3555
|
-
"Code", # 코드 / 함수
|
|
3556
|
-
"Feature", # 소프트웨어 기능
|
|
3557
|
-
"Task", # 할 일
|
|
3558
|
-
"Decision", # 결정 사항
|
|
3559
|
-
# v3.6.0 Knowledge Graph First — 1급 엔티티를 그래프에 노출
|
|
3560
|
-
"Source", # 수집 출처 (파일/URL/브라우저 탭/git)
|
|
3561
|
-
"Repository", # git 저장소
|
|
3562
|
-
"Meeting", # 회의
|
|
3563
|
-
"Organization", # 조직
|
|
3564
|
-
"Workflow", # 워크플로우
|
|
3565
|
-
"Agent", # 에이전트
|
|
3566
|
-
)
|
|
3567
|
-
|
|
3568
|
-
def list_documents(self, limit: int = 200) -> Dict[str, Any]:
|
|
3569
|
-
"""List ingested ``Document`` nodes with their ingest + index state.
|
|
3570
|
-
|
|
3571
|
-
Powers the Files view: every accepted upload and every indexed local
|
|
3572
|
-
document becomes a ``Document`` node. A document is reported ``indexed``
|
|
3573
|
-
once its retrieval chunks exist (searchable in Chat / Hybrid Search).
|
|
3574
|
-
"""
|
|
3575
|
-
limit = max(1, min(int(limit or 200), 1000))
|
|
3576
|
-
nt, _ = self._read_tables()
|
|
3577
|
-
documents: List[Dict[str, Any]] = []
|
|
3578
|
-
with self._connect() as conn:
|
|
3579
|
-
rows = conn.execute(
|
|
3580
|
-
f"SELECT id, title, summary, metadata_json, created_at, updated_at "
|
|
3581
|
-
f"FROM {nt} WHERE type='Document' ORDER BY updated_at DESC, id ASC LIMIT ?",
|
|
3582
|
-
(limit,),
|
|
3583
|
-
).fetchall()
|
|
3584
|
-
for row in rows:
|
|
3585
|
-
meta = _safe_loads(row["metadata_json"]) or {}
|
|
3586
|
-
extracted = meta.get("extracted") or {}
|
|
3587
|
-
node_id = row["id"]
|
|
3588
|
-
chunk_count = conn.execute(
|
|
3589
|
-
f"SELECT COUNT(*) AS c FROM {nt} WHERE type='Chunk' AND metadata_json LIKE ?",
|
|
3590
|
-
(f"%{node_id}%",),
|
|
3591
|
-
).fetchone()["c"]
|
|
3592
|
-
documents.append({
|
|
3593
|
-
"id": node_id,
|
|
3594
|
-
"filename": meta.get("filename") or row["title"],
|
|
3595
|
-
"ext": meta.get("ext"),
|
|
3596
|
-
"mime_type": meta.get("mime_type"),
|
|
3597
|
-
"bytes": meta.get("bytes"),
|
|
3598
|
-
"sha256": meta.get("sha256"),
|
|
3599
|
-
"uploader": meta.get("uploader"),
|
|
3600
|
-
"chars": extracted.get("chars"),
|
|
3601
|
-
"chunks": int(chunk_count or 0),
|
|
3602
|
-
"indexed": int(chunk_count or 0) > 0,
|
|
3603
|
-
"ingest_state": "indexed" if int(chunk_count or 0) > 0 else "ingested",
|
|
3604
|
-
"created_at": row["created_at"],
|
|
3605
|
-
"updated_at": row["updated_at"],
|
|
3606
|
-
})
|
|
3607
|
-
return {
|
|
3608
|
-
"documents": documents,
|
|
3609
|
-
"total": len(documents),
|
|
3610
|
-
"generated_at": datetime.now().isoformat(timespec="seconds"),
|
|
3611
|
-
}
|
|
3612
|
-
|
|
3613
|
-
def graph(self, limit: int = 300) -> Dict[str, Any]:
|
|
3614
|
-
limit = max(1, min(int(limit or 300), 2000))
|
|
3615
|
-
visible = ",".join(f"'{t}'" for t in self._GRAPH_VISIBLE_TYPES)
|
|
3616
|
-
nt, et = self._read_tables()
|
|
3617
|
-
with self._connect() as conn:
|
|
3618
|
-
nodes = [
|
|
3619
|
-
{
|
|
3620
|
-
"id": row["id"],
|
|
3621
|
-
"type": row["type"],
|
|
3622
|
-
"title": row["title"],
|
|
3623
|
-
"summary": row["summary"],
|
|
3624
|
-
"metadata": _safe_loads(row["metadata_json"]),
|
|
3625
|
-
"updated_at": row["updated_at"],
|
|
3626
|
-
}
|
|
3627
|
-
for row in conn.execute(
|
|
3628
|
-
f"SELECT id, type, title, summary, metadata_json, updated_at FROM {nt} WHERE type IN ({visible}) ORDER BY updated_at DESC, id ASC LIMIT ?",
|
|
3629
|
-
(limit,),
|
|
3630
|
-
)
|
|
3631
|
-
]
|
|
3632
|
-
node_ids = {node["id"] for node in nodes}
|
|
3633
|
-
edges: List[Dict[str, Any]] = []
|
|
3634
|
-
if node_ids:
|
|
3635
|
-
edge_rows = conn.execute(
|
|
3636
|
-
f"""
|
|
3637
|
-
SELECT id, from_node, to_node, type, weight, metadata_json
|
|
3638
|
-
FROM {et}
|
|
3639
|
-
WHERE from_node IN (
|
|
3640
|
-
SELECT id FROM {nt} WHERE type IN ({visible})
|
|
3641
|
-
ORDER BY updated_at DESC, id ASC LIMIT ?
|
|
3642
|
-
)
|
|
3643
|
-
AND to_node IN (
|
|
3644
|
-
SELECT id FROM {nt} WHERE type IN ({visible})
|
|
3645
|
-
ORDER BY updated_at DESC, id ASC LIMIT ?
|
|
3646
|
-
)
|
|
3647
|
-
ORDER BY weight DESC, created_at DESC, id ASC
|
|
3648
|
-
""",
|
|
3649
|
-
(limit, limit),
|
|
3650
|
-
).fetchall()
|
|
3651
|
-
edges = [
|
|
3652
|
-
{
|
|
3653
|
-
"id": row["id"],
|
|
3654
|
-
"from": row["from_node"],
|
|
3655
|
-
"to": row["to_node"],
|
|
3656
|
-
"type": row["type"],
|
|
3657
|
-
"weight": row["weight"],
|
|
3658
|
-
"metadata": _safe_loads(row["metadata_json"]),
|
|
3659
|
-
}
|
|
3660
|
-
for row in edge_rows
|
|
3661
|
-
]
|
|
3662
|
-
|
|
3663
|
-
degree_map: Dict[str, int] = {}
|
|
3664
|
-
now = datetime.now()
|
|
3665
|
-
node_by_id = {node["id"]: node for node in nodes}
|
|
3666
|
-
topic_metrics: Dict[str, Dict[str, Any]] = {}
|
|
3667
|
-
|
|
3668
|
-
for edge in edges:
|
|
3669
|
-
degree_map[edge["from"]] = degree_map.get(edge["from"], 0) + 1
|
|
3670
|
-
degree_map[edge["to"]] = degree_map.get(edge["to"], 0) + 1
|
|
3671
|
-
from_node = node_by_id.get(edge["from"])
|
|
3672
|
-
to_node = node_by_id.get(edge["to"])
|
|
3673
|
-
if not from_node or not to_node:
|
|
3674
|
-
continue
|
|
3675
|
-
for topic_node, other_node in ((from_node, to_node), (to_node, from_node)):
|
|
3676
|
-
if topic_node["type"] != "Topic":
|
|
3677
|
-
continue
|
|
3678
|
-
metrics = topic_metrics.setdefault(topic_node["id"], {
|
|
3679
|
-
"mention_count": 0.0,
|
|
3680
|
-
"conversation_ids": set(),
|
|
3681
|
-
})
|
|
3682
|
-
if edge["type"] in {"mentions", "discusses"}:
|
|
3683
|
-
metrics["mention_count"] += max(0.5, float(edge.get("weight") or 1.0))
|
|
3684
|
-
other_meta = other_node.get("metadata") or {}
|
|
3685
|
-
conversation_id = other_meta.get("conversation_id")
|
|
3686
|
-
if other_node["type"] == "Conversation":
|
|
3687
|
-
conversation_id = other_node["id"]
|
|
3688
|
-
if conversation_id:
|
|
3689
|
-
metrics["conversation_ids"].add(str(conversation_id))
|
|
3690
|
-
|
|
3691
|
-
type_max_raw: Dict[str, float] = {}
|
|
3692
|
-
for node in nodes:
|
|
3693
|
-
degree = degree_map.get(node["id"], 0)
|
|
3694
|
-
recency = _recency_score(node.get("updated_at"), now=now)
|
|
3695
|
-
metrics = {
|
|
3696
|
-
"degree": degree,
|
|
3697
|
-
"recency_score": round(recency, 4),
|
|
3698
|
-
}
|
|
3699
|
-
if node["type"] == "Topic":
|
|
3700
|
-
topic_stat = topic_metrics.get(node["id"], {})
|
|
3701
|
-
mention_count = float(topic_stat.get("mention_count") or 0.0)
|
|
3702
|
-
conversation_count = len(topic_stat.get("conversation_ids") or ())
|
|
3703
|
-
raw_importance = (
|
|
3704
|
-
math.log1p(mention_count) * 2.8
|
|
3705
|
-
+ math.log1p(conversation_count) * 2.2
|
|
3706
|
-
+ recency * 1.4
|
|
3707
|
-
+ math.sqrt(max(0, degree)) * 0.45
|
|
3708
|
-
)
|
|
3709
|
-
metrics.update({
|
|
3710
|
-
"mention_count": round(mention_count, 2),
|
|
3711
|
-
"conversation_count": conversation_count,
|
|
3712
|
-
})
|
|
3713
|
-
else:
|
|
3714
|
-
raw_importance = math.log1p(max(0, degree)) * 1.4 + recency * 0.9
|
|
3715
|
-
|
|
3716
|
-
metrics["importance_raw"] = round(raw_importance, 4)
|
|
3717
|
-
node["importance"] = round(raw_importance, 4)
|
|
3718
|
-
node["_raw_importance"] = raw_importance
|
|
3719
|
-
node["metadata"] = {**(node.get("metadata") or {}), "graph_metrics": metrics}
|
|
3720
|
-
type_max_raw[node["type"]] = max(type_max_raw.get(node["type"], 0.0), raw_importance)
|
|
3721
|
-
|
|
3722
|
-
for node in nodes:
|
|
3723
|
-
max_raw = max(type_max_raw.get(node["type"], 0.0), 0.0001)
|
|
3724
|
-
importance_norm = min(1.0, (node.get("_raw_importance") or 0.0) / max_raw)
|
|
3725
|
-
node["importance_norm"] = round(importance_norm, 4)
|
|
3726
|
-
node["metadata"]["graph_metrics"]["importance_norm"] = node["importance_norm"]
|
|
3727
|
-
node.pop("_raw_importance", None)
|
|
3728
|
-
return {"nodes": nodes, "edges": edges}
|
|
3729
|
-
|
|
3730
|
-
def search(self, query: str, limit: int = 30) -> Dict[str, Any]:
|
|
3731
|
-
query = str(query or "").strip()
|
|
3732
|
-
q = f"%{query}%"
|
|
3733
|
-
limit = max(1, min(int(limit or 30), 100))
|
|
3734
|
-
nt, et = self._read_tables()
|
|
3735
|
-
with self._connect() as conn:
|
|
3736
|
-
rows = []
|
|
3737
|
-
if query:
|
|
3738
|
-
rows = conn.execute(
|
|
3739
|
-
f"""
|
|
3740
|
-
SELECT id, type, title, summary, metadata_json, updated_at
|
|
3741
|
-
FROM {nt}
|
|
3742
|
-
WHERE title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?
|
|
3743
|
-
ORDER BY updated_at DESC, id ASC
|
|
3744
|
-
LIMIT ?
|
|
3745
|
-
""",
|
|
3746
|
-
(q, q, q, limit),
|
|
3747
|
-
).fetchall()
|
|
3748
|
-
|
|
3749
|
-
if len(rows) < limit:
|
|
3750
|
-
terms = _topic_candidates(query, limit=8)
|
|
3751
|
-
if terms:
|
|
3752
|
-
clauses = []
|
|
3753
|
-
params: List[str] = []
|
|
3754
|
-
for term in terms:
|
|
3755
|
-
clauses.append("(title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?)")
|
|
3756
|
-
params.extend([f"%{term}%", f"%{term}%", f"%{term}%"])
|
|
3757
|
-
extra = conn.execute(
|
|
3758
|
-
f"""
|
|
3759
|
-
SELECT id, type, title, summary, metadata_json, updated_at
|
|
3760
|
-
FROM {nt}
|
|
3761
|
-
WHERE {' OR '.join(clauses)}
|
|
3762
|
-
ORDER BY updated_at DESC, id ASC
|
|
3763
|
-
LIMIT ?
|
|
3764
|
-
""",
|
|
3765
|
-
(*params, limit * 3),
|
|
3766
|
-
).fetchall()
|
|
3767
|
-
by_id = {row["id"]: row for row in rows}
|
|
3768
|
-
for row in extra:
|
|
3769
|
-
by_id.setdefault(row["id"], row)
|
|
3770
|
-
rows = list(by_id.values())
|
|
3771
|
-
|
|
3772
|
-
terms_for_score = set(_topic_candidates(query, limit=12))
|
|
3773
|
-
def score(row: sqlite3.Row) -> tuple:
|
|
3774
|
-
haystack = f"{row['title']} {row['summary']} {row['metadata_json']}".lower()
|
|
3775
|
-
hits = sum(1 for term in terms_for_score if term.lower() in haystack)
|
|
3776
|
-
type_boost = 1 if row["type"] in {
|
|
3777
|
-
"Decision", "Task", "File", "Document", "CodeFile",
|
|
3778
|
-
"Spreadsheet", "SlideDeck", "Image", "ImageText", "Page", "Slide",
|
|
3779
|
-
} else 0
|
|
3780
|
-
return (hits, type_boost, row["updated_at"] or "")
|
|
3781
|
-
|
|
3782
|
-
rows = sorted(rows, key=score, reverse=True)[:limit]
|
|
3783
|
-
return {
|
|
3784
|
-
"query": query,
|
|
3785
|
-
"matches": [
|
|
3786
|
-
{
|
|
3787
|
-
"id": row["id"],
|
|
3788
|
-
"type": row["type"],
|
|
3789
|
-
"title": row["title"],
|
|
3790
|
-
"summary": row["summary"],
|
|
3791
|
-
"metadata": _safe_loads(row["metadata_json"]),
|
|
3792
|
-
"updated_at": row["updated_at"],
|
|
3793
|
-
}
|
|
3794
|
-
for row in rows
|
|
3795
|
-
],
|
|
3796
|
-
}
|
|
3797
|
-
|
|
3798
|
-
def context_for_query(self, query: str, limit: int = 6) -> str:
|
|
3799
|
-
"""Return compact graph-backed RAG context for chat generation."""
|
|
3800
|
-
query = str(query or "").strip()
|
|
3801
|
-
if not query:
|
|
3802
|
-
return ""
|
|
3803
|
-
matches = self.search(query, limit).get("matches", [])
|
|
3804
|
-
if not matches:
|
|
3805
|
-
topics = _topic_candidates(query, limit=4)
|
|
3806
|
-
if topics:
|
|
3807
|
-
nt, et = self._read_tables()
|
|
3808
|
-
with self._connect() as conn:
|
|
3809
|
-
rows = []
|
|
3810
|
-
for topic in topics:
|
|
3811
|
-
rows.extend(conn.execute(
|
|
3812
|
-
f"""
|
|
3813
|
-
SELECT id, type, title, summary, metadata_json
|
|
3814
|
-
FROM {nt}
|
|
3815
|
-
WHERE title LIKE ? OR metadata_json LIKE ?
|
|
3816
|
-
ORDER BY updated_at DESC, id ASC
|
|
3817
|
-
LIMIT 3
|
|
3818
|
-
""",
|
|
3819
|
-
(f"%{topic}%", f"%{topic}%"),
|
|
3820
|
-
).fetchall())
|
|
3821
|
-
seen = set()
|
|
3822
|
-
matches = []
|
|
3823
|
-
for row in rows:
|
|
3824
|
-
if row["id"] in seen:
|
|
3825
|
-
continue
|
|
3826
|
-
seen.add(row["id"])
|
|
3827
|
-
matches.append({
|
|
3828
|
-
"id": row["id"],
|
|
3829
|
-
"type": row["type"],
|
|
3830
|
-
"title": row["title"],
|
|
3831
|
-
"summary": row["summary"],
|
|
3832
|
-
"metadata": _safe_loads(row["metadata_json"]),
|
|
3833
|
-
})
|
|
3834
|
-
if len(matches) >= limit:
|
|
3835
|
-
break
|
|
3836
|
-
lines = []
|
|
3837
|
-
for match in matches[:limit]:
|
|
3838
|
-
meta = match.get("metadata") or {}
|
|
3839
|
-
source = (
|
|
3840
|
-
meta.get("relative_path")
|
|
3841
|
-
or meta.get("filename")
|
|
3842
|
-
or meta.get("conversation_id")
|
|
3843
|
-
or meta.get("source")
|
|
3844
|
-
or match["id"]
|
|
3845
|
-
)
|
|
3846
|
-
summary = _clean_text(match.get("summary") or "")[:700]
|
|
3847
|
-
lines.append(f"- [{match['type']}] {match['title']} | source={source} | {summary}")
|
|
3848
|
-
return "\n".join(lines)
|
|
3849
|
-
|
|
3850
|
-
def neighbors(self, node_id: str) -> Dict[str, Any]:
|
|
3851
|
-
"""Return direct neighbors (1-hop) of a node."""
|
|
3852
|
-
nt, et = self._read_tables()
|
|
3853
|
-
with self._connect() as conn:
|
|
3854
|
-
edge_rows = conn.execute(
|
|
3855
|
-
f"SELECT from_node, to_node, type, weight FROM {et} WHERE from_node=? OR to_node=? ORDER BY id ASC",
|
|
3856
|
-
(node_id, node_id),
|
|
3857
|
-
).fetchall()
|
|
3858
|
-
neighbor_ids: set = set()
|
|
3859
|
-
edges = []
|
|
3860
|
-
for row in edge_rows:
|
|
3861
|
-
neighbor_ids.add(row["from_node"])
|
|
3862
|
-
neighbor_ids.add(row["to_node"])
|
|
3863
|
-
edges.append({"from": row["from_node"], "to": row["to_node"], "type": row["type"], "weight": row["weight"]})
|
|
3864
|
-
neighbor_ids.discard(node_id)
|
|
3865
|
-
nodes = []
|
|
3866
|
-
if neighbor_ids:
|
|
3867
|
-
placeholders = ",".join("?" * len(neighbor_ids))
|
|
3868
|
-
nodes = [
|
|
3869
|
-
{
|
|
3870
|
-
"id": row["id"],
|
|
3871
|
-
"type": row["type"],
|
|
3872
|
-
"title": row["title"],
|
|
3873
|
-
"summary": row["summary"],
|
|
3874
|
-
"metadata": _safe_loads(row["metadata_json"]),
|
|
3875
|
-
}
|
|
3876
|
-
for row in conn.execute(
|
|
3877
|
-
f"SELECT id, type, title, summary, metadata_json FROM {nt} WHERE id IN ({placeholders}) ORDER BY id ASC",
|
|
3878
|
-
list(neighbor_ids),
|
|
3879
|
-
)
|
|
3880
|
-
]
|
|
3881
|
-
return {"node_id": node_id, "neighbors": nodes, "edges": edges}
|
|
3882
|
-
|
|
3883
|
-
def get_node(self, node_id: str) -> Dict[str, Any]:
|
|
3884
|
-
node_id = str(node_id or "").strip()
|
|
3885
|
-
if not node_id:
|
|
3886
|
-
raise ValueError("node_id required")
|
|
3887
|
-
nt, et = self._read_tables()
|
|
3888
|
-
with self._connect() as conn:
|
|
3889
|
-
row = conn.execute(
|
|
3890
|
-
f"""
|
|
3891
|
-
SELECT id, type, title, summary, metadata_json, updated_at
|
|
3892
|
-
FROM {nt}
|
|
3893
|
-
WHERE id=?
|
|
3894
|
-
""",
|
|
3895
|
-
(node_id,),
|
|
3896
|
-
).fetchone()
|
|
3897
|
-
if not row:
|
|
3898
|
-
raise ValueError(f"graph node not found: {node_id}")
|
|
3899
|
-
degree = conn.execute(
|
|
3900
|
-
f"SELECT COUNT(*) AS c FROM {et} WHERE from_node=? OR to_node=?",
|
|
3901
|
-
(node_id, node_id),
|
|
3902
|
-
).fetchone()["c"]
|
|
3903
|
-
return {
|
|
3904
|
-
"id": row["id"],
|
|
3905
|
-
"type": row["type"],
|
|
3906
|
-
"title": row["title"],
|
|
3907
|
-
"summary": row["summary"],
|
|
3908
|
-
"metadata": _safe_loads(row["metadata_json"]),
|
|
3909
|
-
"updated_at": row["updated_at"],
|
|
3910
|
-
"degree": degree,
|
|
3911
|
-
}
|
|
3912
|
-
|
|
3913
|
-
def relationship_search(
|
|
3914
|
-
self,
|
|
3915
|
-
*,
|
|
3916
|
-
query: str = "",
|
|
3917
|
-
node_id: str = "",
|
|
3918
|
-
relationship_type: str = "",
|
|
3919
|
-
limit: int = 30,
|
|
3920
|
-
) -> Dict[str, Any]:
|
|
3921
|
-
query = str(query or "").strip()
|
|
3922
|
-
node_id = str(node_id or "").strip()
|
|
3923
|
-
relationship_type = str(relationship_type or "").strip()
|
|
3924
|
-
limit = max(1, min(int(limit or 30), 200))
|
|
3925
|
-
nt, et = self._read_tables()
|
|
3926
|
-
where = []
|
|
3927
|
-
params: List[Any] = []
|
|
3928
|
-
if node_id:
|
|
3929
|
-
where.append("(e.from_node=? OR e.to_node=?)")
|
|
3930
|
-
params.extend([node_id, node_id])
|
|
3931
|
-
if relationship_type:
|
|
3932
|
-
where.append("e.type LIKE ?")
|
|
3933
|
-
params.append(f"%{relationship_type}%")
|
|
3934
|
-
if query:
|
|
3935
|
-
where.append(
|
|
3936
|
-
"(e.type LIKE ? OR e.metadata_json LIKE ? OR src.title LIKE ? OR dst.title LIKE ? OR src.summary LIKE ? OR dst.summary LIKE ?)"
|
|
3937
|
-
)
|
|
3938
|
-
params.extend([f"%{query}%"] * 6)
|
|
3939
|
-
where_sql = "WHERE " + " AND ".join(where) if where else ""
|
|
3940
|
-
with self._connect() as conn:
|
|
3941
|
-
rows = conn.execute(
|
|
3942
|
-
f"""
|
|
3943
|
-
SELECT
|
|
3944
|
-
e.id, e.from_node, e.to_node, e.type, e.weight, e.metadata_json, e.created_at,
|
|
3945
|
-
src.type AS source_type, src.title AS source_title, src.summary AS source_summary,
|
|
3946
|
-
src.metadata_json AS source_metadata,
|
|
3947
|
-
dst.type AS target_type, dst.title AS target_title, dst.summary AS target_summary,
|
|
3948
|
-
dst.metadata_json AS target_metadata
|
|
3949
|
-
FROM {et} e
|
|
3950
|
-
JOIN {nt} src ON src.id=e.from_node
|
|
3951
|
-
JOIN {nt} dst ON dst.id=e.to_node
|
|
3952
|
-
{where_sql}
|
|
3953
|
-
ORDER BY e.weight DESC, e.created_at DESC, e.id ASC
|
|
3954
|
-
LIMIT ?
|
|
3955
|
-
""",
|
|
3956
|
-
(*params, limit),
|
|
3957
|
-
).fetchall()
|
|
3958
|
-
return {
|
|
3959
|
-
"query": query,
|
|
3960
|
-
"node_id": node_id,
|
|
3961
|
-
"relationship_type": relationship_type,
|
|
3962
|
-
"relationships": [
|
|
3963
|
-
{
|
|
3964
|
-
"id": row["id"],
|
|
3965
|
-
"type": row["type"],
|
|
3966
|
-
"weight": row["weight"],
|
|
3967
|
-
"metadata": _safe_loads(row["metadata_json"]),
|
|
3968
|
-
"created_at": row["created_at"],
|
|
3969
|
-
"source": {
|
|
3970
|
-
"id": row["from_node"],
|
|
3971
|
-
"type": row["source_type"],
|
|
3972
|
-
"title": row["source_title"],
|
|
3973
|
-
"summary": row["source_summary"],
|
|
3974
|
-
"metadata": _safe_loads(row["source_metadata"]),
|
|
3975
|
-
},
|
|
3976
|
-
"target": {
|
|
3977
|
-
"id": row["to_node"],
|
|
3978
|
-
"type": row["target_type"],
|
|
3979
|
-
"title": row["target_title"],
|
|
3980
|
-
"summary": row["target_summary"],
|
|
3981
|
-
"metadata": _safe_loads(row["target_metadata"]),
|
|
3982
|
-
},
|
|
3983
|
-
}
|
|
3984
|
-
for row in rows
|
|
3985
|
-
],
|
|
3986
|
-
}
|
|
3987
|
-
|
|
3988
|
-
def traverse(self, node_id: str, *, depth: int = 1, limit: int = 100) -> Dict[str, Any]:
|
|
3989
|
-
node_id = str(node_id or "").strip()
|
|
3990
|
-
if not node_id:
|
|
3991
|
-
raise ValueError("node_id required")
|
|
3992
|
-
depth = max(0, min(int(depth or 1), 4))
|
|
3993
|
-
limit = max(1, min(int(limit or 100), 500))
|
|
3994
|
-
nt, et = self._read_tables()
|
|
3995
|
-
visited = {node_id}
|
|
3996
|
-
frontier = {node_id}
|
|
3997
|
-
edges_by_id: Dict[str, Dict[str, Any]] = {}
|
|
3998
|
-
with self._connect() as conn:
|
|
3999
|
-
for _ in range(depth):
|
|
4000
|
-
if not frontier or len(visited) >= limit:
|
|
4001
|
-
break
|
|
4002
|
-
placeholders = ",".join("?" * len(frontier))
|
|
4003
|
-
rows = conn.execute(
|
|
4004
|
-
f"""
|
|
4005
|
-
SELECT id, from_node, to_node, type, weight, metadata_json
|
|
4006
|
-
FROM {et}
|
|
4007
|
-
WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})
|
|
4008
|
-
ORDER BY weight DESC, id ASC
|
|
4009
|
-
LIMIT ?
|
|
4010
|
-
""",
|
|
4011
|
-
(*frontier, *frontier, limit * 3),
|
|
4012
|
-
).fetchall()
|
|
4013
|
-
next_frontier = set()
|
|
4014
|
-
for row in rows:
|
|
4015
|
-
edges_by_id[row["id"]] = {
|
|
4016
|
-
"id": row["id"],
|
|
4017
|
-
"from": row["from_node"],
|
|
4018
|
-
"to": row["to_node"],
|
|
4019
|
-
"type": row["type"],
|
|
4020
|
-
"weight": row["weight"],
|
|
4021
|
-
"metadata": _safe_loads(row["metadata_json"]),
|
|
4022
|
-
}
|
|
4023
|
-
for candidate in (row["from_node"], row["to_node"]):
|
|
4024
|
-
if candidate not in visited and len(visited) < limit:
|
|
4025
|
-
visited.add(candidate)
|
|
4026
|
-
next_frontier.add(candidate)
|
|
4027
|
-
frontier = next_frontier
|
|
4028
|
-
placeholders = ",".join("?" * len(visited))
|
|
4029
|
-
node_rows = conn.execute(
|
|
4030
|
-
f"""
|
|
4031
|
-
SELECT id, type, title, summary, metadata_json, updated_at
|
|
4032
|
-
FROM {nt}
|
|
4033
|
-
WHERE id IN ({placeholders})
|
|
4034
|
-
ORDER BY updated_at DESC, id ASC
|
|
4035
|
-
""",
|
|
4036
|
-
list(visited),
|
|
4037
|
-
).fetchall()
|
|
4038
|
-
return {
|
|
4039
|
-
"root": node_id,
|
|
4040
|
-
"depth": depth,
|
|
4041
|
-
"nodes": [
|
|
4042
|
-
{
|
|
4043
|
-
"id": row["id"],
|
|
4044
|
-
"type": row["type"],
|
|
4045
|
-
"title": row["title"],
|
|
4046
|
-
"summary": row["summary"],
|
|
4047
|
-
"metadata": _safe_loads(row["metadata_json"]),
|
|
4048
|
-
"updated_at": row["updated_at"],
|
|
4049
|
-
}
|
|
4050
|
-
for row in node_rows
|
|
4051
|
-
],
|
|
4052
|
-
"edges": list(edges_by_id.values()),
|
|
4053
|
-
}
|
|
4054
|
-
|
|
4055
|
-
def _iter_vector_source_items(
|
|
4056
|
-
self,
|
|
4057
|
-
conn: sqlite3.Connection,
|
|
4058
|
-
*,
|
|
4059
|
-
include_nodes: bool = True,
|
|
4060
|
-
include_chunks: bool = True,
|
|
4061
|
-
) -> List[Dict[str, Any]]:
|
|
4062
|
-
items: List[Dict[str, Any]] = []
|
|
4063
|
-
if include_nodes:
|
|
4064
|
-
for row in conn.execute(
|
|
4065
|
-
"""
|
|
4066
|
-
SELECT id, type, title, summary, metadata_json
|
|
4067
|
-
FROM nodes
|
|
4068
|
-
WHERE type <> 'Chunk'
|
|
4069
|
-
ORDER BY updated_at DESC, id ASC
|
|
4070
|
-
"""
|
|
4071
|
-
).fetchall():
|
|
4072
|
-
metadata = _safe_loads(row["metadata_json"])
|
|
4073
|
-
text = self._vector_text_for_node(
|
|
4074
|
-
title=row["title"],
|
|
4075
|
-
summary=row["summary"] or "",
|
|
4076
|
-
metadata=metadata,
|
|
4077
|
-
)
|
|
4078
|
-
if text:
|
|
4079
|
-
items.append({
|
|
4080
|
-
"item_id": row["id"],
|
|
4081
|
-
"item_type": "node",
|
|
4082
|
-
"source_node": row["id"],
|
|
4083
|
-
"text": text,
|
|
4084
|
-
"metadata": {"node_type": row["type"], **metadata},
|
|
4085
|
-
})
|
|
4086
|
-
if include_chunks:
|
|
4087
|
-
for row in conn.execute(
|
|
4088
|
-
"""
|
|
4089
|
-
SELECT c.id, c.source_node AS parent_source_node, c.text, c.metadata_json
|
|
4090
|
-
FROM chunks c
|
|
4091
|
-
JOIN nodes n ON n.id=c.id
|
|
4092
|
-
ORDER BY c.created_at DESC, c.id ASC
|
|
4093
|
-
"""
|
|
4094
|
-
).fetchall():
|
|
4095
|
-
metadata = _safe_loads(row["metadata_json"])
|
|
4096
|
-
text = _clean_text(row["text"] or "")
|
|
4097
|
-
if text:
|
|
4098
|
-
items.append({
|
|
4099
|
-
"item_id": row["id"],
|
|
4100
|
-
"item_type": "chunk",
|
|
4101
|
-
"source_node": row["id"],
|
|
4102
|
-
"text": text,
|
|
4103
|
-
"metadata": {**metadata, "parent_source_node": row["parent_source_node"]},
|
|
4104
|
-
})
|
|
4105
|
-
return items
|
|
4106
|
-
|
|
4107
|
-
def rebuild_vector_index(
|
|
4108
|
-
self,
|
|
4109
|
-
*,
|
|
4110
|
-
full: bool = False,
|
|
4111
|
-
include_nodes: bool = True,
|
|
4112
|
-
include_chunks: bool = True,
|
|
4113
|
-
) -> Dict[str, Any]:
|
|
4114
|
-
"""Rebuild the derived vector index without mutating graph content."""
|
|
4115
|
-
op_id = f"vector-op:{_sha256_text(f'{time.time()}:{os.getpid()}')[:24]}"
|
|
4116
|
-
requested_at = _now()
|
|
4117
|
-
started = time.perf_counter()
|
|
4118
|
-
try:
|
|
4119
|
-
with self._connect() as conn:
|
|
4120
|
-
conn.execute(
|
|
4121
|
-
"""
|
|
4122
|
-
INSERT INTO vector_index_operations(
|
|
4123
|
-
id, operation, status, requested_at, started_at, metadata_json
|
|
4124
|
-
)
|
|
4125
|
-
VALUES (?, ?, 'running', ?, ?, ?)
|
|
4126
|
-
""",
|
|
4127
|
-
(
|
|
4128
|
-
op_id,
|
|
4129
|
-
"rebuild_full" if full else "rebuild_incremental",
|
|
4130
|
-
requested_at,
|
|
4131
|
-
requested_at,
|
|
4132
|
-
_json({"include_nodes": include_nodes, "include_chunks": include_chunks}),
|
|
4133
|
-
),
|
|
4134
|
-
)
|
|
4135
|
-
if full:
|
|
4136
|
-
filters = []
|
|
4137
|
-
if include_nodes:
|
|
4138
|
-
filters.append("'node'")
|
|
4139
|
-
if include_chunks:
|
|
4140
|
-
filters.append("'chunk'")
|
|
4141
|
-
if filters:
|
|
4142
|
-
conn.execute(f"DELETE FROM vector_embeddings WHERE item_type IN ({','.join(filters)})")
|
|
4143
|
-
items = self._iter_vector_source_items(
|
|
4144
|
-
conn,
|
|
4145
|
-
include_nodes=include_nodes,
|
|
4146
|
-
include_chunks=include_chunks,
|
|
4147
|
-
)
|
|
4148
|
-
indexed = skipped = 0
|
|
4149
|
-
for item in items:
|
|
4150
|
-
changed = self._upsert_vector_item(conn, **item)
|
|
4151
|
-
if changed:
|
|
4152
|
-
indexed += 1
|
|
4153
|
-
else:
|
|
4154
|
-
skipped += 1
|
|
4155
|
-
duration_ms = round((time.perf_counter() - started) * 1000, 2)
|
|
4156
|
-
conn.execute(
|
|
4157
|
-
"""
|
|
4158
|
-
UPDATE vector_index_operations
|
|
4159
|
-
SET status='completed', completed_at=?, items_total=?,
|
|
4160
|
-
items_indexed=?, items_skipped=?, metadata_json=?
|
|
4161
|
-
WHERE id=?
|
|
4162
|
-
""",
|
|
4163
|
-
(
|
|
4164
|
-
_now(),
|
|
4165
|
-
len(items),
|
|
4166
|
-
indexed,
|
|
4167
|
-
skipped,
|
|
4168
|
-
_json({
|
|
4169
|
-
"include_nodes": include_nodes,
|
|
4170
|
-
"include_chunks": include_chunks,
|
|
4171
|
-
"duration_ms": duration_ms,
|
|
4172
|
-
"embedding_model": self._embedding_model.model_id,
|
|
4173
|
-
"embedding_dim": self._embedding_model.dim,
|
|
4174
|
-
}),
|
|
4175
|
-
op_id,
|
|
4176
|
-
),
|
|
4177
|
-
)
|
|
4178
|
-
return {
|
|
4179
|
-
"status": "completed",
|
|
4180
|
-
"operation_id": op_id,
|
|
4181
|
-
"full": bool(full),
|
|
4182
|
-
"items_total": len(items),
|
|
4183
|
-
"items_indexed": indexed,
|
|
4184
|
-
"items_skipped": skipped,
|
|
4185
|
-
"duration_ms": duration_ms,
|
|
4186
|
-
"embedding_model": self._embedding_model.model_id,
|
|
4187
|
-
"embedding_dim": self._embedding_model.dim,
|
|
4188
|
-
}
|
|
4189
|
-
except Exception as exc:
|
|
4190
|
-
duration_ms = round((time.perf_counter() - started) * 1000, 2)
|
|
4191
|
-
with self._connect() as conn:
|
|
4192
|
-
conn.execute(
|
|
4193
|
-
"""
|
|
4194
|
-
INSERT INTO vector_index_operations(
|
|
4195
|
-
id, operation, status, requested_at, started_at, completed_at,
|
|
4196
|
-
error_message, metadata_json
|
|
4197
|
-
)
|
|
4198
|
-
VALUES (?, ?, 'failed', ?, ?, ?, ?, ?)
|
|
4199
|
-
ON CONFLICT(id) DO UPDATE SET
|
|
4200
|
-
status='failed',
|
|
4201
|
-
completed_at=excluded.completed_at,
|
|
4202
|
-
error_message=excluded.error_message,
|
|
4203
|
-
metadata_json=excluded.metadata_json
|
|
4204
|
-
""",
|
|
4205
|
-
(
|
|
4206
|
-
op_id,
|
|
4207
|
-
"rebuild_full" if full else "rebuild_incremental",
|
|
4208
|
-
requested_at,
|
|
4209
|
-
requested_at,
|
|
4210
|
-
_now(),
|
|
4211
|
-
str(exc),
|
|
4212
|
-
_json({"duration_ms": duration_ms}),
|
|
4213
|
-
),
|
|
4214
|
-
)
|
|
4215
|
-
raise
|
|
4216
|
-
|
|
4217
|
-
def index_status(self) -> Dict[str, Any]:
|
|
4218
|
-
with self._connect() as conn:
|
|
4219
|
-
vector_counts = {
|
|
4220
|
-
row["item_type"]: row["count"]
|
|
4221
|
-
for row in conn.execute(
|
|
4222
|
-
"SELECT item_type, COUNT(*) AS count FROM vector_embeddings GROUP BY item_type"
|
|
4223
|
-
)
|
|
4224
|
-
}
|
|
4225
|
-
source_items = self._iter_vector_source_items(conn)
|
|
4226
|
-
vector_rows = {
|
|
4227
|
-
row["item_id"]: row
|
|
4228
|
-
for row in conn.execute(
|
|
4229
|
-
"""
|
|
4230
|
-
SELECT item_id, text_hash, embedding_dim, embedding_model, indexed_at
|
|
4231
|
-
FROM vector_embeddings
|
|
4232
|
-
"""
|
|
4233
|
-
).fetchall()
|
|
4234
|
-
}
|
|
4235
|
-
latest_rows = conn.execute(
|
|
4236
|
-
"""
|
|
4237
|
-
SELECT id, operation, status, requested_at, started_at, completed_at,
|
|
4238
|
-
items_total, items_indexed, items_skipped, error_message, metadata_json
|
|
4239
|
-
FROM vector_index_operations
|
|
4240
|
-
ORDER BY requested_at DESC, id DESC
|
|
4241
|
-
LIMIT 5
|
|
4242
|
-
"""
|
|
4243
|
-
).fetchall()
|
|
4244
|
-
missing = stale = ready = 0
|
|
4245
|
-
for item in source_items:
|
|
4246
|
-
vector_row = vector_rows.get(item["item_id"])
|
|
4247
|
-
expected_hash = _sha256_text(_clean_text(item["text"]))
|
|
4248
|
-
if not vector_row:
|
|
4249
|
-
missing += 1
|
|
4250
|
-
elif (
|
|
4251
|
-
vector_row["text_hash"] != expected_hash
|
|
4252
|
-
or vector_row["embedding_dim"] != self._embedding_model.dim
|
|
4253
|
-
or vector_row["embedding_model"] != self._embedding_model.model_id
|
|
4254
|
-
):
|
|
4255
|
-
stale += 1
|
|
4256
|
-
else:
|
|
4257
|
-
ready += 1
|
|
4258
|
-
pending = missing + stale
|
|
4259
|
-
return {
|
|
4260
|
-
"status": "ready" if pending == 0 else "needs_reindex",
|
|
4261
|
-
"storage": {
|
|
4262
|
-
"db_path": str(self.db_path),
|
|
4263
|
-
"backend": "sqlite",
|
|
4264
|
-
"embedding_model": self._embedding_model.model_id,
|
|
4265
|
-
"embedding_dim": self._embedding_model.dim,
|
|
4266
|
-
},
|
|
4267
|
-
"source_items": len(source_items),
|
|
4268
|
-
"indexed_items": sum(vector_counts.values()),
|
|
4269
|
-
"ready_items": ready,
|
|
4270
|
-
"missing_items": missing,
|
|
4271
|
-
"stale_items": stale,
|
|
4272
|
-
"pending_items": pending,
|
|
4273
|
-
"by_item_type": vector_counts,
|
|
4274
|
-
"operations": [
|
|
4275
|
-
{
|
|
4276
|
-
"id": row["id"],
|
|
4277
|
-
"operation": row["operation"],
|
|
4278
|
-
"status": row["status"],
|
|
4279
|
-
"requested_at": row["requested_at"],
|
|
4280
|
-
"started_at": row["started_at"],
|
|
4281
|
-
"completed_at": row["completed_at"],
|
|
4282
|
-
"items_total": row["items_total"],
|
|
4283
|
-
"items_indexed": row["items_indexed"],
|
|
4284
|
-
"items_skipped": row["items_skipped"],
|
|
4285
|
-
"error_message": row["error_message"],
|
|
4286
|
-
"metadata": _safe_loads(row["metadata_json"]),
|
|
4287
|
-
}
|
|
4288
|
-
for row in latest_rows
|
|
4289
|
-
],
|
|
4290
|
-
}
|
|
4291
|
-
|
|
4292
|
-
def vector_search(
|
|
4293
|
-
self,
|
|
4294
|
-
query: str,
|
|
4295
|
-
*,
|
|
4296
|
-
limit: int = 30,
|
|
4297
|
-
min_score: float = 0.0,
|
|
4298
|
-
max_candidates: int = 10_000,
|
|
4299
|
-
) -> Dict[str, Any]:
|
|
4300
|
-
query = str(query or "").strip()
|
|
4301
|
-
limit = max(1, min(int(limit or 30), 100))
|
|
4302
|
-
min_score = float(min_score or 0.0)
|
|
4303
|
-
if not query:
|
|
4304
|
-
return {"query": query, "matches": []}
|
|
4305
|
-
query_vector = self._embedding_model.embed(query)
|
|
4306
|
-
max_candidates = max(limit, min(int(max_candidates or 10_000), 50_000))
|
|
4307
|
-
with self._connect() as conn:
|
|
4308
|
-
rows = conn.execute(
|
|
4309
|
-
"""
|
|
4310
|
-
SELECT
|
|
4311
|
-
ve.item_id, ve.item_type, ve.source_node, ve.embedding,
|
|
4312
|
-
ve.embedding_dim, ve.embedding_model, ve.metadata_json AS vector_metadata,
|
|
4313
|
-
n.type AS node_type, n.title AS node_title, n.summary AS node_summary,
|
|
4314
|
-
n.metadata_json AS node_metadata, n.updated_at AS node_updated_at,
|
|
4315
|
-
c.text AS chunk_text, c.source_node AS parent_node_id,
|
|
4316
|
-
pn.type AS parent_type, pn.title AS parent_title,
|
|
4317
|
-
pn.summary AS parent_summary, pn.metadata_json AS parent_metadata,
|
|
4318
|
-
pn.updated_at AS parent_updated_at
|
|
4319
|
-
FROM vector_embeddings ve
|
|
4320
|
-
LEFT JOIN nodes n ON n.id=ve.source_node
|
|
4321
|
-
LEFT JOIN chunks c ON c.id=ve.item_id
|
|
4322
|
-
LEFT JOIN nodes pn ON pn.id=c.source_node
|
|
4323
|
-
WHERE ve.embedding_model=? AND ve.embedding_dim=?
|
|
4324
|
-
ORDER BY ve.indexed_at DESC
|
|
4325
|
-
LIMIT ?
|
|
4326
|
-
""",
|
|
4327
|
-
(self._embedding_model.model_id, self._embedding_model.dim, max_candidates),
|
|
4328
|
-
).fetchall()
|
|
4329
|
-
scored = []
|
|
4330
|
-
for row in rows:
|
|
4331
|
-
vector = self._embedding_model.decode(row["embedding"], row["embedding_dim"])
|
|
4332
|
-
score = self._embedding_model.similarity(query_vector, vector)
|
|
4333
|
-
if score < min_score:
|
|
4334
|
-
continue
|
|
4335
|
-
is_chunk = row["item_type"] == "chunk"
|
|
4336
|
-
summary = row["chunk_text"] if is_chunk and row["chunk_text"] else row["node_summary"]
|
|
4337
|
-
parent_metadata = _safe_loads(row["parent_metadata"])
|
|
4338
|
-
node_metadata = _safe_loads(row["node_metadata"])
|
|
4339
|
-
scored.append({
|
|
4340
|
-
"id": row["item_id"],
|
|
4341
|
-
"node_id": row["parent_node_id"] if is_chunk and row["parent_node_id"] else row["source_node"],
|
|
4342
|
-
"item_type": row["item_type"],
|
|
4343
|
-
"type": "Chunk" if is_chunk else row["node_type"],
|
|
4344
|
-
"title": row["parent_title"] if is_chunk and row["parent_title"] else row["node_title"],
|
|
4345
|
-
"summary": _clean_text(summary or "")[:1000],
|
|
4346
|
-
"score": round(float(score), 6),
|
|
4347
|
-
"metadata": {
|
|
4348
|
-
**(parent_metadata if is_chunk else node_metadata),
|
|
4349
|
-
"vector": _safe_loads(row["vector_metadata"]),
|
|
4350
|
-
"parent_node_id": row["parent_node_id"],
|
|
4351
|
-
"parent_type": row["parent_type"],
|
|
4352
|
-
},
|
|
4353
|
-
"updated_at": row["parent_updated_at"] if is_chunk and row["parent_updated_at"] else row["node_updated_at"],
|
|
4354
|
-
})
|
|
4355
|
-
scored.sort(key=lambda item: (item["score"], item.get("updated_at") or ""), reverse=True)
|
|
4356
|
-
return {
|
|
4357
|
-
"query": query,
|
|
4358
|
-
"embedding_model": self._embedding_model.model_id,
|
|
4359
|
-
"embedding_dim": self._embedding_model.dim,
|
|
4360
|
-
"matches": scored[:limit],
|
|
4361
|
-
}
|
|
4362
|
-
|
|
4363
|
-
def delete_conversation(self, conversation_id: str) -> Dict[str, Any]:
|
|
4364
|
-
conversation_id = str(conversation_id or "").strip()
|
|
4365
|
-
if not conversation_id:
|
|
4366
|
-
return {"status": "skipped", "removed_nodes": 0}
|
|
4367
|
-
conv_id = f"conversation:{_slug(conversation_id)}"
|
|
4368
|
-
with self._connect() as conn:
|
|
4369
|
-
direct_ids = [
|
|
4370
|
-
row["to_node"]
|
|
4371
|
-
for row in conn.execute(
|
|
4372
|
-
"SELECT to_node FROM edges WHERE from_node=? AND type='contains'",
|
|
4373
|
-
(conv_id,),
|
|
4374
|
-
)
|
|
4375
|
-
]
|
|
4376
|
-
remove_ids = set(direct_ids)
|
|
4377
|
-
for source_id in list(direct_ids):
|
|
4378
|
-
for row in conn.execute(
|
|
4379
|
-
"""
|
|
4380
|
-
SELECT to_node FROM edges
|
|
4381
|
-
WHERE from_node=? AND type IN ('has_chunk', 'implies', 'contains_signal', 'has_page', 'has_slide', 'has_sheet', 'contains_image')
|
|
4382
|
-
""",
|
|
4383
|
-
(source_id,),
|
|
4384
|
-
):
|
|
4385
|
-
remove_ids.add(row["to_node"])
|
|
4386
|
-
remove_ids.add(conv_id)
|
|
4387
|
-
for node_id in remove_ids:
|
|
4388
|
-
conn.execute("DELETE FROM nodes WHERE id=?", (node_id,))
|
|
4389
|
-
if KGStoreV2 is not None:
|
|
4390
|
-
conn.execute("DELETE FROM nodes_v2 WHERE id=?", (node_id,)) # edges_v2 cascade
|
|
4391
|
-
conn.execute(
|
|
4392
|
-
"""
|
|
4393
|
-
DELETE FROM nodes
|
|
4394
|
-
WHERE type='Topic'
|
|
4395
|
-
AND id NOT IN (SELECT to_node FROM edges)
|
|
4396
|
-
AND id NOT IN (SELECT from_node FROM edges)
|
|
4397
|
-
"""
|
|
4398
|
-
)
|
|
4399
|
-
if KGStoreV2 is not None:
|
|
4400
|
-
conn.execute(
|
|
4401
|
-
"""
|
|
4402
|
-
DELETE FROM nodes_v2
|
|
4403
|
-
WHERE legacy_type='Topic'
|
|
4404
|
-
AND id NOT IN (SELECT target FROM edges_v2)
|
|
4405
|
-
AND id NOT IN (SELECT source FROM edges_v2)
|
|
4406
|
-
"""
|
|
4407
|
-
)
|
|
4408
|
-
return {"status": "ok", "conversation_id": conversation_id, "removed_nodes": len(remove_ids)}
|
|
4409
|
-
|
|
4410
|
-
def clear_all(self) -> Dict[str, Any]:
|
|
4411
|
-
with self._connect() as conn:
|
|
4412
|
-
counts = {
|
|
4413
|
-
"nodes": conn.execute("SELECT COUNT(*) AS c FROM nodes").fetchone()["c"],
|
|
4414
|
-
"edges": conn.execute("SELECT COUNT(*) AS c FROM edges").fetchone()["c"],
|
|
4415
|
-
"chunks": conn.execute("SELECT COUNT(*) AS c FROM chunks").fetchone()["c"],
|
|
4416
|
-
"knowledge_sources": conn.execute("SELECT COUNT(*) AS c FROM knowledge_sources").fetchone()["c"],
|
|
4417
|
-
"local_file_index": conn.execute("SELECT COUNT(*) AS c FROM local_file_index").fetchone()["c"],
|
|
4418
|
-
}
|
|
4419
|
-
conn.execute("DELETE FROM local_file_index")
|
|
4420
|
-
conn.execute("DELETE FROM knowledge_sources")
|
|
4421
|
-
conn.execute("DELETE FROM chunks")
|
|
4422
|
-
conn.execute("DELETE FROM edges")
|
|
4423
|
-
conn.execute("DELETE FROM nodes")
|
|
4424
|
-
if KGStoreV2 is not None:
|
|
4425
|
-
conn.execute("DELETE FROM edges_v2")
|
|
4426
|
-
conn.execute("DELETE FROM nodes_v2")
|
|
4427
|
-
if self.blob_dir.exists():
|
|
4428
|
-
shutil.rmtree(self.blob_dir, ignore_errors=True)
|
|
4429
|
-
self.blob_dir.mkdir(parents=True, exist_ok=True)
|
|
4430
|
-
return {"status": "ok", "removed": counts}
|
|
4431
|
-
|
|
4432
|
-
def stats(self) -> Dict[str, Any]:
|
|
4433
|
-
nt, et = self._read_tables()
|
|
4434
|
-
with self._connect() as conn:
|
|
4435
|
-
node_counts = {
|
|
4436
|
-
row["type"]: row["count"]
|
|
4437
|
-
for row in conn.execute(f"SELECT type, COUNT(*) AS count FROM {nt} GROUP BY type")
|
|
4438
|
-
}
|
|
4439
|
-
edge_counts = {
|
|
4440
|
-
row["type"]: row["count"]
|
|
4441
|
-
for row in conn.execute(f"SELECT type, COUNT(*) AS count FROM {et} GROUP BY type")
|
|
4442
|
-
}
|
|
4443
|
-
local_sources = conn.execute("SELECT COUNT(*) AS c FROM knowledge_sources").fetchone()["c"]
|
|
4444
|
-
local_file_status = {
|
|
4445
|
-
row["status"]: row["count"]
|
|
4446
|
-
for row in conn.execute("SELECT status, COUNT(*) AS count FROM local_file_index GROUP BY status")
|
|
4447
|
-
}
|
|
4448
|
-
v2 = None
|
|
4449
|
-
if KGStoreV2 is not None:
|
|
4450
|
-
try:
|
|
4451
|
-
v2 = KGStoreV2(self.db_path).stats()
|
|
4452
|
-
except Exception as e:
|
|
4453
|
-
v2 = {"available": False, "error": str(e)}
|
|
4454
|
-
return {
|
|
4455
|
-
"db_path": str(self.db_path),
|
|
4456
|
-
"schema_version": GRAPH_SCHEMA_VERSION,
|
|
4457
|
-
"v2_schema_available": KGStoreV2 is not None,
|
|
4458
|
-
"nodes": node_counts,
|
|
4459
|
-
"edges": edge_counts,
|
|
4460
|
-
"local_sources": local_sources,
|
|
4461
|
-
"local_file_status": local_file_status,
|
|
4462
|
-
"v2": v2,
|
|
4463
|
-
}
|
|
4464
|
-
|
|
4465
|
-
def search_for_document_generation(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
|
|
4466
|
-
"""Hybrid retrieval optimized for document generation.
|
|
4467
|
-
|
|
4468
|
-
Scoring: 0.5*text_relevance + 0.3*graph_relationship + 0.2*recency
|
|
4469
|
-
Returns nodes with rich context for document generation prompts.
|
|
4470
|
-
"""
|
|
4471
|
-
query = str(query or "").strip()
|
|
4472
|
-
if not query:
|
|
4473
|
-
return []
|
|
4474
|
-
limit = max(1, min(int(limit or 10), 50))
|
|
4475
|
-
terms = _topic_candidates(query, limit=12)
|
|
4476
|
-
now = datetime.now()
|
|
4477
|
-
nt, et = self._read_tables()
|
|
4478
|
-
|
|
4479
|
-
with self._connect() as conn:
|
|
4480
|
-
candidate_rows = []
|
|
4481
|
-
seen_ids = set()
|
|
4482
|
-
|
|
4483
|
-
if query:
|
|
4484
|
-
q = f"%{query}%"
|
|
4485
|
-
rows = conn.execute(
|
|
4486
|
-
f"""
|
|
4487
|
-
SELECT id, type, title, summary, metadata_json, updated_at
|
|
4488
|
-
FROM {nt}
|
|
4489
|
-
WHERE (title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?)
|
|
4490
|
-
AND type IN ('Document', 'File', 'CodeFile', 'SlideDeck',
|
|
4491
|
-
'Spreadsheet', 'Image', 'ImageText', 'Chat',
|
|
4492
|
-
'Decision', 'Task', 'Concept', 'Feature',
|
|
4493
|
-
'Page', 'Slide')
|
|
4494
|
-
ORDER BY updated_at DESC, id ASC
|
|
4495
|
-
LIMIT ?
|
|
4496
|
-
""",
|
|
4497
|
-
(q, q, q, limit * 5),
|
|
4498
|
-
).fetchall()
|
|
4499
|
-
for row in rows:
|
|
4500
|
-
if row["id"] not in seen_ids:
|
|
4501
|
-
seen_ids.add(row["id"])
|
|
4502
|
-
candidate_rows.append(row)
|
|
4503
|
-
|
|
4504
|
-
for term in terms:
|
|
4505
|
-
t = f"%{term}%"
|
|
4506
|
-
rows = conn.execute(
|
|
4507
|
-
f"""
|
|
4508
|
-
SELECT id, type, title, summary, metadata_json, updated_at
|
|
4509
|
-
FROM {nt}
|
|
4510
|
-
WHERE (title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?)
|
|
4511
|
-
AND type IN ('Document', 'File', 'CodeFile', 'SlideDeck',
|
|
4512
|
-
'Spreadsheet', 'Image', 'ImageText', 'Chat',
|
|
4513
|
-
'Decision', 'Task', 'Concept', 'Feature',
|
|
4514
|
-
'Page', 'Slide')
|
|
4515
|
-
ORDER BY updated_at DESC, id ASC
|
|
4516
|
-
LIMIT ?
|
|
4517
|
-
""",
|
|
4518
|
-
(t, t, t, limit * 3),
|
|
4519
|
-
).fetchall()
|
|
4520
|
-
for row in rows:
|
|
4521
|
-
if row["id"] not in seen_ids:
|
|
4522
|
-
seen_ids.add(row["id"])
|
|
4523
|
-
candidate_rows.append(row)
|
|
4524
|
-
|
|
4525
|
-
scored_results = []
|
|
4526
|
-
for row in candidate_rows:
|
|
4527
|
-
haystack = f"{row['title']} {row['summary']} {row['metadata_json']}".lower()
|
|
4528
|
-
|
|
4529
|
-
text_hits = sum(1 for term in terms if term.lower() in haystack)
|
|
4530
|
-
text_score = min(1.0, text_hits / max(len(terms), 1))
|
|
4531
|
-
|
|
4532
|
-
edge_count = conn.execute(
|
|
4533
|
-
f"SELECT COUNT(*) AS c FROM {et} WHERE from_node=? OR to_node=?",
|
|
4534
|
-
(row["id"], row["id"]),
|
|
4535
|
-
).fetchone()["c"]
|
|
4536
|
-
graph_score = min(1.0, math.log1p(edge_count) / 4.0)
|
|
4537
|
-
|
|
4538
|
-
recency = _recency_score(row["updated_at"], now=now, half_life_days=14.0)
|
|
4539
|
-
|
|
4540
|
-
doc_type_boost = 1.2 if row["type"] in (
|
|
4541
|
-
"Document", "File", "SlideDeck", "Decision",
|
|
4542
|
-
) else 1.0
|
|
4543
|
-
|
|
4544
|
-
hybrid_score = (
|
|
4545
|
-
0.5 * text_score
|
|
4546
|
-
+ 0.3 * graph_score
|
|
4547
|
-
+ 0.2 * recency
|
|
4548
|
-
) * doc_type_boost
|
|
4549
|
-
|
|
4550
|
-
meta = _safe_loads(row["metadata_json"])
|
|
4551
|
-
neighbor_concepts = []
|
|
4552
|
-
neighbor_rows = conn.execute(
|
|
4553
|
-
f"""
|
|
4554
|
-
SELECT n.title, n.type FROM {et} e
|
|
4555
|
-
JOIN {nt} n ON n.id = CASE WHEN e.from_node = ? THEN e.to_node ELSE e.from_node END
|
|
4556
|
-
WHERE (e.from_node = ? OR e.to_node = ?)
|
|
4557
|
-
AND n.type IN ('Concept', 'Feature', 'Decision', 'Task')
|
|
4558
|
-
LIMIT 8
|
|
4559
|
-
""",
|
|
4560
|
-
(row["id"], row["id"], row["id"]),
|
|
4561
|
-
).fetchall()
|
|
4562
|
-
for nr in neighbor_rows:
|
|
4563
|
-
neighbor_concepts.append({"title": nr["title"], "type": nr["type"]})
|
|
4564
|
-
|
|
4565
|
-
scored_results.append({
|
|
4566
|
-
"id": row["id"],
|
|
4567
|
-
"type": row["type"],
|
|
4568
|
-
"title": row["title"],
|
|
4569
|
-
"summary": row["summary"],
|
|
4570
|
-
"metadata": meta,
|
|
4571
|
-
"updated_at": row["updated_at"],
|
|
4572
|
-
"hybrid_score": round(hybrid_score, 4),
|
|
4573
|
-
"scores": {
|
|
4574
|
-
"text": round(text_score, 4),
|
|
4575
|
-
"graph": round(graph_score, 4),
|
|
4576
|
-
"recency": round(recency, 4),
|
|
4577
|
-
},
|
|
4578
|
-
"related_concepts": neighbor_concepts,
|
|
4579
|
-
})
|
|
4580
|
-
|
|
4581
|
-
scored_results.sort(key=lambda x: x["hybrid_score"], reverse=True)
|
|
4582
|
-
return scored_results[:limit]
|
|
4583
|
-
|
|
4584
|
-
def multi_hop_context(self, node_ids: List[str], max_hops: int = 2) -> Dict[str, Any]:
|
|
4585
|
-
"""Multi-hop graph traversal from seed nodes for richer context."""
|
|
4586
|
-
visited_nodes = set()
|
|
4587
|
-
visited_edges = set()
|
|
4588
|
-
all_nodes = []
|
|
4589
|
-
all_edges = []
|
|
4590
|
-
frontier = set(node_ids)
|
|
4591
|
-
nt, et = self._read_tables()
|
|
4592
|
-
|
|
4593
|
-
with self._connect() as conn:
|
|
4594
|
-
for hop in range(max_hops):
|
|
4595
|
-
if not frontier:
|
|
4596
|
-
break
|
|
4597
|
-
next_frontier = set()
|
|
4598
|
-
for nid in frontier:
|
|
4599
|
-
if nid in visited_nodes:
|
|
4600
|
-
continue
|
|
4601
|
-
visited_nodes.add(nid)
|
|
4602
|
-
row = conn.execute(
|
|
4603
|
-
f"SELECT id, type, title, summary, metadata_json, updated_at FROM {nt} WHERE id=?",
|
|
4604
|
-
(nid,),
|
|
4605
|
-
).fetchone()
|
|
4606
|
-
if row:
|
|
4607
|
-
all_nodes.append({
|
|
4608
|
-
"id": row["id"], "type": row["type"],
|
|
4609
|
-
"title": row["title"], "summary": row["summary"],
|
|
4610
|
-
"metadata": _safe_loads(row["metadata_json"]),
|
|
4611
|
-
"hop": hop,
|
|
4612
|
-
})
|
|
4613
|
-
edge_rows = conn.execute(
|
|
4614
|
-
f"""
|
|
4615
|
-
SELECT id, from_node, to_node, type, weight
|
|
4616
|
-
FROM {et} WHERE from_node=? OR to_node=?
|
|
4617
|
-
ORDER BY id ASC
|
|
4618
|
-
""",
|
|
4619
|
-
(nid, nid),
|
|
4620
|
-
).fetchall()
|
|
4621
|
-
for er in edge_rows:
|
|
4622
|
-
if er["id"] not in visited_edges:
|
|
4623
|
-
visited_edges.add(er["id"])
|
|
4624
|
-
all_edges.append({
|
|
4625
|
-
"from": er["from_node"], "to": er["to_node"],
|
|
4626
|
-
"type": er["type"], "weight": er["weight"],
|
|
4627
|
-
})
|
|
4628
|
-
other = er["to_node"] if er["from_node"] == nid else er["from_node"]
|
|
4629
|
-
if other not in visited_nodes:
|
|
4630
|
-
next_frontier.add(other)
|
|
4631
|
-
frontier = next_frontier
|
|
4632
|
-
|
|
4633
|
-
return {"nodes": all_nodes, "edges": all_edges}
|
|
27
|
+
from latticeai.brain.store import KnowledgeGraphStore
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"KnowledgeGraphStore",
|
|
31
|
+
"GRAPH_SCHEMA_VERSION",
|
|
32
|
+
"EDGE_VERB",
|
|
33
|
+
"_PROJECTION_VERSION",
|
|
34
|
+
"_KG_DB_FORMAT_VERSION",
|
|
35
|
+
"set_llm_router",
|
|
36
|
+
"_slug",
|
|
37
|
+
"_extract_concepts",
|
|
38
|
+
"_extract_concepts_rules",
|
|
39
|
+
"_extract_triples",
|
|
40
|
+
"_extract_triples_rules",
|
|
41
|
+
]
|