superlocalmemory 2.8.6 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +9 -1
- package/NOTICE +63 -0
- package/README.md +165 -480
- package/bin/slm +17 -449
- package/bin/slm-npm +62 -48
- package/conftest.py +5 -0
- package/docs/api-reference.md +284 -0
- package/docs/architecture.md +149 -0
- package/docs/auto-memory.md +150 -0
- package/docs/cli-reference.md +276 -0
- package/docs/compliance.md +191 -0
- package/docs/configuration.md +182 -0
- package/docs/getting-started.md +102 -0
- package/docs/ide-setup.md +261 -0
- package/docs/mcp-tools.md +220 -0
- package/docs/migration-from-v2.md +170 -0
- package/docs/profiles.md +173 -0
- package/docs/troubleshooting.md +310 -0
- package/{configs → ide/configs}/antigravity-mcp.json +3 -3
- package/ide/configs/chatgpt-desktop-mcp.json +16 -0
- package/{configs → ide/configs}/claude-desktop-mcp.json +3 -3
- package/{configs → ide/configs}/codex-mcp.toml +4 -4
- package/{configs → ide/configs}/continue-mcp.yaml +4 -3
- package/{configs → ide/configs}/continue-skills.yaml +6 -6
- package/ide/configs/cursor-mcp.json +15 -0
- package/{configs → ide/configs}/gemini-cli-mcp.json +2 -2
- package/{configs → ide/configs}/jetbrains-mcp.json +2 -2
- package/{configs → ide/configs}/opencode-mcp.json +2 -2
- package/{configs → ide/configs}/perplexity-mcp.json +2 -2
- package/{configs → ide/configs}/vscode-copilot-mcp.json +2 -2
- package/{configs → ide/configs}/windsurf-mcp.json +3 -3
- package/{configs → ide/configs}/zed-mcp.json +2 -2
- package/{hooks → ide/hooks}/context-hook.js +9 -20
- package/ide/hooks/memory-list-skill.js +70 -0
- package/ide/hooks/memory-profile-skill.js +101 -0
- package/ide/hooks/memory-recall-skill.js +62 -0
- package/ide/hooks/memory-remember-skill.js +68 -0
- package/ide/hooks/memory-reset-skill.js +160 -0
- package/{hooks → ide/hooks}/post-recall-hook.js +2 -2
- package/ide/integrations/langchain/README.md +106 -0
- package/ide/integrations/langchain/langchain_superlocalmemory/__init__.py +9 -0
- package/ide/integrations/langchain/langchain_superlocalmemory/chat_message_history.py +201 -0
- package/ide/integrations/langchain/pyproject.toml +38 -0
- package/{src/learning → ide/integrations/langchain}/tests/__init__.py +1 -0
- package/ide/integrations/langchain/tests/test_chat_message_history.py +215 -0
- package/ide/integrations/langchain/tests/test_security.py +117 -0
- package/ide/integrations/llamaindex/README.md +81 -0
- package/ide/integrations/llamaindex/llama_index/storage/chat_store/superlocalmemory/__init__.py +9 -0
- package/ide/integrations/llamaindex/llama_index/storage/chat_store/superlocalmemory/base.py +316 -0
- package/ide/integrations/llamaindex/pyproject.toml +43 -0
- package/{src/lifecycle → ide/integrations/llamaindex}/tests/__init__.py +1 -2
- package/ide/integrations/llamaindex/tests/test_chat_store.py +294 -0
- package/ide/integrations/llamaindex/tests/test_security.py +241 -0
- package/{skills → ide/skills}/slm-build-graph/SKILL.md +6 -6
- package/{skills → ide/skills}/slm-list-recent/SKILL.md +5 -5
- package/{skills → ide/skills}/slm-recall/SKILL.md +5 -5
- package/{skills → ide/skills}/slm-remember/SKILL.md +6 -6
- package/{skills → ide/skills}/slm-show-patterns/SKILL.md +7 -7
- package/{skills → ide/skills}/slm-status/SKILL.md +9 -9
- package/{skills → ide/skills}/slm-switch-profile/SKILL.md +9 -9
- package/package.json +13 -22
- package/pyproject.toml +85 -0
- package/scripts/build-dmg.sh +417 -0
- package/scripts/install-skills.ps1 +334 -0
- package/scripts/postinstall.js +2 -2
- package/scripts/start-dashboard.ps1 +52 -0
- package/scripts/start-dashboard.sh +41 -0
- package/scripts/sync-wiki.ps1 +127 -0
- package/scripts/sync-wiki.sh +82 -0
- package/scripts/test-dmg.sh +161 -0
- package/scripts/test-npm-package.ps1 +252 -0
- package/scripts/test-npm-package.sh +207 -0
- package/scripts/verify-install.ps1 +294 -0
- package/scripts/verify-install.sh +266 -0
- package/src/superlocalmemory/__init__.py +0 -0
- package/src/superlocalmemory/attribution/__init__.py +9 -0
- package/src/superlocalmemory/attribution/mathematical_dna.py +235 -0
- package/src/superlocalmemory/attribution/signer.py +153 -0
- package/src/superlocalmemory/attribution/watermark.py +189 -0
- package/src/superlocalmemory/cli/__init__.py +5 -0
- package/src/superlocalmemory/cli/commands.py +245 -0
- package/src/superlocalmemory/cli/main.py +89 -0
- package/src/superlocalmemory/cli/migrate_cmd.py +55 -0
- package/src/superlocalmemory/cli/post_install.py +99 -0
- package/src/superlocalmemory/cli/setup_wizard.py +129 -0
- package/src/superlocalmemory/compliance/__init__.py +0 -0
- package/src/superlocalmemory/compliance/abac.py +204 -0
- package/src/superlocalmemory/compliance/audit.py +314 -0
- package/src/superlocalmemory/compliance/eu_ai_act.py +131 -0
- package/src/superlocalmemory/compliance/gdpr.py +294 -0
- package/src/superlocalmemory/compliance/lifecycle.py +158 -0
- package/src/superlocalmemory/compliance/retention.py +232 -0
- package/src/superlocalmemory/compliance/scheduler.py +148 -0
- package/src/superlocalmemory/core/__init__.py +0 -0
- package/src/superlocalmemory/core/config.py +391 -0
- package/src/superlocalmemory/core/embeddings.py +293 -0
- package/src/superlocalmemory/core/engine.py +701 -0
- package/src/superlocalmemory/core/hooks.py +65 -0
- package/src/superlocalmemory/core/maintenance.py +172 -0
- package/src/superlocalmemory/core/modes.py +140 -0
- package/src/superlocalmemory/core/profiles.py +234 -0
- package/src/superlocalmemory/core/registry.py +117 -0
- package/src/superlocalmemory/dynamics/__init__.py +0 -0
- package/src/superlocalmemory/dynamics/fisher_langevin_coupling.py +223 -0
- package/src/superlocalmemory/encoding/__init__.py +0 -0
- package/src/superlocalmemory/encoding/consolidator.py +485 -0
- package/src/superlocalmemory/encoding/emotional.py +125 -0
- package/src/superlocalmemory/encoding/entity_resolver.py +525 -0
- package/src/superlocalmemory/encoding/entropy_gate.py +104 -0
- package/src/superlocalmemory/encoding/fact_extractor.py +775 -0
- package/src/superlocalmemory/encoding/foresight.py +91 -0
- package/src/superlocalmemory/encoding/graph_builder.py +302 -0
- package/src/superlocalmemory/encoding/observation_builder.py +160 -0
- package/src/superlocalmemory/encoding/scene_builder.py +183 -0
- package/src/superlocalmemory/encoding/signal_inference.py +90 -0
- package/src/superlocalmemory/encoding/temporal_parser.py +426 -0
- package/src/superlocalmemory/encoding/type_router.py +235 -0
- package/src/superlocalmemory/hooks/__init__.py +3 -0
- package/src/superlocalmemory/hooks/auto_capture.py +111 -0
- package/src/superlocalmemory/hooks/auto_recall.py +93 -0
- package/src/superlocalmemory/hooks/ide_connector.py +204 -0
- package/src/superlocalmemory/hooks/rules_engine.py +99 -0
- package/src/superlocalmemory/infra/__init__.py +3 -0
- package/src/superlocalmemory/infra/auth_middleware.py +82 -0
- package/src/superlocalmemory/infra/backup.py +317 -0
- package/src/superlocalmemory/infra/cache_manager.py +267 -0
- package/src/superlocalmemory/infra/event_bus.py +381 -0
- package/src/superlocalmemory/infra/rate_limiter.py +135 -0
- package/src/{webhook_dispatcher.py → superlocalmemory/infra/webhook_dispatcher.py} +104 -101
- package/src/superlocalmemory/learning/__init__.py +0 -0
- package/src/superlocalmemory/learning/adaptive.py +172 -0
- package/src/superlocalmemory/learning/behavioral.py +490 -0
- package/src/superlocalmemory/learning/behavioral_listener.py +94 -0
- package/src/superlocalmemory/learning/bootstrap.py +298 -0
- package/src/superlocalmemory/learning/cross_project.py +399 -0
- package/src/superlocalmemory/learning/database.py +376 -0
- package/src/superlocalmemory/learning/engagement.py +323 -0
- package/src/superlocalmemory/learning/features.py +138 -0
- package/src/superlocalmemory/learning/feedback.py +316 -0
- package/src/superlocalmemory/learning/outcomes.py +255 -0
- package/src/superlocalmemory/learning/project_context.py +366 -0
- package/src/superlocalmemory/learning/ranker.py +155 -0
- package/src/superlocalmemory/learning/source_quality.py +303 -0
- package/src/superlocalmemory/learning/workflows.py +309 -0
- package/src/superlocalmemory/llm/__init__.py +0 -0
- package/src/superlocalmemory/llm/backbone.py +316 -0
- package/src/superlocalmemory/math/__init__.py +0 -0
- package/src/superlocalmemory/math/fisher.py +356 -0
- package/src/superlocalmemory/math/langevin.py +398 -0
- package/src/superlocalmemory/math/sheaf.py +257 -0
- package/src/superlocalmemory/mcp/__init__.py +0 -0
- package/src/superlocalmemory/mcp/resources.py +245 -0
- package/src/superlocalmemory/mcp/server.py +61 -0
- package/src/superlocalmemory/mcp/tools.py +18 -0
- package/src/superlocalmemory/mcp/tools_core.py +305 -0
- package/src/superlocalmemory/mcp/tools_v28.py +223 -0
- package/src/superlocalmemory/mcp/tools_v3.py +286 -0
- package/src/superlocalmemory/retrieval/__init__.py +0 -0
- package/src/superlocalmemory/retrieval/agentic.py +295 -0
- package/src/superlocalmemory/retrieval/ann_index.py +223 -0
- package/src/superlocalmemory/retrieval/bm25_channel.py +185 -0
- package/src/superlocalmemory/retrieval/bridge_discovery.py +170 -0
- package/src/superlocalmemory/retrieval/engine.py +390 -0
- package/src/superlocalmemory/retrieval/entity_channel.py +179 -0
- package/src/superlocalmemory/retrieval/fusion.py +78 -0
- package/src/superlocalmemory/retrieval/profile_channel.py +105 -0
- package/src/superlocalmemory/retrieval/reranker.py +154 -0
- package/src/superlocalmemory/retrieval/semantic_channel.py +232 -0
- package/src/superlocalmemory/retrieval/strategy.py +96 -0
- package/src/superlocalmemory/retrieval/temporal_channel.py +175 -0
- package/src/superlocalmemory/server/__init__.py +1 -0
- package/src/superlocalmemory/server/api.py +248 -0
- package/src/superlocalmemory/server/routes/__init__.py +4 -0
- package/src/superlocalmemory/server/routes/agents.py +107 -0
- package/src/superlocalmemory/server/routes/backup.py +91 -0
- package/src/superlocalmemory/server/routes/behavioral.py +127 -0
- package/src/superlocalmemory/server/routes/compliance.py +160 -0
- package/src/superlocalmemory/server/routes/data_io.py +188 -0
- package/src/superlocalmemory/server/routes/events.py +183 -0
- package/src/superlocalmemory/server/routes/helpers.py +85 -0
- package/src/superlocalmemory/server/routes/learning.py +273 -0
- package/src/superlocalmemory/server/routes/lifecycle.py +116 -0
- package/src/superlocalmemory/server/routes/memories.py +399 -0
- package/src/superlocalmemory/server/routes/profiles.py +219 -0
- package/src/superlocalmemory/server/routes/stats.py +346 -0
- package/src/superlocalmemory/server/routes/v3_api.py +365 -0
- package/src/superlocalmemory/server/routes/ws.py +82 -0
- package/src/superlocalmemory/server/security_middleware.py +57 -0
- package/src/superlocalmemory/server/ui.py +245 -0
- package/src/superlocalmemory/storage/__init__.py +0 -0
- package/src/superlocalmemory/storage/access_control.py +182 -0
- package/src/superlocalmemory/storage/database.py +594 -0
- package/src/superlocalmemory/storage/migrations.py +303 -0
- package/src/superlocalmemory/storage/models.py +406 -0
- package/src/superlocalmemory/storage/schema.py +726 -0
- package/src/superlocalmemory/storage/v2_migrator.py +317 -0
- package/src/superlocalmemory/trust/__init__.py +0 -0
- package/src/superlocalmemory/trust/gate.py +130 -0
- package/src/superlocalmemory/trust/provenance.py +124 -0
- package/src/superlocalmemory/trust/scorer.py +347 -0
- package/src/superlocalmemory/trust/signals.py +153 -0
- package/ui/index.html +278 -5
- package/ui/js/auto-settings.js +70 -0
- package/ui/js/dashboard.js +90 -0
- package/ui/js/fact-detail.js +92 -0
- package/ui/js/feedback.js +2 -2
- package/ui/js/ide-status.js +102 -0
- package/ui/js/math-health.js +98 -0
- package/ui/js/recall-lab.js +127 -0
- package/ui/js/settings.js +2 -2
- package/ui/js/trust-dashboard.js +73 -0
- package/api_server.py +0 -724
- package/bin/aider-smart +0 -72
- package/bin/superlocalmemoryv2-learning +0 -4
- package/bin/superlocalmemoryv2-list +0 -3
- package/bin/superlocalmemoryv2-patterns +0 -4
- package/bin/superlocalmemoryv2-profile +0 -3
- package/bin/superlocalmemoryv2-recall +0 -3
- package/bin/superlocalmemoryv2-remember +0 -3
- package/bin/superlocalmemoryv2-reset +0 -3
- package/bin/superlocalmemoryv2-status +0 -3
- package/configs/chatgpt-desktop-mcp.json +0 -16
- package/configs/cursor-mcp.json +0 -15
- package/hooks/memory-list-skill.js +0 -139
- package/hooks/memory-profile-skill.js +0 -273
- package/hooks/memory-recall-skill.js +0 -114
- package/hooks/memory-remember-skill.js +0 -127
- package/hooks/memory-reset-skill.js +0 -274
- package/mcp_server.py +0 -1808
- package/requirements-core.txt +0 -22
- package/requirements-learning.txt +0 -12
- package/requirements.txt +0 -12
- package/src/agent_registry.py +0 -411
- package/src/auth_middleware.py +0 -61
- package/src/auto_backup.py +0 -459
- package/src/behavioral/__init__.py +0 -49
- package/src/behavioral/behavioral_listener.py +0 -203
- package/src/behavioral/behavioral_patterns.py +0 -275
- package/src/behavioral/cross_project_transfer.py +0 -206
- package/src/behavioral/outcome_inference.py +0 -194
- package/src/behavioral/outcome_tracker.py +0 -193
- package/src/behavioral/tests/__init__.py +0 -4
- package/src/behavioral/tests/test_behavioral_integration.py +0 -108
- package/src/behavioral/tests/test_behavioral_patterns.py +0 -150
- package/src/behavioral/tests/test_cross_project_transfer.py +0 -142
- package/src/behavioral/tests/test_mcp_behavioral.py +0 -139
- package/src/behavioral/tests/test_mcp_report_outcome.py +0 -117
- package/src/behavioral/tests/test_outcome_inference.py +0 -107
- package/src/behavioral/tests/test_outcome_tracker.py +0 -96
- package/src/cache_manager.py +0 -518
- package/src/compliance/__init__.py +0 -48
- package/src/compliance/abac_engine.py +0 -149
- package/src/compliance/abac_middleware.py +0 -116
- package/src/compliance/audit_db.py +0 -215
- package/src/compliance/audit_logger.py +0 -148
- package/src/compliance/retention_manager.py +0 -289
- package/src/compliance/retention_scheduler.py +0 -186
- package/src/compliance/tests/__init__.py +0 -4
- package/src/compliance/tests/test_abac_enforcement.py +0 -95
- package/src/compliance/tests/test_abac_engine.py +0 -124
- package/src/compliance/tests/test_abac_mcp_integration.py +0 -118
- package/src/compliance/tests/test_audit_db.py +0 -123
- package/src/compliance/tests/test_audit_logger.py +0 -98
- package/src/compliance/tests/test_mcp_audit.py +0 -128
- package/src/compliance/tests/test_mcp_retention_policy.py +0 -125
- package/src/compliance/tests/test_retention_manager.py +0 -131
- package/src/compliance/tests/test_retention_scheduler.py +0 -99
- package/src/compression/__init__.py +0 -25
- package/src/compression/cli.py +0 -150
- package/src/compression/cold_storage.py +0 -217
- package/src/compression/config.py +0 -72
- package/src/compression/orchestrator.py +0 -133
- package/src/compression/tier2_compressor.py +0 -228
- package/src/compression/tier3_compressor.py +0 -153
- package/src/compression/tier_classifier.py +0 -148
- package/src/db_connection_manager.py +0 -536
- package/src/embedding_engine.py +0 -63
- package/src/embeddings/__init__.py +0 -47
- package/src/embeddings/cache.py +0 -70
- package/src/embeddings/cli.py +0 -113
- package/src/embeddings/constants.py +0 -47
- package/src/embeddings/database.py +0 -91
- package/src/embeddings/engine.py +0 -247
- package/src/embeddings/model_loader.py +0 -145
- package/src/event_bus.py +0 -562
- package/src/graph/__init__.py +0 -36
- package/src/graph/build_helpers.py +0 -74
- package/src/graph/cli.py +0 -87
- package/src/graph/cluster_builder.py +0 -188
- package/src/graph/cluster_summary.py +0 -148
- package/src/graph/constants.py +0 -47
- package/src/graph/edge_builder.py +0 -162
- package/src/graph/entity_extractor.py +0 -95
- package/src/graph/graph_core.py +0 -226
- package/src/graph/graph_search.py +0 -231
- package/src/graph/hierarchical.py +0 -207
- package/src/graph/schema.py +0 -99
- package/src/graph_engine.py +0 -52
- package/src/hnsw_index.py +0 -628
- package/src/hybrid_search.py +0 -46
- package/src/learning/__init__.py +0 -217
- package/src/learning/adaptive_ranker.py +0 -682
- package/src/learning/bootstrap/__init__.py +0 -69
- package/src/learning/bootstrap/constants.py +0 -93
- package/src/learning/bootstrap/db_queries.py +0 -316
- package/src/learning/bootstrap/sampling.py +0 -82
- package/src/learning/bootstrap/text_utils.py +0 -71
- package/src/learning/cross_project_aggregator.py +0 -857
- package/src/learning/db/__init__.py +0 -40
- package/src/learning/db/constants.py +0 -44
- package/src/learning/db/schema.py +0 -279
- package/src/learning/engagement_tracker.py +0 -628
- package/src/learning/feature_extractor.py +0 -708
- package/src/learning/feedback_collector.py +0 -806
- package/src/learning/learning_db.py +0 -915
- package/src/learning/project_context_manager.py +0 -572
- package/src/learning/ranking/__init__.py +0 -33
- package/src/learning/ranking/constants.py +0 -84
- package/src/learning/ranking/helpers.py +0 -278
- package/src/learning/source_quality_scorer.py +0 -676
- package/src/learning/synthetic_bootstrap.py +0 -755
- package/src/learning/tests/test_adaptive_ranker.py +0 -325
- package/src/learning/tests/test_adaptive_ranker_v28.py +0 -60
- package/src/learning/tests/test_aggregator.py +0 -306
- package/src/learning/tests/test_auto_retrain_v28.py +0 -35
- package/src/learning/tests/test_e2e_ranking_v28.py +0 -82
- package/src/learning/tests/test_feature_extractor_v28.py +0 -93
- package/src/learning/tests/test_feedback_collector.py +0 -294
- package/src/learning/tests/test_learning_db.py +0 -602
- package/src/learning/tests/test_learning_db_v28.py +0 -110
- package/src/learning/tests/test_learning_init_v28.py +0 -48
- package/src/learning/tests/test_outcome_signals.py +0 -48
- package/src/learning/tests/test_project_context.py +0 -292
- package/src/learning/tests/test_schema_migration.py +0 -319
- package/src/learning/tests/test_signal_inference.py +0 -397
- package/src/learning/tests/test_source_quality.py +0 -351
- package/src/learning/tests/test_synthetic_bootstrap.py +0 -429
- package/src/learning/tests/test_workflow_miner.py +0 -318
- package/src/learning/workflow_pattern_miner.py +0 -655
- package/src/lifecycle/__init__.py +0 -54
- package/src/lifecycle/bounded_growth.py +0 -239
- package/src/lifecycle/compaction_engine.py +0 -226
- package/src/lifecycle/lifecycle_engine.py +0 -355
- package/src/lifecycle/lifecycle_evaluator.py +0 -257
- package/src/lifecycle/lifecycle_scheduler.py +0 -130
- package/src/lifecycle/retention_policy.py +0 -285
- package/src/lifecycle/tests/test_bounded_growth.py +0 -193
- package/src/lifecycle/tests/test_compaction.py +0 -179
- package/src/lifecycle/tests/test_lifecycle_engine.py +0 -137
- package/src/lifecycle/tests/test_lifecycle_evaluation.py +0 -177
- package/src/lifecycle/tests/test_lifecycle_scheduler.py +0 -127
- package/src/lifecycle/tests/test_lifecycle_search.py +0 -109
- package/src/lifecycle/tests/test_mcp_compact.py +0 -149
- package/src/lifecycle/tests/test_mcp_lifecycle_status.py +0 -114
- package/src/lifecycle/tests/test_retention_policy.py +0 -162
- package/src/mcp_tools_v28.py +0 -281
- package/src/memory/__init__.py +0 -36
- package/src/memory/cli.py +0 -205
- package/src/memory/constants.py +0 -39
- package/src/memory/helpers.py +0 -28
- package/src/memory/schema.py +0 -166
- package/src/memory-profiles.py +0 -595
- package/src/memory-reset.py +0 -491
- package/src/memory_compression.py +0 -989
- package/src/memory_store_v2.py +0 -1155
- package/src/migrate_v1_to_v2.py +0 -629
- package/src/pattern_learner.py +0 -34
- package/src/patterns/__init__.py +0 -24
- package/src/patterns/analyzers.py +0 -251
- package/src/patterns/learner.py +0 -271
- package/src/patterns/scoring.py +0 -171
- package/src/patterns/store.py +0 -225
- package/src/patterns/terminology.py +0 -140
- package/src/provenance_tracker.py +0 -312
- package/src/qualixar_attribution.py +0 -139
- package/src/qualixar_watermark.py +0 -78
- package/src/query_optimizer.py +0 -511
- package/src/rate_limiter.py +0 -83
- package/src/search/__init__.py +0 -20
- package/src/search/cli.py +0 -77
- package/src/search/constants.py +0 -26
- package/src/search/engine.py +0 -241
- package/src/search/fusion.py +0 -122
- package/src/search/index_loader.py +0 -114
- package/src/search/methods.py +0 -162
- package/src/search_engine_v2.py +0 -401
- package/src/setup_validator.py +0 -482
- package/src/subscription_manager.py +0 -391
- package/src/tree/__init__.py +0 -59
- package/src/tree/builder.py +0 -185
- package/src/tree/nodes.py +0 -202
- package/src/tree/queries.py +0 -257
- package/src/tree/schema.py +0 -80
- package/src/tree_manager.py +0 -19
- package/src/trust/__init__.py +0 -45
- package/src/trust/constants.py +0 -66
- package/src/trust/queries.py +0 -157
- package/src/trust/schema.py +0 -95
- package/src/trust/scorer.py +0 -299
- package/src/trust/signals.py +0 -95
- package/src/trust_scorer.py +0 -44
- package/ui/app.js +0 -1588
- package/ui/js/graph-cytoscape-monolithic-backup.js +0 -1168
- package/ui/js/graph-cytoscape.js +0 -1168
- package/ui/js/graph-d3-backup.js +0 -32
- package/ui/js/graph.js +0 -32
- package/ui_server.py +0 -286
- /package/docs/{ACCESSIBILITY.md → v2-archive/ACCESSIBILITY.md} +0 -0
- /package/docs/{ARCHITECTURE.md → v2-archive/ARCHITECTURE.md} +0 -0
- /package/docs/{CLI-COMMANDS-REFERENCE.md → v2-archive/CLI-COMMANDS-REFERENCE.md} +0 -0
- /package/docs/{COMPRESSION-README.md → v2-archive/COMPRESSION-README.md} +0 -0
- /package/docs/{FRAMEWORK-INTEGRATIONS.md → v2-archive/FRAMEWORK-INTEGRATIONS.md} +0 -0
- /package/docs/{MCP-MANUAL-SETUP.md → v2-archive/MCP-MANUAL-SETUP.md} +0 -0
- /package/docs/{MCP-TROUBLESHOOTING.md → v2-archive/MCP-TROUBLESHOOTING.md} +0 -0
- /package/docs/{PATTERN-LEARNING.md → v2-archive/PATTERN-LEARNING.md} +0 -0
- /package/docs/{PROFILES-GUIDE.md → v2-archive/PROFILES-GUIDE.md} +0 -0
- /package/docs/{RESET-GUIDE.md → v2-archive/RESET-GUIDE.md} +0 -0
- /package/docs/{SEARCH-ENGINE-V2.2.0.md → v2-archive/SEARCH-ENGINE-V2.2.0.md} +0 -0
- /package/docs/{SEARCH-INTEGRATION-GUIDE.md → v2-archive/SEARCH-INTEGRATION-GUIDE.md} +0 -0
- /package/docs/{UI-SERVER.md → v2-archive/UI-SERVER.md} +0 -0
- /package/docs/{UNIVERSAL-INTEGRATION.md → v2-archive/UNIVERSAL-INTEGRATION.md} +0 -0
- /package/docs/{V2.2.0-OPTIONAL-SEARCH.md → v2-archive/V2.2.0-OPTIONAL-SEARCH.md} +0 -0
- /package/docs/{WINDOWS-INSTALL-README.txt → v2-archive/WINDOWS-INSTALL-README.txt} +0 -0
- /package/docs/{WINDOWS-POST-INSTALL.txt → v2-archive/WINDOWS-POST-INSTALL.txt} +0 -0
- /package/docs/{example_graph_usage.py → v2-archive/example_graph_usage.py} +0 -0
- /package/{completions → ide/completions}/slm.bash +0 -0
- /package/{completions → ide/completions}/slm.zsh +0 -0
- /package/{configs → ide/configs}/cody-commands.json +0 -0
- /package/{install-skills.sh → scripts/install-skills.sh} +0 -0
- /package/{install.ps1 → scripts/install.ps1} +0 -0
- /package/{install.sh → scripts/install.sh} +0 -0
|
@@ -0,0 +1,775 @@
|
|
|
1
|
+
# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
|
|
2
|
+
# Licensed under the MIT License - see LICENSE file
|
|
3
|
+
# Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
|
|
4
|
+
|
|
5
|
+
"""Fact extraction — converts raw conversation turns into structured AtomicFacts.
|
|
6
|
+
|
|
7
|
+
Three extraction strategies aligned to operating modes:
|
|
8
|
+
Mode A Zero LLM — regex entities, date inference, keyword type classification.
|
|
9
|
+
Mode B Local Ollama — LLM-guided extraction with JSON output, Mode A fallback.
|
|
10
|
+
Mode C Cloud LLM — narrative fact extraction (2-5 per chunk), richest quality.
|
|
11
|
+
|
|
12
|
+
This module is the primary driver of encoding quality. Competitor analysis
|
|
13
|
+
(EverMemOS 93%, Hindsight 89.6%, Mastra 94.9%) shows that structured
|
|
14
|
+
extraction at encoding time — not retrieval sophistication — accounts for
|
|
15
|
+
the majority of benchmark score differences.
|
|
16
|
+
|
|
17
|
+
Key patterns implemented:
|
|
18
|
+
- Conversation chunking (5-10 turns, 2-turn overlap)
|
|
19
|
+
- Three-date temporal model (observation, referenced, interval)
|
|
20
|
+
- Typed fact classification (episodic / semantic / opinion / temporal)
|
|
21
|
+
- Importance scoring (entity frequency + emotional markers + recency)
|
|
22
|
+
- Narrative fact extraction in LLM modes (self-contained, context-rich)
|
|
23
|
+
|
|
24
|
+
Part of Qualixar | Author: Varun Pratap Bhardwaj
|
|
25
|
+
License: MIT
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import json
|
|
31
|
+
import logging
|
|
32
|
+
import re
|
|
33
|
+
import uuid
|
|
34
|
+
from typing import Any, Protocol, runtime_checkable
|
|
35
|
+
|
|
36
|
+
from superlocalmemory.core.config import EncodingConfig
|
|
37
|
+
from superlocalmemory.storage.models import AtomicFact, FactType, Mode, SignalType
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
# Protocols — accept any LLM / embedder without importing concrete classes
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
|
|
46
|
+
@runtime_checkable
|
|
47
|
+
class LLMBackboneProtocol(Protocol):
|
|
48
|
+
"""Minimal interface the fact extractor needs from an LLM."""
|
|
49
|
+
|
|
50
|
+
def is_available(self) -> bool: ...
|
|
51
|
+
def generate(
|
|
52
|
+
self,
|
|
53
|
+
prompt: str,
|
|
54
|
+
system: str = "",
|
|
55
|
+
temperature: float | None = None,
|
|
56
|
+
max_tokens: int | None = None,
|
|
57
|
+
) -> str: ...
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@runtime_checkable
|
|
61
|
+
class EmbedderProtocol(Protocol):
|
|
62
|
+
"""Minimal interface for computing embeddings (Mode A type classification)."""
|
|
63
|
+
|
|
64
|
+
def embed(self, text: str) -> list[float]: ...
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# ---------------------------------------------------------------------------
|
|
68
|
+
# Constants — regex patterns, markers, templates
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
|
|
71
|
+
_DATE_RE = re.compile(
|
|
72
|
+
r"\b(\d{4}-\d{2}-\d{2})" # ISO
|
|
73
|
+
r"|\b(\d{1,2}/\d{1,2}/\d{2,4})" # US
|
|
74
|
+
r"|\b((?:January|February|March|April|May|June|July"
|
|
75
|
+
r"|August|September|October|November|December)"
|
|
76
|
+
r"\s+\d{1,2}(?:,?\s+\d{4})?)" # Month Day Year
|
|
77
|
+
r"|\b(yesterday|today|tomorrow|last\s+\w+|next\s+\w+)\b",
|
|
78
|
+
re.IGNORECASE,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
_INTERVAL_RE = re.compile(
|
|
82
|
+
r"\b(?:from|between)\s+(.+?)\s+(?:to|and|until|through)\s+(.+?)(?:[.,;]|$)",
|
|
83
|
+
re.IGNORECASE,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
_ENTITY_RE = re.compile(
|
|
87
|
+
r"\b([A-Z][a-z]+(?:\s[A-Z][a-z]+){0,3})\b" # Capitalized word sequences
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
_QUOTED_RE = re.compile(r'"([^"]+)"') # Quoted strings as entities
|
|
91
|
+
|
|
92
|
+
_OPINION_MARKERS = re.compile(
|
|
93
|
+
r"\b(?:I think|I believe|I feel|in my opinion|I prefer|I like|I love|"
|
|
94
|
+
r"I hate|I want|I need|I wish|personally|my favorite|"
|
|
95
|
+
r"probably|seems like|might be|could be|I guess|"
|
|
96
|
+
r"thinks?|believes?|prefers?|preferred|likes?|liked|loves?|loved|hates?|hated|"
|
|
97
|
+
r"overrated|underrated|best|worst|favorite|"
|
|
98
|
+
r"should|shouldn't|ought to|better|rather)\b",
|
|
99
|
+
re.IGNORECASE,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
_EXPERIENCE_MARKERS = re.compile(
|
|
103
|
+
r"\b(?:I went|I visited|I saw|I met|I did|I made|I had|I was|"
|
|
104
|
+
r"we went|we visited|we had|I've been|I've done|I used to|"
|
|
105
|
+
r"I remember|I once|last time I|when I was|my experience)\b",
|
|
106
|
+
re.IGNORECASE,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
_TEMPORAL_MARKERS = re.compile(
|
|
110
|
+
r"\b(?:deadline|due date|expires?|scheduled|appointment|meeting|"
|
|
111
|
+
r"on \w+day|at \d{1,2}:\d{2}|by \w+|until|before|after|"
|
|
112
|
+
r"in \d+ (?:days?|weeks?|months?|years?)|"
|
|
113
|
+
r"next week|next month|this weekend|tomorrow|yesterday)\b",
|
|
114
|
+
re.IGNORECASE,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
_EMOTIONAL_KEYWORDS = frozenset({
|
|
118
|
+
"love", "hate", "amazing", "terrible", "wonderful", "awful", "excited",
|
|
119
|
+
"angry", "happy", "sad", "scared", "thrilled", "devastated", "furious",
|
|
120
|
+
"anxious", "grateful", "disappointed", "proud", "embarrassed", "jealous",
|
|
121
|
+
"best", "worst", "incredible", "horrible", "fantastic", "miserable",
|
|
122
|
+
})
|
|
123
|
+
|
|
124
|
+
_FILLER_PREFIXES = (
|
|
125
|
+
"good to see", "nice to", "hello", "hi ", "hey ", "how are you",
|
|
126
|
+
"thanks", "thank you", "bye", "goodbye", "see you", "take care",
|
|
127
|
+
"sure thing", "no problem", "okay",
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# ---------------------------------------------------------------------------
|
|
132
|
+
# LLM Prompt Templates
|
|
133
|
+
# ---------------------------------------------------------------------------
|
|
134
|
+
|
|
135
|
+
_SYSTEM_PROMPT = (
|
|
136
|
+
"You are a precise fact extraction engine for a memory system.\n"
|
|
137
|
+
"Given conversation turns, extract 2-5 atomic facts. Rules:\n"
|
|
138
|
+
"1. Use EXPLICIT NAMES — never pronouns (he/she/they/it). Every fact "
|
|
139
|
+
"must name the subject explicitly.\n"
|
|
140
|
+
"2. Each fact must be a COMPLETE, STANDALONE statement understandable "
|
|
141
|
+
"without the original conversation.\n"
|
|
142
|
+
"3. Convert ALL relative time to ABSOLUTE dates when possible. "
|
|
143
|
+
"'Yesterday' with session date 2024-01-15 becomes '2024-01-14'. "
|
|
144
|
+
"'Next month' becomes the actual month and year.\n"
|
|
145
|
+
"4. Resolve ALL coreferences. 'He went there' must become "
|
|
146
|
+
"'[Person name] went to [Place name]'.\n"
|
|
147
|
+
"5. Extract relationships between people when mentioned.\n"
|
|
148
|
+
"6. Extract preferences, opinions, and experiences as SEPARATE facts.\n"
|
|
149
|
+
"7. Skip greetings, filler, social pleasantries, and confirmations.\n"
|
|
150
|
+
"8. For opinions, include a confidence between 0.0-1.0.\n\n"
|
|
151
|
+
"Classify each fact:\n"
|
|
152
|
+
"- episodic: personal event or experience (visited, attended, did)\n"
|
|
153
|
+
"- semantic: objective fact about the world (jobs, locations, relations)\n"
|
|
154
|
+
"- opinion: subjective belief or preference (likes, thinks, prefers)\n"
|
|
155
|
+
"- temporal: time-bound fact with dates or deadlines\n\n"
|
|
156
|
+
"Respond ONLY with a JSON array. Example:\n"
|
|
157
|
+
'[{"text":"Alice works at Google as a software engineer",'
|
|
158
|
+
'"fact_type":"semantic","entities":["Alice","Google"],'
|
|
159
|
+
'"referenced_date":null,"importance":7,"confidence":0.95},'
|
|
160
|
+
'{"text":"Alice prefers Python over Java",'
|
|
161
|
+
'"fact_type":"opinion","entities":["Alice"],'
|
|
162
|
+
'"referenced_date":null,"importance":5,"confidence":0.8}]'
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# ---------------------------------------------------------------------------
|
|
167
|
+
# Helpers
|
|
168
|
+
# ---------------------------------------------------------------------------
|
|
169
|
+
|
|
170
|
+
def _new_id() -> str:
|
|
171
|
+
return uuid.uuid4().hex[:16]
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _split_sentences(text: str) -> list[str]:
|
|
175
|
+
"""Split text into sentences using punctuation boundaries."""
|
|
176
|
+
parts = re.split(r"(?<=[.!?])\s+", text.strip())
|
|
177
|
+
return [p.strip() for p in parts if len(p.strip()) >= 8]
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _extract_date_string(text: str) -> str | None:
|
|
181
|
+
"""Extract the first recognizable date string from text."""
|
|
182
|
+
match = _DATE_RE.search(text)
|
|
183
|
+
if not match:
|
|
184
|
+
return None
|
|
185
|
+
for group in match.groups():
|
|
186
|
+
if group:
|
|
187
|
+
return group.strip()
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _try_parse_date(raw: str, reference_date: str | None = None) -> str | None:
|
|
192
|
+
"""Attempt to resolve a date string to ISO format.
|
|
193
|
+
|
|
194
|
+
Uses dateutil.parser for structured dates and dateparser for
|
|
195
|
+
relative expressions ("last Monday", "next week").
|
|
196
|
+
Returns None on failure — never raises.
|
|
197
|
+
"""
|
|
198
|
+
if not raw:
|
|
199
|
+
return None
|
|
200
|
+
|
|
201
|
+
# Fast path: already ISO
|
|
202
|
+
iso_match = re.match(r"^\d{4}-\d{2}-\d{2}$", raw.strip())
|
|
203
|
+
if iso_match:
|
|
204
|
+
return raw.strip()
|
|
205
|
+
|
|
206
|
+
# dateutil for structured dates (March 15, 2026 / 3/15/2026)
|
|
207
|
+
try:
|
|
208
|
+
from dateutil import parser as du_parser
|
|
209
|
+
result = du_parser.parse(raw, fuzzy=True)
|
|
210
|
+
return result.date().isoformat()
|
|
211
|
+
except Exception:
|
|
212
|
+
pass
|
|
213
|
+
|
|
214
|
+
# dateparser for relative dates (yesterday, last week, next Friday)
|
|
215
|
+
try:
|
|
216
|
+
import dateparser
|
|
217
|
+
settings: dict[str, Any] = {"PREFER_DATES_FROM": "past"}
|
|
218
|
+
if reference_date:
|
|
219
|
+
ref = dateparser.parse(reference_date)
|
|
220
|
+
if ref:
|
|
221
|
+
settings["RELATIVE_BASE"] = ref
|
|
222
|
+
result = dateparser.parse(raw, settings=settings)
|
|
223
|
+
if result:
|
|
224
|
+
return result.date().isoformat()
|
|
225
|
+
except Exception:
|
|
226
|
+
pass
|
|
227
|
+
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _extract_interval(text: str, ref_date: str | None = None) -> tuple[str | None, str | None]:
|
|
232
|
+
"""Extract temporal interval (start, end) from text."""
|
|
233
|
+
match = _INTERVAL_RE.search(text)
|
|
234
|
+
if not match:
|
|
235
|
+
return None, None
|
|
236
|
+
start_raw, end_raw = match.group(1).strip(), match.group(2).strip()
|
|
237
|
+
return _try_parse_date(start_raw, ref_date), _try_parse_date(end_raw, ref_date)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _extract_entities(text: str) -> list[str]:
|
|
241
|
+
"""Extract candidate entity names from text using regex heuristics."""
|
|
242
|
+
entities: set[str] = set()
|
|
243
|
+
|
|
244
|
+
# Capitalized word sequences (proper nouns)
|
|
245
|
+
for match in _ENTITY_RE.finditer(text):
|
|
246
|
+
candidate = match.group(1).strip()
|
|
247
|
+
# Filter common English words that start sentences
|
|
248
|
+
# Check first word of multi-word candidates against stop list
|
|
249
|
+
_first_word = candidate.split()[0].lower() if candidate else ""
|
|
250
|
+
if _first_word not in {
|
|
251
|
+
"the", "this", "that", "these", "those", "what", "when", "where",
|
|
252
|
+
"which", "how", "who", "why", "also", "then", "just", "very",
|
|
253
|
+
"really", "actually", "maybe", "well", "still", "even",
|
|
254
|
+
"she", "he", "they", "them", "her", "him", "his", "its",
|
|
255
|
+
"but", "and", "not", "yes", "yeah", "sure", "okay", "ok",
|
|
256
|
+
"here", "there", "now", "today", "some", "all", "any",
|
|
257
|
+
"been", "being", "have", "has", "had", "was", "were",
|
|
258
|
+
"for", "with", "from", "about", "into", "over",
|
|
259
|
+
# Sentence starters and conversational words
|
|
260
|
+
"wow", "did", "so", "gonna", "got", "by", "thanks", "thank",
|
|
261
|
+
"hey", "hi", "hello", "bye", "good", "great", "nice", "cool",
|
|
262
|
+
"right", "like", "know", "think", "feel", "want", "need",
|
|
263
|
+
"make", "take", "give", "tell", "said", "told", "get",
|
|
264
|
+
"let", "can", "will", "would", "could", "should", "might",
|
|
265
|
+
"much", "many", "more", "most", "lot", "way", "thing",
|
|
266
|
+
"something", "anything", "everything", "nothing", "someone",
|
|
267
|
+
"it", "my", "your", "our", "their", "me", "you", "we", "us",
|
|
268
|
+
"do", "does", "if", "or", "no", "to", "at", "on", "in",
|
|
269
|
+
"up", "out", "off", "too", "go", "come", "see", "look",
|
|
270
|
+
"say", "ask", "try", "keep", "put", "run", "set", "move",
|
|
271
|
+
"call", "end", "start", "find", "show", "hear", "play",
|
|
272
|
+
"work", "read", "talk", "turn", "help", "miss", "hope",
|
|
273
|
+
"love", "hate", "wish", "seem", "mean", "mind", "care",
|
|
274
|
+
}:
|
|
275
|
+
entities.add(candidate)
|
|
276
|
+
|
|
277
|
+
# Quoted strings
|
|
278
|
+
for match in _QUOTED_RE.finditer(text):
|
|
279
|
+
quoted = match.group(1).strip()
|
|
280
|
+
if len(quoted) >= 2:
|
|
281
|
+
entities.add(quoted)
|
|
282
|
+
|
|
283
|
+
return sorted(entities)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def _classify_sentence(sentence: str) -> FactType:
|
|
287
|
+
"""Classify a sentence into a FactType using keyword markers."""
|
|
288
|
+
if _TEMPORAL_MARKERS.search(sentence):
|
|
289
|
+
return FactType.TEMPORAL
|
|
290
|
+
if _OPINION_MARKERS.search(sentence):
|
|
291
|
+
return FactType.OPINION
|
|
292
|
+
if _EXPERIENCE_MARKERS.search(sentence):
|
|
293
|
+
return FactType.EPISODIC
|
|
294
|
+
return FactType.SEMANTIC
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _score_importance(
|
|
298
|
+
text: str,
|
|
299
|
+
entities: list[str],
|
|
300
|
+
entity_frequency: dict[str, int],
|
|
301
|
+
has_date: bool,
|
|
302
|
+
) -> float:
|
|
303
|
+
"""Score importance 0.0-1.0 based on entity frequency, emotion, temporality.
|
|
304
|
+
|
|
305
|
+
Scoring formula:
|
|
306
|
+
base = 0.3
|
|
307
|
+
+0.2 if contains emotional keywords
|
|
308
|
+
+0.2 if temporally grounded (has a date reference)
|
|
309
|
+
+0.3 scaled by entity prominence (max entity frequency / total)
|
|
310
|
+
"""
|
|
311
|
+
score = 0.3
|
|
312
|
+
|
|
313
|
+
# Emotional boost
|
|
314
|
+
words = set(text.lower().split())
|
|
315
|
+
if words & _EMOTIONAL_KEYWORDS:
|
|
316
|
+
score += 0.2
|
|
317
|
+
|
|
318
|
+
# Temporal boost
|
|
319
|
+
if has_date:
|
|
320
|
+
score += 0.2
|
|
321
|
+
|
|
322
|
+
# Entity prominence boost (frequent entities are important)
|
|
323
|
+
if entities and entity_frequency:
|
|
324
|
+
total = sum(entity_frequency.values()) or 1
|
|
325
|
+
max_freq = max((entity_frequency.get(e, 0) for e in entities), default=0)
|
|
326
|
+
score += 0.3 * (max_freq / total)
|
|
327
|
+
|
|
328
|
+
return min(1.0, round(score, 3))
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _signal_from_fact_type(ft: FactType) -> SignalType:
|
|
332
|
+
"""Map FactType to SignalType for V2 compatibility."""
|
|
333
|
+
mapping = {
|
|
334
|
+
FactType.EPISODIC: SignalType.FACTUAL,
|
|
335
|
+
FactType.SEMANTIC: SignalType.FACTUAL,
|
|
336
|
+
FactType.OPINION: SignalType.OPINION,
|
|
337
|
+
FactType.TEMPORAL: SignalType.TEMPORAL,
|
|
338
|
+
}
|
|
339
|
+
return mapping.get(ft, SignalType.FACTUAL)
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def _is_filler(text: str) -> bool:
|
|
343
|
+
"""Return True if text is a greeting, filler, or social pleasantry."""
|
|
344
|
+
low = text.strip().lower()
|
|
345
|
+
return any(low.startswith(prefix) for prefix in _FILLER_PREFIXES)
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
# ---------------------------------------------------------------------------
|
|
349
|
+
# Chunk builder
|
|
350
|
+
# ---------------------------------------------------------------------------
|
|
351
|
+
|
|
352
|
+
def chunk_turns(
|
|
353
|
+
turns: list[str],
|
|
354
|
+
chunk_size: int = 10,
|
|
355
|
+
overlap: int = 2,
|
|
356
|
+
) -> list[list[str]]:
|
|
357
|
+
"""Group conversation turns into overlapping chunks.
|
|
358
|
+
|
|
359
|
+
Each chunk is up to ``chunk_size`` turns with ``overlap`` turns
|
|
360
|
+
carried over from the previous chunk to preserve cross-boundary context.
|
|
361
|
+
Trailing fragments smaller than ``overlap + 1`` are merged into the
|
|
362
|
+
final chunk to avoid low-context extraction passes.
|
|
363
|
+
"""
|
|
364
|
+
if not turns:
|
|
365
|
+
return []
|
|
366
|
+
if len(turns) <= chunk_size:
|
|
367
|
+
return [list(turns)]
|
|
368
|
+
|
|
369
|
+
chunks: list[list[str]] = []
|
|
370
|
+
start = 0
|
|
371
|
+
step = max(1, chunk_size - overlap)
|
|
372
|
+
|
|
373
|
+
while start < len(turns):
|
|
374
|
+
end = min(start + chunk_size, len(turns))
|
|
375
|
+
remaining_after = len(turns) - end
|
|
376
|
+
# Merge tiny trailing fragment into current chunk
|
|
377
|
+
if 0 < remaining_after < overlap + 1:
|
|
378
|
+
end = len(turns)
|
|
379
|
+
chunks.append(list(turns[start:end]))
|
|
380
|
+
if end >= len(turns):
|
|
381
|
+
break
|
|
382
|
+
start += step
|
|
383
|
+
|
|
384
|
+
return chunks
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
# ---------------------------------------------------------------------------
|
|
388
|
+
# FactExtractor
|
|
389
|
+
# ---------------------------------------------------------------------------
|
|
390
|
+
|
|
391
|
+
class FactExtractor:
|
|
392
|
+
"""Extract structured AtomicFacts from conversation turns.
|
|
393
|
+
|
|
394
|
+
Strategies:
|
|
395
|
+
Mode A — Rule-based: regex entities, keyword classification, heuristic importance.
|
|
396
|
+
Mode B — Local LLM (Ollama): structured JSON extraction, Mode A fallback.
|
|
397
|
+
Mode C — Cloud LLM: narrative fact extraction (2-5 per chunk), richest output.
|
|
398
|
+
"""
|
|
399
|
+
|
|
400
|
+
def __init__(
|
|
401
|
+
self,
|
|
402
|
+
config: EncodingConfig,
|
|
403
|
+
llm: LLMBackboneProtocol | None = None,
|
|
404
|
+
embedder: EmbedderProtocol | None = None,
|
|
405
|
+
mode: Mode = Mode.A,
|
|
406
|
+
) -> None:
|
|
407
|
+
self._config = config
|
|
408
|
+
self._llm = llm
|
|
409
|
+
self._embedder = embedder
|
|
410
|
+
self._mode = mode
|
|
411
|
+
|
|
412
|
+
# ------------------------------------------------------------------
|
|
413
|
+
# Public API
|
|
414
|
+
# ------------------------------------------------------------------
|
|
415
|
+
|
|
416
|
+
def extract_facts(
|
|
417
|
+
self,
|
|
418
|
+
turns: list[str],
|
|
419
|
+
session_id: str,
|
|
420
|
+
session_date: str | None = None,
|
|
421
|
+
speaker_a: str = "",
|
|
422
|
+
speaker_b: str = "",
|
|
423
|
+
) -> list[AtomicFact]:
|
|
424
|
+
"""Extract structured atomic facts from conversation turns.
|
|
425
|
+
|
|
426
|
+
Chunks the conversation into overlapping windows, extracts facts from
|
|
427
|
+
each chunk, and deduplicates the merged results.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
turns: Raw conversation turn strings.
|
|
431
|
+
session_id: Identifier for the conversation session.
|
|
432
|
+
session_date: ISO-8601 date of the session (observation date).
|
|
433
|
+
speaker_a: Name/identifier for the first speaker (e.g. user).
|
|
434
|
+
speaker_b: Name/identifier for the second speaker (e.g. assistant).
|
|
435
|
+
|
|
436
|
+
Returns:
|
|
437
|
+
Deduplicated list of AtomicFact objects.
|
|
438
|
+
"""
|
|
439
|
+
if not turns:
|
|
440
|
+
return []
|
|
441
|
+
|
|
442
|
+
chunks = chunk_turns(turns, self._config.chunk_size, overlap=2)
|
|
443
|
+
all_facts: list[AtomicFact] = []
|
|
444
|
+
|
|
445
|
+
for chunk in chunks:
|
|
446
|
+
chunk_facts = self._extract_chunk(
|
|
447
|
+
chunk, session_id, session_date, speaker_a, speaker_b,
|
|
448
|
+
)
|
|
449
|
+
all_facts.extend(chunk_facts)
|
|
450
|
+
|
|
451
|
+
return self._deduplicate(all_facts)
|
|
452
|
+
|
|
453
|
+
# ------------------------------------------------------------------
|
|
454
|
+
# Chunk-level dispatch
|
|
455
|
+
# ------------------------------------------------------------------
|
|
456
|
+
|
|
457
|
+
def _extract_chunk(
|
|
458
|
+
self,
|
|
459
|
+
turns: list[str],
|
|
460
|
+
session_id: str,
|
|
461
|
+
session_date: str | None,
|
|
462
|
+
speaker_a: str,
|
|
463
|
+
speaker_b: str,
|
|
464
|
+
) -> list[AtomicFact]:
|
|
465
|
+
"""Extract facts from a single chunk — dispatches by mode."""
|
|
466
|
+
use_llm = (
|
|
467
|
+
self._mode in (Mode.B, Mode.C)
|
|
468
|
+
and self._llm is not None
|
|
469
|
+
and self._llm.is_available()
|
|
470
|
+
)
|
|
471
|
+
if use_llm:
|
|
472
|
+
facts = self._extract_llm(
|
|
473
|
+
turns, session_id, session_date, speaker_a, speaker_b,
|
|
474
|
+
)
|
|
475
|
+
if facts:
|
|
476
|
+
return facts
|
|
477
|
+
# Fallback to local if LLM produced nothing
|
|
478
|
+
logger.info("LLM extraction returned no facts, falling back to local.")
|
|
479
|
+
|
|
480
|
+
return self._extract_local(
|
|
481
|
+
turns, session_id, session_date, speaker_a, speaker_b,
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
# ------------------------------------------------------------------
|
|
485
|
+
# Mode A: Rule-based extraction
|
|
486
|
+
# ------------------------------------------------------------------
|
|
487
|
+
|
|
488
|
+
def _extract_local(
|
|
489
|
+
self,
|
|
490
|
+
turns: list[str],
|
|
491
|
+
session_id: str,
|
|
492
|
+
session_date: str | None,
|
|
493
|
+
speaker_a: str,
|
|
494
|
+
speaker_b: str,
|
|
495
|
+
) -> list[AtomicFact]:
|
|
496
|
+
"""Rule-based extraction: regex entities, keyword classification, scoring."""
|
|
497
|
+
combined = "\n".join(turns)
|
|
498
|
+
sentences = _split_sentences(combined)
|
|
499
|
+
if not sentences:
|
|
500
|
+
# If no proper sentences, treat each turn as a sentence
|
|
501
|
+
sentences = [t.strip() for t in turns if len(t.strip()) >= 8]
|
|
502
|
+
|
|
503
|
+
# Build entity frequency map for importance scoring
|
|
504
|
+
entity_freq: dict[str, int] = {}
|
|
505
|
+
for sent in sentences:
|
|
506
|
+
for ent in _extract_entities(sent):
|
|
507
|
+
entity_freq[ent] = entity_freq.get(ent, 0) + 1
|
|
508
|
+
|
|
509
|
+
facts: list[AtomicFact] = []
|
|
510
|
+
seen_texts: set[str] = set()
|
|
511
|
+
|
|
512
|
+
for sent in sentences:
|
|
513
|
+
if _is_filler(sent):
|
|
514
|
+
continue
|
|
515
|
+
normalized = sent.strip()
|
|
516
|
+
if normalized in seen_texts or len(normalized) < 10:
|
|
517
|
+
continue
|
|
518
|
+
seen_texts.add(normalized)
|
|
519
|
+
|
|
520
|
+
# Resolve [Speaker]: prefix to "Speaker" in content
|
|
521
|
+
# "[Caroline]: I went to..." → "Caroline: I went to..."
|
|
522
|
+
import re as _re
|
|
523
|
+
_spk_match = _re.match(r"^\[([A-Za-z ]+)\]:\s*", normalized)
|
|
524
|
+
if _spk_match:
|
|
525
|
+
speaker_name = _spk_match.group(1)
|
|
526
|
+
normalized = f"{speaker_name}: {normalized[_spk_match.end():]}"
|
|
527
|
+
|
|
528
|
+
entities = _extract_entities(normalized)
|
|
529
|
+
fact_type = _classify_sentence(normalized)
|
|
530
|
+
|
|
531
|
+
# Three-date model: extract and resolve relative dates
|
|
532
|
+
raw_date = _extract_date_string(normalized)
|
|
533
|
+
referenced_date = _try_parse_date(raw_date, session_date) if raw_date else None
|
|
534
|
+
interval_start, interval_end = _extract_interval(normalized, session_date)
|
|
535
|
+
|
|
536
|
+
# Resolve relative dates in content for better retrieval
|
|
537
|
+
# "I went yesterday" + session_date=2023-05-08 → "I went on 2023-05-07"
|
|
538
|
+
if raw_date and referenced_date and raw_date.lower() in (
|
|
539
|
+
"yesterday", "today", "last week", "last month", "last year",
|
|
540
|
+
"this morning", "this afternoon", "this evening",
|
|
541
|
+
"the other day", "recently", "the day before",
|
|
542
|
+
):
|
|
543
|
+
date_str = referenced_date[:10] # YYYY-MM-DD
|
|
544
|
+
normalized = normalized.replace(raw_date, f"on {date_str}")
|
|
545
|
+
|
|
546
|
+
has_date = referenced_date is not None or interval_start is not None
|
|
547
|
+
importance = _score_importance(normalized, entities, entity_freq, has_date)
|
|
548
|
+
|
|
549
|
+
if importance < self._config.min_fact_confidence:
|
|
550
|
+
continue
|
|
551
|
+
|
|
552
|
+
# Determine speaker from turn position heuristic
|
|
553
|
+
speaker = self._infer_speaker(normalized, turns, speaker_a, speaker_b)
|
|
554
|
+
|
|
555
|
+
facts.append(AtomicFact(
|
|
556
|
+
fact_id=_new_id(),
|
|
557
|
+
content=normalized,
|
|
558
|
+
fact_type=fact_type,
|
|
559
|
+
entities=entities,
|
|
560
|
+
observation_date=session_date,
|
|
561
|
+
referenced_date=referenced_date,
|
|
562
|
+
interval_start=interval_start,
|
|
563
|
+
interval_end=interval_end,
|
|
564
|
+
confidence=0.7 if fact_type == FactType.SEMANTIC else 0.6,
|
|
565
|
+
importance=importance,
|
|
566
|
+
session_id=session_id,
|
|
567
|
+
signal_type=_signal_from_fact_type(fact_type),
|
|
568
|
+
))
|
|
569
|
+
|
|
570
|
+
# Cap at max_facts_per_chunk, keeping highest importance
|
|
571
|
+
facts.sort(key=lambda f: f.importance, reverse=True)
|
|
572
|
+
return facts[: self._config.max_facts_per_chunk]
|
|
573
|
+
|
|
574
|
+
# ------------------------------------------------------------------
|
|
575
|
+
# Mode B/C: LLM-based extraction
|
|
576
|
+
# ------------------------------------------------------------------
|
|
577
|
+
|
|
578
|
+
def _extract_llm(
|
|
579
|
+
self,
|
|
580
|
+
turns: list[str],
|
|
581
|
+
session_id: str,
|
|
582
|
+
session_date: str | None,
|
|
583
|
+
speaker_a: str,
|
|
584
|
+
speaker_b: str,
|
|
585
|
+
) -> list[AtomicFact]:
|
|
586
|
+
"""LLM-guided extraction: structured JSON prompt, parsed into AtomicFacts."""
|
|
587
|
+
conversation_text = "\n".join(turns)
|
|
588
|
+
speakers = []
|
|
589
|
+
if speaker_a:
|
|
590
|
+
speakers.append(f"Speaker A: {speaker_a}")
|
|
591
|
+
if speaker_b:
|
|
592
|
+
speakers.append(f"Speaker B: {speaker_b}")
|
|
593
|
+
speaker_info = ", ".join(speakers) if speakers else "unknown"
|
|
594
|
+
|
|
595
|
+
prompt = (
|
|
596
|
+
f"Extract atomic facts from the following conversation.\n"
|
|
597
|
+
f"Speakers: {speaker_info}\n"
|
|
598
|
+
f"Conversation date: {session_date or 'unknown'}\n\n"
|
|
599
|
+
f"--- CONVERSATION ---\n{conversation_text}\n--- END ---\n\n"
|
|
600
|
+
f"Rules:\n"
|
|
601
|
+
f"- Extract 2-5 comprehensive, self-contained facts.\n"
|
|
602
|
+
f"- Use explicit names (never pronouns).\n"
|
|
603
|
+
f"- Each fact must make sense WITHOUT the original conversation.\n"
|
|
604
|
+
f"- For dates mentioned (\"yesterday\", \"next week\"), resolve to "
|
|
605
|
+
f"ISO format relative to {session_date or 'today'}.\n"
|
|
606
|
+
f"- Skip greetings, filler, and confirmations.\n"
|
|
607
|
+
f"- importance: 1 (trivial) to 10 (critical)\n"
|
|
608
|
+
f"- confidence: 0.0 (uncertain) to 1.0 (definite)\n\n"
|
|
609
|
+
f"Respond with ONLY a JSON array."
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
try:
|
|
613
|
+
raw = self._llm.generate( # type: ignore[union-attr]
|
|
614
|
+
prompt=prompt,
|
|
615
|
+
system=_SYSTEM_PROMPT,
|
|
616
|
+
temperature=0.0,
|
|
617
|
+
max_tokens=1024,
|
|
618
|
+
)
|
|
619
|
+
return self._parse_llm_response(raw, session_id, session_date)
|
|
620
|
+
except Exception as exc:
|
|
621
|
+
logger.warning("LLM fact extraction failed: %s", exc)
|
|
622
|
+
return []
|
|
623
|
+
|
|
624
|
+
def _parse_llm_response(
|
|
625
|
+
self,
|
|
626
|
+
raw: str,
|
|
627
|
+
session_id: str,
|
|
628
|
+
session_date: str | None,
|
|
629
|
+
) -> list[AtomicFact]:
|
|
630
|
+
"""Parse JSON array from LLM response into AtomicFact list."""
|
|
631
|
+
if not raw or not raw.strip():
|
|
632
|
+
return []
|
|
633
|
+
|
|
634
|
+
# Extract JSON array from potentially wrapped response
|
|
635
|
+
try:
|
|
636
|
+
match = re.search(r"\[.*\]", raw, re.DOTALL)
|
|
637
|
+
if not match:
|
|
638
|
+
logger.warning("No JSON array found in LLM response.")
|
|
639
|
+
return []
|
|
640
|
+
items = json.loads(match.group())
|
|
641
|
+
if not isinstance(items, list):
|
|
642
|
+
return []
|
|
643
|
+
except (json.JSONDecodeError, ValueError) as exc:
|
|
644
|
+
logger.warning("JSON parse error in LLM fact response: %s", exc)
|
|
645
|
+
return []
|
|
646
|
+
|
|
647
|
+
facts: list[AtomicFact] = []
|
|
648
|
+
for item in items[:10]: # Hard cap at 10 per chunk
|
|
649
|
+
if not isinstance(item, dict):
|
|
650
|
+
continue
|
|
651
|
+
fact = self._item_to_fact(item, session_id, session_date)
|
|
652
|
+
if fact is not None:
|
|
653
|
+
facts.append(fact)
|
|
654
|
+
|
|
655
|
+
return facts
|
|
656
|
+
|
|
657
|
+
def _item_to_fact(
|
|
658
|
+
self,
|
|
659
|
+
item: dict[str, Any],
|
|
660
|
+
session_id: str,
|
|
661
|
+
session_date: str | None,
|
|
662
|
+
) -> AtomicFact | None:
|
|
663
|
+
"""Convert a single LLM JSON item to an AtomicFact.
|
|
664
|
+
|
|
665
|
+
Returns None if the item is malformed or is filler.
|
|
666
|
+
"""
|
|
667
|
+
text = str(item.get("text", "")).strip()
|
|
668
|
+
if not text or len(text) < 8 or _is_filler(text):
|
|
669
|
+
return None
|
|
670
|
+
|
|
671
|
+
# Fact type
|
|
672
|
+
raw_type = str(item.get("fact_type", item.get("type", "semantic"))).lower()
|
|
673
|
+
type_map = {
|
|
674
|
+
"episodic": FactType.EPISODIC,
|
|
675
|
+
"experience": FactType.EPISODIC,
|
|
676
|
+
"semantic": FactType.SEMANTIC,
|
|
677
|
+
"world": FactType.SEMANTIC,
|
|
678
|
+
"opinion": FactType.OPINION,
|
|
679
|
+
"temporal": FactType.TEMPORAL,
|
|
680
|
+
}
|
|
681
|
+
fact_type = type_map.get(raw_type, FactType.SEMANTIC)
|
|
682
|
+
|
|
683
|
+
# Entities
|
|
684
|
+
raw_entities = item.get("entities", [])
|
|
685
|
+
if isinstance(raw_entities, list):
|
|
686
|
+
entities = [str(e).strip() for e in raw_entities if str(e).strip()]
|
|
687
|
+
elif isinstance(raw_entities, str):
|
|
688
|
+
entities = [raw_entities.strip()] if raw_entities.strip() else []
|
|
689
|
+
else:
|
|
690
|
+
entities = _extract_entities(text)
|
|
691
|
+
|
|
692
|
+
# Referenced date — from LLM or inferred
|
|
693
|
+
ref_date_raw = item.get("referenced_date") or item.get("date")
|
|
694
|
+
referenced_date: str | None = None
|
|
695
|
+
if ref_date_raw and str(ref_date_raw).strip().lower() != "null":
|
|
696
|
+
referenced_date = _try_parse_date(str(ref_date_raw), session_date)
|
|
697
|
+
|
|
698
|
+
# Interval
|
|
699
|
+
interval_start = item.get("interval_start")
|
|
700
|
+
interval_end = item.get("interval_end")
|
|
701
|
+
if interval_start:
|
|
702
|
+
interval_start = _try_parse_date(str(interval_start), session_date)
|
|
703
|
+
if interval_end:
|
|
704
|
+
interval_end = _try_parse_date(str(interval_end), session_date)
|
|
705
|
+
|
|
706
|
+
# Importance (LLM returns 1-10, we normalize to 0.0-1.0)
|
|
707
|
+
raw_importance = item.get("importance", 5)
|
|
708
|
+
try:
|
|
709
|
+
importance = min(1.0, max(0.0, float(raw_importance) / 10.0))
|
|
710
|
+
except (TypeError, ValueError):
|
|
711
|
+
importance = 0.5
|
|
712
|
+
|
|
713
|
+
# Confidence
|
|
714
|
+
raw_conf = item.get("confidence", 0.8)
|
|
715
|
+
try:
|
|
716
|
+
confidence = min(1.0, max(0.0, float(raw_conf)))
|
|
717
|
+
except (TypeError, ValueError):
|
|
718
|
+
confidence = 0.8
|
|
719
|
+
|
|
720
|
+
return AtomicFact(
|
|
721
|
+
fact_id=_new_id(),
|
|
722
|
+
content=text,
|
|
723
|
+
fact_type=fact_type,
|
|
724
|
+
entities=entities,
|
|
725
|
+
observation_date=session_date,
|
|
726
|
+
referenced_date=referenced_date,
|
|
727
|
+
interval_start=interval_start,
|
|
728
|
+
interval_end=interval_end,
|
|
729
|
+
confidence=confidence,
|
|
730
|
+
importance=importance,
|
|
731
|
+
session_id=session_id,
|
|
732
|
+
signal_type=_signal_from_fact_type(fact_type),
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
# ------------------------------------------------------------------
|
|
736
|
+
# Speaker inference (Mode A heuristic)
|
|
737
|
+
# ------------------------------------------------------------------
|
|
738
|
+
|
|
739
|
+
@staticmethod
|
|
740
|
+
def _infer_speaker(
|
|
741
|
+
sentence: str,
|
|
742
|
+
turns: list[str],
|
|
743
|
+
speaker_a: str,
|
|
744
|
+
speaker_b: str,
|
|
745
|
+
) -> str:
|
|
746
|
+
"""Infer which speaker said a sentence based on turn position.
|
|
747
|
+
|
|
748
|
+
Checks which turn contains the sentence and uses even/odd indexing
|
|
749
|
+
(even = speaker_a, odd = speaker_b by convention).
|
|
750
|
+
"""
|
|
751
|
+
if not speaker_a and not speaker_b:
|
|
752
|
+
return ""
|
|
753
|
+
for i, turn in enumerate(turns):
|
|
754
|
+
if sentence in turn:
|
|
755
|
+
return speaker_a if i % 2 == 0 else speaker_b
|
|
756
|
+
return speaker_a or speaker_b
|
|
757
|
+
|
|
758
|
+
# ------------------------------------------------------------------
|
|
759
|
+
# Deduplication
|
|
760
|
+
# ------------------------------------------------------------------
|
|
761
|
+
|
|
762
|
+
@staticmethod
|
|
763
|
+
def _deduplicate(facts: list[AtomicFact]) -> list[AtomicFact]:
|
|
764
|
+
"""Remove near-duplicate facts by content normalization.
|
|
765
|
+
|
|
766
|
+
Uses lowercased, whitespace-collapsed content as dedup key.
|
|
767
|
+
When duplicates exist, keeps the one with higher importance.
|
|
768
|
+
"""
|
|
769
|
+
seen: dict[str, AtomicFact] = {}
|
|
770
|
+
for fact in facts:
|
|
771
|
+
key = re.sub(r"\s+", " ", fact.content.lower().strip())
|
|
772
|
+
existing = seen.get(key)
|
|
773
|
+
if existing is None or fact.importance > existing.importance:
|
|
774
|
+
seen[key] = fact
|
|
775
|
+
return list(seen.values())
|