@geravant/sinain 1.19.0 → 1.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -76,7 +76,7 @@ def query_facts_by_entities(
76
76
  attrs = store.entity(fid)
77
77
  if not attrs:
78
78
  continue
79
- fact = {"entityId": fid}
79
+ fact = {"entity_id": fid}
80
80
  for attr_name, values in attrs.items():
81
81
  if attr_name == "tag":
82
82
  continue # Don't include tags in output (noise)
@@ -117,7 +117,7 @@ def query_top_facts(db_path: str, limit: int = 30) -> list[dict]:
117
117
  attrs = store.entity(fid)
118
118
  if not attrs:
119
119
  continue
120
- fact = {"entityId": fid}
120
+ fact = {"entity_id": fid}
121
121
  for attr_name, values in attrs.items():
122
122
  fact[attr_name] = values[0] if len(values) == 1 else values
123
123
  facts.append(fact)
@@ -425,12 +425,14 @@ def query_facts_hybrid(
425
425
  pass
426
426
 
427
427
  # Graph boost: facts linked to mentioned entities via backrefs get priority
428
+ # +0.05 is significant vs RRF scores of ~0.015-0.033 — ensures entity-linked facts
429
+ # rank above FTS noise in large graphs (100K+ triples)
428
430
  if graph_fact_ids or community_fact_ids:
429
431
  for eid in rrf_scores:
430
432
  if eid in graph_fact_ids:
431
- rrf_scores[eid] += 0.02 # direct graph-linked facts
433
+ rrf_scores[eid] += 0.05 # direct graph-linked facts
432
434
  elif eid in community_fact_ids:
433
- rrf_scores[eid] += 0.01 # community-expanded facts (half weight)
435
+ rrf_scores[eid] += 0.025 # community-expanded facts (half weight)
434
436
 
435
437
  # Apply confidence decay as secondary signal (fresh facts rank above stale ones)
436
438
  from triplestore import decayed_confidence
@@ -584,6 +586,446 @@ def domain_fact_counts(db_path: str) -> dict[str, int]:
584
586
  return {}
585
587
 
586
588
 
589
+ def _slug_variants(query: str) -> set[str]:
590
+ """Generate slug variations to handle 'Al Futaim' / 'al-futaim' / 'alfutaim'.
591
+
592
+ The web search bar accepts free text but the knowledge graph stores
593
+ content-addressed slugs. We normalize aggressively: lowercase, then
594
+ produce hyphenated, underscored, and no-separator variants so a slug
595
+ match works regardless of how the user typed it.
596
+ """
597
+ norm = "-".join(w for w in query.lower().split() if w)
598
+ if not norm:
599
+ return set()
600
+ return {
601
+ norm,
602
+ norm.replace("-", ""),
603
+ norm.replace("-", "_"),
604
+ query.lower().replace(" ", ""),
605
+ }
606
+
607
+
608
+ # English stopwords + a few internet-noise words. Per-token passes (prefix
609
+ # wildcards, tag-exact) skip these because they false-positive against the
610
+ # whole corpus — "not*" matched 518 rows in our test DB, "real*" matched 88.
611
+ # We don't filter them from the main FTS5 query because phrase-style
612
+ # multi-word matches benefit from preserving them.
613
+ _STOPWORDS = frozenset({
614
+ "a", "an", "and", "are", "as", "at", "be", "but", "by", "do", "did",
615
+ "does", "for", "from", "had", "has", "have", "he", "her", "him", "his",
616
+ "how", "i", "if", "in", "is", "it", "its", "me", "my", "no", "not", "of",
617
+ "on", "or", "our", "she", "so", "than", "that", "the", "their", "them",
618
+ "then", "there", "these", "they", "this", "to", "was", "we", "were",
619
+ "what", "when", "where", "which", "who", "why", "will", "with", "you",
620
+ "your", "yes", "real", "true", "false",
621
+ })
622
+
623
+
624
+ def _fts5_safe_tokens(query: str) -> list[str]:
625
+ """Strip FTS5-special chars, return clean lowercase tokens (>=2 chars).
626
+
627
+ FTS5 treats ``"()*+-^|`` and AND/OR/NOT/NEAR as operators; raw user input
628
+ can produce confusing results or syntax errors. We defang to a plain
629
+ token list and re-build queries from there.
630
+ """
631
+ import re
632
+ cleaned = re.sub(r"[^\w\s]", " ", query.lower(), flags=re.UNICODE)
633
+ return [t for t in cleaned.split() if len(t) >= 2]
634
+
635
+
636
+ def _significant_tokens(query: str) -> list[str]:
637
+ """Tokens worth running per-token passes on: non-stopword, >=3 chars."""
638
+ return [t for t in _fts5_safe_tokens(query)
639
+ if t not in _STOPWORDS and len(t) >= 3]
640
+
641
+
642
+ def search_entities(db_path: str, query: str, limit: int = 20) -> list[dict]:
643
+ """High-recall entity search for the web UI search bar.
644
+
645
+ The triplestore has **5× more FTS signal** than my first version exploited:
646
+ FTS5 indexes the ``value`` column of *every* triple (not just facts'
647
+ ``value`` attribute), so tags, refs (their stringified target), and
648
+ everything else are searchable. The original predicate
649
+ ``AND t.attribute = 'value'`` was a recall killer — most facts have ~10
650
+ tag triples and 1 value triple, so we were dropping the bulk of the index.
651
+
652
+ Six passes union into a ranked list:
653
+ 1. Exact entity_id slug match (variants: hyphen / underscore / no-sep)
654
+ → score 2.0. Top of the list.
655
+ 2. entity_id LIKE substring for each variant → score 1.0. Catches
656
+ ``fact:al-futtaim-cto-...`` when user types ``al futtaim``.
657
+ 3. FTS5 over the FULL index (no attribute filter) for the raw query.
658
+ Per-hit score weighted by attribute: tag=0.3, value=0.2, other=0.1.
659
+ Tags are the strongest single signal because they're auto-extracted
660
+ keywords — a tag match is essentially "the fact is *about* this term."
661
+ 4. FTS5 prefix wildcard (``term*``) for each long token. Catches partial
662
+ words: typing ``intel`` should reach ``intellij``.
663
+ 5. Direct tag-exact match: ``attribute='tag' AND LOWER(value) = ?``.
664
+ Cheap, high-precision boost (+0.4) for entities deeply tagged.
665
+ 6. Snippet backfill for top-K results that landed via slug-only paths.
666
+
667
+ Score is uncapped during accumulation but final score is rounded to 3 dp.
668
+ Returns: [{entity, type, fact_count, snippet, score, last_seen}].
669
+ """
670
+ if not Path(db_path).exists() or not query.strip():
671
+ return []
672
+
673
+ try:
674
+ from triplestore import TripleStore
675
+ store = TripleStore(db_path)
676
+
677
+ candidates: dict[str, dict] = {}
678
+ # Cache outgoing-ref lookups — many FTS hits hit the same fact_eid.
679
+ ref_cache: dict[str, str | None] = {}
680
+
681
+ def lookup_outbound_ref(fact_eid: str) -> str | None:
682
+ if fact_eid in ref_cache:
683
+ return ref_cache[fact_eid]
684
+ ref_row = store._conn.execute(
685
+ """SELECT value FROM triples
686
+ WHERE entity_id = ? AND value_type = 'ref' AND retracted = 0
687
+ LIMIT 1""",
688
+ (fact_eid,),
689
+ ).fetchone()
690
+ v = ref_row["value"] if ref_row else None
691
+ ref_cache[fact_eid] = v if v and str(v).startswith("entity:") else None
692
+ return ref_cache[fact_eid]
693
+
694
+ def upsert(eid: str, *, score: float = 0.0, snippet: str = "",
695
+ ts: str | None = None) -> dict:
696
+ entry = candidates.setdefault(eid, {
697
+ "entity": eid,
698
+ "type": eid.split(":", 1)[0] if ":" in eid else "unknown",
699
+ "score": 0.0, "fact_count": 0,
700
+ "snippet": "", "last_seen": None,
701
+ })
702
+ # For exact-match scores (>=1.0) take the max; for evidence
703
+ # contributions (<1.0) accumulate so multiple weak hits stack.
704
+ if score >= 1.0:
705
+ entry["score"] = max(entry["score"], score)
706
+ else:
707
+ entry["score"] += score
708
+ if snippet and not entry["snippet"]:
709
+ entry["snippet"] = snippet[:140]
710
+ if ts and (entry["last_seen"] is None or ts > entry["last_seen"]):
711
+ entry["last_seen"] = ts
712
+ return entry
713
+
714
+ # ── Pass 1: exact slug match for each variant ────────────────────
715
+ variants = _slug_variants(query)
716
+ for variant in variants:
717
+ for prefix in ("entity:", "fact:"):
718
+ eid = f"{prefix}{variant}"
719
+ row = store._conn.execute(
720
+ "SELECT 1 FROM triples WHERE entity_id = ? AND retracted = 0 LIMIT 1",
721
+ (eid,),
722
+ ).fetchone()
723
+ if row:
724
+ upsert(eid, score=2.0)
725
+
726
+ # ── Pass 2: entity_id substring LIKE ─────────────────────────────
727
+ for variant in variants:
728
+ if len(variant) < 2:
729
+ continue
730
+ rows = store._conn.execute(
731
+ """SELECT DISTINCT entity_id FROM triples
732
+ WHERE retracted = 0
733
+ AND (entity_id LIKE ? OR entity_id LIKE ?)
734
+ LIMIT 200""",
735
+ (f"entity:%{variant}%", f"fact:%{variant}%"),
736
+ ).fetchall()
737
+ for r in rows:
738
+ upsert(r["entity_id"], score=1.0)
739
+
740
+ # ── Pass 3: FTS5 over the FULL index (the big recall fix) ────────
741
+ # Per-hit score weighted by attribute: tags carry the strongest
742
+ # topical signal because they're auto-extracted keywords.
743
+ attr_weight = {"tag": 0.3, "value": 0.2}
744
+ try:
745
+ fts_rows = store._conn.execute(
746
+ """SELECT t.entity_id, t.attribute, t.value, t.created_at
747
+ FROM triples_fts fts
748
+ JOIN triples t ON fts.rowid = t.id
749
+ WHERE triples_fts MATCH ? AND t.retracted = 0
750
+ LIMIT 500""",
751
+ (query,),
752
+ ).fetchall()
753
+ except Exception:
754
+ # Defang the query and retry with cleaned tokens; fall back to
755
+ # LIKE if FTS5 itself is unavailable.
756
+ tokens = _fts5_safe_tokens(query)
757
+ if tokens:
758
+ try:
759
+ fts_rows = store._conn.execute(
760
+ """SELECT t.entity_id, t.attribute, t.value, t.created_at
761
+ FROM triples_fts fts JOIN triples t ON fts.rowid = t.id
762
+ WHERE triples_fts MATCH ? AND t.retracted = 0
763
+ LIMIT 500""",
764
+ (" ".join(tokens),),
765
+ ).fetchall()
766
+ except Exception:
767
+ conds = " OR ".join(["LOWER(value) LIKE ?"] * len(tokens))
768
+ params = [f"%{t}%" for t in tokens]
769
+ fts_rows = store._conn.execute(
770
+ f"""SELECT entity_id, attribute, value, created_at FROM triples
771
+ WHERE retracted = 0 AND ({conds}) LIMIT 500""",
772
+ params,
773
+ ).fetchall()
774
+ else:
775
+ fts_rows = []
776
+
777
+ for r in fts_rows:
778
+ fact_eid = r["entity_id"]
779
+ attr = r["attribute"]
780
+ value = r["value"] or ""
781
+ ts = r["created_at"]
782
+ # Don't double-count: refs themselves match FTS as their target
783
+ # entity name, but we'll surface them via the ref-following step.
784
+ target_eid = lookup_outbound_ref(fact_eid) or fact_eid
785
+ weight = attr_weight.get(attr, 0.1)
786
+ # Only feed snippets from real value text — tags are noisy as
787
+ # snippets ("citibank", "cto", ...).
788
+ snippet_text = value if attr == "value" else ""
789
+ entry = upsert(target_eid, score=weight,
790
+ snippet=snippet_text, ts=ts)
791
+ if attr == "value":
792
+ entry["fact_count"] += 1
793
+
794
+ # ── Pass 4: prefix wildcards — only when Pass 3 was dry ─────────
795
+ # "intel" → "intellij", "intelligence", etc. FTS5 prefix is
796
+ # forward-only, so this is for partial input (still typing). For
797
+ # multi-word queries we trust FTS5's implicit AND in Pass 3 for
798
+ # precision; running OR'd per-token prefix here would give common
799
+ # nouns like "real" enough hits (88+ rows) to drown signal.
800
+ sig_tokens = _significant_tokens(query)
801
+ if len(fts_rows) < 5 and sig_tokens:
802
+ for token in sig_tokens:
803
+ if len(token) < 4:
804
+ continue # 3-char prefixes match too broadly
805
+ try:
806
+ # Use FTS5 rank ordering so the top 300 are the most
807
+ # relevant by bm25, not arbitrary insertion order. This
808
+ # matters when a prefix like 'intel*' has thousands of
809
+ # matches but only ~200 are about IntelliJ/Intelligence.
810
+ prefix_rows = store._conn.execute(
811
+ """SELECT t.entity_id, t.attribute, t.value, t.created_at
812
+ FROM triples_fts fts JOIN triples t ON fts.rowid = t.id
813
+ WHERE triples_fts MATCH ? AND t.retracted = 0
814
+ ORDER BY rank
815
+ LIMIT 300""",
816
+ (token + "*",),
817
+ ).fetchall()
818
+ except Exception:
819
+ prefix_rows = []
820
+ for r in prefix_rows:
821
+ fact_eid = r["entity_id"]
822
+ target_eid = lookup_outbound_ref(fact_eid) or fact_eid
823
+ upsert(target_eid, score=0.05,
824
+ snippet=(r["value"] if r["attribute"] == "value" else ""),
825
+ ts=r["created_at"])
826
+
827
+ # ── Pass 5: direct tag exact-match (high-precision boost) ────────
828
+ # Stopword guard prevents "not"/"are"/"with" from blanket-boosting
829
+ # entities tagged with those (which they shouldn't be, but real
830
+ # auto-tag pipelines occasionally produce them).
831
+ for token in sig_tokens:
832
+ rows = store._conn.execute(
833
+ """SELECT DISTINCT entity_id FROM triples
834
+ WHERE attribute = 'tag' AND LOWER(value) = ? AND retracted = 0
835
+ LIMIT 200""",
836
+ (token,),
837
+ ).fetchall()
838
+ for r in rows:
839
+ fact_eid = r["entity_id"]
840
+ target_eid = lookup_outbound_ref(fact_eid) or fact_eid
841
+ upsert(target_eid, score=0.4)
842
+
843
+ # Compute fact_count for slug-match candidates that weren't seen via FTS.
844
+ for eid, entry in candidates.items():
845
+ if entry["fact_count"] == 0:
846
+ # Count facts that reference this entity via any ref attribute.
847
+ cnt_row = store._conn.execute(
848
+ """SELECT COUNT(DISTINCT entity_id) AS n FROM triples
849
+ WHERE value = ? AND value_type = 'ref' AND retracted = 0""",
850
+ (eid,),
851
+ ).fetchone()
852
+ entry["fact_count"] = int(cnt_row["n"]) if cnt_row else 0
853
+ # If no incoming refs but the entity itself has triples, count
854
+ # that as 1 for display (it's at least a real entity).
855
+ if entry["fact_count"] == 0:
856
+ self_row = store._conn.execute(
857
+ "SELECT 1 FROM triples WHERE entity_id = ? AND retracted = 0 LIMIT 1",
858
+ (eid,),
859
+ ).fetchone()
860
+ if self_row:
861
+ entry["fact_count"] = 1
862
+
863
+ # Round for display; no hard cap — exact matches start at 2.0 and
864
+ # evidence accumulation below 1.0 should be allowed to sum freely so
865
+ # entities with many independent hit types out-rank one-trick hits.
866
+ for c in candidates.values():
867
+ c["score"] = round(c["score"], 3)
868
+
869
+ results = sorted(candidates.values(),
870
+ key=lambda x: (-x["score"], -x["fact_count"]))[:limit]
871
+
872
+ # Backfill snippets for top results that came from slug-only matches
873
+ # (no FTS hit on value text). Bounded to `limit` queries — cheap.
874
+ for c in results:
875
+ if c["snippet"]:
876
+ continue
877
+ if c["entity"].startswith("entity:"):
878
+ row = store._conn.execute(
879
+ """SELECT t.value, t.created_at FROM triples t
880
+ WHERE t.attribute = 'value' AND t.retracted = 0
881
+ AND t.entity_id IN (
882
+ SELECT entity_id FROM triples
883
+ WHERE value = ? AND value_type = 'ref' AND retracted = 0
884
+ LIMIT 5
885
+ ) LIMIT 1""",
886
+ (c["entity"],),
887
+ ).fetchone()
888
+ if row:
889
+ c["snippet"] = (row["value"] or "")[:140]
890
+ if row["created_at"] and not c["last_seen"]:
891
+ c["last_seen"] = row["created_at"]
892
+ elif c["entity"].startswith("fact:"):
893
+ row = store._conn.execute(
894
+ """SELECT value, created_at FROM triples
895
+ WHERE entity_id = ? AND attribute = 'value' AND retracted = 0
896
+ LIMIT 1""",
897
+ (c["entity"],),
898
+ ).fetchone()
899
+ if row:
900
+ c["snippet"] = (row["value"] or "")[:140]
901
+ if row["created_at"] and not c["last_seen"]:
902
+ c["last_seen"] = row["created_at"]
903
+
904
+ store.close()
905
+ return results
906
+ except Exception as e:
907
+ sys.stderr.write(f"search_entities error: {e}\n")
908
+ return []
909
+
910
+
911
+ def graph_children(db_path: str, entity: str, limit: int = 200) -> dict:
912
+ """Lazy-load children of an entity for the web UI graph tree.
913
+
914
+ Uses VAET index (backreferences via `value_type='ref'`) to find facts
915
+ that reference this entity. Two-level grouping:
916
+
917
+ • Top level: by edge attribute (the "kind" of relation — employed_by,
918
+ related_to, etc.). Most data uses just one attribute, so this collapses.
919
+ • When fact:* children dominate a group, sub-group by `domain` (people,
920
+ projects, decisions, ...) — this is the natural Confluence-page
921
+ taxonomy and produces a useful tree even when all edges share a name.
922
+
923
+ Plus a "string-typed legacy refs" pass that handles installs storing the
924
+ entity-pointer as value_type='string' (the slug) rather than as a typed ref.
925
+
926
+ Returns: { entity, groups: [{ label, edge_attr, children: [...] }] }
927
+ """
928
+ if not Path(db_path).exists():
929
+ return {"entity": entity, "groups": []}
930
+
931
+ try:
932
+ from triplestore import TripleStore
933
+ store = TripleStore(db_path)
934
+
935
+ # Find all triples where value=entity (backref via VAET)
936
+ rows = store._conn.execute(
937
+ """SELECT entity_id, attribute FROM triples
938
+ WHERE value = ? AND value_type = 'ref' AND retracted = 0
939
+ LIMIT ?""",
940
+ (entity, limit),
941
+ ).fetchall()
942
+ children_by_attr: dict[str, set[str]] = {}
943
+ for r in rows:
944
+ children_by_attr.setdefault(r["attribute"] or "related", set()) \
945
+ .add(r["entity_id"])
946
+
947
+ # Legacy string-typed refs: facts with attribute='entity', value=<slug>.
948
+ slug_part = entity.split(":", 1)[1] if ":" in entity else entity
949
+ legacy_rows = store._conn.execute(
950
+ """SELECT DISTINCT entity_id FROM triples
951
+ WHERE attribute = 'entity' AND value = ?
952
+ AND value_type = 'string' AND retracted = 0
953
+ LIMIT ?""",
954
+ (slug_part, limit),
955
+ ).fetchall()
956
+ for r in legacy_rows:
957
+ children_by_attr.setdefault("entity", set()).add(r["entity_id"])
958
+
959
+ # Pre-fetch per-child metadata (fact_count, domain, value snippet,
960
+ # has-its-own-backrefs) in a tight loop — this is hot for big graphs.
961
+ all_children = {c for cs in children_by_attr.values() for c in cs}
962
+ meta: dict[str, dict] = {}
963
+ for child_eid in all_children:
964
+ cnt = store._conn.execute(
965
+ "SELECT COUNT(*) AS n FROM triples WHERE entity_id = ? AND retracted = 0",
966
+ (child_eid,),
967
+ ).fetchone()["n"]
968
+ domain_row = store._conn.execute(
969
+ """SELECT value FROM triples WHERE entity_id = ?
970
+ AND attribute = 'domain' AND retracted = 0 LIMIT 1""",
971
+ (child_eid,),
972
+ ).fetchone()
973
+ value_row = store._conn.execute(
974
+ """SELECT value FROM triples WHERE entity_id = ?
975
+ AND attribute = 'value' AND retracted = 0 LIMIT 1""",
976
+ (child_eid,),
977
+ ).fetchone()
978
+ backref_row = store._conn.execute(
979
+ """SELECT 1 FROM triples WHERE value = ? AND value_type = 'ref'
980
+ AND retracted = 0 LIMIT 1""",
981
+ (child_eid,),
982
+ ).fetchone()
983
+ meta[child_eid] = {
984
+ "entity": child_eid,
985
+ "fact_count": cnt,
986
+ "domain": (domain_row["value"] if domain_row else None),
987
+ "snippet": ((value_row["value"] or "")[:80] if value_row else ""),
988
+ "expandable": bool(backref_row),
989
+ }
990
+
991
+ out_groups: list[dict] = []
992
+ for attr, child_set in sorted(children_by_attr.items()):
993
+ entries = [meta[c] for c in child_set if c in meta]
994
+ attr_label = attr.replace("_", " ").title()
995
+ fact_share = sum(1 for e in entries if e["entity"].startswith("fact:")) / max(1, len(entries))
996
+
997
+ # Only sub-group by domain when (a) the group is big enough that
998
+ # flat would be unwieldy, (b) it's mostly facts, AND (c) we have at
999
+ # least one usable domain signal — otherwise everything ends up in
1000
+ # an "Uncategorized" bucket that hides the parent attribute label
1001
+ # ("About", "Mentions") which IS useful structure.
1002
+ if (len(entries) >= 8 and fact_share >= 0.7
1003
+ and any(e.get("domain") for e in entries)):
1004
+ by_domain: dict[str, list[dict]] = {}
1005
+ for e in entries:
1006
+ d = (e["domain"] or "other").lower()
1007
+ by_domain.setdefault(d, []).append(e)
1008
+ for domain, group_entries in sorted(by_domain.items(),
1009
+ key=lambda x: -len(x[1])):
1010
+ out_groups.append({
1011
+ "label": f"{attr_label}: {domain.replace('_', ' ').title()}",
1012
+ "edge_attr": f"{attr}:{domain}",
1013
+ "children": sorted(group_entries, key=lambda x: -x["fact_count"]),
1014
+ })
1015
+ else:
1016
+ out_groups.append({
1017
+ "label": f"{attr_label} ({len(entries)})",
1018
+ "edge_attr": attr,
1019
+ "children": sorted(entries, key=lambda x: -x["fact_count"]),
1020
+ })
1021
+
1022
+ store.close()
1023
+ return {"entity": entity, "groups": out_groups}
1024
+ except Exception as e:
1025
+ sys.stderr.write(f"graph_children error: {e}\n")
1026
+ return {"entity": entity, "groups": []}
1027
+
1028
+
587
1029
  def main() -> None:
588
1030
  parser = argparse.ArgumentParser(description="Graph Query")
589
1031
  parser.add_argument("--db", required=True, help="Path to knowledge-graph.db")
@@ -592,8 +1034,23 @@ def main() -> None:
592
1034
  parser.add_argument("--domain-counts", action="store_true", help="Show fact counts per domain")
593
1035
  parser.add_argument("--max-facts", type=int, default=5, help="Maximum facts to return")
594
1036
  parser.add_argument("--format", choices=["text", "json", "compact"], default="json", help="Output format")
1037
+ parser.add_argument("--search-entities", default=None, help="Search query for entity-prioritized lookup")
1038
+ parser.add_argument("--search-limit", type=int, default=20, help="Max entity results")
1039
+ parser.add_argument("--graph-children", default=None, help="Entity to expand for graph tree")
1040
+ parser.add_argument("--graph-limit", type=int, default=50, help="Max children per parent")
595
1041
  args = parser.parse_args()
596
1042
 
1043
+ if args.search_entities is not None:
1044
+ results = search_entities(args.db, args.search_entities, limit=args.search_limit)
1045
+ score_max = max((r["score"] for r in results), default=0.0)
1046
+ print(json.dumps({"results": results, "topic_fallback": score_max < 0.4}, ensure_ascii=False))
1047
+ return
1048
+
1049
+ if args.graph_children is not None:
1050
+ result = graph_children(args.db, args.graph_children, limit=args.graph_limit)
1051
+ print(json.dumps(result, ensure_ascii=False))
1052
+ return
1053
+
597
1054
  if args.domain_counts:
598
1055
  counts = domain_fact_counts(args.db)
599
1056
  print(json.dumps(counts, indent=2))
@@ -418,16 +418,33 @@ def _extract_entity_from_fact(fact_text: str, known_entities: list) -> str:
418
418
 
419
419
 
420
420
  def _facts_to_graph_ops(digest: dict) -> list[dict]:
421
- """Convert distiller facts/entities/decisions directly to graph ops.
421
+ """Convert ALL distiller output + raw feed items to graph ops.
422
422
 
423
- DETERMINISTIC — no LLM needed. The distiller already extracted structured
424
- facts with entity names. This function mechanically converts them to
425
- assert operations for the triplestore.
423
+ DETERMINISTIC — no LLM needed. Stores distilled knowledge (facts,
424
+ decisions, patterns, preferences, summary) AND verbatim raw captures
425
+ (audio quotes, agent analysis) so the triplestore is the single
426
+ source of truth for session recall.
426
427
  """
427
428
  ops = []
428
429
  known_entities = digest.get("entities", [])
430
+ raw_items = digest.pop("_rawItems", None) or []
429
431
 
430
- # Each fact becomes an assert op
432
+ # Session anchor from whatHappened
433
+ session_ts = digest.get("ts", "")[:16] # "2026-05-07T10:08"
434
+ session_eid = f"session:{session_ts}" if session_ts else None
435
+ if session_eid and digest.get("whatHappened"):
436
+ ops.append({
437
+ "op": "assert",
438
+ "entity": session_ts,
439
+ "attribute": "value",
440
+ "value": digest["whatHappened"],
441
+ "confidence": 0.9,
442
+ "domain": "session",
443
+ "kind": "distilled",
444
+ "session_ref": session_eid,
445
+ })
446
+
447
+ # Facts (distilled)
431
448
  for fact_text in digest.get("facts", []):
432
449
  if not fact_text or len(fact_text) < 5:
433
450
  continue
@@ -435,13 +452,14 @@ def _facts_to_graph_ops(digest: dict) -> list[dict]:
435
452
  ops.append({
436
453
  "op": "assert",
437
454
  "entity": entity,
438
- "attribute": "fact",
455
+ "attribute": "value",
439
456
  "value": fact_text,
440
457
  "confidence": 0.9,
441
- "domain": "",
458
+ "kind": "distilled",
459
+ "session_ref": session_eid,
442
460
  })
443
461
 
444
- # Each decision becomes an assert with lower confidence (time-bound)
462
+ # Decisions (distilled, lower confidence time-bound)
445
463
  for decision_text in digest.get("decisions", []):
446
464
  if not decision_text or len(decision_text) < 5:
447
465
  continue
@@ -449,10 +467,63 @@ def _facts_to_graph_ops(digest: dict) -> list[dict]:
449
467
  ops.append({
450
468
  "op": "assert",
451
469
  "entity": entity,
452
- "attribute": "decision",
470
+ "attribute": "value",
453
471
  "value": decision_text,
454
472
  "confidence": 0.7,
455
- "domain": "",
473
+ "kind": "distilled",
474
+ "session_ref": session_eid,
475
+ })
476
+
477
+ # Patterns + Preferences (distilled)
478
+ for text in digest.get("patterns", []) + digest.get("preferences", []):
479
+ if not text or not isinstance(text, str) or len(text) < 5:
480
+ continue
481
+ entity = _extract_entity_from_fact(text, known_entities)
482
+ ops.append({
483
+ "op": "assert",
484
+ "entity": entity,
485
+ "attribute": "value",
486
+ "value": text,
487
+ "confidence": 0.7,
488
+ "kind": "distilled",
489
+ "session_ref": session_eid,
490
+ })
491
+
492
+ # Verbatim audio quotes (top 20 by length, > 30 chars)
493
+ audio = [i for i in raw_items
494
+ if i.get("source") == "audio" and len(i.get("text", "")) > 30]
495
+ for item in sorted(audio, key=lambda x: -len(x.get("text", "")))[:20]:
496
+ text = re.sub(r"^\[.*?\]\s*", "", item["text"]) # strip emoji prefixes
497
+ if len(text) < 20:
498
+ continue
499
+ entity = _extract_entity_from_fact(text, known_entities)
500
+ ops.append({
501
+ "op": "assert",
502
+ "entity": entity,
503
+ "attribute": "value",
504
+ "value": text,
505
+ "confidence": 0.95,
506
+ "kind": "verbatim",
507
+ "session_ref": session_eid,
508
+ })
509
+
510
+ # Agent analysis responses (last 10, > 50 chars — verbatim)
511
+ agents = [i for i in raw_items
512
+ if i.get("source") in ("agent", "openclaw")
513
+ and len(i.get("text", "")) > 50]
514
+ for item in agents[-10:]:
515
+ text = re.sub(r"^\[.*?\]\s*", "", item["text"]) # strip emoji prefixes
516
+ if len(text) < 30:
517
+ continue
518
+ entity = _extract_entity_from_fact(text, known_entities)
519
+ ops.append({
520
+ "op": "assert",
521
+ "entity": entity,
522
+ "attribute": "value",
523
+ "value": text,
524
+ "confidence": 0.8,
525
+ "kind": "verbatim",
526
+ "session_ref": session_eid,
456
527
  })
457
528
 
458
529
  return ops
@@ -506,6 +577,12 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str, digest_ent
506
577
  store.assert_triple(tx, entity_id, "reinforce_count", "1")
507
578
  if domain:
508
579
  store.assert_triple(tx, entity_id, "domain", domain)
580
+ kind = op_data.get("kind", "distilled")
581
+ store.assert_triple(tx, entity_id, "kind", kind)
582
+ # Link to session anchor via ref edge
583
+ session_ref = op_data.get("session_ref")
584
+ if session_ref:
585
+ store.assert_triple(tx, entity_id, "session", session_ref, value_type="ref")
509
586
  # Auto-tag for keyword-based discovery
510
587
  for tag in _extract_tags(value):
511
588
  store.assert_triple(tx, entity_id, "tag", tag)