@geravant/sinain 1.18.3 → 1.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -584,6 +584,446 @@ def domain_fact_counts(db_path: str) -> dict[str, int]:
584
584
  return {}
585
585
 
586
586
 
587
+ def _slug_variants(query: str) -> set[str]:
588
+ """Generate slug variations to handle 'Al Futaim' / 'al-futaim' / 'alfutaim'.
589
+
590
+ The web search bar accepts free text but the knowledge graph stores
591
+ content-addressed slugs. We normalize aggressively: lowercase, then
592
+ produce hyphenated, underscored, and no-separator variants so a slug
593
+ match works regardless of how the user typed it.
594
+ """
595
+ norm = "-".join(w for w in query.lower().split() if w)
596
+ if not norm:
597
+ return set()
598
+ return {
599
+ norm,
600
+ norm.replace("-", ""),
601
+ norm.replace("-", "_"),
602
+ query.lower().replace(" ", ""),
603
+ }
604
+
605
+
606
+ # English stopwords + a few internet-noise words. Per-token passes (prefix
607
+ # wildcards, tag-exact) skip these because they false-positive against the
608
+ # whole corpus — "not*" matched 518 rows in our test DB, "real*" matched 88.
609
+ # We don't filter them from the main FTS5 query because phrase-style
610
+ # multi-word matches benefit from preserving them.
611
+ _STOPWORDS = frozenset({
612
+ "a", "an", "and", "are", "as", "at", "be", "but", "by", "do", "did",
613
+ "does", "for", "from", "had", "has", "have", "he", "her", "him", "his",
614
+ "how", "i", "if", "in", "is", "it", "its", "me", "my", "no", "not", "of",
615
+ "on", "or", "our", "she", "so", "than", "that", "the", "their", "them",
616
+ "then", "there", "these", "they", "this", "to", "was", "we", "were",
617
+ "what", "when", "where", "which", "who", "why", "will", "with", "you",
618
+ "your", "yes", "real", "true", "false",
619
+ })
620
+
621
+
622
+ def _fts5_safe_tokens(query: str) -> list[str]:
623
+ """Strip FTS5-special chars, return clean lowercase tokens (>=2 chars).
624
+
625
+ FTS5 treats ``"()*+-^|`` and AND/OR/NOT/NEAR as operators; raw user input
626
+ can produce confusing results or syntax errors. We defang to a plain
627
+ token list and re-build queries from there.
628
+ """
629
+ import re
630
+ cleaned = re.sub(r"[^\w\s]", " ", query.lower(), flags=re.UNICODE)
631
+ return [t for t in cleaned.split() if len(t) >= 2]
632
+
633
+
634
+ def _significant_tokens(query: str) -> list[str]:
635
+ """Tokens worth running per-token passes on: non-stopword, >=3 chars."""
636
+ return [t for t in _fts5_safe_tokens(query)
637
+ if t not in _STOPWORDS and len(t) >= 3]
638
+
639
+
640
+ def search_entities(db_path: str, query: str, limit: int = 20) -> list[dict]:
641
+ """High-recall entity search for the web UI search bar.
642
+
643
+ The triplestore has **5× more FTS signal** than my first version exploited:
644
+ FTS5 indexes the ``value`` column of *every* triple (not just facts'
645
+ ``value`` attribute), so tags, refs (their stringified target), and
646
+ everything else are searchable. The original predicate
647
+ ``AND t.attribute = 'value'`` was a recall killer — most facts have ~10
648
+ tag triples and 1 value triple, so we were dropping the bulk of the index.
649
+
650
+ Six passes union into a ranked list:
651
+ 1. Exact entity_id slug match (variants: hyphen / underscore / no-sep)
652
+ → score 2.0. Top of the list.
653
+ 2. entity_id LIKE substring for each variant → score 1.0. Catches
654
+ ``fact:al-futtaim-cto-...`` when user types ``al futtaim``.
655
+ 3. FTS5 over the FULL index (no attribute filter) for the raw query.
656
+ Per-hit score weighted by attribute: tag=0.3, value=0.2, other=0.1.
657
+ Tags are the strongest single signal because they're auto-extracted
658
+ keywords — a tag match is essentially "the fact is *about* this term."
659
+ 4. FTS5 prefix wildcard (``term*``) for each long token. Catches partial
660
+ words: typing ``intel`` should reach ``intellij``.
661
+ 5. Direct tag-exact match: ``attribute='tag' AND LOWER(value) = ?``.
662
+ Cheap, high-precision boost (+0.4) for entities deeply tagged.
663
+ 6. Snippet backfill for top-K results that landed via slug-only paths.
664
+
665
+ Score is uncapped during accumulation but final score is rounded to 3 dp.
666
+ Returns: [{entity, type, fact_count, snippet, score, last_seen}].
667
+ """
668
+ if not Path(db_path).exists() or not query.strip():
669
+ return []
670
+
671
+ try:
672
+ from triplestore import TripleStore
673
+ store = TripleStore(db_path)
674
+
675
+ candidates: dict[str, dict] = {}
676
+ # Cache outgoing-ref lookups — many FTS hits hit the same fact_eid.
677
+ ref_cache: dict[str, str | None] = {}
678
+
679
+ def lookup_outbound_ref(fact_eid: str) -> str | None:
680
+ if fact_eid in ref_cache:
681
+ return ref_cache[fact_eid]
682
+ ref_row = store._conn.execute(
683
+ """SELECT value FROM triples
684
+ WHERE entity_id = ? AND value_type = 'ref' AND retracted = 0
685
+ LIMIT 1""",
686
+ (fact_eid,),
687
+ ).fetchone()
688
+ v = ref_row["value"] if ref_row else None
689
+ ref_cache[fact_eid] = v if v and str(v).startswith("entity:") else None
690
+ return ref_cache[fact_eid]
691
+
692
+ def upsert(eid: str, *, score: float = 0.0, snippet: str = "",
693
+ ts: str | None = None) -> dict:
694
+ entry = candidates.setdefault(eid, {
695
+ "entity": eid,
696
+ "type": eid.split(":", 1)[0] if ":" in eid else "unknown",
697
+ "score": 0.0, "fact_count": 0,
698
+ "snippet": "", "last_seen": None,
699
+ })
700
+ # For exact-match scores (>=1.0) take the max; for evidence
701
+ # contributions (<1.0) accumulate so multiple weak hits stack.
702
+ if score >= 1.0:
703
+ entry["score"] = max(entry["score"], score)
704
+ else:
705
+ entry["score"] += score
706
+ if snippet and not entry["snippet"]:
707
+ entry["snippet"] = snippet[:140]
708
+ if ts and (entry["last_seen"] is None or ts > entry["last_seen"]):
709
+ entry["last_seen"] = ts
710
+ return entry
711
+
712
+ # ── Pass 1: exact slug match for each variant ────────────────────
713
+ variants = _slug_variants(query)
714
+ for variant in variants:
715
+ for prefix in ("entity:", "fact:"):
716
+ eid = f"{prefix}{variant}"
717
+ row = store._conn.execute(
718
+ "SELECT 1 FROM triples WHERE entity_id = ? AND retracted = 0 LIMIT 1",
719
+ (eid,),
720
+ ).fetchone()
721
+ if row:
722
+ upsert(eid, score=2.0)
723
+
724
+ # ── Pass 2: entity_id substring LIKE ─────────────────────────────
725
+ for variant in variants:
726
+ if len(variant) < 2:
727
+ continue
728
+ rows = store._conn.execute(
729
+ """SELECT DISTINCT entity_id FROM triples
730
+ WHERE retracted = 0
731
+ AND (entity_id LIKE ? OR entity_id LIKE ?)
732
+ LIMIT 200""",
733
+ (f"entity:%{variant}%", f"fact:%{variant}%"),
734
+ ).fetchall()
735
+ for r in rows:
736
+ upsert(r["entity_id"], score=1.0)
737
+
738
+ # ── Pass 3: FTS5 over the FULL index (the big recall fix) ────────
739
+ # Per-hit score weighted by attribute: tags carry the strongest
740
+ # topical signal because they're auto-extracted keywords.
741
+ attr_weight = {"tag": 0.3, "value": 0.2}
742
+ try:
743
+ fts_rows = store._conn.execute(
744
+ """SELECT t.entity_id, t.attribute, t.value, t.created_at
745
+ FROM triples_fts fts
746
+ JOIN triples t ON fts.rowid = t.id
747
+ WHERE triples_fts MATCH ? AND t.retracted = 0
748
+ LIMIT 500""",
749
+ (query,),
750
+ ).fetchall()
751
+ except Exception:
752
+ # Defang the query and retry with cleaned tokens; fall back to
753
+ # LIKE if FTS5 itself is unavailable.
754
+ tokens = _fts5_safe_tokens(query)
755
+ if tokens:
756
+ try:
757
+ fts_rows = store._conn.execute(
758
+ """SELECT t.entity_id, t.attribute, t.value, t.created_at
759
+ FROM triples_fts fts JOIN triples t ON fts.rowid = t.id
760
+ WHERE triples_fts MATCH ? AND t.retracted = 0
761
+ LIMIT 500""",
762
+ (" ".join(tokens),),
763
+ ).fetchall()
764
+ except Exception:
765
+ conds = " OR ".join(["LOWER(value) LIKE ?"] * len(tokens))
766
+ params = [f"%{t}%" for t in tokens]
767
+ fts_rows = store._conn.execute(
768
+ f"""SELECT entity_id, attribute, value, created_at FROM triples
769
+ WHERE retracted = 0 AND ({conds}) LIMIT 500""",
770
+ params,
771
+ ).fetchall()
772
+ else:
773
+ fts_rows = []
774
+
775
+ for r in fts_rows:
776
+ fact_eid = r["entity_id"]
777
+ attr = r["attribute"]
778
+ value = r["value"] or ""
779
+ ts = r["created_at"]
780
+ # Don't double-count: refs themselves match FTS as their target
781
+ # entity name, but we'll surface them via the ref-following step.
782
+ target_eid = lookup_outbound_ref(fact_eid) or fact_eid
783
+ weight = attr_weight.get(attr, 0.1)
784
+ # Only feed snippets from real value text — tags are noisy as
785
+ # snippets ("citibank", "cto", ...).
786
+ snippet_text = value if attr == "value" else ""
787
+ entry = upsert(target_eid, score=weight,
788
+ snippet=snippet_text, ts=ts)
789
+ if attr == "value":
790
+ entry["fact_count"] += 1
791
+
792
+ # ── Pass 4: prefix wildcards — only when Pass 3 was dry ─────────
793
+ # "intel" → "intellij", "intelligence", etc. FTS5 prefix is
794
+ # forward-only, so this is for partial input (still typing). For
795
+ # multi-word queries we trust FTS5's implicit AND in Pass 3 for
796
+ # precision; running OR'd per-token prefix here would give common
797
+ # nouns like "real" enough hits (88+ rows) to drown signal.
798
+ sig_tokens = _significant_tokens(query)
799
+ if len(fts_rows) < 5 and sig_tokens:
800
+ for token in sig_tokens:
801
+ if len(token) < 4:
802
+ continue # 3-char prefixes match too broadly
803
+ try:
804
+ # Use FTS5 rank ordering so the top 300 are the most
805
+ # relevant by bm25, not arbitrary insertion order. This
806
+ # matters when a prefix like 'intel*' has thousands of
807
+ # matches but only ~200 are about IntelliJ/Intelligence.
808
+ prefix_rows = store._conn.execute(
809
+ """SELECT t.entity_id, t.attribute, t.value, t.created_at
810
+ FROM triples_fts fts JOIN triples t ON fts.rowid = t.id
811
+ WHERE triples_fts MATCH ? AND t.retracted = 0
812
+ ORDER BY rank
813
+ LIMIT 300""",
814
+ (token + "*",),
815
+ ).fetchall()
816
+ except Exception:
817
+ prefix_rows = []
818
+ for r in prefix_rows:
819
+ fact_eid = r["entity_id"]
820
+ target_eid = lookup_outbound_ref(fact_eid) or fact_eid
821
+ upsert(target_eid, score=0.05,
822
+ snippet=(r["value"] if r["attribute"] == "value" else ""),
823
+ ts=r["created_at"])
824
+
825
+ # ── Pass 5: direct tag exact-match (high-precision boost) ────────
826
+ # Stopword guard prevents "not"/"are"/"with" from blanket-boosting
827
+ # entities tagged with those (which they shouldn't be, but real
828
+ # auto-tag pipelines occasionally produce them).
829
+ for token in sig_tokens:
830
+ rows = store._conn.execute(
831
+ """SELECT DISTINCT entity_id FROM triples
832
+ WHERE attribute = 'tag' AND LOWER(value) = ? AND retracted = 0
833
+ LIMIT 200""",
834
+ (token,),
835
+ ).fetchall()
836
+ for r in rows:
837
+ fact_eid = r["entity_id"]
838
+ target_eid = lookup_outbound_ref(fact_eid) or fact_eid
839
+ upsert(target_eid, score=0.4)
840
+
841
+ # Compute fact_count for slug-match candidates that weren't seen via FTS.
842
+ for eid, entry in candidates.items():
843
+ if entry["fact_count"] == 0:
844
+ # Count facts that reference this entity via any ref attribute.
845
+ cnt_row = store._conn.execute(
846
+ """SELECT COUNT(DISTINCT entity_id) AS n FROM triples
847
+ WHERE value = ? AND value_type = 'ref' AND retracted = 0""",
848
+ (eid,),
849
+ ).fetchone()
850
+ entry["fact_count"] = int(cnt_row["n"]) if cnt_row else 0
851
+ # If no incoming refs but the entity itself has triples, count
852
+ # that as 1 for display (it's at least a real entity).
853
+ if entry["fact_count"] == 0:
854
+ self_row = store._conn.execute(
855
+ "SELECT 1 FROM triples WHERE entity_id = ? AND retracted = 0 LIMIT 1",
856
+ (eid,),
857
+ ).fetchone()
858
+ if self_row:
859
+ entry["fact_count"] = 1
860
+
861
+ # Round for display; no hard cap — exact matches start at 2.0 and
862
+ # evidence accumulation below 1.0 should be allowed to sum freely so
863
+ # entities with many independent hit types out-rank one-trick hits.
864
+ for c in candidates.values():
865
+ c["score"] = round(c["score"], 3)
866
+
867
+ results = sorted(candidates.values(),
868
+ key=lambda x: (-x["score"], -x["fact_count"]))[:limit]
869
+
870
+ # Backfill snippets for top results that came from slug-only matches
871
+ # (no FTS hit on value text). Bounded to `limit` queries — cheap.
872
+ for c in results:
873
+ if c["snippet"]:
874
+ continue
875
+ if c["entity"].startswith("entity:"):
876
+ row = store._conn.execute(
877
+ """SELECT t.value, t.created_at FROM triples t
878
+ WHERE t.attribute = 'value' AND t.retracted = 0
879
+ AND t.entity_id IN (
880
+ SELECT entity_id FROM triples
881
+ WHERE value = ? AND value_type = 'ref' AND retracted = 0
882
+ LIMIT 5
883
+ ) LIMIT 1""",
884
+ (c["entity"],),
885
+ ).fetchone()
886
+ if row:
887
+ c["snippet"] = (row["value"] or "")[:140]
888
+ if row["created_at"] and not c["last_seen"]:
889
+ c["last_seen"] = row["created_at"]
890
+ elif c["entity"].startswith("fact:"):
891
+ row = store._conn.execute(
892
+ """SELECT value, created_at FROM triples
893
+ WHERE entity_id = ? AND attribute = 'value' AND retracted = 0
894
+ LIMIT 1""",
895
+ (c["entity"],),
896
+ ).fetchone()
897
+ if row:
898
+ c["snippet"] = (row["value"] or "")[:140]
899
+ if row["created_at"] and not c["last_seen"]:
900
+ c["last_seen"] = row["created_at"]
901
+
902
+ store.close()
903
+ return results
904
+ except Exception as e:
905
+ sys.stderr.write(f"search_entities error: {e}\n")
906
+ return []
907
+
908
+
909
+ def graph_children(db_path: str, entity: str, limit: int = 200) -> dict:
910
+ """Lazy-load children of an entity for the web UI graph tree.
911
+
912
+ Uses VAET index (backreferences via `value_type='ref'`) to find facts
913
+ that reference this entity. Two-level grouping:
914
+
915
+ • Top level: by edge attribute (the "kind" of relation — employed_by,
916
+ related_to, etc.). Most data uses just one attribute, so this collapses.
917
+ • When fact:* children dominate a group, sub-group by `domain` (people,
918
+ projects, decisions, ...) — this is the natural Confluence-page
919
+ taxonomy and produces a useful tree even when all edges share a name.
920
+
921
+ Plus a "string-typed legacy refs" pass that handles installs storing the
922
+ entity-pointer as value_type='string' (the slug) rather than as a typed ref.
923
+
924
+ Returns: { entity, groups: [{ label, edge_attr, children: [...] }] }
925
+ """
926
+ if not Path(db_path).exists():
927
+ return {"entity": entity, "groups": []}
928
+
929
+ try:
930
+ from triplestore import TripleStore
931
+ store = TripleStore(db_path)
932
+
933
+ # Find all triples where value=entity (backref via VAET)
934
+ rows = store._conn.execute(
935
+ """SELECT entity_id, attribute FROM triples
936
+ WHERE value = ? AND value_type = 'ref' AND retracted = 0
937
+ LIMIT ?""",
938
+ (entity, limit),
939
+ ).fetchall()
940
+ children_by_attr: dict[str, set[str]] = {}
941
+ for r in rows:
942
+ children_by_attr.setdefault(r["attribute"] or "related", set()) \
943
+ .add(r["entity_id"])
944
+
945
+ # Legacy string-typed refs: facts with attribute='entity', value=<slug>.
946
+ slug_part = entity.split(":", 1)[1] if ":" in entity else entity
947
+ legacy_rows = store._conn.execute(
948
+ """SELECT DISTINCT entity_id FROM triples
949
+ WHERE attribute = 'entity' AND value = ?
950
+ AND value_type = 'string' AND retracted = 0
951
+ LIMIT ?""",
952
+ (slug_part, limit),
953
+ ).fetchall()
954
+ for r in legacy_rows:
955
+ children_by_attr.setdefault("entity", set()).add(r["entity_id"])
956
+
957
+ # Pre-fetch per-child metadata (fact_count, domain, value snippet,
958
+ # has-its-own-backrefs) in a tight loop — this is hot for big graphs.
959
+ all_children = {c for cs in children_by_attr.values() for c in cs}
960
+ meta: dict[str, dict] = {}
961
+ for child_eid in all_children:
962
+ cnt = store._conn.execute(
963
+ "SELECT COUNT(*) AS n FROM triples WHERE entity_id = ? AND retracted = 0",
964
+ (child_eid,),
965
+ ).fetchone()["n"]
966
+ domain_row = store._conn.execute(
967
+ """SELECT value FROM triples WHERE entity_id = ?
968
+ AND attribute = 'domain' AND retracted = 0 LIMIT 1""",
969
+ (child_eid,),
970
+ ).fetchone()
971
+ value_row = store._conn.execute(
972
+ """SELECT value FROM triples WHERE entity_id = ?
973
+ AND attribute = 'value' AND retracted = 0 LIMIT 1""",
974
+ (child_eid,),
975
+ ).fetchone()
976
+ backref_row = store._conn.execute(
977
+ """SELECT 1 FROM triples WHERE value = ? AND value_type = 'ref'
978
+ AND retracted = 0 LIMIT 1""",
979
+ (child_eid,),
980
+ ).fetchone()
981
+ meta[child_eid] = {
982
+ "entity": child_eid,
983
+ "fact_count": cnt,
984
+ "domain": (domain_row["value"] if domain_row else None),
985
+ "snippet": ((value_row["value"] or "")[:80] if value_row else ""),
986
+ "expandable": bool(backref_row),
987
+ }
988
+
989
+ out_groups: list[dict] = []
990
+ for attr, child_set in sorted(children_by_attr.items()):
991
+ entries = [meta[c] for c in child_set if c in meta]
992
+ attr_label = attr.replace("_", " ").title()
993
+ fact_share = sum(1 for e in entries if e["entity"].startswith("fact:")) / max(1, len(entries))
994
+
995
+ # Only sub-group by domain when (a) the group is big enough that
996
+ # flat would be unwieldy, (b) it's mostly facts, AND (c) we have at
997
+ # least one usable domain signal — otherwise everything ends up in
998
+ # an "Uncategorized" bucket that hides the parent attribute label
999
+ # ("About", "Mentions") which IS useful structure.
1000
+ if (len(entries) >= 8 and fact_share >= 0.7
1001
+ and any(e.get("domain") for e in entries)):
1002
+ by_domain: dict[str, list[dict]] = {}
1003
+ for e in entries:
1004
+ d = (e["domain"] or "other").lower()
1005
+ by_domain.setdefault(d, []).append(e)
1006
+ for domain, group_entries in sorted(by_domain.items(),
1007
+ key=lambda x: -len(x[1])):
1008
+ out_groups.append({
1009
+ "label": f"{attr_label}: {domain.replace('_', ' ').title()}",
1010
+ "edge_attr": f"{attr}:{domain}",
1011
+ "children": sorted(group_entries, key=lambda x: -x["fact_count"]),
1012
+ })
1013
+ else:
1014
+ out_groups.append({
1015
+ "label": f"{attr_label} ({len(entries)})",
1016
+ "edge_attr": attr,
1017
+ "children": sorted(entries, key=lambda x: -x["fact_count"]),
1018
+ })
1019
+
1020
+ store.close()
1021
+ return {"entity": entity, "groups": out_groups}
1022
+ except Exception as e:
1023
+ sys.stderr.write(f"graph_children error: {e}\n")
1024
+ return {"entity": entity, "groups": []}
1025
+
1026
+
587
1027
  def main() -> None:
588
1028
  parser = argparse.ArgumentParser(description="Graph Query")
589
1029
  parser.add_argument("--db", required=True, help="Path to knowledge-graph.db")
@@ -592,8 +1032,23 @@ def main() -> None:
592
1032
  parser.add_argument("--domain-counts", action="store_true", help="Show fact counts per domain")
593
1033
  parser.add_argument("--max-facts", type=int, default=5, help="Maximum facts to return")
594
1034
  parser.add_argument("--format", choices=["text", "json", "compact"], default="json", help="Output format")
1035
+ parser.add_argument("--search-entities", default=None, help="Search query for entity-prioritized lookup")
1036
+ parser.add_argument("--search-limit", type=int, default=20, help="Max entity results")
1037
+ parser.add_argument("--graph-children", default=None, help="Entity to expand for graph tree")
1038
+ parser.add_argument("--graph-limit", type=int, default=50, help="Max children per parent")
595
1039
  args = parser.parse_args()
596
1040
 
1041
+ if args.search_entities is not None:
1042
+ results = search_entities(args.db, args.search_entities, limit=args.search_limit)
1043
+ score_max = max((r["score"] for r in results), default=0.0)
1044
+ print(json.dumps({"results": results, "topic_fallback": score_max < 0.4}, ensure_ascii=False))
1045
+ return
1046
+
1047
+ if args.graph_children is not None:
1048
+ result = graph_children(args.db, args.graph_children, limit=args.graph_limit)
1049
+ print(json.dumps(result, ensure_ascii=False))
1050
+ return
1051
+
597
1052
  if args.domain_counts:
598
1053
  counts = domain_fact_counts(args.db)
599
1054
  print(json.dumps(counts, indent=2))