PyPI - topos-node - Versions diffs - 0.1.0__py3-none-any.whl - Mend

topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (249) hide show

shared/__init__.py +59 -0
shared/filtering.py +640 -0
shared/schema_registry.py +229 -0
topos/__init__.py +5 -0
topos/__version__.py +6 -0
topos/analytics/__init__.py +15 -0
topos/analytics/duckdb_adapter.py +48 -0
topos/analytics/messenger_communities.py +349 -0
topos/analytics/messenger_graph.py +522 -0
topos/analytics/messenger_labels.py +321 -0
topos/analytics/profiles.py +22 -0
topos/analytics/query_engine.py +64 -0
topos/analytics/raw_queries.py +174 -0
topos/api/__init__.py +1 -0
topos/api/analytics.py +52 -0
topos/api/app_registry.py +31 -0
topos/api/backup.py +15 -0
topos/api/compute_remote.py +175 -0
topos/api/data_commit.py +158 -0
topos/api/data_explorer_table_prefs.py +81 -0
topos/api/db.py +10 -0
topos/api/device.py +25 -0
topos/api/enrichment.py +959 -0
topos/api/filter_lab.py +195 -0
topos/api/health.py +61 -0
topos/api/ingestion_api.py +37 -0
topos/api/ingestion_compat.py +21 -0
topos/api/ingestion_sources.py +600 -0
topos/api/llm.py +76 -0
topos/api/local_mcp.py +46 -0
topos/api/messenger_analytics.py +385 -0
topos/api/query_api.py +13 -0
topos/api/sanitization_ollama_config.py +64 -0
topos/api/source_install.py +324 -0
topos/api/sources.py +13 -0
topos/api/sync.py +10 -0
topos/api/ui_config.py +83 -0
topos/api/uma_data.py +311 -0
topos/api/usage.py +49 -0
topos/api/user_identity.py +46 -0
topos/app.py +239 -0
topos/auth.py +17 -0
topos/canonicalization/__init__.py +1 -0
topos/canonicalization/mappers/__init__.py +22 -0
topos/canonicalization/mappers/base.py +26 -0
topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
topos/canonicalization/mappers/grok_mapper.py +17 -0
topos/canonicalization/mappers/messenger_mapper.py +58 -0
topos/canonicalization/models.py +31 -0
topos/canonicalization/resolver.py +23 -0
topos/cli/__init__.py +1 -0
topos/cli/__main__.py +6 -0
topos/cli/commands.py +132 -0
topos/config/__init__.py +1 -0
topos/config/sanitization_ollama.py +189 -0
topos/config/settings.py +310 -0
topos/contacts/__init__.py +5 -0
topos/contacts/identity.py +24 -0
topos/control_plane_client.py +300 -0
topos/core/__init__.py +1 -0
topos/core/api_models.py +128 -0
topos/core/connection_resilience.py +99 -0
topos/core/device_helpers.py +8 -0
topos/core/errors.py +13 -0
topos/core/events.py +12 -0
topos/core/handlers.py +5625 -0
topos/core/logging.py +175 -0
topos/core/metrics.py +21 -0
topos/core/startup_banner.py +62 -0
topos/core/state.py +682 -0
topos/core/table_layers.py +45 -0
topos/core/types.py +13 -0
topos/data_explorer_table_prefs.py +150 -0
topos/engine/__init__.py +29 -0
topos/engine/backends/__init__.py +50 -0
topos/engine/backends/base.py +21 -0
topos/engine/backends/huggingface.py +151 -0
topos/engine/backends/ollama.py +181 -0
topos/engine/backends/stub.py +22 -0
topos/engine/engine.py +165 -0
topos/engine/intake.py +32 -0
topos/engine/queue_manager.py +112 -0
topos/engine/registration.py +126 -0
topos/engine/result_formatter.py +38 -0
topos/engine/router.py +19 -0
topos/engine/scoped_token.py +82 -0
topos/engine/tasks.py +154 -0
topos/engine/transport.py +44 -0
topos/engine/usage_guard.py +100 -0
topos/engine/usage_observation.py +129 -0
topos/engine/validator.py +23 -0
topos/enrichment/__init__.py +1 -0
topos/enrichment/derived_tables.py +214 -0
topos/enrichment/jobs/__init__.py +30 -0
topos/enrichment/jobs/base.py +54 -0
topos/enrichment/jobs/canonical/__init__.py +1 -0
topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
topos/enrichment/jobs/canonical/entities_job.py +27 -0
topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
topos/enrichment/jobs/canonical/topics_job.py +27 -0
topos/enrichment/jobs/raw/__init__.py +1 -0
topos/enrichment/jobs/raw/attachments_job.py +12 -0
topos/enrichment/jobs/raw/language_job.py +12 -0
topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
topos/enrichment/models/__init__.py +1 -0
topos/enrichment/models/manager.py +8 -0
topos/enrichment/models/registry.py +71 -0
topos/enrichment/models/versioning.py +8 -0
topos/enrichment/orchestrator.py +177 -0
topos/enrichment/processor.py +17 -0
topos/enrichment/progress_bar.py +122 -0
topos/enrichment/website_classifier.py +31 -0
topos/filter_lab/__init__.py +1 -0
topos/filter_lab/bundles.py +300 -0
topos/filter_lab/schema.py +86 -0
topos/filter_lab/service.py +167 -0
topos/filter_lab/store.py +374 -0
topos/filter_lab/worker.py +250 -0
topos/hosted_pool_lease.py +153 -0
topos/ingestion/__init__.py +1 -0
topos/ingestion/checkpoints/__init__.py +6 -0
topos/ingestion/checkpoints/checkpoint_store.py +24 -0
topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
topos/ingestion/ingest_helpers.py +504 -0
topos/ingestion/jobs.py +91 -0
topos/ingestion/local_sync.py +823 -0
topos/ingestion/log_preview.py +21 -0
topos/ingestion/manager.py +1100 -0
topos/ingestion/parser.py +174 -0
topos/ingestion/parsers/__init__.py +32 -0
topos/ingestion/parsers/base.py +24 -0
topos/ingestion/parsers/browser_parser.py +171 -0
topos/ingestion/parsers/calendar_parser.py +21 -0
topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
topos/ingestion/parsers/chatgpt_parser.py +67 -0
topos/ingestion/parsers/grok_parser.py +21 -0
topos/ingestion/parsers/messenger_parser.py +97 -0
topos/ingestion/progress.py +54 -0
topos/ingestion/sources/__init__.py +20 -0
topos/ingestion/sources/base.py +39 -0
topos/ingestion/sources/calendar.py +29 -0
topos/ingestion/sources/chatgpt.py +29 -0
topos/ingestion/sources/contact_importers.py +274 -0
topos/ingestion/sources/grok.py +29 -0
topos/ingestion/sources/imessage_reader.py +479 -0
topos/ingestion/sources/signal_export_parser.py +132 -0
topos/ingestion/sources/signal_reader.py +491 -0
topos/ingestion/state_machine.py +70 -0
topos/ingestion/triggers/__init__.py +1 -0
topos/ingestion/triggers/file_trigger.py +36 -0
topos/ingestion/triggers/sqlite_trigger.py +18 -0
topos/ingestion/validation/__init__.py +1 -0
topos/ingestion/validation/base.py +27 -0
topos/ingestion/validation/schema_registry.py +111 -0
topos/ingestion/validation/schema_validator.py +13 -0
topos/lineage/__init__.py +1 -0
topos/lineage/provenance.py +9 -0
topos/lineage/tracker.py +9 -0
topos/mcp_stdio_proxy.py +83 -0
topos/observability/__init__.py +1 -0
topos/observability/alerts.py +7 -0
topos/observability/metrics.py +25 -0
topos/observability/tracing.py +18 -0
topos/openai_client.py +69 -0
topos/projections/__init__.py +1 -0
topos/projections/vector_index/__init__.py +1 -0
topos/projections/vector_index/base.py +21 -0
topos/projections/vector_index/builders.py +11 -0
topos/projections/vector_index/health_checks.py +5 -0
topos/rate_limit.py +43 -0
topos/sanitization/__init__.py +16 -0
topos/sanitization/ollama_transforms.py +276 -0
topos/scope_resolution.py +89 -0
topos/services/__init__.py +1 -0
topos/services/container.py +46 -0
topos/services/embeddings/__init__.py +1 -0
topos/services/embeddings/base.py +7 -0
topos/services/embeddings/local.py +9 -0
topos/services/embeddings/remote.py +9 -0
topos/services/interfaces.py +40 -0
topos/services/llm/__init__.py +1 -0
topos/services/llm/base.py +7 -0
topos/services/llm/openai.py +126 -0
topos/services/local.py +123 -0
topos/services/postgres.py +385 -0
topos/sources/__init__.py +6 -0
topos/sources/definitions.py +114 -0
topos/sources/install_service.py +836 -0
topos/sources/registry.py +263 -0
topos/sources/runtime_install.py +427 -0
topos/storage/__init__.py +1 -0
topos/storage/canonical/__init__.py +18 -0
topos/storage/canonical/ai_chat/__init__.py +22 -0
topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
topos/storage/canonical/ai_chat/mapper.py +168 -0
topos/storage/canonical/ai_chat/model.py +87 -0
topos/storage/canonical/ai_chat/tables.py +179 -0
topos/storage/canonical/canonical_store.py +24 -0
topos/storage/canonical/conversations_tables.py +1020 -0
topos/storage/canonical/mapping_store.py +30 -0
topos/storage/canonical/postgres.py +10 -0
topos/storage/db/__init__.py +1 -0
topos/storage/db/client.py +8 -0
topos/storage/db/migrations/__init__.py +1 -0
topos/storage/db/migrations/stage9_column_renames.py +78 -0
topos/storage/db/paths.py +122 -0
topos/storage/db/postgres.py +240 -0
topos/storage/db/schema.py +6 -0
topos/storage/enrichment/__init__.py +1 -0
topos/storage/enrichment/canonical_enrichment_store.py +7 -0
topos/storage/enrichment/raw_enrichment_store.py +18 -0
topos/storage/normalized/__init__.py +1 -0
topos/storage/normalized/normalized_store.py +24 -0
topos/storage/oplog/__init__.py +1 -0
topos/storage/oplog/decision.py +6 -0
topos/storage/oplog/oplog_store.py +17 -0
topos/storage/oplog/postgres.py +10 -0
topos/storage/projections/__init__.py +1 -0
topos/storage/projections/index_ops_store.py +6 -0
topos/storage/projections/vector_index_store.py +6 -0
topos/storage/raw/__init__.py +1 -0
topos/storage/raw/browser_flat_tables.py +303 -0
topos/storage/raw/file_store.py +100 -0
topos/storage/raw/raw_store.py +29 -0
topos/storage/raw/raw_tables_manager.py +295 -0
topos/storage/raw/sqlite_raw_store.py +17 -0
topos/storage/security/encryption.py +21 -0
topos/storage/signal_identity.py +71 -0
topos/storage/source_settings.py +116 -0
topos/storage/user_identity.py +69 -0
topos/sync/__init__.py +5 -0
topos/sync/client.py +272 -0
topos/sync_handlers.py +70 -0
topos/testing/__init__.py +1 -0
topos/testing/lifespan.py +7 -0
topos/uma_contact_enrichment.py +1032 -0
topos/uma_filters.py +669 -0
topos/uma_resource_id.py +24 -0
topos/uma_rpt.py +69 -0
topos/utils/base_object.py +61 -0
topos/websocket_client.py +21 -0
topos_node-0.1.0.dist-info/METADATA +199 -0
topos_node-0.1.0.dist-info/RECORD +249 -0
topos_node-0.1.0.dist-info/WHEEL +5 -0
topos_node-0.1.0.dist-info/entry_points.txt +2 -0
topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
topos_node-0.1.0.dist-info/top_level.txt +2 -0

topos/analytics/messenger_labels.py ADDED Viewed

@@ -0,0 +1,321 @@
+"""Helpers for resolving participant labels in messenger analytics."""
+from __future__ import annotations
+from collections import defaultdict
+from typing import Any, Dict, List, Sequence, Set, Tuple
+def _rows_to_dicts(rows: Sequence[Any], cursor: Any = None) -> List[Dict[str, Any]]:
+    """Map DB rows to dicts. Plain sqlite3 connections return tuples; use ``cursor.description``."""
+    out: List[Dict[str, Any]] = []
+    col_names: List[str] | None = None
+    if cursor is not None and getattr(cursor, "description", None):
+        col_names = [d[0] for d in cursor.description if d is not None]
+    for row in rows:
+        if hasattr(row, "keys"):
+            out.append({k: row[k] for k in row.keys()})
+        elif col_names is not None and isinstance(row, (tuple, list)) and len(row) == len(col_names):
+            out.append({col_names[i]: row[i] for i in range(len(col_names))})
+        else:
+            out.append(dict(row))
+    return out
+def _in_clause(values: Sequence[str]) -> tuple[str, List[str]]:
+    placeholders = ",".join(["?"] * len(values))
+    return f"({placeholders})", list(values)
+def _normalize_contact_key(value: Any) -> str:
+    s = str(value or "").strip()
+    if not s:
+        return ""
+    low = s.lower()
+    if low == "self":
+        return "self"
+    if "@" in low:
+        return low
+    digits = "".join(ch for ch in s if ch.isdigit())
+    if digits:
+        return f"+{digits}" if s.startswith("+") else digits
+    return low
+def sender_matches_focus_identifier(sender_id: str, profile_identifier: str) -> bool:
+    """True if message ``sender_id`` refers to the same party as the profile row's primary identifier."""
+    a = str(sender_id or "").strip()
+    b = str(profile_identifier or "").strip()
+    if not a or not b:
+        return False
+    if _identifier_candidates(a) & _identifier_candidates(b):
+        return True
+    na, nb = _normalize_contact_key(a), _normalize_contact_key(b)
+    return bool(na and nb and na == nb)
+def _identifier_candidates(value: str) -> Set[str]:
+    raw = str(value or "").strip()
+    if not raw:
+        return set()
+    out = {raw, raw.lower()}
+    normalized = _normalize_contact_key(raw)
+    if normalized:
+        out.add(normalized)
+    digits = "".join(ch for ch in raw if ch.isdigit())
+    if digits:
+        out.add(digits)
+        out.add(f"+{digits}")
+        # Common NANP variant: some imports drop leading country code 1.
+        if len(digits) == 11 and digits.startswith("1"):
+            local10 = digits[1:]
+            out.add(local10)
+            out.add(f"+{local10}")
+    return {v for v in out if v}
+def resolve_participant_labels(
+    conn: Any,
+    *,
+    dataset_id: str,
+    participant_ids: Sequence[str],
+) -> Dict[str, Dict[str, str]]:
+    """Resolve display labels for participant contact IDs.
+    Priority:
+    1) contacts.display_name
+    2) a contact identifier from contact_identifiers
+    3) raw participant_id
+    """
+    normalized_participants = sorted({str(pid).strip() for pid in participant_ids if str(pid).strip()})
+    if not normalized_participants:
+        return {}
+    contacts_in_clause, contacts_params = _in_clause(normalized_participants)
+    participant_candidates: Dict[str, Set[str]] = {
+        participant_id: _identifier_candidates(participant_id)
+        for participant_id in normalized_participants
+    }
+    all_identifier_candidates = sorted({cand for cands in participant_candidates.values() for cand in cands})
+    _cur_contacts = conn.execute(
+        f"""
+            SELECT contact_id, display_name
+            FROM contacts
+            WHERE dataset_id = ? AND contact_id IN {contacts_in_clause}
+            """,
+        tuple([dataset_id] + contacts_params),
+    )
+    contacts_rows = _rows_to_dicts(_cur_contacts.fetchall(), _cur_contacts)
+    display_name_by_contact_id = {
+        str(row["contact_id"]): str(row["display_name"]).strip()
+        for row in contacts_rows
+        if row.get("contact_id") and row.get("display_name") and str(row["display_name"]).strip()
+    }
+    # Keep identifier fallback for participants that are already contact_ids.
+    _cur_cid = conn.execute(
+        f"""
+            SELECT contact_id, identifier, source_id
+            FROM contact_identifiers
+            WHERE dataset_id = ?
+              AND contact_id IN {contacts_in_clause}
+            ORDER BY CASE WHEN source_id = '*' THEN 1 ELSE 0 END, updated_at DESC
+            """,
+        tuple([dataset_id] + contacts_params),
+    )
+    contact_identifier_rows = _rows_to_dicts(_cur_cid.fetchall(), _cur_cid)
+    identifier_rows: List[Dict[str, Any]] = []
+    if all_identifier_candidates:
+        identifiers_in_clause, identifiers_params = _in_clause(all_identifier_candidates)
+        _cur_ident = conn.execute(
+            f"""
+                SELECT ci.contact_id, ci.identifier, ci.source_id, c.display_name
+                FROM contact_identifiers ci
+                LEFT JOIN contacts c
+                  ON c.dataset_id = ci.dataset_id
+                 AND c.contact_id = ci.contact_id
+                WHERE ci.dataset_id = ?
+                  AND ci.identifier IN {identifiers_in_clause}
+                ORDER BY CASE WHEN ci.source_id = '*' THEN 1 ELSE 0 END, ci.updated_at DESC
+                """,
+            tuple([dataset_id] + identifiers_params),
+        )
+        identifier_rows = _rows_to_dicts(_cur_ident.fetchall(), _cur_ident)
+    best_identifier_by_contact_id: Dict[str, str] = {}
+    display_name_by_identifier: Dict[str, str] = {}
+    contact_ids_by_identifier: Dict[str, List[str]] = defaultdict(list)
+    def _index_identifier_rows(rows: Sequence[Dict[str, Any]]) -> None:
+        for row in rows:
+            contact_id = str(row.get("contact_id") or "").strip()
+            identifier = str(row.get("identifier") or "").strip()
+            display_name = str(row.get("display_name") or "").strip()
+            if not contact_id or not identifier:
+                continue
+            if display_name and contact_id not in display_name_by_contact_id:
+                display_name_by_contact_id[contact_id] = display_name
+            if contact_id not in best_identifier_by_contact_id:
+                best_identifier_by_contact_id[contact_id] = identifier
+            for candidate in _identifier_candidates(identifier):
+                if candidate and contact_id not in contact_ids_by_identifier[candidate]:
+                    contact_ids_by_identifier[candidate].append(contact_id)
+                if candidate and display_name and candidate not in display_name_by_identifier:
+                    display_name_by_identifier[candidate] = display_name
+    _index_identifier_rows(identifier_rows)
+    # Also index identifiers that belong to participant contact_ids directly (used for fallback labeling).
+    _index_identifier_rows(contact_identifier_rows)
+    secondary_identifier_candidates = sorted(
+        {
+            candidate
+            for identifier in best_identifier_by_contact_id.values()
+            for candidate in _identifier_candidates(identifier)
+        }
+    )
+    if secondary_identifier_candidates:
+        secondary_in_clause, secondary_params = _in_clause(secondary_identifier_candidates)
+        _cur_sec = conn.execute(
+            f"""
+                SELECT ci.contact_id, ci.identifier, ci.source_id, c.display_name
+                FROM contact_identifiers ci
+                LEFT JOIN contacts c
+                  ON c.dataset_id = ci.dataset_id
+                 AND c.contact_id = ci.contact_id
+                WHERE ci.dataset_id = ?
+                  AND ci.identifier IN {secondary_in_clause}
+                ORDER BY CASE WHEN ci.source_id = '*' THEN 1 ELSE 0 END, ci.updated_at DESC
+                """,
+            tuple([dataset_id] + secondary_params),
+        )
+        secondary_rows = _rows_to_dicts(_cur_sec.fetchall(), _cur_sec)
+        _index_identifier_rows(secondary_rows)
+    for row in contact_identifier_rows:
+        contact_id = str(row.get("contact_id") or "").strip()
+        identifier = str(row.get("identifier") or "").strip()
+        display_name = str(row.get("display_name") or "").strip()
+        if contact_id and identifier and contact_id not in best_identifier_by_contact_id:
+            best_identifier_by_contact_id[contact_id] = identifier
+    out: Dict[str, Dict[str, str]] = {}
+    for participant_id in normalized_participants:
+        display_name = display_name_by_contact_id.get(participant_id, "")
+        identifier = best_identifier_by_contact_id.get(participant_id, "")
+        if not display_name:
+            matched_contact_id = ""
+            for candidate in participant_candidates.get(participant_id, set()):
+                contact_ids = contact_ids_by_identifier.get(candidate, [])
+                if not contact_ids:
+                    continue
+                matched_contact_id = contact_ids[0]
+                if matched_contact_id:
+                    break
+            if matched_contact_id:
+                display_name = display_name_by_contact_id.get(matched_contact_id, "") or display_name
+                identifier = best_identifier_by_contact_id.get(matched_contact_id, "") or identifier
+            if not display_name:
+                # Fallback to identifier-level display mapping (e.g., when contact row has sparse data).
+                for candidate in participant_candidates.get(participant_id, set()):
+                    maybe_name = display_name_by_identifier.get(candidate, "")
+                    if maybe_name:
+                        display_name = maybe_name
+                        break
+        if not identifier:
+            identifier = participant_id
+        # If this participant maps to an unnamed contact_id but we do have an identifier,
+        # try resolving that identifier to another contact with a display name
+        # (common after contact import where normalized phone variants point to different contact_ids).
+        if not display_name and identifier:
+            identifier_matched_contact_id = ""
+            fallback_contact_id = ""
+            for candidate in _identifier_candidates(identifier):
+                contact_ids = contact_ids_by_identifier.get(candidate, [])
+                if not contact_ids:
+                    continue
+                named_ids = [cid for cid in contact_ids if display_name_by_contact_id.get(cid)]
+                if named_ids:
+                    identifier_matched_contact_id = named_ids[0]
+                    break
+                if not fallback_contact_id:
+                    fallback_contact_id = contact_ids[0]
+            if not identifier_matched_contact_id and fallback_contact_id:
+                identifier_matched_contact_id = fallback_contact_id
+            if identifier_matched_contact_id:
+                display_name = display_name_by_contact_id.get(identifier_matched_contact_id, "") or display_name
+                identifier = best_identifier_by_contact_id.get(identifier_matched_contact_id, "") or identifier
+        label = display_name or identifier or participant_id
+        out[participant_id] = {
+            "label": label,
+            "display_name": display_name,
+            "identifier": identifier,
+        }
+    return out
+def enrich_conversation_thread_previews(
+    conn: Any,
+    *,
+    dataset_id: str,
+    profile_identifier: str,
+    previews: List[Dict[str, Any]],
+) -> None:
+    """Mutates each message in ``previews``: adds ``sender_display_name`` and ``is_focus_contact``."""
+    senders: List[str] = []
+    for block in previews:
+        for m in block.get("messages") or []:
+            if not isinstance(m, dict):
+                continue
+            sid = str(m.get("sender_id") or "").strip()
+            if sid:
+                senders.append(sid)
+    labels = resolve_participant_labels(conn, dataset_id=dataset_id, participant_ids=senders)
+    for block in previews:
+        for m in block.get("messages") or []:
+            if not isinstance(m, dict):
+                continue
+            sid = str(m.get("sender_id") or "").strip()
+            info = labels.get(sid, {}) if sid else {}
+            label = str(info.get("label") or "").strip()
+            m["sender_display_name"] = label or sid or "Unknown"
+            m["is_focus_contact"] = bool(sid) and sender_matches_focus_identifier(sid, profile_identifier)
+def enrich_contact_rows_with_resolved_display_names(
+    conn: Any,
+    *,
+    dataset_id: str,
+    contacts: List[Dict[str, Any]],
+) -> None:
+    """Fill empty ``display_name`` on owner/API contact rows (parity with messenger social graph).
+    ``list_contacts`` returns ``contacts.display_name`` per row only. Analytics uses
+    :func:`resolve_participant_labels` to promote names across identifier variants and
+    duplicate contact_ids (e.g. iMessage sender vs address-book import). Apply the same
+    resolution here so grant privacy UI and filters see the same labels as the graph.
+    """
+    participant_ids: List[str] = []
+    for c in contacts:
+        cid = str(c.get("contact_id") or "").strip()
+        if cid:
+            participant_ids.append(cid)
+        ident = str(c.get("identifier") or "").strip()
+        if ident:
+            participant_ids.append(ident)
+    if not participant_ids:
+        return
+    labels = resolve_participant_labels(conn, dataset_id=dataset_id, participant_ids=participant_ids)
+    for c in contacts:
+        if str(c.get("display_name") or "").strip():
+            continue
+        cid = str(c.get("contact_id") or "").strip()
+        resolved = str((labels.get(cid) or {}).get("display_name") or "").strip()
+        if resolved:
+            c["display_name"] = resolved

topos/analytics/profiles.py ADDED Viewed

@@ -0,0 +1,22 @@
+from __future__ import annotations
+CHATGPT_DEV_PROFILE = {
+    "profile_id": "chatgpt_dev",
+    "queries": [
+        "messages_per_day",
+        "total_messages",
+        "messages_by_sender",
+        "avg_message_length",
+    ],
+}
+PROFILE_REGISTRY = {
+    CHATGPT_DEV_PROFILE["profile_id"]: CHATGPT_DEV_PROFILE,
+    # Allow per-source profile ids to map to the shared ChatGPT profile.
+    "chatgpt_file_ingestion": CHATGPT_DEV_PROFILE,
+    "chatgpt_ui_conversation": CHATGPT_DEV_PROFILE,
+}
+def get_profile(profile_id: str) -> dict | None:
+    return PROFILE_REGISTRY.get(profile_id)

topos/analytics/query_engine.py ADDED Viewed

@@ -0,0 +1,64 @@
+from __future__ import annotations
+from typing import Any, Dict, List, Optional
+from .duckdb_adapter import DuckDBAdapter
+class QueryEngine:
+    def __init__(self, adapter: DuckDBAdapter):
+        self.adapter = adapter
+    def query_messages_per_day(self, dataset_id: Optional[str] = None) -> List[Dict[str, Any]]:
+        query = """
+            SELECT DATE(ts) as day, COUNT(*) as message_count
+            FROM projection.messages
+        """
+        params: List[Any] = []
+        if dataset_id:
+            query += " WHERE dataset_id = ?"
+            params.append(dataset_id)
+        query += " GROUP BY day ORDER BY day DESC"
+        return self.adapter.execute(query, params)
+    def query_total_messages(self, dataset_id: Optional[str] = None) -> Dict[str, Any]:
+        query = "SELECT COUNT(*) as total_messages FROM projection.messages"
+        params: List[Any] = []
+        if dataset_id:
+            query += " WHERE dataset_id = ?"
+            params.append(dataset_id)
+        rows = self.adapter.execute(query, params)
+        return rows[0] if rows else {"total_messages": 0}
+    def query_messages_by_sender(self, dataset_id: Optional[str] = None) -> List[Dict[str, Any]]:
+        query = """
+            SELECT sender_type, COUNT(*) as count
+            FROM projection.messages
+        """
+        params: List[Any] = []
+        if dataset_id:
+            query += " WHERE dataset_id = ?"
+            params.append(dataset_id)
+        query += " GROUP BY sender_type ORDER BY count DESC"
+        return self.adapter.execute(query, params)
+    def query_avg_message_length(self, dataset_id: Optional[str] = None) -> Dict[str, Any]:
+        query = """
+            SELECT AVG(LENGTH(content)) as avg_length,
+                   MIN(LENGTH(content)) as min_length,
+                   MAX(LENGTH(content)) as max_length
+            FROM projection.messages
+        """
+        params: List[Any] = []
+        if dataset_id:
+            query += " WHERE dataset_id = ?"
+            params.append(dataset_id)
+        rows = self.adapter.execute(query, params)
+        if rows:
+            row = rows[0]
+            return {
+                "avg_length": float(row.get("avg_length") or 0),
+                "min_length": int(row.get("min_length") or 0),
+                "max_length": int(row.get("max_length") or 0),
+            }
+        return {"avg_length": 0.0, "min_length": 0, "max_length": 0}

topos/analytics/raw_queries.py ADDED Viewed

@@ -0,0 +1,174 @@
+from __future__ import annotations
+import json
+import logging
+from collections import Counter, defaultdict
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from ..storage.raw.file_store import RawFileStore
+logger = logging.getLogger("topos.analytics.raw_queries")
+def _normalize_ts(value: Any) -> str:
+    if isinstance(value, (int, float)):
+        return datetime.fromtimestamp(value, tz=timezone.utc).isoformat()
+    if isinstance(value, str):
+        return value
+    return ""
+def _normalize_sender(payload: dict) -> str:
+    role = (payload.get("role") or "").lower()
+    if role:
+        return "human" if role == "user" else role
+    sender_type = payload.get("sender_type")
+    return sender_type or "assistant"
+def _message_from_payload(payload: dict, fallback_id: str, dataset_id: str) -> dict:
+    created_at = payload.get("created_at") or payload.get("ts")
+    out: Dict[str, Any] = {
+        "message_id": payload.get("id") or payload.get("message_id") or fallback_id,
+        "dataset_id": dataset_id,
+        "sender_type": _normalize_sender(payload),
+        "ts": _normalize_ts(created_at),
+        "content": payload.get("content", ""),
+    }
+    if payload.get("source_id") is not None:
+        out["source_id"] = str(payload["source_id"])
+    return out
+def _parse_ts_to_datetime(ts: str) -> Optional[datetime]:
+    """Parse ISO-like ts string to datetime for comparison. Returns None if unparseable."""
+    if not ts:
+        return None
+    try:
+        if isinstance(ts, (int, float)):
+            return datetime.fromtimestamp(ts, tz=timezone.utc)
+        s = str(ts).strip()
+        if "T" in s:
+            dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
+            return dt if dt.tzinfo else dt.replace(tzinfo=timezone.utc)
+        return datetime.strptime(s[:10], "%Y-%m-%d").replace(tzinfo=timezone.utc)
+    except (ValueError, TypeError):
+        return None
+def _apply_filter_manifest_to_messages(
+    messages: List[Dict[str, Any]],
+    manifest: Optional[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Apply filter_manifest (rolling_window_days, date_range, source_filter) in Python. Stage 2b."""
+    if not manifest or not isinstance(manifest, dict):
+        return messages
+    out: List[Dict[str, Any]] = []
+    now = datetime.now(timezone.utc)
+    rolling_days: Optional[int] = None
+    if manifest.get("rolling_window_days") is not None:
+        try:
+            rolling_days = max(0, int(manifest["rolling_window_days"]))
+        except (TypeError, ValueError):
+            pass
+    range_start: Optional[datetime] = None
+    if manifest.get("date_range_start"):
+        range_start = _parse_ts_to_datetime(str(manifest["date_range_start"]))
+    range_end: Optional[datetime] = None
+    if manifest.get("date_range_end"):
+        range_end = _parse_ts_to_datetime(str(manifest["date_range_end"]))
+    source_allow: Optional[List[str]] = None
+    if isinstance(manifest.get("source_filter"), list) and len(manifest["source_filter"]) > 0:
+        source_allow = [str(s) for s in manifest["source_filter"]]
+    for msg in messages:
+        ts_str = msg.get("ts")
+        dt = _parse_ts_to_datetime(ts_str) if ts_str else None
+        if rolling_days is not None and dt is not None:
+            if dt < now - timedelta(days=rolling_days):
+                continue
+        if range_start is not None and dt is not None and dt < range_start:
+            continue
+        if range_end is not None and dt is not None and dt > range_end:
+            continue
+        if source_allow is not None:
+            sid = msg.get("source_id")
+            if sid is not None and str(sid) not in source_allow:
+                continue
+        out.append(msg)
+    return out
+def load_raw_messages(
+    *,
+    dataset_id: str,
+    schema_id: str,
+    limit: Optional[int] = None,
+    offset: int = 0,
+    filter_manifest: Optional[Dict[str, Any]] = None,
+) -> List[Dict[str, Any]]:
+    file_store = RawFileStore()
+    file_path = file_store.get_file_path(dataset_id, schema_id)
+    logger.debug(
+        "[PIPELINE:ANALYTICS] Loading raw messages: dataset_id=%s, schema_id=%s, file_path=%s, limit=%s, offset=%s",
+        dataset_id,
+        schema_id,
+        file_path,
+        limit,
+        offset,
+    )
+    if not file_path.exists():
+        logger.debug("[PIPELINE:ANALYTICS] Raw file does not exist: %s", file_path)
+        return []
+    messages: List[Dict[str, Any]] = []
+    with Path(file_path).open("r", encoding="utf-8") as handle:
+        for idx, line in enumerate(handle):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                payload = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            messages.append(_message_from_payload(payload, str(idx + 1), dataset_id))
+    messages = _apply_filter_manifest_to_messages(messages, filter_manifest)
+    if offset:
+        messages = messages[offset:]
+    if limit is not None:
+        messages = messages[:limit]
+    logger.debug(
+        "[PIPELINE:ANALYTICS] Loaded %d messages (after limit/offset)",
+        len(messages),
+    )
+    return messages
+def messages_per_day(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    counts: dict[str, int] = defaultdict(int)
+    for message in messages:
+        ts = message.get("ts") or ""
+        if ts:
+            day = ts.split("T", 1)[0]
+            counts[day] += 1
+    return [{"day": day, "count": counts[day]} for day in sorted(counts.keys())]
+def total_messages(messages: List[Dict[str, Any]]) -> Dict[str, Any]:
+    return {"total_messages": len(messages)}
+def messages_by_sender(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    counts = Counter(msg.get("sender_type") or "unknown" for msg in messages)
+    return [{"sender_type": sender, "count": count} for sender, count in counts.most_common()]
+def avg_message_length(messages: List[Dict[str, Any]]) -> Dict[str, Any]:
+    if not messages:
+        return {"avg_length": 0.0, "min_length": 0, "max_length": 0}
+    lengths = [len(msg.get("content") or "") for msg in messages]
+    return {
+        "avg_length": float(sum(lengths)) / len(lengths),
+        "min_length": min(lengths),
+        "max_length": max(lengths),
+    }

topos/api/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """API routers for Topos."""

topos/api/analytics.py ADDED Viewed

@@ -0,0 +1,52 @@
+from __future__ import annotations
+from typing import Optional
+from fastapi import APIRouter, Query
+from ..analytics.duckdb_adapter import DuckDBAdapter
+from ..analytics.profiles import get_profile
+from ..analytics.query_engine import QueryEngine
+router = APIRouter()
+@router.get("/analytics")
+async def get_analytics_endpoint(
+    query: Optional[str] = Query(None, description="Analytics query name"),
+    profile_id: Optional[str] = Query(None, description="Analytics profile id"),
+    dataset_id: Optional[str] = Query(None),
+) -> dict:
+    adapter = DuckDBAdapter()
+    engine = QueryEngine(adapter)
+    if profile_id:
+        profile = get_profile(profile_id)
+        if not profile:
+            return {"status": "error", "error": "unknown profile_id"}
+        results = {}
+        for item in profile["queries"]:
+            try:
+                if item == "messages_per_day":
+                    results[item] = engine.query_messages_per_day(dataset_id=dataset_id)
+                elif item == "total_messages":
+                    results[item] = engine.query_total_messages(dataset_id=dataset_id)
+                elif item == "messages_by_sender":
+                    results[item] = engine.query_messages_by_sender(dataset_id=dataset_id)
+                elif item == "avg_message_length":
+                    results[item] = engine.query_avg_message_length(dataset_id=dataset_id)
+                else:
+                    results[item] = {"error": "unsupported query"}
+            except Exception:
+                results[item] = []
+        return {"profile_id": profile_id, "results": results}
+    if query == "messages_per_day":
+        return {"query": query, "result": engine.query_messages_per_day(dataset_id=dataset_id)}
+    if query == "total_messages":
+        return {"query": query, "result": engine.query_total_messages(dataset_id=dataset_id)}
+    if query == "messages_by_sender":
+        return {"query": query, "result": engine.query_messages_by_sender(dataset_id=dataset_id)}
+    if query == "avg_message_length":
+        return {"query": query, "result": engine.query_avg_message_length(dataset_id=dataset_id)}
+    return {"status": "stub", "query": query, "dataset_id": dataset_id}

topos/api/app_registry.py ADDED Viewed

@@ -0,0 +1,31 @@
+from __future__ import annotations
+from typing import Any, Dict
+from fastapi import APIRouter, Body, Depends
+from ..auth import require_api_key
+router = APIRouter()
+@router.get("/apps", dependencies=[Depends(require_api_key)])
+async def list_apps() -> Dict[str, Any]:
+    return {"status": "stub", "apps": []}
+@router.post("/apps", dependencies=[Depends(require_api_key)])
+async def create_app(payload: Dict[str, Any] = Body(default_factory=dict)) -> Dict[str, Any]:
+    return {"status": "stub", "app": payload}
+@router.get("/apps/{app_id}/sources", dependencies=[Depends(require_api_key)])
+async def list_app_sources(app_id: str) -> Dict[str, Any]:
+    return {"status": "stub", "app_id": app_id, "sources": []}
+@router.post("/apps/{app_id}/sources", dependencies=[Depends(require_api_key)])
+async def create_app_source(
+    app_id: str, payload: Dict[str, Any] = Body(default_factory=dict)
+) -> Dict[str, Any]:
+    return {"status": "stub", "app_id": app_id, "source": payload}

topos/api/backup.py ADDED Viewed

@@ -0,0 +1,15 @@
+from __future__ import annotations
+from fastapi import APIRouter
+router = APIRouter()
+@router.post("/backup")
+async def backup_database() -> dict:
+    return {"status": "stub"}
+@router.post("/restore")
+async def restore_database() -> dict:
+    return {"status": "stub"}