topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Stage 9 schema registry: machine-readable contract for table/column canonical names,
|
|
3
|
+
types, and categories (informational vs organizational).
|
|
4
|
+
|
|
5
|
+
Source of truth: docs/SCHEMA_CONVENTIONS.md §7.
|
|
6
|
+
Used by: engine, control plane, and UI (via assist APIs).
|
|
7
|
+
Organizational columns are non-filterable by default.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
# -----------------------------------------------------------------------------
|
|
15
|
+
# Column entry: one row per (table, column) with current DB name, canonical name, type, category.
|
|
16
|
+
# For Stage 9 rename targets, current_column_name != canonical_column_name (current = name in DB today).
|
|
17
|
+
# -----------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
CATEGORY_INFORMATIONAL = "informational"
|
|
20
|
+
CATEGORY_ORGANIZATIONAL = "organizational"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _row(
|
|
24
|
+
table: str,
|
|
25
|
+
current: str,
|
|
26
|
+
canonical: str,
|
|
27
|
+
type_name: str,
|
|
28
|
+
category: str,
|
|
29
|
+
) -> Dict[str, Any]:
|
|
30
|
+
"""Build a registry row; rename_target True when current != canonical."""
|
|
31
|
+
return {
|
|
32
|
+
"table_name": table,
|
|
33
|
+
"current_column_name": current,
|
|
34
|
+
"canonical_column_name": canonical,
|
|
35
|
+
"type": type_name,
|
|
36
|
+
"category": category,
|
|
37
|
+
"filterable_by_default": category == CATEGORY_INFORMATIONAL,
|
|
38
|
+
"rename_target": current != canonical,
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Registry: all Stage 9 mapped tables from SCHEMA_CONVENTIONS.md §7.1–§7.7.
|
|
43
|
+
# §7.0 rename targets: current = name in DB today; canonical = name after migration.
|
|
44
|
+
SCHEMA_REGISTRY: List[Dict[str, Any]] = [
|
|
45
|
+
# ----- conversation_messages (7.1) -----
|
|
46
|
+
_row("conversation_messages", "message_id", "message_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
47
|
+
_row("conversation_messages", "conversation_id", "conversation_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
48
|
+
_row("conversation_messages", "dataset_id", "dataset_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
49
|
+
_row("conversation_messages", "sender_type", "sender_type", "text", CATEGORY_INFORMATIONAL),
|
|
50
|
+
_row("conversation_messages", "sender_id", "sender_id", "text", CATEGORY_INFORMATIONAL),
|
|
51
|
+
_row("conversation_messages", "reply_to_message_id", "reply_to_message_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
52
|
+
_row("conversation_messages", "message_type", "message_type", "text", CATEGORY_INFORMATIONAL),
|
|
53
|
+
_row("conversation_messages", "event_type", "event_type", "text", CATEGORY_INFORMATIONAL),
|
|
54
|
+
_row("conversation_messages", "content", "content", "text", CATEGORY_INFORMATIONAL),
|
|
55
|
+
_row("conversation_messages", "ts", "event_at", "timestamp_utc", CATEGORY_INFORMATIONAL), # rename
|
|
56
|
+
_row("conversation_messages", "source_id", "source_id", "text", CATEGORY_ORGANIZATIONAL),
|
|
57
|
+
_row("conversation_messages", "metadata_json", "metadata_json", "json", CATEGORY_INFORMATIONAL),
|
|
58
|
+
_row("conversation_messages", "created_at", "created_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
|
|
59
|
+
_row("conversation_messages", "from_self", "is_from_self", "integer", CATEGORY_ORGANIZATIONAL), # rename
|
|
60
|
+
_row("conversation_messages", "owner_user_id", "owner_user_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
61
|
+
# ----- ai_chat_messages (7.2) -----
|
|
62
|
+
_row("ai_chat_messages", "message_id", "message_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
63
|
+
_row("ai_chat_messages", "conversation_id", "conversation_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
64
|
+
_row("ai_chat_messages", "sender_type", "sender_type", "text", CATEGORY_INFORMATIONAL),
|
|
65
|
+
_row("ai_chat_messages", "sender_id", "sender_id", "text", CATEGORY_INFORMATIONAL),
|
|
66
|
+
_row("ai_chat_messages", "ts", "event_at", "timestamp_utc", CATEGORY_INFORMATIONAL), # rename
|
|
67
|
+
_row("ai_chat_messages", "content", "content", "text", CATEGORY_INFORMATIONAL),
|
|
68
|
+
_row("ai_chat_messages", "content_rendered", "content_rendered", "text", CATEGORY_INFORMATIONAL),
|
|
69
|
+
_row("ai_chat_messages", "metadata_json", "metadata_json", "json", CATEGORY_INFORMATIONAL),
|
|
70
|
+
_row("ai_chat_messages", "seq", "sequence", "integer", CATEGORY_ORGANIZATIONAL), # rename
|
|
71
|
+
_row("ai_chat_messages", "source_id", "source_id", "text", CATEGORY_ORGANIZATIONAL),
|
|
72
|
+
# ----- ai_chat_conversations (7.3) -----
|
|
73
|
+
_row("ai_chat_conversations", "conversation_id", "conversation_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
74
|
+
_row("ai_chat_conversations", "owner_user_id", "owner_user_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
75
|
+
_row("ai_chat_conversations", "title", "title", "text", CATEGORY_INFORMATIONAL),
|
|
76
|
+
_row("ai_chat_conversations", "source", "source_id", "text", CATEGORY_ORGANIZATIONAL), # rename
|
|
77
|
+
_row("ai_chat_conversations", "created_at", "created_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
|
|
78
|
+
_row("ai_chat_conversations", "updated_at", "updated_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
|
|
79
|
+
# ----- browser_visits (7.4) -----
|
|
80
|
+
_row("browser_visits", "record_id", "record_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
81
|
+
_row("browser_visits", "dataset_id", "dataset_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
82
|
+
_row("browser_visits", "url", "url", "text", CATEGORY_INFORMATIONAL),
|
|
83
|
+
_row("browser_visits", "visited_at", "visited_at", "timestamp_utc", CATEGORY_INFORMATIONAL),
|
|
84
|
+
_row("browser_visits", "title", "title", "text", CATEGORY_INFORMATIONAL),
|
|
85
|
+
_row("browser_visits", "favicon_url", "favicon_url", "text", CATEGORY_INFORMATIONAL),
|
|
86
|
+
_row("browser_visits", "hostname", "hostname", "text", CATEGORY_INFORMATIONAL),
|
|
87
|
+
_row("browser_visits", "device_name", "device_name", "text", CATEGORY_INFORMATIONAL),
|
|
88
|
+
_row("browser_visits", "tab_id", "tab_id", "integer", CATEGORY_ORGANIZATIONAL),
|
|
89
|
+
_row("browser_visits", "window_id", "window_id", "integer", CATEGORY_ORGANIZATIONAL),
|
|
90
|
+
_row("browser_visits", "incognito", "incognito", "integer", CATEGORY_ORGANIZATIONAL),
|
|
91
|
+
_row("browser_visits", "transition_type", "transition_type", "text", CATEGORY_INFORMATIONAL),
|
|
92
|
+
_row("browser_visits", "pinned", "pinned", "integer", CATEGORY_ORGANIZATIONAL),
|
|
93
|
+
_row("browser_visits", "audible", "audible", "integer", CATEGORY_ORGANIZATIONAL),
|
|
94
|
+
_row("browser_visits", "muted", "muted", "integer", CATEGORY_ORGANIZATIONAL),
|
|
95
|
+
_row("browser_visits", "opener_tab_id", "opener_tab_id", "integer", CATEGORY_ORGANIZATIONAL),
|
|
96
|
+
_row("browser_visits", "referred_by", "referred_by", "text", CATEGORY_INFORMATIONAL),
|
|
97
|
+
_row("browser_visits", "created_at", "created_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
|
|
98
|
+
# ----- browser_events (7.5) -----
|
|
99
|
+
_row("browser_events", "record_id", "record_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
100
|
+
_row("browser_events", "dataset_id", "dataset_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
101
|
+
_row("browser_events", "event_type", "event_type", "text", CATEGORY_INFORMATIONAL),
|
|
102
|
+
_row("browser_events", "url", "url", "text", CATEGORY_INFORMATIONAL),
|
|
103
|
+
_row("browser_events", "visited_at", "visited_at", "timestamp_utc", CATEGORY_INFORMATIONAL),
|
|
104
|
+
_row("browser_events", "title", "title", "text", CATEGORY_INFORMATIONAL),
|
|
105
|
+
_row("browser_events", "favicon_url", "favicon_url", "text", CATEGORY_INFORMATIONAL),
|
|
106
|
+
_row("browser_events", "hostname", "hostname", "text", CATEGORY_INFORMATIONAL),
|
|
107
|
+
_row("browser_events", "device_name", "device_name", "text", CATEGORY_INFORMATIONAL),
|
|
108
|
+
_row("browser_events", "transition_type", "transition_type", "text", CATEGORY_INFORMATIONAL),
|
|
109
|
+
_row("browser_events", "content", "content", "text", CATEGORY_INFORMATIONAL),
|
|
110
|
+
_row("browser_events", "tab_id", "tab_id", "integer", CATEGORY_ORGANIZATIONAL),
|
|
111
|
+
_row("browser_events", "window_id", "window_id", "integer", CATEGORY_ORGANIZATIONAL),
|
|
112
|
+
_row("browser_events", "incognito", "incognito", "integer", CATEGORY_ORGANIZATIONAL),
|
|
113
|
+
_row("browser_events", "pinned", "pinned", "integer", CATEGORY_ORGANIZATIONAL),
|
|
114
|
+
_row("browser_events", "audible", "audible", "integer", CATEGORY_ORGANIZATIONAL),
|
|
115
|
+
_row("browser_events", "muted", "muted", "integer", CATEGORY_ORGANIZATIONAL),
|
|
116
|
+
_row("browser_events", "opener_tab_id", "opener_tab_id", "integer", CATEGORY_ORGANIZATIONAL),
|
|
117
|
+
_row("browser_events", "starred_at", "starred_at", "timestamp_utc", CATEGORY_INFORMATIONAL),
|
|
118
|
+
_row("browser_events", "created_at", "created_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
|
|
119
|
+
# ----- browser_url_classification (7.6) -----
|
|
120
|
+
_row("browser_url_classification", "source_table", "enriched_from_table", "identifier", CATEGORY_ORGANIZATIONAL), # rename
|
|
121
|
+
_row("browser_url_classification", "record_id", "record_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
122
|
+
_row("browser_url_classification", "dataset_id", "dataset_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
123
|
+
_row("browser_url_classification", "url", "url", "text", CATEGORY_INFORMATIONAL),
|
|
124
|
+
_row("browser_url_classification", "title", "title", "text", CATEGORY_INFORMATIONAL),
|
|
125
|
+
_row("browser_url_classification", "url_category", "url_category", "text", CATEGORY_INFORMATIONAL),
|
|
126
|
+
_row("browser_url_classification", "url_confidence", "url_confidence", "real", CATEGORY_INFORMATIONAL),
|
|
127
|
+
_row("browser_url_classification", "model_name", "model_name", "text", CATEGORY_ORGANIZATIONAL),
|
|
128
|
+
_row("browser_url_classification", "created_at", "created_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
|
|
129
|
+
_row("browser_url_classification", "updated_at", "updated_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
|
|
130
|
+
# ----- message_emotions (7.7) -----
|
|
131
|
+
_row("message_emotions", "message_id", "message_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
132
|
+
_row("message_emotions", "source_id", "source_id", "text", CATEGORY_ORGANIZATIONAL),
|
|
133
|
+
_row("message_emotions", "emotion_label", "emotion_label", "text", CATEGORY_INFORMATIONAL),
|
|
134
|
+
_row("message_emotions", "confidence", "confidence", "real", CATEGORY_INFORMATIONAL),
|
|
135
|
+
_row("message_emotions", "model", "model_name", "text", CATEGORY_ORGANIZATIONAL), # rename
|
|
136
|
+
_row("message_emotions", "all_emotions", "all_emotions_json", "json", CATEGORY_INFORMATIONAL), # rename
|
|
137
|
+
_row("message_emotions", "created_at", "created_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
|
|
138
|
+
# ----- contacts (Stage 11: contacts:resolve — canonical messenger address book) -----
|
|
139
|
+
# See topos/storage/canonical/conversations_tables.py CREATE TABLE contacts / contact_identifiers
|
|
140
|
+
_row("contacts", "contact_id", "contact_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
141
|
+
_row("contacts", "dataset_id", "dataset_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
142
|
+
_row("contacts", "source_id", "source_id", "text", CATEGORY_ORGANIZATIONAL),
|
|
143
|
+
_row("contacts", "display_name", "display_name", "text", CATEGORY_INFORMATIONAL),
|
|
144
|
+
_row("contacts", "known_usernames_json", "known_usernames_json", "json", CATEGORY_INFORMATIONAL),
|
|
145
|
+
_row("contacts", "is_self", "is_self", "integer", CATEGORY_ORGANIZATIONAL),
|
|
146
|
+
_row("contacts", "last_import_source", "last_import_source", "text", CATEGORY_ORGANIZATIONAL),
|
|
147
|
+
_row("contacts", "last_import_run_id", "last_import_run_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
148
|
+
_row("contacts", "last_imported_at", "last_imported_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
|
|
149
|
+
_row("contacts", "sharing_policy_json", "sharing_policy_json", "json", CATEGORY_INFORMATIONAL),
|
|
150
|
+
_row("contacts", "created_at", "created_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
|
|
151
|
+
_row("contacts", "updated_at", "updated_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
|
|
152
|
+
_row("contact_identifiers", "dataset_id", "dataset_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
153
|
+
_row("contact_identifiers", "source_id", "source_id", "text", CATEGORY_ORGANIZATIONAL),
|
|
154
|
+
_row("contact_identifiers", "identifier", "identifier", "text", CATEGORY_INFORMATIONAL),
|
|
155
|
+
_row("contact_identifiers", "identifier_type", "identifier_type", "text", CATEGORY_INFORMATIONAL),
|
|
156
|
+
_row("contact_identifiers", "contact_id", "contact_id", "identifier", CATEGORY_ORGANIZATIONAL),
|
|
157
|
+
_row("contact_identifiers", "created_at", "created_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
|
|
158
|
+
_row("contact_identifiers", "updated_at", "updated_at", "timestamp_utc", CATEGORY_ORGANIZATIONAL),
|
|
159
|
+
]
|
|
160
|
+
|
|
161
|
+
# Tables covered by this registry (for validation and iteration).
|
|
162
|
+
STAGE_9_TABLE_NAMES = [
|
|
163
|
+
"conversation_messages",
|
|
164
|
+
"ai_chat_messages",
|
|
165
|
+
"ai_chat_conversations",
|
|
166
|
+
"browser_visits",
|
|
167
|
+
"browser_events",
|
|
168
|
+
"browser_url_classification",
|
|
169
|
+
"message_emotions",
|
|
170
|
+
"contacts",
|
|
171
|
+
"contact_identifiers",
|
|
172
|
+
]
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def get_columns_for_table(
|
|
176
|
+
table_name: str,
|
|
177
|
+
*,
|
|
178
|
+
include_organizational: bool = True,
|
|
179
|
+
use_canonical_names: bool = False,
|
|
180
|
+
) -> List[Dict[str, Any]]:
|
|
181
|
+
"""
|
|
182
|
+
Return column entries for a table. Used by assist APIs and UI.
|
|
183
|
+
|
|
184
|
+
include_organizational: If False, return only informational columns (default filterable set).
|
|
185
|
+
use_canonical_names: If True, return canonical_column_name as the column name to display/use;
|
|
186
|
+
if False, return current_column_name (name in DB today).
|
|
187
|
+
"""
|
|
188
|
+
rows = [r for r in SCHEMA_REGISTRY if r["table_name"] == table_name]
|
|
189
|
+
if not include_organizational:
|
|
190
|
+
rows = [r for r in rows if r["category"] == CATEGORY_INFORMATIONAL]
|
|
191
|
+
if use_canonical_names:
|
|
192
|
+
return [{**r, "column_name": r["canonical_column_name"]} for r in rows]
|
|
193
|
+
return [{**r, "column_name": r["current_column_name"]} for r in rows]
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def get_informational_columns(table_name: str) -> List[Dict[str, Any]]:
|
|
197
|
+
"""Return only informational (filterable-by-default) columns for a table."""
|
|
198
|
+
return get_columns_for_table(table_name, include_organizational=False)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def get_rename_targets() -> List[Dict[str, Any]]:
|
|
202
|
+
"""Return all columns that are Stage 9 migration rename targets (current != canonical)."""
|
|
203
|
+
return [r for r in SCHEMA_REGISTRY if r["rename_target"]]
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def get_registry_as_list(
|
|
207
|
+
*,
|
|
208
|
+
include_organizational: bool = True,
|
|
209
|
+
) -> List[Dict[str, Any]]:
|
|
210
|
+
"""Return full registry as list of dicts (e.g. for JSON export or API)."""
|
|
211
|
+
if include_organizational:
|
|
212
|
+
return list(SCHEMA_REGISTRY)
|
|
213
|
+
return [r for r in SCHEMA_REGISTRY if r["category"] == CATEGORY_INFORMATIONAL]
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def get_table_names() -> List[str]:
|
|
217
|
+
"""Return list of table names in the registry."""
|
|
218
|
+
return list(STAGE_9_TABLE_NAMES)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def resolve_column_to_canonical(table_name: str, current_column_name: str) -> Optional[str]:
|
|
222
|
+
"""
|
|
223
|
+
Given a table and the current (DB) column name, return the canonical name.
|
|
224
|
+
If not in registry or no rename, returns current_column_name (or None if unknown).
|
|
225
|
+
"""
|
|
226
|
+
for r in SCHEMA_REGISTRY:
|
|
227
|
+
if r["table_name"] == table_name and r["current_column_name"] == current_column_name:
|
|
228
|
+
return r["canonical_column_name"]
|
|
229
|
+
return None
|
topos/__init__.py
ADDED
topos/__version__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Analytics layer for Topos."""
|
|
2
|
+
|
|
3
|
+
from .messenger_communities import (
|
|
4
|
+
compute_and_persist_messenger_analytics,
|
|
5
|
+
compute_importance_and_communities,
|
|
6
|
+
ensure_messenger_analytics_tables,
|
|
7
|
+
)
|
|
8
|
+
from .messenger_graph import extract_messenger_graph
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"extract_messenger_graph",
|
|
12
|
+
"compute_importance_and_communities",
|
|
13
|
+
"compute_and_persist_messenger_analytics",
|
|
14
|
+
"ensure_messenger_analytics_tables",
|
|
15
|
+
]
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""DuckDB adapter for analytics queries."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import duckdb
|
|
11
|
+
except ImportError:
|
|
12
|
+
duckdb = None # type: ignore
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger("topos.analytics.duckdb")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DuckDBAdapter:
|
|
18
|
+
def __init__(self, db_path: Optional[Path] = None):
|
|
19
|
+
if duckdb is None:
|
|
20
|
+
raise ImportError("duckdb package not installed")
|
|
21
|
+
self.conn = duckdb.connect(str(db_path) if db_path else ":memory:")
|
|
22
|
+
|
|
23
|
+
def attach_sqlite(self, sqlite_path: str) -> None:
|
|
24
|
+
escaped_path = sqlite_path.replace("'", "''")
|
|
25
|
+
self.conn.execute(f"ATTACH '{escaped_path}' AS projection (TYPE SQLITE)")
|
|
26
|
+
|
|
27
|
+
def query_jsonl_file(
|
|
28
|
+
self,
|
|
29
|
+
file_path: str,
|
|
30
|
+
dataset_id: Optional[str] = None,
|
|
31
|
+
) -> List[Dict[str, Any]]:
|
|
32
|
+
escaped_path = file_path.replace("'", "''")
|
|
33
|
+
query = f"SELECT * FROM read_ndjson('{escaped_path}')"
|
|
34
|
+
if dataset_id:
|
|
35
|
+
query += f" WHERE dataset_id = '{dataset_id}'"
|
|
36
|
+
result = self.conn.execute(query).fetchall()
|
|
37
|
+
columns = [desc[0] for desc in self.conn.description] if self.conn.description else []
|
|
38
|
+
return [dict(zip(columns, row)) for row in result]
|
|
39
|
+
|
|
40
|
+
def execute(self, query: str, params: Optional[List[Any]] = None) -> List[Dict[str, Any]]:
|
|
41
|
+
params = params or []
|
|
42
|
+
result = self.conn.execute(query, params).fetchall()
|
|
43
|
+
columns = [desc[0] for desc in self.conn.description] if self.conn.description else []
|
|
44
|
+
return [dict(zip(columns, row)) for row in result]
|
|
45
|
+
|
|
46
|
+
def close(self) -> None:
|
|
47
|
+
if self.conn:
|
|
48
|
+
self.conn.close()
|
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
"""Messenger graph importance and community detection (Sprint 02)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from typing import Any, Dict, List, Optional, Sequence
|
|
8
|
+
|
|
9
|
+
import networkx as nx
|
|
10
|
+
|
|
11
|
+
from .messenger_graph import extract_messenger_graph
|
|
12
|
+
|
|
13
|
+
MESSENGER_SOCIAL_EDGES_TABLE = "messenger_social_edges"
|
|
14
|
+
MESSENGER_PARTICIPANT_IMPORTANCE_TABLE = "messenger_participant_importance"
|
|
15
|
+
MESSENGER_COMMUNITIES_TABLE = "messenger_communities"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _utc_now() -> str:
|
|
19
|
+
return datetime.now(timezone.utc).isoformat()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _source_scope(source_ids: Optional[Sequence[str]]) -> str:
|
|
23
|
+
if not source_ids:
|
|
24
|
+
return "all"
|
|
25
|
+
normalized = sorted({str(s).strip() for s in source_ids if str(s).strip()})
|
|
26
|
+
return ",".join(normalized) if normalized else "all"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def ensure_messenger_analytics_tables(conn: Any) -> None:
|
|
30
|
+
"""Create Sprint 02 derived messenger analytics tables."""
|
|
31
|
+
conn.execute(
|
|
32
|
+
f"""
|
|
33
|
+
CREATE TABLE IF NOT EXISTS {MESSENGER_SOCIAL_EDGES_TABLE} (
|
|
34
|
+
dataset_id TEXT NOT NULL,
|
|
35
|
+
period_key TEXT NOT NULL,
|
|
36
|
+
source_scope TEXT NOT NULL DEFAULT 'all',
|
|
37
|
+
source_id TEXT NOT NULL,
|
|
38
|
+
target_id TEXT NOT NULL,
|
|
39
|
+
weight REAL NOT NULL,
|
|
40
|
+
edge_type TEXT,
|
|
41
|
+
edge_type_counts_json TEXT,
|
|
42
|
+
created_at TEXT NOT NULL,
|
|
43
|
+
updated_at TEXT NOT NULL,
|
|
44
|
+
PRIMARY KEY (dataset_id, period_key, source_scope, source_id, target_id)
|
|
45
|
+
)
|
|
46
|
+
"""
|
|
47
|
+
)
|
|
48
|
+
conn.execute(
|
|
49
|
+
f"""
|
|
50
|
+
CREATE INDEX IF NOT EXISTS idx_{MESSENGER_SOCIAL_EDGES_TABLE}_dataset_period
|
|
51
|
+
ON {MESSENGER_SOCIAL_EDGES_TABLE}(dataset_id, period_key, source_scope)
|
|
52
|
+
"""
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
conn.execute(
|
|
56
|
+
f"""
|
|
57
|
+
CREATE TABLE IF NOT EXISTS {MESSENGER_PARTICIPANT_IMPORTANCE_TABLE} (
|
|
58
|
+
dataset_id TEXT NOT NULL,
|
|
59
|
+
period_key TEXT NOT NULL,
|
|
60
|
+
source_scope TEXT NOT NULL DEFAULT 'all',
|
|
61
|
+
participant_id TEXT NOT NULL,
|
|
62
|
+
centrality_degree REAL NOT NULL,
|
|
63
|
+
centrality_betweenness REAL NOT NULL,
|
|
64
|
+
created_at TEXT NOT NULL,
|
|
65
|
+
updated_at TEXT NOT NULL,
|
|
66
|
+
PRIMARY KEY (dataset_id, period_key, source_scope, participant_id)
|
|
67
|
+
)
|
|
68
|
+
"""
|
|
69
|
+
)
|
|
70
|
+
conn.execute(
|
|
71
|
+
f"""
|
|
72
|
+
CREATE INDEX IF NOT EXISTS idx_{MESSENGER_PARTICIPANT_IMPORTANCE_TABLE}_dataset_period
|
|
73
|
+
ON {MESSENGER_PARTICIPANT_IMPORTANCE_TABLE}(dataset_id, period_key, source_scope)
|
|
74
|
+
"""
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
conn.execute(
|
|
78
|
+
f"""
|
|
79
|
+
CREATE TABLE IF NOT EXISTS {MESSENGER_COMMUNITIES_TABLE} (
|
|
80
|
+
dataset_id TEXT NOT NULL,
|
|
81
|
+
period_key TEXT NOT NULL,
|
|
82
|
+
source_scope TEXT NOT NULL DEFAULT 'all',
|
|
83
|
+
participant_id TEXT NOT NULL,
|
|
84
|
+
community_id INTEGER NOT NULL,
|
|
85
|
+
created_at TEXT NOT NULL,
|
|
86
|
+
updated_at TEXT NOT NULL,
|
|
87
|
+
PRIMARY KEY (dataset_id, period_key, source_scope, participant_id)
|
|
88
|
+
)
|
|
89
|
+
"""
|
|
90
|
+
)
|
|
91
|
+
conn.execute(
|
|
92
|
+
f"""
|
|
93
|
+
CREATE INDEX IF NOT EXISTS idx_{MESSENGER_COMMUNITIES_TABLE}_dataset_period
|
|
94
|
+
ON {MESSENGER_COMMUNITIES_TABLE}(dataset_id, period_key, source_scope)
|
|
95
|
+
"""
|
|
96
|
+
)
|
|
97
|
+
conn.commit()
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def build_networkx_graph(period_payload: Dict[str, Any]) -> nx.Graph:
|
|
101
|
+
"""Build an undirected weighted graph from Sprint 01 period payload."""
|
|
102
|
+
graph = nx.Graph()
|
|
103
|
+
for node in period_payload.get("nodes", []):
|
|
104
|
+
node_id = str(node.get("id") or "").strip()
|
|
105
|
+
if not node_id:
|
|
106
|
+
continue
|
|
107
|
+
graph.add_node(node_id, **node)
|
|
108
|
+
|
|
109
|
+
for edge in period_payload.get("edges", []):
|
|
110
|
+
source_id = str(edge.get("source") or "").strip()
|
|
111
|
+
target_id = str(edge.get("target") or "").strip()
|
|
112
|
+
if not source_id or not target_id or source_id == target_id:
|
|
113
|
+
continue
|
|
114
|
+
weight = float(edge.get("weight") or 0.0)
|
|
115
|
+
graph.add_edge(source_id, target_id, weight=max(weight, 0.0))
|
|
116
|
+
return graph
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def compute_importance_and_communities(period_payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
120
|
+
"""Compute centrality metrics and Louvain communities for one period."""
|
|
121
|
+
graph = build_networkx_graph(period_payload)
|
|
122
|
+
node_ids = list(graph.nodes())
|
|
123
|
+
if not node_ids:
|
|
124
|
+
return {"importance": {}, "communities": {}, "graph": graph}
|
|
125
|
+
|
|
126
|
+
degree = nx.degree_centrality(graph)
|
|
127
|
+
# Use unweighted betweenness to keep metric stable with strength-based edges.
|
|
128
|
+
betweenness = nx.betweenness_centrality(graph, weight=None, normalized=True)
|
|
129
|
+
|
|
130
|
+
if graph.number_of_edges() > 0:
|
|
131
|
+
try:
|
|
132
|
+
from community import community_louvain # type: ignore
|
|
133
|
+
|
|
134
|
+
communities = community_louvain.best_partition(graph, weight="weight", random_state=42)
|
|
135
|
+
except Exception:
|
|
136
|
+
# Fallback keeps pipeline functional if python-louvain is unavailable at runtime.
|
|
137
|
+
communities = {}
|
|
138
|
+
for idx, component in enumerate(nx.connected_components(graph)):
|
|
139
|
+
for node_id in component:
|
|
140
|
+
communities[node_id] = idx
|
|
141
|
+
else:
|
|
142
|
+
communities = {node_id: idx for idx, node_id in enumerate(sorted(node_ids))}
|
|
143
|
+
|
|
144
|
+
importance: Dict[str, Dict[str, float]] = {}
|
|
145
|
+
for node_id in node_ids:
|
|
146
|
+
importance[node_id] = {
|
|
147
|
+
"centrality_degree": float(degree.get(node_id, 0.0)),
|
|
148
|
+
"centrality_betweenness": float(betweenness.get(node_id, 0.0)),
|
|
149
|
+
}
|
|
150
|
+
return {
|
|
151
|
+
"importance": importance,
|
|
152
|
+
"communities": {k: int(v) for k, v in communities.items()},
|
|
153
|
+
"graph": graph,
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _persist_period_results(
|
|
158
|
+
conn: Any,
|
|
159
|
+
*,
|
|
160
|
+
dataset_id: str,
|
|
161
|
+
period_key: str,
|
|
162
|
+
source_scope: str,
|
|
163
|
+
period_payload: Dict[str, Any],
|
|
164
|
+
importance: Dict[str, Dict[str, float]],
|
|
165
|
+
communities: Dict[str, int],
|
|
166
|
+
) -> Dict[str, int]:
|
|
167
|
+
now = _utc_now()
|
|
168
|
+
conn.execute(
|
|
169
|
+
f"""
|
|
170
|
+
DELETE FROM {MESSENGER_SOCIAL_EDGES_TABLE}
|
|
171
|
+
WHERE dataset_id = ? AND period_key = ? AND source_scope = ?
|
|
172
|
+
""",
|
|
173
|
+
(dataset_id, period_key, source_scope),
|
|
174
|
+
)
|
|
175
|
+
conn.execute(
|
|
176
|
+
f"""
|
|
177
|
+
DELETE FROM {MESSENGER_PARTICIPANT_IMPORTANCE_TABLE}
|
|
178
|
+
WHERE dataset_id = ? AND period_key = ? AND source_scope = ?
|
|
179
|
+
""",
|
|
180
|
+
(dataset_id, period_key, source_scope),
|
|
181
|
+
)
|
|
182
|
+
conn.execute(
|
|
183
|
+
f"""
|
|
184
|
+
DELETE FROM {MESSENGER_COMMUNITIES_TABLE}
|
|
185
|
+
WHERE dataset_id = ? AND period_key = ? AND source_scope = ?
|
|
186
|
+
""",
|
|
187
|
+
(dataset_id, period_key, source_scope),
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
edge_rows = []
|
|
191
|
+
for edge in period_payload.get("edges", []):
|
|
192
|
+
source_id = str(edge.get("source") or "").strip()
|
|
193
|
+
target_id = str(edge.get("target") or "").strip()
|
|
194
|
+
if not source_id or not target_id:
|
|
195
|
+
continue
|
|
196
|
+
edge_rows.append(
|
|
197
|
+
(
|
|
198
|
+
dataset_id,
|
|
199
|
+
period_key,
|
|
200
|
+
source_scope,
|
|
201
|
+
source_id,
|
|
202
|
+
target_id,
|
|
203
|
+
float(edge.get("weight") or 0.0),
|
|
204
|
+
str(edge.get("edge_type") or ""),
|
|
205
|
+
json.dumps(edge.get("edge_type_counts") or {}, ensure_ascii=False),
|
|
206
|
+
now,
|
|
207
|
+
now,
|
|
208
|
+
)
|
|
209
|
+
)
|
|
210
|
+
if edge_rows:
|
|
211
|
+
conn.executemany(
|
|
212
|
+
f"""
|
|
213
|
+
INSERT INTO {MESSENGER_SOCIAL_EDGES_TABLE}
|
|
214
|
+
(
|
|
215
|
+
dataset_id, period_key, source_scope, source_id, target_id,
|
|
216
|
+
weight, edge_type, edge_type_counts_json, created_at, updated_at
|
|
217
|
+
)
|
|
218
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
219
|
+
""",
|
|
220
|
+
edge_rows,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
importance_rows = []
|
|
224
|
+
for participant_id, metrics in importance.items():
|
|
225
|
+
importance_rows.append(
|
|
226
|
+
(
|
|
227
|
+
dataset_id,
|
|
228
|
+
period_key,
|
|
229
|
+
source_scope,
|
|
230
|
+
participant_id,
|
|
231
|
+
float(metrics.get("centrality_degree", 0.0)),
|
|
232
|
+
float(metrics.get("centrality_betweenness", 0.0)),
|
|
233
|
+
now,
|
|
234
|
+
now,
|
|
235
|
+
)
|
|
236
|
+
)
|
|
237
|
+
if importance_rows:
|
|
238
|
+
conn.executemany(
|
|
239
|
+
f"""
|
|
240
|
+
INSERT INTO {MESSENGER_PARTICIPANT_IMPORTANCE_TABLE}
|
|
241
|
+
(
|
|
242
|
+
dataset_id, period_key, source_scope, participant_id,
|
|
243
|
+
centrality_degree, centrality_betweenness, created_at, updated_at
|
|
244
|
+
)
|
|
245
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
246
|
+
""",
|
|
247
|
+
importance_rows,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
community_rows = []
|
|
251
|
+
for participant_id, community_id in communities.items():
|
|
252
|
+
community_rows.append(
|
|
253
|
+
(
|
|
254
|
+
dataset_id,
|
|
255
|
+
period_key,
|
|
256
|
+
source_scope,
|
|
257
|
+
participant_id,
|
|
258
|
+
int(community_id),
|
|
259
|
+
now,
|
|
260
|
+
now,
|
|
261
|
+
)
|
|
262
|
+
)
|
|
263
|
+
if community_rows:
|
|
264
|
+
conn.executemany(
|
|
265
|
+
f"""
|
|
266
|
+
INSERT INTO {MESSENGER_COMMUNITIES_TABLE}
|
|
267
|
+
(
|
|
268
|
+
dataset_id, period_key, source_scope, participant_id,
|
|
269
|
+
community_id, created_at, updated_at
|
|
270
|
+
)
|
|
271
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
272
|
+
""",
|
|
273
|
+
community_rows,
|
|
274
|
+
)
|
|
275
|
+
conn.commit()
|
|
276
|
+
return {
|
|
277
|
+
"edges_written": len(edge_rows),
|
|
278
|
+
"importance_written": len(importance_rows),
|
|
279
|
+
"communities_written": len(community_rows),
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def compute_and_persist_messenger_analytics(
|
|
284
|
+
*,
|
|
285
|
+
dataset_id: str,
|
|
286
|
+
conn: Optional[Any] = None,
|
|
287
|
+
start_ts: Optional[str] = None,
|
|
288
|
+
end_ts: Optional[str] = None,
|
|
289
|
+
source_ids: Optional[Sequence[str]] = None,
|
|
290
|
+
period_granularity: str = "month",
|
|
291
|
+
cumulative: bool = False,
|
|
292
|
+
) -> Dict[str, Any]:
|
|
293
|
+
"""Run Sprint 01 extraction + Sprint 02 metrics and persist derived analytics."""
|
|
294
|
+
if conn is not None:
|
|
295
|
+
db = conn
|
|
296
|
+
else:
|
|
297
|
+
from ..core.state import get_db_connection
|
|
298
|
+
|
|
299
|
+
db = get_db_connection()
|
|
300
|
+
if db is None:
|
|
301
|
+
raise RuntimeError("Database connection not available")
|
|
302
|
+
|
|
303
|
+
ensure_messenger_analytics_tables(db)
|
|
304
|
+
extraction = extract_messenger_graph(
|
|
305
|
+
dataset_id=dataset_id,
|
|
306
|
+
conn=db,
|
|
307
|
+
start_ts=start_ts,
|
|
308
|
+
end_ts=end_ts,
|
|
309
|
+
source_ids=source_ids,
|
|
310
|
+
period_granularity=period_granularity,
|
|
311
|
+
cumulative=cumulative,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
scope = _source_scope(source_ids)
|
|
315
|
+
periods_out: List[Dict[str, Any]] = []
|
|
316
|
+
totals = {"edges_written": 0, "importance_written": 0, "communities_written": 0}
|
|
317
|
+
|
|
318
|
+
for period_payload in extraction.get("periods", []):
|
|
319
|
+
period_key = str(period_payload.get("period_key") or "")
|
|
320
|
+
if not period_key:
|
|
321
|
+
continue
|
|
322
|
+
computed = compute_importance_and_communities(period_payload)
|
|
323
|
+
writes = _persist_period_results(
|
|
324
|
+
db,
|
|
325
|
+
dataset_id=dataset_id,
|
|
326
|
+
period_key=period_key,
|
|
327
|
+
source_scope=scope,
|
|
328
|
+
period_payload=period_payload,
|
|
329
|
+
importance=computed["importance"],
|
|
330
|
+
communities=computed["communities"],
|
|
331
|
+
)
|
|
332
|
+
for key in totals:
|
|
333
|
+
totals[key] += writes[key]
|
|
334
|
+
periods_out.append(
|
|
335
|
+
{
|
|
336
|
+
"period_key": period_key,
|
|
337
|
+
**writes,
|
|
338
|
+
"nodes_count": len(period_payload.get("nodes", [])),
|
|
339
|
+
"edges_count": len(period_payload.get("edges", [])),
|
|
340
|
+
}
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
return {
|
|
344
|
+
"dataset_id": dataset_id,
|
|
345
|
+
"period_granularity": period_granularity,
|
|
346
|
+
"source_scope": scope,
|
|
347
|
+
"periods": periods_out,
|
|
348
|
+
"totals": totals,
|
|
349
|
+
}
|