topos-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. shared/__init__.py +59 -0
  2. shared/filtering.py +640 -0
  3. shared/schema_registry.py +229 -0
  4. topos/__init__.py +5 -0
  5. topos/__version__.py +6 -0
  6. topos/analytics/__init__.py +15 -0
  7. topos/analytics/duckdb_adapter.py +48 -0
  8. topos/analytics/messenger_communities.py +349 -0
  9. topos/analytics/messenger_graph.py +522 -0
  10. topos/analytics/messenger_labels.py +321 -0
  11. topos/analytics/profiles.py +22 -0
  12. topos/analytics/query_engine.py +64 -0
  13. topos/analytics/raw_queries.py +174 -0
  14. topos/api/__init__.py +1 -0
  15. topos/api/analytics.py +52 -0
  16. topos/api/app_registry.py +31 -0
  17. topos/api/backup.py +15 -0
  18. topos/api/compute_remote.py +175 -0
  19. topos/api/data_commit.py +158 -0
  20. topos/api/data_explorer_table_prefs.py +81 -0
  21. topos/api/db.py +10 -0
  22. topos/api/device.py +25 -0
  23. topos/api/enrichment.py +959 -0
  24. topos/api/filter_lab.py +195 -0
  25. topos/api/health.py +61 -0
  26. topos/api/ingestion_api.py +37 -0
  27. topos/api/ingestion_compat.py +21 -0
  28. topos/api/ingestion_sources.py +600 -0
  29. topos/api/llm.py +76 -0
  30. topos/api/local_mcp.py +46 -0
  31. topos/api/messenger_analytics.py +385 -0
  32. topos/api/query_api.py +13 -0
  33. topos/api/sanitization_ollama_config.py +64 -0
  34. topos/api/source_install.py +324 -0
  35. topos/api/sources.py +13 -0
  36. topos/api/sync.py +10 -0
  37. topos/api/ui_config.py +83 -0
  38. topos/api/uma_data.py +311 -0
  39. topos/api/usage.py +49 -0
  40. topos/api/user_identity.py +46 -0
  41. topos/app.py +239 -0
  42. topos/auth.py +17 -0
  43. topos/canonicalization/__init__.py +1 -0
  44. topos/canonicalization/mappers/__init__.py +22 -0
  45. topos/canonicalization/mappers/base.py +26 -0
  46. topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
  47. topos/canonicalization/mappers/grok_mapper.py +17 -0
  48. topos/canonicalization/mappers/messenger_mapper.py +58 -0
  49. topos/canonicalization/models.py +31 -0
  50. topos/canonicalization/resolver.py +23 -0
  51. topos/cli/__init__.py +1 -0
  52. topos/cli/__main__.py +6 -0
  53. topos/cli/commands.py +132 -0
  54. topos/config/__init__.py +1 -0
  55. topos/config/sanitization_ollama.py +189 -0
  56. topos/config/settings.py +310 -0
  57. topos/contacts/__init__.py +5 -0
  58. topos/contacts/identity.py +24 -0
  59. topos/control_plane_client.py +300 -0
  60. topos/core/__init__.py +1 -0
  61. topos/core/api_models.py +128 -0
  62. topos/core/connection_resilience.py +99 -0
  63. topos/core/device_helpers.py +8 -0
  64. topos/core/errors.py +13 -0
  65. topos/core/events.py +12 -0
  66. topos/core/handlers.py +5625 -0
  67. topos/core/logging.py +175 -0
  68. topos/core/metrics.py +21 -0
  69. topos/core/startup_banner.py +62 -0
  70. topos/core/state.py +682 -0
  71. topos/core/table_layers.py +45 -0
  72. topos/core/types.py +13 -0
  73. topos/data_explorer_table_prefs.py +150 -0
  74. topos/engine/__init__.py +29 -0
  75. topos/engine/backends/__init__.py +50 -0
  76. topos/engine/backends/base.py +21 -0
  77. topos/engine/backends/huggingface.py +151 -0
  78. topos/engine/backends/ollama.py +181 -0
  79. topos/engine/backends/stub.py +22 -0
  80. topos/engine/engine.py +165 -0
  81. topos/engine/intake.py +32 -0
  82. topos/engine/queue_manager.py +112 -0
  83. topos/engine/registration.py +126 -0
  84. topos/engine/result_formatter.py +38 -0
  85. topos/engine/router.py +19 -0
  86. topos/engine/scoped_token.py +82 -0
  87. topos/engine/tasks.py +154 -0
  88. topos/engine/transport.py +44 -0
  89. topos/engine/usage_guard.py +100 -0
  90. topos/engine/usage_observation.py +129 -0
  91. topos/engine/validator.py +23 -0
  92. topos/enrichment/__init__.py +1 -0
  93. topos/enrichment/derived_tables.py +214 -0
  94. topos/enrichment/jobs/__init__.py +30 -0
  95. topos/enrichment/jobs/base.py +54 -0
  96. topos/enrichment/jobs/canonical/__init__.py +1 -0
  97. topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
  98. topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
  99. topos/enrichment/jobs/canonical/entities_job.py +27 -0
  100. topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
  101. topos/enrichment/jobs/canonical/topics_job.py +27 -0
  102. topos/enrichment/jobs/raw/__init__.py +1 -0
  103. topos/enrichment/jobs/raw/attachments_job.py +12 -0
  104. topos/enrichment/jobs/raw/language_job.py +12 -0
  105. topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
  106. topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
  107. topos/enrichment/models/__init__.py +1 -0
  108. topos/enrichment/models/manager.py +8 -0
  109. topos/enrichment/models/registry.py +71 -0
  110. topos/enrichment/models/versioning.py +8 -0
  111. topos/enrichment/orchestrator.py +177 -0
  112. topos/enrichment/processor.py +17 -0
  113. topos/enrichment/progress_bar.py +122 -0
  114. topos/enrichment/website_classifier.py +31 -0
  115. topos/filter_lab/__init__.py +1 -0
  116. topos/filter_lab/bundles.py +300 -0
  117. topos/filter_lab/schema.py +86 -0
  118. topos/filter_lab/service.py +167 -0
  119. topos/filter_lab/store.py +374 -0
  120. topos/filter_lab/worker.py +250 -0
  121. topos/hosted_pool_lease.py +153 -0
  122. topos/ingestion/__init__.py +1 -0
  123. topos/ingestion/checkpoints/__init__.py +6 -0
  124. topos/ingestion/checkpoints/checkpoint_store.py +24 -0
  125. topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
  126. topos/ingestion/ingest_helpers.py +504 -0
  127. topos/ingestion/jobs.py +91 -0
  128. topos/ingestion/local_sync.py +823 -0
  129. topos/ingestion/log_preview.py +21 -0
  130. topos/ingestion/manager.py +1100 -0
  131. topos/ingestion/parser.py +174 -0
  132. topos/ingestion/parsers/__init__.py +32 -0
  133. topos/ingestion/parsers/base.py +24 -0
  134. topos/ingestion/parsers/browser_parser.py +171 -0
  135. topos/ingestion/parsers/calendar_parser.py +21 -0
  136. topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
  137. topos/ingestion/parsers/chatgpt_parser.py +67 -0
  138. topos/ingestion/parsers/grok_parser.py +21 -0
  139. topos/ingestion/parsers/messenger_parser.py +97 -0
  140. topos/ingestion/progress.py +54 -0
  141. topos/ingestion/sources/__init__.py +20 -0
  142. topos/ingestion/sources/base.py +39 -0
  143. topos/ingestion/sources/calendar.py +29 -0
  144. topos/ingestion/sources/chatgpt.py +29 -0
  145. topos/ingestion/sources/contact_importers.py +274 -0
  146. topos/ingestion/sources/grok.py +29 -0
  147. topos/ingestion/sources/imessage_reader.py +479 -0
  148. topos/ingestion/sources/signal_export_parser.py +132 -0
  149. topos/ingestion/sources/signal_reader.py +491 -0
  150. topos/ingestion/state_machine.py +70 -0
  151. topos/ingestion/triggers/__init__.py +1 -0
  152. topos/ingestion/triggers/file_trigger.py +36 -0
  153. topos/ingestion/triggers/sqlite_trigger.py +18 -0
  154. topos/ingestion/validation/__init__.py +1 -0
  155. topos/ingestion/validation/base.py +27 -0
  156. topos/ingestion/validation/schema_registry.py +111 -0
  157. topos/ingestion/validation/schema_validator.py +13 -0
  158. topos/lineage/__init__.py +1 -0
  159. topos/lineage/provenance.py +9 -0
  160. topos/lineage/tracker.py +9 -0
  161. topos/mcp_stdio_proxy.py +83 -0
  162. topos/observability/__init__.py +1 -0
  163. topos/observability/alerts.py +7 -0
  164. topos/observability/metrics.py +25 -0
  165. topos/observability/tracing.py +18 -0
  166. topos/openai_client.py +69 -0
  167. topos/projections/__init__.py +1 -0
  168. topos/projections/vector_index/__init__.py +1 -0
  169. topos/projections/vector_index/base.py +21 -0
  170. topos/projections/vector_index/builders.py +11 -0
  171. topos/projections/vector_index/health_checks.py +5 -0
  172. topos/rate_limit.py +43 -0
  173. topos/sanitization/__init__.py +16 -0
  174. topos/sanitization/ollama_transforms.py +276 -0
  175. topos/scope_resolution.py +89 -0
  176. topos/services/__init__.py +1 -0
  177. topos/services/container.py +46 -0
  178. topos/services/embeddings/__init__.py +1 -0
  179. topos/services/embeddings/base.py +7 -0
  180. topos/services/embeddings/local.py +9 -0
  181. topos/services/embeddings/remote.py +9 -0
  182. topos/services/interfaces.py +40 -0
  183. topos/services/llm/__init__.py +1 -0
  184. topos/services/llm/base.py +7 -0
  185. topos/services/llm/openai.py +126 -0
  186. topos/services/local.py +123 -0
  187. topos/services/postgres.py +385 -0
  188. topos/sources/__init__.py +6 -0
  189. topos/sources/definitions.py +114 -0
  190. topos/sources/install_service.py +836 -0
  191. topos/sources/registry.py +263 -0
  192. topos/sources/runtime_install.py +427 -0
  193. topos/storage/__init__.py +1 -0
  194. topos/storage/canonical/__init__.py +18 -0
  195. topos/storage/canonical/ai_chat/__init__.py +22 -0
  196. topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
  197. topos/storage/canonical/ai_chat/mapper.py +168 -0
  198. topos/storage/canonical/ai_chat/model.py +87 -0
  199. topos/storage/canonical/ai_chat/tables.py +179 -0
  200. topos/storage/canonical/canonical_store.py +24 -0
  201. topos/storage/canonical/conversations_tables.py +1020 -0
  202. topos/storage/canonical/mapping_store.py +30 -0
  203. topos/storage/canonical/postgres.py +10 -0
  204. topos/storage/db/__init__.py +1 -0
  205. topos/storage/db/client.py +8 -0
  206. topos/storage/db/migrations/__init__.py +1 -0
  207. topos/storage/db/migrations/stage9_column_renames.py +78 -0
  208. topos/storage/db/paths.py +122 -0
  209. topos/storage/db/postgres.py +240 -0
  210. topos/storage/db/schema.py +6 -0
  211. topos/storage/enrichment/__init__.py +1 -0
  212. topos/storage/enrichment/canonical_enrichment_store.py +7 -0
  213. topos/storage/enrichment/raw_enrichment_store.py +18 -0
  214. topos/storage/normalized/__init__.py +1 -0
  215. topos/storage/normalized/normalized_store.py +24 -0
  216. topos/storage/oplog/__init__.py +1 -0
  217. topos/storage/oplog/decision.py +6 -0
  218. topos/storage/oplog/oplog_store.py +17 -0
  219. topos/storage/oplog/postgres.py +10 -0
  220. topos/storage/projections/__init__.py +1 -0
  221. topos/storage/projections/index_ops_store.py +6 -0
  222. topos/storage/projections/vector_index_store.py +6 -0
  223. topos/storage/raw/__init__.py +1 -0
  224. topos/storage/raw/browser_flat_tables.py +303 -0
  225. topos/storage/raw/file_store.py +100 -0
  226. topos/storage/raw/raw_store.py +29 -0
  227. topos/storage/raw/raw_tables_manager.py +295 -0
  228. topos/storage/raw/sqlite_raw_store.py +17 -0
  229. topos/storage/security/encryption.py +21 -0
  230. topos/storage/signal_identity.py +71 -0
  231. topos/storage/source_settings.py +116 -0
  232. topos/storage/user_identity.py +69 -0
  233. topos/sync/__init__.py +5 -0
  234. topos/sync/client.py +272 -0
  235. topos/sync_handlers.py +70 -0
  236. topos/testing/__init__.py +1 -0
  237. topos/testing/lifespan.py +7 -0
  238. topos/uma_contact_enrichment.py +1032 -0
  239. topos/uma_filters.py +669 -0
  240. topos/uma_resource_id.py +24 -0
  241. topos/uma_rpt.py +69 -0
  242. topos/utils/base_object.py +61 -0
  243. topos/websocket_client.py +21 -0
  244. topos_node-0.1.0.dist-info/METADATA +199 -0
  245. topos_node-0.1.0.dist-info/RECORD +249 -0
  246. topos_node-0.1.0.dist-info/WHEEL +5 -0
  247. topos_node-0.1.0.dist-info/entry_points.txt +2 -0
  248. topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
  249. topos_node-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,123 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, Optional
4
+
5
+ from ..core.api_models import (
6
+ DeviceInfoResponse,
7
+ DeviceNameResponse,
8
+ PairDeviceResponse,
9
+ PairingCodeResponse,
10
+ StoreMessageResponse,
11
+ SyncDatabaseResponse,
12
+ SyncResponse,
13
+ )
14
+ from ..core import state
15
+ from ..config.settings import settings
16
+ from fastapi import HTTPException, status
17
+
18
+
19
+ class LocalDbService:
20
+ async def store_message(self, payload: Dict[str, Any]) -> StoreMessageResponse:
21
+ _ = payload
22
+ raise NotImplementedError("LocalDbService not implemented yet")
23
+
24
+ async def get_oplog(self, dataset_id: Optional[str], limit: int, offset: int) -> Dict[str, Any]:
25
+ _ = (dataset_id, limit, offset)
26
+ raise NotImplementedError("LocalDbService not implemented yet")
27
+
28
+ async def get_messages(self, dataset_id: Optional[str], limit: int, offset: int) -> Dict[str, Any]:
29
+ _ = (dataset_id, limit, offset)
30
+ raise NotImplementedError("LocalDbService not implemented yet")
31
+
32
+ async def replay_projection(self, dataset_id: Optional[str]) -> Dict[str, Any]:
33
+ _ = dataset_id
34
+ raise NotImplementedError("LocalDbService not implemented yet")
35
+
36
+ async def reset_database(self) -> Dict[str, Any]:
37
+ raise NotImplementedError("LocalDbService not implemented yet")
38
+
39
+ async def sync_database(self) -> SyncDatabaseResponse:
40
+ raise NotImplementedError("LocalDbService not implemented yet")
41
+
42
+ async def backup_database(self, encrypted: bool) -> Any:
43
+ _ = encrypted
44
+ raise NotImplementedError("LocalDbService not implemented yet")
45
+
46
+ async def restore_database(self, file, authenticated_user_id: str, encrypted: bool) -> Dict[str, Any]:
47
+ _ = (file, authenticated_user_id, encrypted)
48
+ raise NotImplementedError("LocalDbService not implemented yet")
49
+
50
+
51
+ class LocalSyncService:
52
+ async def trigger_sync(self) -> SyncResponse:
53
+ raise NotImplementedError("LocalSyncService not implemented yet")
54
+
55
+
56
+ class LocalDeviceService:
57
+ async def get_pairing_code(self) -> PairingCodeResponse:
58
+ raise HTTPException(status_code=status.HTTP_501_NOT_IMPLEMENTED, detail="Pairing not implemented")
59
+
60
+ async def pair_device(self, pairing_code: str, keep_existing_data: bool) -> PairDeviceResponse:
61
+ _ = (pairing_code, keep_existing_data)
62
+ raise HTTPException(status_code=status.HTTP_501_NOT_IMPLEMENTED, detail="Pairing not implemented")
63
+
64
+ async def get_device_info(self) -> DeviceInfoResponse:
65
+ # Get user_id from database (set by connection_info handler) or fall back to settings
66
+ user_id = None
67
+ if state.db_conn:
68
+ from ..core.state import get_user_id
69
+ user_id = get_user_id(state.db_conn)
70
+ if not user_id:
71
+ user_id = settings.topos_user_id
72
+ dataset_id = f"{user_id}:{settings.topos_default_dataset_id}" if user_id else None
73
+ sync_connected = state.sync_client.is_connected() if state.sync_client else False
74
+ sync_enabled = settings.enable_sync and settings.get_sync_url() is not None
75
+
76
+ last_sync_at = None
77
+ last_received_hlc_ts = None
78
+ last_received_op_id = None
79
+ if state.db_conn:
80
+ last_sync_at = state.get_engine_config_value(state.db_conn, "last_sync_at")
81
+ last_received_hlc_ts = state.get_engine_config_value(state.db_conn, "last_received_hlc_ts")
82
+ last_received_op_id = state.get_engine_config_value(state.db_conn, "last_received_op_id")
83
+
84
+ device_name = settings.engine_name or state.get_system_info().get("hostname")
85
+
86
+ return DeviceInfoResponse(
87
+ user_id=user_id,
88
+ dataset_id=dataset_id,
89
+ sync_connected=sync_connected,
90
+ sync_enabled=sync_enabled,
91
+ engine_class=state.get_engine_class(),
92
+ engine_mode=state.get_engine_mode(),
93
+ llm_enabled=settings.enable_llm and state.get_engine_mode() == "full",
94
+ database_mode=settings.topos_database_mode,
95
+ engine_name=device_name,
96
+ engine_version=None,
97
+ system=state.get_system_info(),
98
+ last_sync_at=last_sync_at,
99
+ last_received_hlc_ts=last_received_hlc_ts,
100
+ last_received_op_id=last_received_op_id,
101
+ oplog_count=None,
102
+ oplog_bytes=None,
103
+ ops_since_last_sync=None,
104
+ oplog_bytes_since_last_sync=None,
105
+ )
106
+
107
+ async def set_device_name(self, device_name: str) -> DeviceNameResponse:
108
+ name = device_name.strip()
109
+ if not name:
110
+ raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Device name cannot be empty")
111
+ if len(name) > 64:
112
+ raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Device name cannot exceed 64 characters")
113
+
114
+ if state.db_conn:
115
+ state.set_engine_config_value(state.db_conn, "device_name", name)
116
+
117
+ return DeviceNameResponse(status="ok", device_name=name)
118
+
119
+
120
+ class LocalLLMService:
121
+ async def generate(self, payload: Dict[str, Any]) -> Dict[str, Any]:
122
+ _ = payload
123
+ raise NotImplementedError("LocalLLMService not implemented yet")
@@ -0,0 +1,385 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from datetime import datetime, timezone
5
+ from typing import Any, Dict, Optional
6
+ from uuid import uuid4
7
+
8
+ from fastapi import HTTPException, status
9
+
10
+ from ..config.settings import settings
11
+ from ..core.api_models import (
12
+ DeviceInfoResponse,
13
+ DeviceNameResponse,
14
+ PairDeviceResponse,
15
+ PairingCodeResponse,
16
+ StoreMessageResponse,
17
+ SyncDatabaseResponse,
18
+ SyncResponse,
19
+ )
20
+ from ..storage.db.postgres import (
21
+ PostgresConfigurationError,
22
+ connect_postgres,
23
+ execute_query,
24
+ fetch_all,
25
+ )
26
+
27
+
28
+ def _utc_now_iso() -> str:
29
+ return datetime.now(timezone.utc).isoformat()
30
+
31
+
32
+ def _http_error(status_code: int, code: str, message: str) -> HTTPException:
33
+ return HTTPException(status_code=status_code, detail={"code": code, "message": message})
34
+
35
+
36
+ def _tenant_from_dataset(dataset_id: Optional[str]) -> str:
37
+ ds = (dataset_id or "").strip()
38
+ if not ds:
39
+ raise _http_error(status.HTTP_400_BAD_REQUEST, "dataset_required", "dataset_id is required")
40
+ if ":" in ds:
41
+ tenant, _ = ds.split(":", 1)
42
+ tenant = tenant.strip()
43
+ else:
44
+ tenant = ds
45
+ if not tenant:
46
+ raise _http_error(status.HTTP_400_BAD_REQUEST, "tenant_required", "dataset_id must include tenant scope")
47
+ return tenant
48
+
49
+
50
+ def _assert_authenticated_tenant(tenant_id: str) -> None:
51
+ expected = (settings.topos_user_id or "").strip()
52
+ if not expected:
53
+ return
54
+ if tenant_id != expected:
55
+ raise _http_error(
56
+ status.HTTP_403_FORBIDDEN,
57
+ "tenant_access_denied",
58
+ "Requested dataset tenant does not match authenticated tenant",
59
+ )
60
+
61
+
62
+ class PostgresDbService:
63
+ async def store_message(self, payload: Dict[str, Any]) -> StoreMessageResponse:
64
+ dataset_id = (payload.get("dataset_id") or "").strip()
65
+ tenant_id = _tenant_from_dataset(dataset_id)
66
+ _assert_authenticated_tenant(tenant_id)
67
+
68
+ sender_type = (payload.get("sender_type") or "").strip()
69
+ if not sender_type:
70
+ raise _http_error(status.HTTP_400_BAD_REQUEST, "sender_type_required", "sender_type is required")
71
+ content = (payload.get("content") or "").strip()
72
+ if not content:
73
+ raise _http_error(status.HTTP_400_BAD_REQUEST, "content_required", "content is required")
74
+
75
+ message_id = (payload.get("message_id") or "").strip() or str(uuid4())
76
+ op_id = str(uuid4())
77
+ ts = (payload.get("ts") or "").strip() or _utc_now_iso()
78
+ user_id = (payload.get("user_id") or "").strip() or None
79
+
80
+ try:
81
+ with connect_postgres() as conn:
82
+ execute_query(
83
+ conn,
84
+ """
85
+ INSERT INTO messages (tenant_id, dataset_id, message_id, sender_type, content, ts, user_id)
86
+ VALUES (%s, %s, %s, %s, %s, %s, %s)
87
+ """,
88
+ (tenant_id, dataset_id, message_id, sender_type, content, ts, user_id),
89
+ )
90
+ execute_query(
91
+ conn,
92
+ """
93
+ INSERT INTO oplog (tenant_id, dataset_id, op_id, op_type, payload_json, hlc_ts)
94
+ VALUES (%s, %s, %s, %s, %s, %s)
95
+ """,
96
+ (
97
+ tenant_id,
98
+ dataset_id,
99
+ op_id,
100
+ "store_message",
101
+ json.dumps(
102
+ {
103
+ "message_id": message_id,
104
+ "sender_type": sender_type,
105
+ "content": content,
106
+ "ts": ts,
107
+ }
108
+ ),
109
+ ts,
110
+ ),
111
+ )
112
+ except HTTPException:
113
+ raise
114
+ except PostgresConfigurationError as exc:
115
+ raise _http_error(status.HTTP_503_SERVICE_UNAVAILABLE, "postgres_not_configured", str(exc)) from exc
116
+ except Exception as exc:
117
+ raise _http_error(status.HTTP_503_SERVICE_UNAVAILABLE, "postgres_write_failed", str(exc)) from exc
118
+
119
+ return StoreMessageResponse(op_id=op_id, message_id=message_id, status="ok")
120
+
121
+ async def get_oplog(self, dataset_id: Optional[str], limit: int, offset: int) -> Dict[str, Any]:
122
+ tenant_id = _tenant_from_dataset(dataset_id)
123
+ _assert_authenticated_tenant(tenant_id)
124
+ page_limit = max(1, min(int(limit), 1000))
125
+ page_offset = max(0, int(offset))
126
+ try:
127
+ with connect_postgres() as conn:
128
+ rows = fetch_all(
129
+ conn,
130
+ """
131
+ SELECT op_id, op_type, payload_json, hlc_ts, dataset_id
132
+ FROM oplog
133
+ WHERE tenant_id = %s AND dataset_id = %s
134
+ ORDER BY hlc_ts DESC
135
+ LIMIT %s OFFSET %s
136
+ """,
137
+ (tenant_id, dataset_id, page_limit, page_offset),
138
+ )
139
+ except HTTPException:
140
+ raise
141
+ except PostgresConfigurationError as exc:
142
+ raise _http_error(status.HTTP_503_SERVICE_UNAVAILABLE, "postgres_not_configured", str(exc)) from exc
143
+ except Exception as exc:
144
+ raise _http_error(status.HTTP_503_SERVICE_UNAVAILABLE, "postgres_read_failed", str(exc)) from exc
145
+
146
+ items = []
147
+ for row in rows:
148
+ items.append(
149
+ {
150
+ "op_id": row[0],
151
+ "op_type": row[1],
152
+ "payload": json.loads(row[2]) if row[2] else {},
153
+ "hlc_ts": row[3],
154
+ "dataset_id": row[4],
155
+ }
156
+ )
157
+ return {"status": "ok", "dataset_id": dataset_id, "items": items, "limit": page_limit, "offset": page_offset}
158
+
159
+ async def get_messages(self, dataset_id: Optional[str], limit: int, offset: int) -> Dict[str, Any]:
160
+ tenant_id = _tenant_from_dataset(dataset_id)
161
+ _assert_authenticated_tenant(tenant_id)
162
+ page_limit = max(1, min(int(limit), 1000))
163
+ page_offset = max(0, int(offset))
164
+ try:
165
+ with connect_postgres() as conn:
166
+ rows = fetch_all(
167
+ conn,
168
+ """
169
+ SELECT message_id, sender_type, content, ts, user_id, dataset_id
170
+ FROM messages
171
+ WHERE tenant_id = %s AND dataset_id = %s
172
+ ORDER BY ts DESC
173
+ LIMIT %s OFFSET %s
174
+ """,
175
+ (tenant_id, dataset_id, page_limit, page_offset),
176
+ )
177
+ except HTTPException:
178
+ raise
179
+ except PostgresConfigurationError as exc:
180
+ raise _http_error(status.HTTP_503_SERVICE_UNAVAILABLE, "postgres_not_configured", str(exc)) from exc
181
+ except Exception as exc:
182
+ raise _http_error(status.HTTP_503_SERVICE_UNAVAILABLE, "postgres_read_failed", str(exc)) from exc
183
+
184
+ items = []
185
+ for row in rows:
186
+ items.append(
187
+ {
188
+ "message_id": row[0],
189
+ "sender_type": row[1],
190
+ "content": row[2],
191
+ "ts": row[3],
192
+ "user_id": row[4],
193
+ "dataset_id": row[5],
194
+ }
195
+ )
196
+ return {"status": "ok", "dataset_id": dataset_id, "messages": items, "limit": page_limit, "offset": page_offset}
197
+
198
+ async def replay_projection(self, dataset_id: Optional[str]) -> Dict[str, Any]:
199
+ tenant_id = _tenant_from_dataset(dataset_id)
200
+ _assert_authenticated_tenant(tenant_id)
201
+ try:
202
+ with connect_postgres() as conn:
203
+ rows = fetch_all(
204
+ conn,
205
+ """
206
+ SELECT COUNT(*) FROM messages
207
+ WHERE tenant_id = %s AND dataset_id = %s
208
+ """,
209
+ (tenant_id, dataset_id),
210
+ )
211
+ except HTTPException:
212
+ raise
213
+ except PostgresConfigurationError as exc:
214
+ raise _http_error(status.HTTP_503_SERVICE_UNAVAILABLE, "postgres_not_configured", str(exc)) from exc
215
+ except Exception as exc:
216
+ raise _http_error(status.HTTP_503_SERVICE_UNAVAILABLE, "postgres_replay_failed", str(exc)) from exc
217
+ replayed = int(rows[0][0]) if rows else 0
218
+ return {"status": "ok", "dataset_id": dataset_id, "replayed_messages": replayed}
219
+
220
+ async def reset_database(self) -> Dict[str, Any]:
221
+ raise _http_error(
222
+ status.HTTP_403_FORBIDDEN,
223
+ "reset_forbidden",
224
+ "Hosted mode database reset is not allowed without tenant-scoped maintenance flow",
225
+ )
226
+
227
+ async def sync_database(self) -> SyncDatabaseResponse:
228
+ return SyncDatabaseResponse(
229
+ status="ok",
230
+ message="Hosted mode uses server-backed Postgres and does not require local sync export",
231
+ )
232
+
233
+ async def backup_database(self, encrypted: bool) -> Any:
234
+ _ = encrypted
235
+ try:
236
+ with connect_postgres() as conn:
237
+ messages = fetch_all(
238
+ conn,
239
+ """
240
+ SELECT tenant_id, dataset_id, message_id, sender_type, content, ts, user_id
241
+ FROM messages
242
+ ORDER BY ts ASC
243
+ """,
244
+ )
245
+ oplog = fetch_all(
246
+ conn,
247
+ """
248
+ SELECT tenant_id, dataset_id, op_id, op_type, payload_json, hlc_ts
249
+ FROM oplog
250
+ ORDER BY hlc_ts ASC
251
+ """,
252
+ )
253
+ except PostgresConfigurationError as exc:
254
+ raise _http_error(status.HTTP_503_SERVICE_UNAVAILABLE, "postgres_not_configured", str(exc)) from exc
255
+ except Exception as exc:
256
+ raise _http_error(status.HTTP_503_SERVICE_UNAVAILABLE, "postgres_backup_failed", str(exc)) from exc
257
+
258
+ payload = {
259
+ "messages": [
260
+ {
261
+ "tenant_id": row[0],
262
+ "dataset_id": row[1],
263
+ "message_id": row[2],
264
+ "sender_type": row[3],
265
+ "content": row[4],
266
+ "ts": row[5],
267
+ "user_id": row[6],
268
+ }
269
+ for row in messages
270
+ ],
271
+ "oplog": [
272
+ {
273
+ "tenant_id": row[0],
274
+ "dataset_id": row[1],
275
+ "op_id": row[2],
276
+ "op_type": row[3],
277
+ "payload_json": row[4],
278
+ "hlc_ts": row[5],
279
+ }
280
+ for row in oplog
281
+ ],
282
+ }
283
+ return json.dumps(payload).encode("utf-8")
284
+
285
+ async def restore_database(self, file, authenticated_user_id: str, encrypted: bool) -> Dict[str, Any]:
286
+ _ = encrypted
287
+ tenant_id = (authenticated_user_id or "").strip()
288
+ if not tenant_id:
289
+ raise _http_error(status.HTTP_400_BAD_REQUEST, "authenticated_user_required", "authenticated_user_id is required")
290
+ _assert_authenticated_tenant(tenant_id)
291
+
292
+ if hasattr(file, "read"):
293
+ raw = await file.read()
294
+ elif isinstance(file, (bytes, bytearray)):
295
+ raw = bytes(file)
296
+ else:
297
+ raise _http_error(status.HTTP_400_BAD_REQUEST, "invalid_backup_file", "Backup payload must be bytes or readable file")
298
+ try:
299
+ payload = json.loads(raw.decode("utf-8"))
300
+ except Exception as exc:
301
+ raise _http_error(status.HTTP_400_BAD_REQUEST, "invalid_backup_payload", "Backup payload must be valid UTF-8 JSON") from exc
302
+
303
+ restored_messages = 0
304
+ restored_ops = 0
305
+ try:
306
+ with connect_postgres() as conn:
307
+ execute_query(conn, "DELETE FROM messages WHERE tenant_id = %s", (tenant_id,))
308
+ execute_query(conn, "DELETE FROM oplog WHERE tenant_id = %s", (tenant_id,))
309
+ for row in payload.get("messages") or []:
310
+ if not isinstance(row, dict):
311
+ continue
312
+ if str(row.get("tenant_id") or "").strip() != tenant_id:
313
+ continue
314
+ execute_query(
315
+ conn,
316
+ """
317
+ INSERT INTO messages (tenant_id, dataset_id, message_id, sender_type, content, ts, user_id)
318
+ VALUES (%s, %s, %s, %s, %s, %s, %s)
319
+ """,
320
+ (
321
+ tenant_id,
322
+ row.get("dataset_id"),
323
+ row.get("message_id"),
324
+ row.get("sender_type"),
325
+ row.get("content"),
326
+ row.get("ts") or _utc_now_iso(),
327
+ row.get("user_id"),
328
+ ),
329
+ )
330
+ restored_messages += 1
331
+ for row in payload.get("oplog") or []:
332
+ if not isinstance(row, dict):
333
+ continue
334
+ if str(row.get("tenant_id") or "").strip() != tenant_id:
335
+ continue
336
+ execute_query(
337
+ conn,
338
+ """
339
+ INSERT INTO oplog (tenant_id, dataset_id, op_id, op_type, payload_json, hlc_ts)
340
+ VALUES (%s, %s, %s, %s, %s, %s)
341
+ """,
342
+ (
343
+ tenant_id,
344
+ row.get("dataset_id"),
345
+ row.get("op_id") or str(uuid4()),
346
+ row.get("op_type") or "restore_replay",
347
+ row.get("payload_json") or "{}",
348
+ row.get("hlc_ts") or _utc_now_iso(),
349
+ ),
350
+ )
351
+ restored_ops += 1
352
+ except HTTPException:
353
+ raise
354
+ except PostgresConfigurationError as exc:
355
+ raise _http_error(status.HTTP_503_SERVICE_UNAVAILABLE, "postgres_not_configured", str(exc)) from exc
356
+ except Exception as exc:
357
+ raise _http_error(status.HTTP_503_SERVICE_UNAVAILABLE, "postgres_restore_failed", str(exc)) from exc
358
+
359
+ return {
360
+ "status": "ok",
361
+ "tenant_id": tenant_id,
362
+ "restored_messages": restored_messages,
363
+ "restored_oplog_entries": restored_ops,
364
+ }
365
+
366
+
367
+ class HostedDeviceService:
368
+ async def get_pairing_code(self) -> PairingCodeResponse:
369
+ raise NotImplementedError("HostedDeviceService not implemented yet")
370
+
371
+ async def pair_device(self, pairing_code: str, keep_existing_data: bool) -> PairDeviceResponse:
372
+ _ = (pairing_code, keep_existing_data)
373
+ raise NotImplementedError("HostedDeviceService not implemented yet")
374
+
375
+ async def get_device_info(self) -> DeviceInfoResponse:
376
+ raise NotImplementedError("HostedDeviceService not implemented yet")
377
+
378
+ async def set_device_name(self, device_name: str) -> DeviceNameResponse:
379
+ _ = device_name
380
+ raise NotImplementedError("HostedDeviceService not implemented yet")
381
+
382
+
383
+ class HostedSyncService:
384
+ async def trigger_sync(self) -> SyncResponse:
385
+ raise NotImplementedError("HostedSyncService not implemented yet")
@@ -0,0 +1,6 @@
1
+ """Data source registry."""
2
+
3
+ from .definitions import DataSourceDefinition
4
+ from .registry import REGISTRY, list_sources
5
+
6
+ __all__ = ["DataSourceDefinition", "REGISTRY", "list_sources"]
@@ -0,0 +1,114 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any, Dict, List, Literal, Optional
5
+
6
+ from shared.filtering import filter_manifest_from_storage, get_filter_definition
7
+
8
+
9
+ def _validate_field_transform_defaults(value: Optional[List[Dict[str, Any]]]) -> None:
10
+ """Ensure each transform_id in field_transform_defaults is in FILTER_CATALOG."""
11
+ if value is None or not value:
12
+ return
13
+ for i, entry in enumerate(value):
14
+ if not isinstance(entry, dict):
15
+ raise ValueError(f"field_transform_defaults[{i}] must be a dict")
16
+ table_id = entry.get("table_id")
17
+ field_name = entry.get("field")
18
+ transform_ids = entry.get("transform_ids")
19
+ if not isinstance(field_name, str) or not field_name.strip():
20
+ raise ValueError(f"field_transform_defaults[{i}] must have non-empty 'field'")
21
+ if not isinstance(transform_ids, list):
22
+ raise ValueError(f"field_transform_defaults[{i}] must have 'transform_ids' list")
23
+ for tid in transform_ids:
24
+ if not isinstance(tid, str) or not tid.strip():
25
+ raise ValueError(f"field_transform_defaults[{i}].transform_ids must contain non-empty strings")
26
+ if get_filter_definition(tid.strip()) is None:
27
+ raise ValueError(f"field_transform_defaults[{i}]: unknown transform_id {tid!r}")
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class DataSourceDefinition:
32
+ source_id: str
33
+ display_name: str
34
+ source_type: str
35
+ schema_id: str
36
+ parser_id: str
37
+ canonical_mapper_id: Optional[str] = None
38
+ canonical_group_id: Optional[str] = None
39
+ raw_enrichment_jobs: List[str] = field(default_factory=list)
40
+ canonical_enrichment_jobs: List[str] = field(default_factory=list)
41
+ analytics_profile_id: Optional[str] = None
42
+ enrichment_trigger: Literal["automatic", "manual"] = "automatic"
43
+ ingestion_trigger: Literal["automatic", "manual"] = "automatic" # When to start processing after upload
44
+ # MVP roles/scopes (Sprint 02 Stage 1): scope this source's canonical output maps to
45
+ default_scope_id: str = "unknown"
46
+ allowed_scope_ids: Optional[List[str]] = None
47
+ default_filter_hints: Optional[List[str]] = None
48
+ filter_tier_kind: Optional[Literal["sensitivity", "inferability"]] = None
49
+ default_filter_tiers: Optional[Dict[str, Dict[str, object]]] = None
50
+ # Stage 10: registerer-defined field-level default transforms (which fields need which filters)
51
+ field_transform_defaults: Optional[List[Dict[str, Any]]] = None
52
+ # Managed source install metadata used by runtime ingestion for file sources.
53
+ tables: Optional[List[Dict[str, Any]]] = None
54
+ file_ingest_shape: Optional[Dict[str, Any]] = None
55
+ parser_column_map: Optional[Dict[str, str]] = None
56
+ canonical_mapping_connected: Optional[bool] = None
57
+ pipeline_include_parser: Optional[bool] = None
58
+ pipeline_include_data_table: Optional[bool] = None
59
+ pipeline_data_table_after_parser: Optional[bool] = None
60
+ pipeline_data_table_match_parser_output: Optional[bool] = None
61
+
62
+ def __post_init__(self) -> None:
63
+ if self.filter_tier_kind is not None and self.filter_tier_kind not in {"sensitivity", "inferability"}:
64
+ raise ValueError("filter_tier_kind must be 'sensitivity' or 'inferability'")
65
+ if self.default_filter_tiers is not None:
66
+ for tier_name, manifest in self.default_filter_tiers.items():
67
+ if tier_name not in {"low", "medium", "high"}:
68
+ raise ValueError(f"Unsupported default filter tier {tier_name!r}")
69
+ filter_manifest_from_storage(manifest)
70
+ _validate_field_transform_defaults(self.field_transform_defaults)
71
+
72
+ def to_dict(self) -> dict:
73
+ out = {
74
+ "source_id": self.source_id,
75
+ "display_name": self.display_name,
76
+ "source_type": self.source_type,
77
+ "schema_id": self.schema_id,
78
+ "parser_id": self.parser_id,
79
+ "canonical_mapper_id": self.canonical_mapper_id,
80
+ "canonical_group_id": self.canonical_group_id,
81
+ "raw_enrichment_jobs": list(self.raw_enrichment_jobs),
82
+ "canonical_enrichment_jobs": list(self.canonical_enrichment_jobs),
83
+ "analytics_profile_id": self.analytics_profile_id,
84
+ "enrichment_trigger": self.enrichment_trigger,
85
+ "ingestion_trigger": self.ingestion_trigger,
86
+ "default_scope_id": self.default_scope_id,
87
+ }
88
+ if self.allowed_scope_ids is not None:
89
+ out["allowed_scope_ids"] = list(self.allowed_scope_ids)
90
+ if self.default_filter_hints is not None:
91
+ out["default_filter_hints"] = list(self.default_filter_hints)
92
+ if self.filter_tier_kind is not None:
93
+ out["filter_tier_kind"] = self.filter_tier_kind
94
+ if self.default_filter_tiers is not None:
95
+ out["default_filter_tiers"] = dict(self.default_filter_tiers)
96
+ if self.field_transform_defaults is not None:
97
+ out["field_transform_defaults"] = list(self.field_transform_defaults)
98
+ if self.tables is not None:
99
+ out["tables"] = list(self.tables)
100
+ if self.file_ingest_shape is not None:
101
+ out["file_ingest_shape"] = dict(self.file_ingest_shape)
102
+ if self.parser_column_map is not None:
103
+ out["parser_column_map"] = dict(self.parser_column_map)
104
+ if self.canonical_mapping_connected is not None:
105
+ out["canonical_mapping_connected"] = bool(self.canonical_mapping_connected)
106
+ if self.pipeline_include_parser is not None:
107
+ out["pipeline_include_parser"] = bool(self.pipeline_include_parser)
108
+ if self.pipeline_include_data_table is not None:
109
+ out["pipeline_include_data_table"] = bool(self.pipeline_include_data_table)
110
+ if self.pipeline_data_table_after_parser is not None:
111
+ out["pipeline_data_table_after_parser"] = bool(self.pipeline_data_table_after_parser)
112
+ if self.pipeline_data_table_match_parser_output is not None:
113
+ out["pipeline_data_table_match_parser_output"] = bool(self.pipeline_data_table_match_parser_output)
114
+ return out