voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
db/models.py ADDED
@@ -0,0 +1,618 @@
1
+ """
2
+ SQLAlchemy ORM models for VoidAccess's persistent storage layer.
3
+
4
+ Tables
5
+ ------
6
+ investigations — one record per pipeline run
7
+ sources — canonical .onion domain registry (global, deduped by address)
8
+ investigation_sources — many-to-many: which sources appeared in which investigation
9
+ pages — individual scraped pages (URL-level, one per unique URL)
10
+ entities — structured intelligence artifacts extracted from pages
11
+ entity_relationships — directed edges between two entities
12
+
13
+ Design notes
14
+ ------------
15
+ - Primary keys are UUID4, generated in Python so they're globally unique and safe
16
+ to produce offline before insertion.
17
+ - All enum columns use native_enum=False (stored as VARCHAR) for portability between
18
+ PostgreSQL (production) and SQLite (tests) and to avoid DDL-level ENUM management.
19
+ - DateTime columns are timezone-aware (UTC throughout).
20
+ - Soft cascade rules: deleting a Page cascades to its Entities and their Relationships.
21
+ Deleting an Investigation does NOT delete its Sources (they are global).
22
+ """
23
+
24
+ import enum
25
+ import uuid
26
+ from datetime import datetime, timezone
27
+ from typing import Any, List, Optional
28
+
29
+ import sqlalchemy as sa
30
+ from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
31
+ from sqlalchemy.schema import UniqueConstraint
32
+
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # Enums (application-level validation; stored as VARCHAR in the DB)
36
+ # ---------------------------------------------------------------------------
37
+
38
+ class SourceStatus(str, enum.Enum):
39
+ ACTIVE = "active"
40
+ DOWN = "down"
41
+ UNKNOWN = "unknown"
42
+
43
+
44
+ class SourceType(str, enum.Enum):
45
+ SEARCH_RESULT = "search_result"
46
+ CRAWLED = "crawled"
47
+ SEED = "seed"
48
+ TELEGRAM = "telegram"
49
+
50
+
51
+ class EntityType(str, enum.Enum):
52
+ """Entity types stored as VARCHAR in the DB."""
53
+ CRYPTO_WALLET = "crypto_wallet"
54
+ EMAIL = "email"
55
+ PGP_KEY = "pgp_key"
56
+ ONION_URL = "onion_url"
57
+ CVE = "cve"
58
+ IP_ADDRESS = "ip_address"
59
+ PHONE = "phone"
60
+ HANDLE = "handle"
61
+ MALWARE = "malware"
62
+ RANSOMWARE_GROUP = "ransomware_group"
63
+ DOMAIN = "domain"
64
+ OTHER = "other"
65
+ FILE_HASH_MD5 = "file_hash_md5"
66
+ FILE_HASH_SHA1 = "file_hash_sha1"
67
+ FILE_HASH_SHA256 = "file_hash_sha256"
68
+ MITRE_TECHNIQUE = "mitre_technique"
69
+
70
+
71
+ class RelationshipType(str, enum.Enum):
72
+ """Edge types for the entity graph (Phase 3 will query these)."""
73
+ CO_APPEARED_ON = "CO_APPEARED_ON"
74
+ POSTED_BY = "POSTED_BY"
75
+ LINKED_TO = "LINKED_TO"
76
+ PAID_TO = "PAID_TO"
77
+ MEMBER_OF = "MEMBER_OF"
78
+ USED = "USED"
79
+ CLAIMED = "CLAIMED"
80
+ LIKELY_SAME_ACTOR = "LIKELY_SAME_ACTOR"
81
+ CONFIRMED_SAME_ACTOR = "CONFIRMED_SAME_ACTOR"
82
+ FUNDED_BY = "FUNDED_BY"
83
+ POSSIBLE_SAME_AUTHOR = "POSSIBLE_SAME_AUTHOR"
84
+
85
+
86
+ # ---------------------------------------------------------------------------
87
+ # Declarative base
88
+ # ---------------------------------------------------------------------------
89
+
90
+ class Base(DeclarativeBase):
91
+ pass
92
+
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # Junction table: Investigation <-> Source (many-to-many)
96
+ # ---------------------------------------------------------------------------
97
+
98
+ investigation_sources = sa.Table(
99
+ "investigation_sources",
100
+ Base.metadata,
101
+ sa.Column(
102
+ "investigation_id",
103
+ sa.UUID(as_uuid=True),
104
+ sa.ForeignKey("investigations.id", ondelete="CASCADE"),
105
+ primary_key=True,
106
+ ),
107
+ sa.Column(
108
+ "source_id",
109
+ sa.UUID(as_uuid=True),
110
+ sa.ForeignKey("sources.id", ondelete="CASCADE"),
111
+ primary_key=True,
112
+ ),
113
+ sa.Column(
114
+ "added_at",
115
+ sa.DateTime(timezone=True),
116
+ nullable=False,
117
+ default=lambda: datetime.now(timezone.utc),
118
+ ),
119
+ )
120
+
121
+
122
+ # ---------------------------------------------------------------------------
123
+ # Models
124
+ # ---------------------------------------------------------------------------
125
+
126
+ class Investigation(Base):
127
+ """
128
+ One row per pipeline run. Stores the query, parameters, and final summary.
129
+ """
130
+ __tablename__ = "investigations"
131
+
132
+ id: Mapped[uuid.UUID] = mapped_column(
133
+ sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
134
+ )
135
+ run_id: Mapped[uuid.UUID] = mapped_column(
136
+ sa.UUID(as_uuid=True), unique=True, nullable=False, default=uuid.uuid4,
137
+ index=True,
138
+ )
139
+ query: Mapped[str] = mapped_column(sa.Text, nullable=False)
140
+ refined_query: Mapped[Optional[str]] = mapped_column(sa.Text, nullable=True)
141
+ model_used: Mapped[Optional[str]] = mapped_column(sa.String(100), nullable=True)
142
+ preset: Mapped[Optional[str]] = mapped_column(sa.String(50), nullable=True)
143
+ summary: Mapped[Optional[str]] = mapped_column(sa.Text, nullable=True)
144
+ status: Mapped[str] = mapped_column(
145
+ sa.String(20), nullable=False, default="pending", server_default="pending"
146
+ )
147
+ created_at: Mapped[datetime] = mapped_column(
148
+ sa.DateTime(timezone=True),
149
+ nullable=False,
150
+ default=lambda: datetime.now(timezone.utc),
151
+ )
152
+ is_seed: Mapped[bool] = mapped_column(
153
+ sa.Boolean, nullable=False, default=False, server_default="false"
154
+ )
155
+ graph_status: Mapped[str] = mapped_column(
156
+ sa.String(20), nullable=False, default="pending", server_default="pending"
157
+ )
158
+ current_step: Mapped[int] = mapped_column(
159
+ sa.Integer, nullable=False, default=0, server_default="0"
160
+ )
161
+ current_step_label: Mapped[str] = mapped_column(
162
+ sa.String(200), nullable=False, default="", server_default=""
163
+ )
164
+ entity_count: Mapped[int] = mapped_column(
165
+ sa.Integer, nullable=False, default=0, server_default="0"
166
+ )
167
+ page_count: Mapped[int] = mapped_column(
168
+ sa.Integer, nullable=False, default=0, server_default="0"
169
+ )
170
+ user_id: Mapped[Optional[int]] = mapped_column(
171
+ sa.Integer,
172
+ sa.ForeignKey("users.id", ondelete="SET NULL"),
173
+ nullable=True,
174
+ index=True,
175
+ )
176
+
177
+ sources: Mapped[List["Source"]] = relationship(
178
+ "Source",
179
+ secondary=investigation_sources,
180
+ back_populates="investigations",
181
+ lazy="select",
182
+ )
183
+
184
+
185
+
186
+ class MonitorAlertSeverity(str, enum.Enum):
187
+ """Stored as VARCHAR in ``monitor_alerts.severity``."""
188
+
189
+ info = "info"
190
+ warning = "warning"
191
+ critical = "critical"
192
+
193
+
194
+ class MonitorAlert(Base):
195
+ """
196
+ Persisted record of every alert fired by the monitoring system.
197
+ Created whenever a monitor detects a change significant enough to alert.
198
+ """
199
+
200
+ __tablename__ = "monitor_alerts"
201
+
202
+ id: Mapped[int] = mapped_column(sa.Integer, primary_key=True, autoincrement=True)
203
+ monitor_name: Mapped[str] = mapped_column(sa.String, nullable=False, index=True)
204
+ triggered_at: Mapped[datetime] = mapped_column(
205
+ sa.DateTime(timezone=True),
206
+ nullable=False,
207
+ default=lambda: datetime.now(timezone.utc),
208
+ index=True,
209
+ )
210
+ change_type: Mapped[str] = mapped_column(sa.String(50), nullable=False)
211
+ summary: Mapped[str] = mapped_column(sa.Text, nullable=False, default="")
212
+ diff_data: Mapped[Optional[dict[str, Any]]] = mapped_column(sa.JSON, nullable=True)
213
+ severity: Mapped[str] = mapped_column(
214
+ sa.String(20),
215
+ nullable=False,
216
+ default=MonitorAlertSeverity.info.value,
217
+ )
218
+ entity_count_delta: Mapped[int] = mapped_column(
219
+ sa.Integer, nullable=False, default=0
220
+ )
221
+ delivered: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, default=False)
222
+ delivery_channels: Mapped[Optional[List[Any]]] = mapped_column(sa.JSON, nullable=True)
223
+ acknowledged: Mapped[bool] = mapped_column(
224
+ sa.Boolean, nullable=False, default=False
225
+ )
226
+ acknowledged_at: Mapped[Optional[datetime]] = mapped_column(
227
+ sa.DateTime(timezone=True), nullable=True
228
+ )
229
+
230
+ __table_args__ = (
231
+ sa.Index("ix_monitor_alerts_monitor_triggered", "monitor_name", "triggered_at"),
232
+ )
233
+
234
+
235
+ class InvestigationEntityLink(Base):
236
+ """
237
+ Links an entity to additional investigations beyond its origin.
238
+ Enables cross-investigation deduplication without moving entity ownership.
239
+ """
240
+ __tablename__ = "investigation_entity_links"
241
+
242
+ id: Mapped[uuid.UUID] = mapped_column(
243
+ sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
244
+ )
245
+ entity_id: Mapped[uuid.UUID] = mapped_column(
246
+ sa.UUID(as_uuid=True), sa.ForeignKey("entities.id", ondelete="CASCADE"), nullable=False
247
+ )
248
+ investigation_id: Mapped[uuid.UUID] = mapped_column(
249
+ sa.UUID(as_uuid=True), sa.ForeignKey("investigations.id", ondelete="CASCADE"), nullable=False
250
+ )
251
+ linked_at: Mapped[datetime] = mapped_column(
252
+ sa.DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
253
+ )
254
+
255
+ __table_args__ = (
256
+ sa.UniqueConstraint("entity_id", "investigation_id"),
257
+ )
258
+
259
+
260
+ class ActorStyleProfile(Base):
261
+ """
262
+ Stores aggregated writing style fingerprints for unique actors.
263
+ Updated incrementally as new text samples are discovered.
264
+ """
265
+ __tablename__ = "actor_style_profiles"
266
+
267
+ id: Mapped[int] = mapped_column(sa.Integer, primary_key=True, autoincrement=True)
268
+ canonical_value: Mapped[str] = mapped_column(sa.String, nullable=False, index=True)
269
+ entity_type: Mapped[str] = mapped_column(sa.String, nullable=False)
270
+ style_vector: Mapped[dict[str, Any]] = mapped_column(sa.JSON, nullable=False)
271
+ sample_count: Mapped[int] = mapped_column(sa.Integer, default=0, server_default="0")
272
+ total_chars: Mapped[int] = mapped_column(sa.Integer, default=0, server_default="0")
273
+ last_updated: Mapped[datetime] = mapped_column(
274
+ sa.DateTime(timezone=True),
275
+ nullable=False,
276
+ default=lambda: datetime.now(timezone.utc),
277
+ )
278
+
279
+ __table_args__ = (
280
+ UniqueConstraint("canonical_value", "entity_type"),
281
+ )
282
+
283
+ class User(Base):
284
+ """
285
+ VoidAccess system user. Handles authentication and access control.
286
+ """
287
+ __tablename__ = "users"
288
+
289
+ id: Mapped[int] = mapped_column(sa.Integer, primary_key=True, autoincrement=True)
290
+ email: Mapped[str] = mapped_column(sa.String(255), nullable=False, unique=True, index=True)
291
+ hashed_password: Mapped[str] = mapped_column(sa.String, nullable=False)
292
+ is_active: Mapped[bool] = mapped_column(sa.Boolean, default=True, nullable=False)
293
+
294
+ # Forces password reset on next login
295
+ # Set to True for the default admin account
296
+ must_reset_password: Mapped[bool] = mapped_column(sa.Boolean, default=False, nullable=False)
297
+
298
+ created_at: Mapped[datetime] = mapped_column(
299
+ sa.DateTime(timezone=True),
300
+ nullable=False,
301
+ default=lambda: datetime.now(timezone.utc),
302
+ )
303
+ last_login_at: Mapped[Optional[datetime]] = mapped_column(sa.DateTime(timezone=True), nullable=True)
304
+
305
+ def __repr__(self) -> str:
306
+ return f"<User {self.email!r}>"
307
+
308
+
309
+ class UserApiKey(Base):
310
+ """
311
+ Per-user encrypted API key storage.
312
+ Keys are encrypted at rest using Fernet (AES-128) with a key derived from JWT_SECRET.
313
+ """
314
+ __tablename__ = "user_api_keys"
315
+
316
+ id: Mapped[int] = mapped_column(sa.Integer, primary_key=True, autoincrement=True)
317
+ user_id: Mapped[int] = mapped_column(
318
+ sa.Integer,
319
+ sa.ForeignKey("users.id", ondelete="CASCADE"),
320
+ nullable=False,
321
+ )
322
+ key_name: Mapped[str] = mapped_column(sa.String(64), nullable=False)
323
+ encrypted_value: Mapped[str] = mapped_column(sa.Text, nullable=False)
324
+ created_at: Mapped[datetime] = mapped_column(
325
+ sa.DateTime(timezone=True),
326
+ server_default=sa.func.now(),
327
+ )
328
+ updated_at: Mapped[datetime] = mapped_column(
329
+ sa.DateTime(timezone=True),
330
+ server_default=sa.func.now(),
331
+ onupdate=sa.func.now(),
332
+ )
333
+
334
+ __table_args__ = (
335
+ sa.UniqueConstraint("user_id", "key_name"),
336
+ )
337
+
338
+
339
+ class ContentSafetyEvent(Base):
340
+ """
341
+ Audit log for content safety block events.
342
+ Never stores actual prohibited content — only event metadata and a hash
343
+ prefix for correlation.
344
+ """
345
+ __tablename__ = "content_safety_events"
346
+
347
+ id: Mapped[int] = mapped_column(sa.Integer, primary_key=True, autoincrement=True)
348
+ event_type: Mapped[str] = mapped_column(
349
+ sa.String(50), nullable=False
350
+ ) # "query_blocked", "url_blocked", "content_blocked"
351
+ user_id: Mapped[Optional[int]] = mapped_column(sa.Integer, nullable=True)
352
+ # Hash prefix for correlation — never the actual content
353
+ content_hash: Mapped[Optional[str]] = mapped_column(sa.String(64), nullable=True)
354
+ timestamp: Mapped[datetime] = mapped_column(
355
+ sa.DateTime(timezone=True),
356
+ server_default=sa.func.now(),
357
+ )
358
+
359
+
360
+ class Entity(Base):
361
+ """
362
+ Structured intelligence artifacts extracted from pages.
363
+ """
364
+ __tablename__ = "entities"
365
+
366
+ id: Mapped[uuid.UUID] = mapped_column(
367
+ sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
368
+ )
369
+ page_id: Mapped[uuid.UUID] = mapped_column(
370
+ sa.UUID(as_uuid=True),
371
+ sa.ForeignKey("pages.id", ondelete="CASCADE"),
372
+ nullable=False,
373
+ )
374
+ investigation_id: Mapped[Optional[uuid.UUID]] = mapped_column(
375
+ sa.UUID(as_uuid=True),
376
+ sa.ForeignKey("investigations.id", ondelete="SET NULL"),
377
+ nullable=True,
378
+ )
379
+ entity_type: Mapped[str] = mapped_column(sa.String(50), nullable=False)
380
+ value: Mapped[str] = mapped_column(sa.Text, nullable=False)
381
+ confidence: Mapped[float] = mapped_column(
382
+ sa.Float(), nullable=False, server_default="1.0"
383
+ )
384
+ # DB column is context_snippet; `context` is a backward-compat Python alias
385
+ context_snippet: Mapped[Optional[str]] = mapped_column(sa.Text, nullable=True)
386
+ canonical_value: Mapped[Optional[str]] = mapped_column(
387
+ sa.String, nullable=True, index=True
388
+ )
389
+ historical_context: Mapped[Optional[str]] = mapped_column(sa.Text, nullable=True)
390
+ extraction_method: Mapped[Optional[str]] = mapped_column(
391
+ sa.String(10), nullable=True
392
+ )
393
+ first_seen: Mapped[datetime] = mapped_column(
394
+ sa.DateTime(timezone=True),
395
+ nullable=False,
396
+ default=lambda: datetime.now(timezone.utc),
397
+ )
398
+ last_seen: Mapped[datetime] = mapped_column(
399
+ sa.DateTime(timezone=True),
400
+ nullable=False,
401
+ default=lambda: datetime.now(timezone.utc),
402
+ )
403
+ created_at: Mapped[datetime] = mapped_column(
404
+ sa.DateTime(timezone=True),
405
+ nullable=False,
406
+ default=lambda: datetime.now(timezone.utc),
407
+ )
408
+ source_count: Mapped[int] = mapped_column(
409
+ sa.Integer, server_default="1", default=1
410
+ )
411
+ corroborating_sources: Mapped[Optional[str]] = mapped_column(
412
+ sa.Text, nullable=True
413
+ )
414
+ first_seen_at: Mapped[Optional[datetime]] = mapped_column(
415
+ sa.DateTime(timezone=True),
416
+ server_default=sa.func.now(),
417
+ nullable=True,
418
+ )
419
+ last_seen_at: Mapped[Optional[datetime]] = mapped_column(
420
+ sa.DateTime(timezone=True),
421
+ server_default=sa.func.now(),
422
+ nullable=True,
423
+ )
424
+
425
+ @property
426
+ def context(self) -> Optional[str]:
427
+ """Backward-compat alias for context_snippet (AGENTS.md: do not remove)."""
428
+ return self.context_snippet
429
+
430
+ @context.setter
431
+ def context(self, value: Optional[str]) -> None:
432
+ self.context_snippet = value
433
+
434
+ __table_args__ = (
435
+ sa.Index("ix_entities_page_id", "page_id"),
436
+ sa.Index("ix_entities_investigation_id", "investigation_id"),
437
+ sa.Index("ix_entities_entity_type", "entity_type"),
438
+ sa.Index("ix_entity_canonical", "entity_type", "canonical_value"),
439
+ )
440
+
441
+ page: Mapped["Page"] = relationship("Page", back_populates="entities")
442
+ relationships_as_entity_a: Mapped[List["EntityRelationship"]] = relationship(
443
+ "EntityRelationship",
444
+ foreign_keys="EntityRelationship.entity_a_id",
445
+ back_populates="entity_a",
446
+ )
447
+ relationships_as_entity_b: Mapped[List["EntityRelationship"]] = relationship(
448
+ "EntityRelationship",
449
+ foreign_keys="EntityRelationship.entity_b_id",
450
+ back_populates="entity_b",
451
+ )
452
+
453
+
454
+
455
+ class Page(Base):
456
+ """
457
+ Individual scraped page from a source.
458
+ """
459
+ __tablename__ = "pages"
460
+
461
+ id: Mapped[uuid.UUID] = mapped_column(
462
+ sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
463
+ )
464
+ source_id: Mapped[Optional[uuid.UUID]] = mapped_column(
465
+ sa.UUID(as_uuid=True),
466
+ sa.ForeignKey("sources.id", ondelete="SET NULL"),
467
+ nullable=True,
468
+ )
469
+ url: Mapped[str] = mapped_column(sa.Text, nullable=False)
470
+ raw_content_hash: Mapped[Optional[str]] = mapped_column(sa.String(64), nullable=True)
471
+ cleaned_text: Mapped[Optional[str]] = mapped_column(sa.Text, nullable=True)
472
+ scrape_timestamp: Mapped[datetime] = mapped_column(
473
+ sa.DateTime(timezone=True),
474
+ nullable=False,
475
+ default=lambda: datetime.now(timezone.utc),
476
+ )
477
+ language: Mapped[Optional[str]] = mapped_column(sa.String(10), nullable=True)
478
+ byte_size: Mapped[Optional[int]] = mapped_column(sa.Integer, nullable=True)
479
+ posted_at: Mapped[Optional[datetime]] = mapped_column(
480
+ sa.DateTime(timezone=True), nullable=True
481
+ )
482
+ created_at: Mapped[datetime] = mapped_column(
483
+ sa.DateTime(timezone=True),
484
+ nullable=False,
485
+ default=lambda: datetime.now(timezone.utc),
486
+ )
487
+
488
+ __table_args__ = (
489
+ sa.UniqueConstraint("url"),
490
+ sa.Index("ix_pages_source_id", "source_id"),
491
+ sa.Index("ix_pages_raw_content_hash", "raw_content_hash"),
492
+ sa.Index("ix_pages_posted_at", "posted_at"),
493
+ )
494
+
495
+ source: Mapped[Optional["Source"]] = relationship(
496
+ "Source", back_populates="pages"
497
+ )
498
+ entities: Mapped[List["Entity"]] = relationship(
499
+ "Entity", back_populates="page", cascade="all, delete-orphan"
500
+ )
501
+ relationships_as_source: Mapped[List["EntityRelationship"]] = relationship(
502
+ "EntityRelationship",
503
+ foreign_keys="EntityRelationship.source_page_id",
504
+ back_populates="source_page",
505
+ )
506
+
507
+
508
+ class Source(Base):
509
+ """
510
+ Canonical .onion domain registry.
511
+ """
512
+ __tablename__ = "sources"
513
+
514
+ id: Mapped[uuid.UUID] = mapped_column(
515
+ sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
516
+ )
517
+ onion_address: Mapped[str] = mapped_column(sa.String(255), nullable=False, unique=True)
518
+ first_seen: Mapped[datetime] = mapped_column(
519
+ sa.DateTime(timezone=True),
520
+ nullable=False,
521
+ default=lambda: datetime.now(timezone.utc),
522
+ )
523
+ last_seen: Mapped[datetime] = mapped_column(
524
+ sa.DateTime(timezone=True),
525
+ nullable=False,
526
+ default=lambda: datetime.now(timezone.utc),
527
+ )
528
+ status: Mapped[str] = mapped_column(
529
+ sa.String(20), nullable=False, default="unknown", server_default="unknown"
530
+ )
531
+ source_type: Mapped[str] = mapped_column(
532
+ sa.String(30),
533
+ nullable=False,
534
+ default="search_result",
535
+ server_default="search_result",
536
+ )
537
+
538
+ __table_args__ = (
539
+ sa.Index("ix_sources_onion_address", "onion_address"),
540
+ )
541
+
542
+ pages: Mapped[List["Page"]] = relationship(
543
+ "Page", back_populates="source", cascade="all, delete-orphan"
544
+ )
545
+ investigations: Mapped[List["Investigation"]] = relationship(
546
+ "Investigation",
547
+ secondary=investigation_sources,
548
+ back_populates="sources",
549
+ lazy="select",
550
+ )
551
+
552
+
553
+ class EntityRelationship(Base):
554
+ """
555
+ Directed edge between two entities.
556
+ """
557
+ __tablename__ = "entity_relationships"
558
+
559
+ id: Mapped[uuid.UUID] = mapped_column(
560
+ sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
561
+ )
562
+ entity_a_id: Mapped[uuid.UUID] = mapped_column(
563
+ sa.UUID(as_uuid=True),
564
+ sa.ForeignKey("entities.id", ondelete="CASCADE"),
565
+ nullable=False,
566
+ )
567
+ entity_b_id: Mapped[uuid.UUID] = mapped_column(
568
+ sa.UUID(as_uuid=True),
569
+ sa.ForeignKey("entities.id", ondelete="CASCADE"),
570
+ nullable=False,
571
+ )
572
+ relationship_type: Mapped[str] = mapped_column(sa.String(50), nullable=False)
573
+ source_page_id: Mapped[Optional[uuid.UUID]] = mapped_column(
574
+ sa.UUID(as_uuid=True),
575
+ sa.ForeignKey("pages.id", ondelete="SET NULL"),
576
+ nullable=True,
577
+ )
578
+ confidence: Mapped[float] = mapped_column(
579
+ sa.Float(), nullable=False, server_default="1.0"
580
+ )
581
+ first_seen: Mapped[datetime] = mapped_column(
582
+ sa.DateTime(timezone=True),
583
+ nullable=False,
584
+ default=lambda: datetime.now(timezone.utc),
585
+ )
586
+ investigation_id: Mapped[Optional[uuid.UUID]] = mapped_column(
587
+ sa.UUID(as_uuid=True),
588
+ sa.ForeignKey("investigations.id", ondelete="SET NULL"),
589
+ nullable=True,
590
+ )
591
+
592
+ __table_args__ = (
593
+ sa.Index(
594
+ "ix_entity_relationships_lookup",
595
+ "entity_a_id",
596
+ "entity_b_id",
597
+ "relationship_type",
598
+ ),
599
+ sa.Index("ix_entity_relationships_investigation_id", "investigation_id"),
600
+ sa.Index("ix_entity_relationships_source_target", "entity_a_id", "entity_b_id"),
601
+ )
602
+
603
+ entity_a: Mapped["Entity"] = relationship(
604
+ "Entity",
605
+ foreign_keys=[entity_a_id],
606
+ back_populates="relationships_as_entity_a",
607
+ )
608
+ entity_b: Mapped["Entity"] = relationship(
609
+ "Entity",
610
+ foreign_keys=[entity_b_id],
611
+ back_populates="relationships_as_entity_b",
612
+ )
613
+ source_page: Mapped[Optional["Page"]] = relationship(
614
+ "Page",
615
+ foreign_keys=[source_page_id],
616
+ back_populates="relationships_as_source",
617
+ )
618
+