nornweave 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. nornweave/__init__.py +3 -0
  2. nornweave/adapters/__init__.py +1 -0
  3. nornweave/adapters/base.py +5 -0
  4. nornweave/adapters/mailgun.py +196 -0
  5. nornweave/adapters/resend.py +510 -0
  6. nornweave/adapters/sendgrid.py +492 -0
  7. nornweave/adapters/ses.py +824 -0
  8. nornweave/cli.py +186 -0
  9. nornweave/core/__init__.py +26 -0
  10. nornweave/core/config.py +172 -0
  11. nornweave/core/exceptions.py +25 -0
  12. nornweave/core/interfaces.py +390 -0
  13. nornweave/core/storage.py +192 -0
  14. nornweave/core/utils.py +23 -0
  15. nornweave/huginn/__init__.py +10 -0
  16. nornweave/huginn/client.py +296 -0
  17. nornweave/huginn/config.py +52 -0
  18. nornweave/huginn/resources.py +165 -0
  19. nornweave/huginn/server.py +202 -0
  20. nornweave/models/__init__.py +113 -0
  21. nornweave/models/attachment.py +136 -0
  22. nornweave/models/event.py +275 -0
  23. nornweave/models/inbox.py +33 -0
  24. nornweave/models/message.py +284 -0
  25. nornweave/models/thread.py +172 -0
  26. nornweave/muninn/__init__.py +14 -0
  27. nornweave/muninn/tools.py +207 -0
  28. nornweave/search/__init__.py +1 -0
  29. nornweave/search/embeddings.py +1 -0
  30. nornweave/search/vector_store.py +1 -0
  31. nornweave/skuld/__init__.py +1 -0
  32. nornweave/skuld/rate_limiter.py +1 -0
  33. nornweave/skuld/scheduler.py +1 -0
  34. nornweave/skuld/sender.py +25 -0
  35. nornweave/skuld/webhooks.py +1 -0
  36. nornweave/storage/__init__.py +20 -0
  37. nornweave/storage/database.py +165 -0
  38. nornweave/storage/gcs.py +144 -0
  39. nornweave/storage/local.py +152 -0
  40. nornweave/storage/s3.py +164 -0
  41. nornweave/urdr/__init__.py +14 -0
  42. nornweave/urdr/adapters/__init__.py +16 -0
  43. nornweave/urdr/adapters/base.py +385 -0
  44. nornweave/urdr/adapters/postgres.py +50 -0
  45. nornweave/urdr/adapters/sqlite.py +51 -0
  46. nornweave/urdr/migrations/env.py +94 -0
  47. nornweave/urdr/migrations/script.py.mako +26 -0
  48. nornweave/urdr/migrations/versions/.gitkeep +0 -0
  49. nornweave/urdr/migrations/versions/20260131_0001_initial_schema.py +182 -0
  50. nornweave/urdr/migrations/versions/20260131_0002_extended_schema.py +241 -0
  51. nornweave/urdr/orm.py +641 -0
  52. nornweave/verdandi/__init__.py +45 -0
  53. nornweave/verdandi/attachments.py +471 -0
  54. nornweave/verdandi/content.py +420 -0
  55. nornweave/verdandi/headers.py +404 -0
  56. nornweave/verdandi/parser.py +25 -0
  57. nornweave/verdandi/sanitizer.py +9 -0
  58. nornweave/verdandi/threading.py +359 -0
  59. nornweave/yggdrasil/__init__.py +1 -0
  60. nornweave/yggdrasil/app.py +86 -0
  61. nornweave/yggdrasil/dependencies.py +190 -0
  62. nornweave/yggdrasil/middleware/__init__.py +1 -0
  63. nornweave/yggdrasil/middleware/auth.py +1 -0
  64. nornweave/yggdrasil/middleware/logging.py +1 -0
  65. nornweave/yggdrasil/routes/__init__.py +1 -0
  66. nornweave/yggdrasil/routes/v1/__init__.py +1 -0
  67. nornweave/yggdrasil/routes/v1/inboxes.py +124 -0
  68. nornweave/yggdrasil/routes/v1/messages.py +200 -0
  69. nornweave/yggdrasil/routes/v1/search.py +84 -0
  70. nornweave/yggdrasil/routes/v1/threads.py +142 -0
  71. nornweave/yggdrasil/routes/webhooks/__init__.py +1 -0
  72. nornweave/yggdrasil/routes/webhooks/mailgun.py +136 -0
  73. nornweave/yggdrasil/routes/webhooks/resend.py +344 -0
  74. nornweave/yggdrasil/routes/webhooks/sendgrid.py +15 -0
  75. nornweave/yggdrasil/routes/webhooks/ses.py +15 -0
  76. nornweave-0.1.2.dist-info/METADATA +324 -0
  77. nornweave-0.1.2.dist-info/RECORD +80 -0
  78. nornweave-0.1.2.dist-info/WHEEL +4 -0
  79. nornweave-0.1.2.dist-info/entry_points.txt +5 -0
  80. nornweave-0.1.2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,420 @@
1
+ """Content extraction: quote/signature removal using Talon.
2
+
3
+ NornWeave uses Mailgun's Talon library to extract clean reply content
4
+ from emails, removing quoted text and signatures. This powers the
5
+ extracted_text and extracted_html fields in the Message model.
6
+
7
+ Reference: https://github.com/mailgun/talon
8
+ """
9
+
10
+ import logging
11
+ from dataclasses import dataclass
12
+ from typing import cast
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Flag to track if Talon has been initialized
17
+ _talon_initialized = False
18
+
19
+
20
+ def init_talon() -> bool:
21
+ """
22
+ Initialize Talon with ML classifiers for signature extraction.
23
+
24
+ Should be called once on application startup.
25
+
26
+ Returns:
27
+ True if initialization succeeded, False otherwise
28
+ """
29
+ global _talon_initialized
30
+
31
+ if _talon_initialized:
32
+ return True
33
+
34
+ try:
35
+ import talon
36
+
37
+ talon.init()
38
+ _talon_initialized = True
39
+ logger.info("Talon ML classifiers initialized successfully")
40
+ return True
41
+ except ImportError:
42
+ logger.warning(
43
+ "Talon library not installed. Quote/signature extraction will be limited. "
44
+ "Install with: pip install talon"
45
+ )
46
+ return False
47
+ except Exception as e:
48
+ logger.warning(f"Failed to initialize Talon: {e}")
49
+ return False
50
+
51
+
52
+ @dataclass
53
+ class ExtractedContent:
54
+ """Result of content extraction."""
55
+
56
+ extracted_text: str
57
+ extracted_html: str | None
58
+ signature: str | None
59
+ preview: str
60
+
61
+
62
+ def extract_reply_text(body_plain: str) -> str:
63
+ """
64
+ Extract new reply content from plain text email.
65
+
66
+ Removes quoted text like "On Jan 31, 2026, Bob wrote: ..."
67
+
68
+ Args:
69
+ body_plain: Raw plain text email body
70
+
71
+ Returns:
72
+ The new content without quoted replies
73
+ """
74
+ if not body_plain:
75
+ return ""
76
+
77
+ try:
78
+ from talon import quotations
79
+
80
+ reply = quotations.extract_from_plain(body_plain)
81
+ return reply.strip() if reply else body_plain
82
+ except ImportError:
83
+ # Fallback: basic quote removal
84
+ return _basic_quote_removal(body_plain)
85
+ except Exception as e:
86
+ logger.warning(f"Quote extraction failed: {e}")
87
+ return body_plain
88
+
89
+
90
+ def extract_reply_html(body_html: str) -> str:
91
+ """
92
+ Extract new reply content from HTML email.
93
+
94
+ Removes quoted blocks from HTML structure.
95
+
96
+ Args:
97
+ body_html: Raw HTML email body
98
+
99
+ Returns:
100
+ The new HTML content without quoted replies
101
+ """
102
+ if not body_html:
103
+ return ""
104
+
105
+ try:
106
+ from talon import quotations
107
+
108
+ reply = quotations.extract_from_html(body_html)
109
+ return reply.strip() if reply else body_html
110
+ except ImportError:
111
+ # No fallback for HTML - return original
112
+ return body_html
113
+ except Exception as e:
114
+ logger.warning(f"HTML quote extraction failed: {e}")
115
+ return body_html
116
+
117
+
118
+ def extract_reply(body: str, content_type: str = "text/plain") -> str:
119
+ """
120
+ Extract reply content based on content type.
121
+
122
+ Args:
123
+ body: Email body content
124
+ content_type: MIME content type ("text/plain" or "text/html")
125
+
126
+ Returns:
127
+ Extracted reply content
128
+ """
129
+ try:
130
+ from talon import quotations
131
+
132
+ return cast("str", quotations.extract_from(body, content_type))
133
+ except ImportError:
134
+ if "html" in content_type.lower():
135
+ return body
136
+ return _basic_quote_removal(body)
137
+ except Exception as e:
138
+ logger.warning(f"Content extraction failed: {e}")
139
+ return body
140
+
141
+
142
+ def remove_signature_bruteforce(text: str) -> tuple[str, str | None]:
143
+ """
144
+ Remove signature using brute-force method (~90% accuracy).
145
+
146
+ Fast and doesn't require ML models. Works by looking for
147
+ common signature delimiters like "--", "Best regards", etc.
148
+
149
+ Args:
150
+ text: Plain text email body
151
+
152
+ Returns:
153
+ Tuple of (text_without_signature, signature_if_found)
154
+ """
155
+ if not text:
156
+ return "", None
157
+
158
+ try:
159
+ from talon.signature.bruteforce import extract_signature
160
+
161
+ clean_text, sig = extract_signature(text)
162
+ return clean_text or text, sig
163
+ except ImportError:
164
+ # Fallback: look for common signature markers
165
+ return _basic_signature_removal(text)
166
+ except Exception as e:
167
+ logger.warning(f"Bruteforce signature extraction failed: {e}")
168
+ return text, None
169
+
170
+
171
+ def remove_signature_ml(text: str, sender_email: str | None = None) -> tuple[str, str | None]:
172
+ """
173
+ Remove signature using ML classifier (~98% accuracy).
174
+
175
+ Requires talon.init() to be called first.
176
+
177
+ Args:
178
+ text: Plain text email body
179
+ sender_email: Optional sender email for better accuracy
180
+
181
+ Returns:
182
+ Tuple of (text_without_signature, signature_if_found)
183
+ """
184
+ if not text:
185
+ return "", None
186
+
187
+ try:
188
+ from talon import signature as ml_signature
189
+
190
+ clean_text, sig = ml_signature.extract(text, sender=sender_email)
191
+ return clean_text or text, sig
192
+ except ImportError:
193
+ logger.warning("Talon ML signature extraction not available, using bruteforce")
194
+ return remove_signature_bruteforce(text)
195
+ except Exception as e:
196
+ logger.warning(f"ML signature extraction failed: {e}")
197
+ return remove_signature_bruteforce(text)
198
+
199
+
200
+ def extract_content(
201
+ body_plain: str,
202
+ body_html: str | None = None,
203
+ sender_email: str | None = None,
204
+ *,
205
+ use_ml_signature: bool = True,
206
+ preview_max_length: int = 100,
207
+ fallback_to_original: bool = True,
208
+ ) -> ExtractedContent:
209
+ """
210
+ Full content extraction pipeline.
211
+
212
+ Steps:
213
+ 1. Extract reply (remove quoted text)
214
+ 2. Remove signature
215
+ 3. Generate preview
216
+
217
+ Args:
218
+ body_plain: Plain text body
219
+ body_html: Optional HTML body
220
+ sender_email: Sender email for ML signature detection
221
+ use_ml_signature: Use ML-based signature extraction (more accurate)
222
+ preview_max_length: Maximum length for preview text
223
+ fallback_to_original: Return original content if extraction fails
224
+
225
+ Returns:
226
+ ExtractedContent with all processed fields
227
+ """
228
+ # Step 1: Extract reply from plain text
229
+ try:
230
+ reply_text = extract_reply_text(body_plain)
231
+ except Exception as e:
232
+ logger.warning(f"Reply extraction failed: {e}")
233
+ reply_text = body_plain if fallback_to_original else ""
234
+
235
+ # Step 2: Remove signature
236
+ try:
237
+ if use_ml_signature and sender_email:
238
+ clean_text, signature = remove_signature_ml(reply_text, sender_email)
239
+ else:
240
+ clean_text, signature = remove_signature_bruteforce(reply_text)
241
+ except Exception as e:
242
+ logger.warning(f"Signature removal failed: {e}")
243
+ clean_text = reply_text
244
+ signature = None
245
+
246
+ # Step 3: Extract reply from HTML (if present)
247
+ extracted_html = None
248
+ if body_html:
249
+ try:
250
+ extracted_html = extract_reply_html(body_html)
251
+ except Exception as e:
252
+ logger.warning(f"HTML extraction failed: {e}")
253
+ extracted_html = body_html if fallback_to_original else None
254
+
255
+ # Step 4: Generate preview
256
+ preview = generate_preview(clean_text, max_length=preview_max_length)
257
+
258
+ return ExtractedContent(
259
+ extracted_text=clean_text,
260
+ extracted_html=extracted_html,
261
+ signature=signature,
262
+ preview=preview,
263
+ )
264
+
265
+
266
+ def generate_preview(text: str, max_length: int = 100) -> str:
267
+ """
268
+ Generate a short preview of the message.
269
+
270
+ Args:
271
+ text: Clean text content
272
+ max_length: Maximum preview length
273
+
274
+ Returns:
275
+ Preview string, truncated with "..." if needed
276
+ """
277
+ if not text:
278
+ return ""
279
+
280
+ # Collapse whitespace
281
+ preview = " ".join(text.split())
282
+
283
+ if len(preview) <= max_length:
284
+ return preview
285
+
286
+ # Truncate at word boundary
287
+ truncated = preview[:max_length]
288
+ # Find last space to avoid cutting words
289
+ last_space = truncated.rfind(" ")
290
+ if last_space > max_length * 0.5: # Only if we'd keep at least half
291
+ truncated = truncated[:last_space]
292
+
293
+ return truncated + "..."
294
+
295
+
296
+ def calculate_message_size(
297
+ text: str | None = None,
298
+ html: str | None = None,
299
+ headers: dict[str, str] | None = None,
300
+ attachments_size: int = 0,
301
+ ) -> int:
302
+ """
303
+ Calculate approximate message size in bytes.
304
+
305
+ Args:
306
+ text: Plain text body
307
+ html: HTML body
308
+ headers: Message headers
309
+ attachments_size: Total size of attachments
310
+
311
+ Returns:
312
+ Approximate message size in bytes
313
+ """
314
+ size = 0
315
+
316
+ if text:
317
+ size += len(text.encode("utf-8", errors="replace"))
318
+
319
+ if html:
320
+ size += len(html.encode("utf-8", errors="replace"))
321
+
322
+ if headers:
323
+ for key, value in headers.items():
324
+ size += len(key.encode("utf-8", errors="replace"))
325
+ size += len(value.encode("utf-8", errors="replace"))
326
+
327
+ size += attachments_size
328
+
329
+ return size
330
+
331
+
332
+ # -------------------------------------------------------------------------
333
+ # Fallback implementations (when Talon is not available)
334
+ # -------------------------------------------------------------------------
335
+
336
+
337
+ def _basic_quote_removal(text: str) -> str:
338
+ """
339
+ Basic quote removal without Talon.
340
+
341
+ Removes lines starting with > and common "On ... wrote:" patterns.
342
+ """
343
+ if not text:
344
+ return ""
345
+
346
+ lines = text.split("\n")
347
+ result_lines = []
348
+ in_quote = False
349
+
350
+ for line in lines:
351
+ # Check for "On ... wrote:" pattern
352
+ if _is_quote_header(line):
353
+ in_quote = True
354
+ continue
355
+
356
+ # Check for > quoted lines
357
+ stripped = line.strip()
358
+ if stripped.startswith(">"):
359
+ in_quote = True
360
+ continue
361
+
362
+ # Check for "-----Original Message-----"
363
+ if "-----Original Message-----" in line:
364
+ in_quote = True
365
+ continue
366
+
367
+ if not in_quote:
368
+ result_lines.append(line)
369
+ elif stripped and not stripped.startswith(">"):
370
+ # Non-empty, non-quoted line after quote section
371
+ # This might be a signature or footer, keep for now
372
+ result_lines.append(line)
373
+
374
+ return "\n".join(result_lines).strip()
375
+
376
+
377
+ def _is_quote_header(line: str) -> bool:
378
+ """Check if line is a quote header like 'On Jan 31, 2026, Bob wrote:'."""
379
+ line_lower = line.lower().strip()
380
+
381
+ # Gmail style: "On Mon, Jan 31, 2026 at 10:00 AM Bob wrote:"
382
+ if line_lower.startswith("on ") and "wrote:" in line_lower:
383
+ return True
384
+
385
+ # Outlook style (beginning of separator)
386
+ return bool(line.strip().startswith("From:") or line.strip().startswith("Sent:"))
387
+
388
+
389
+ def _basic_signature_removal(text: str) -> tuple[str, str | None]:
390
+ """
391
+ Basic signature removal without Talon.
392
+
393
+ Looks for common signature delimiters.
394
+ """
395
+ if not text:
396
+ return "", None
397
+
398
+ # Common signature delimiters
399
+ delimiters = [
400
+ "\n-- \n", # Standard signature delimiter
401
+ "\n--\n",
402
+ "\n___",
403
+ "\nBest regards",
404
+ "\nBest,",
405
+ "\nRegards,",
406
+ "\nThanks,",
407
+ "\nCheers,",
408
+ "\nSent from my iPhone",
409
+ "\nSent from my Android",
410
+ ]
411
+
412
+ for delimiter in delimiters:
413
+ if delimiter in text:
414
+ parts = text.split(delimiter, 1)
415
+ clean = parts[0].strip()
416
+ # Include the delimiter (without leading newline) in the signature
417
+ sig_content = delimiter.lstrip("\n") + (parts[1] if len(parts) > 1 else "")
418
+ return clean, sig_content.strip() if sig_content.strip() else None
419
+
420
+ return text, None