nornweave 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nornweave/__init__.py +3 -0
- nornweave/adapters/__init__.py +1 -0
- nornweave/adapters/base.py +5 -0
- nornweave/adapters/mailgun.py +196 -0
- nornweave/adapters/resend.py +510 -0
- nornweave/adapters/sendgrid.py +492 -0
- nornweave/adapters/ses.py +824 -0
- nornweave/cli.py +186 -0
- nornweave/core/__init__.py +26 -0
- nornweave/core/config.py +172 -0
- nornweave/core/exceptions.py +25 -0
- nornweave/core/interfaces.py +390 -0
- nornweave/core/storage.py +192 -0
- nornweave/core/utils.py +23 -0
- nornweave/huginn/__init__.py +10 -0
- nornweave/huginn/client.py +296 -0
- nornweave/huginn/config.py +52 -0
- nornweave/huginn/resources.py +165 -0
- nornweave/huginn/server.py +202 -0
- nornweave/models/__init__.py +113 -0
- nornweave/models/attachment.py +136 -0
- nornweave/models/event.py +275 -0
- nornweave/models/inbox.py +33 -0
- nornweave/models/message.py +284 -0
- nornweave/models/thread.py +172 -0
- nornweave/muninn/__init__.py +14 -0
- nornweave/muninn/tools.py +207 -0
- nornweave/search/__init__.py +1 -0
- nornweave/search/embeddings.py +1 -0
- nornweave/search/vector_store.py +1 -0
- nornweave/skuld/__init__.py +1 -0
- nornweave/skuld/rate_limiter.py +1 -0
- nornweave/skuld/scheduler.py +1 -0
- nornweave/skuld/sender.py +25 -0
- nornweave/skuld/webhooks.py +1 -0
- nornweave/storage/__init__.py +20 -0
- nornweave/storage/database.py +165 -0
- nornweave/storage/gcs.py +144 -0
- nornweave/storage/local.py +152 -0
- nornweave/storage/s3.py +164 -0
- nornweave/urdr/__init__.py +14 -0
- nornweave/urdr/adapters/__init__.py +16 -0
- nornweave/urdr/adapters/base.py +385 -0
- nornweave/urdr/adapters/postgres.py +50 -0
- nornweave/urdr/adapters/sqlite.py +51 -0
- nornweave/urdr/migrations/env.py +94 -0
- nornweave/urdr/migrations/script.py.mako +26 -0
- nornweave/urdr/migrations/versions/.gitkeep +0 -0
- nornweave/urdr/migrations/versions/20260131_0001_initial_schema.py +182 -0
- nornweave/urdr/migrations/versions/20260131_0002_extended_schema.py +241 -0
- nornweave/urdr/orm.py +641 -0
- nornweave/verdandi/__init__.py +45 -0
- nornweave/verdandi/attachments.py +471 -0
- nornweave/verdandi/content.py +420 -0
- nornweave/verdandi/headers.py +404 -0
- nornweave/verdandi/parser.py +25 -0
- nornweave/verdandi/sanitizer.py +9 -0
- nornweave/verdandi/threading.py +359 -0
- nornweave/yggdrasil/__init__.py +1 -0
- nornweave/yggdrasil/app.py +86 -0
- nornweave/yggdrasil/dependencies.py +190 -0
- nornweave/yggdrasil/middleware/__init__.py +1 -0
- nornweave/yggdrasil/middleware/auth.py +1 -0
- nornweave/yggdrasil/middleware/logging.py +1 -0
- nornweave/yggdrasil/routes/__init__.py +1 -0
- nornweave/yggdrasil/routes/v1/__init__.py +1 -0
- nornweave/yggdrasil/routes/v1/inboxes.py +124 -0
- nornweave/yggdrasil/routes/v1/messages.py +200 -0
- nornweave/yggdrasil/routes/v1/search.py +84 -0
- nornweave/yggdrasil/routes/v1/threads.py +142 -0
- nornweave/yggdrasil/routes/webhooks/__init__.py +1 -0
- nornweave/yggdrasil/routes/webhooks/mailgun.py +136 -0
- nornweave/yggdrasil/routes/webhooks/resend.py +344 -0
- nornweave/yggdrasil/routes/webhooks/sendgrid.py +15 -0
- nornweave/yggdrasil/routes/webhooks/ses.py +15 -0
- nornweave-0.1.2.dist-info/METADATA +324 -0
- nornweave-0.1.2.dist-info/RECORD +80 -0
- nornweave-0.1.2.dist-info/WHEEL +4 -0
- nornweave-0.1.2.dist-info/entry_points.txt +5 -0
- nornweave-0.1.2.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
"""Content extraction: quote/signature removal using Talon.
|
|
2
|
+
|
|
3
|
+
NornWeave uses Mailgun's Talon library to extract clean reply content
|
|
4
|
+
from emails, removing quoted text and signatures. This powers the
|
|
5
|
+
extracted_text and extracted_html fields in the Message model.
|
|
6
|
+
|
|
7
|
+
Reference: https://github.com/mailgun/talon
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from typing import cast
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# Flag to track if Talon has been initialized
|
|
17
|
+
_talon_initialized = False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def init_talon() -> bool:
|
|
21
|
+
"""
|
|
22
|
+
Initialize Talon with ML classifiers for signature extraction.
|
|
23
|
+
|
|
24
|
+
Should be called once on application startup.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
True if initialization succeeded, False otherwise
|
|
28
|
+
"""
|
|
29
|
+
global _talon_initialized
|
|
30
|
+
|
|
31
|
+
if _talon_initialized:
|
|
32
|
+
return True
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
import talon
|
|
36
|
+
|
|
37
|
+
talon.init()
|
|
38
|
+
_talon_initialized = True
|
|
39
|
+
logger.info("Talon ML classifiers initialized successfully")
|
|
40
|
+
return True
|
|
41
|
+
except ImportError:
|
|
42
|
+
logger.warning(
|
|
43
|
+
"Talon library not installed. Quote/signature extraction will be limited. "
|
|
44
|
+
"Install with: pip install talon"
|
|
45
|
+
)
|
|
46
|
+
return False
|
|
47
|
+
except Exception as e:
|
|
48
|
+
logger.warning(f"Failed to initialize Talon: {e}")
|
|
49
|
+
return False
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class ExtractedContent:
|
|
54
|
+
"""Result of content extraction."""
|
|
55
|
+
|
|
56
|
+
extracted_text: str
|
|
57
|
+
extracted_html: str | None
|
|
58
|
+
signature: str | None
|
|
59
|
+
preview: str
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def extract_reply_text(body_plain: str) -> str:
|
|
63
|
+
"""
|
|
64
|
+
Extract new reply content from plain text email.
|
|
65
|
+
|
|
66
|
+
Removes quoted text like "On Jan 31, 2026, Bob wrote: ..."
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
body_plain: Raw plain text email body
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
The new content without quoted replies
|
|
73
|
+
"""
|
|
74
|
+
if not body_plain:
|
|
75
|
+
return ""
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
from talon import quotations
|
|
79
|
+
|
|
80
|
+
reply = quotations.extract_from_plain(body_plain)
|
|
81
|
+
return reply.strip() if reply else body_plain
|
|
82
|
+
except ImportError:
|
|
83
|
+
# Fallback: basic quote removal
|
|
84
|
+
return _basic_quote_removal(body_plain)
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logger.warning(f"Quote extraction failed: {e}")
|
|
87
|
+
return body_plain
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def extract_reply_html(body_html: str) -> str:
|
|
91
|
+
"""
|
|
92
|
+
Extract new reply content from HTML email.
|
|
93
|
+
|
|
94
|
+
Removes quoted blocks from HTML structure.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
body_html: Raw HTML email body
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
The new HTML content without quoted replies
|
|
101
|
+
"""
|
|
102
|
+
if not body_html:
|
|
103
|
+
return ""
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
from talon import quotations
|
|
107
|
+
|
|
108
|
+
reply = quotations.extract_from_html(body_html)
|
|
109
|
+
return reply.strip() if reply else body_html
|
|
110
|
+
except ImportError:
|
|
111
|
+
# No fallback for HTML - return original
|
|
112
|
+
return body_html
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.warning(f"HTML quote extraction failed: {e}")
|
|
115
|
+
return body_html
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def extract_reply(body: str, content_type: str = "text/plain") -> str:
|
|
119
|
+
"""
|
|
120
|
+
Extract reply content based on content type.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
body: Email body content
|
|
124
|
+
content_type: MIME content type ("text/plain" or "text/html")
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Extracted reply content
|
|
128
|
+
"""
|
|
129
|
+
try:
|
|
130
|
+
from talon import quotations
|
|
131
|
+
|
|
132
|
+
return cast("str", quotations.extract_from(body, content_type))
|
|
133
|
+
except ImportError:
|
|
134
|
+
if "html" in content_type.lower():
|
|
135
|
+
return body
|
|
136
|
+
return _basic_quote_removal(body)
|
|
137
|
+
except Exception as e:
|
|
138
|
+
logger.warning(f"Content extraction failed: {e}")
|
|
139
|
+
return body
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def remove_signature_bruteforce(text: str) -> tuple[str, str | None]:
|
|
143
|
+
"""
|
|
144
|
+
Remove signature using brute-force method (~90% accuracy).
|
|
145
|
+
|
|
146
|
+
Fast and doesn't require ML models. Works by looking for
|
|
147
|
+
common signature delimiters like "--", "Best regards", etc.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
text: Plain text email body
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Tuple of (text_without_signature, signature_if_found)
|
|
154
|
+
"""
|
|
155
|
+
if not text:
|
|
156
|
+
return "", None
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
from talon.signature.bruteforce import extract_signature
|
|
160
|
+
|
|
161
|
+
clean_text, sig = extract_signature(text)
|
|
162
|
+
return clean_text or text, sig
|
|
163
|
+
except ImportError:
|
|
164
|
+
# Fallback: look for common signature markers
|
|
165
|
+
return _basic_signature_removal(text)
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.warning(f"Bruteforce signature extraction failed: {e}")
|
|
168
|
+
return text, None
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def remove_signature_ml(text: str, sender_email: str | None = None) -> tuple[str, str | None]:
|
|
172
|
+
"""
|
|
173
|
+
Remove signature using ML classifier (~98% accuracy).
|
|
174
|
+
|
|
175
|
+
Requires talon.init() to be called first.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
text: Plain text email body
|
|
179
|
+
sender_email: Optional sender email for better accuracy
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Tuple of (text_without_signature, signature_if_found)
|
|
183
|
+
"""
|
|
184
|
+
if not text:
|
|
185
|
+
return "", None
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
from talon import signature as ml_signature
|
|
189
|
+
|
|
190
|
+
clean_text, sig = ml_signature.extract(text, sender=sender_email)
|
|
191
|
+
return clean_text or text, sig
|
|
192
|
+
except ImportError:
|
|
193
|
+
logger.warning("Talon ML signature extraction not available, using bruteforce")
|
|
194
|
+
return remove_signature_bruteforce(text)
|
|
195
|
+
except Exception as e:
|
|
196
|
+
logger.warning(f"ML signature extraction failed: {e}")
|
|
197
|
+
return remove_signature_bruteforce(text)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def extract_content(
|
|
201
|
+
body_plain: str,
|
|
202
|
+
body_html: str | None = None,
|
|
203
|
+
sender_email: str | None = None,
|
|
204
|
+
*,
|
|
205
|
+
use_ml_signature: bool = True,
|
|
206
|
+
preview_max_length: int = 100,
|
|
207
|
+
fallback_to_original: bool = True,
|
|
208
|
+
) -> ExtractedContent:
|
|
209
|
+
"""
|
|
210
|
+
Full content extraction pipeline.
|
|
211
|
+
|
|
212
|
+
Steps:
|
|
213
|
+
1. Extract reply (remove quoted text)
|
|
214
|
+
2. Remove signature
|
|
215
|
+
3. Generate preview
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
body_plain: Plain text body
|
|
219
|
+
body_html: Optional HTML body
|
|
220
|
+
sender_email: Sender email for ML signature detection
|
|
221
|
+
use_ml_signature: Use ML-based signature extraction (more accurate)
|
|
222
|
+
preview_max_length: Maximum length for preview text
|
|
223
|
+
fallback_to_original: Return original content if extraction fails
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
ExtractedContent with all processed fields
|
|
227
|
+
"""
|
|
228
|
+
# Step 1: Extract reply from plain text
|
|
229
|
+
try:
|
|
230
|
+
reply_text = extract_reply_text(body_plain)
|
|
231
|
+
except Exception as e:
|
|
232
|
+
logger.warning(f"Reply extraction failed: {e}")
|
|
233
|
+
reply_text = body_plain if fallback_to_original else ""
|
|
234
|
+
|
|
235
|
+
# Step 2: Remove signature
|
|
236
|
+
try:
|
|
237
|
+
if use_ml_signature and sender_email:
|
|
238
|
+
clean_text, signature = remove_signature_ml(reply_text, sender_email)
|
|
239
|
+
else:
|
|
240
|
+
clean_text, signature = remove_signature_bruteforce(reply_text)
|
|
241
|
+
except Exception as e:
|
|
242
|
+
logger.warning(f"Signature removal failed: {e}")
|
|
243
|
+
clean_text = reply_text
|
|
244
|
+
signature = None
|
|
245
|
+
|
|
246
|
+
# Step 3: Extract reply from HTML (if present)
|
|
247
|
+
extracted_html = None
|
|
248
|
+
if body_html:
|
|
249
|
+
try:
|
|
250
|
+
extracted_html = extract_reply_html(body_html)
|
|
251
|
+
except Exception as e:
|
|
252
|
+
logger.warning(f"HTML extraction failed: {e}")
|
|
253
|
+
extracted_html = body_html if fallback_to_original else None
|
|
254
|
+
|
|
255
|
+
# Step 4: Generate preview
|
|
256
|
+
preview = generate_preview(clean_text, max_length=preview_max_length)
|
|
257
|
+
|
|
258
|
+
return ExtractedContent(
|
|
259
|
+
extracted_text=clean_text,
|
|
260
|
+
extracted_html=extracted_html,
|
|
261
|
+
signature=signature,
|
|
262
|
+
preview=preview,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def generate_preview(text: str, max_length: int = 100) -> str:
|
|
267
|
+
"""
|
|
268
|
+
Generate a short preview of the message.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
text: Clean text content
|
|
272
|
+
max_length: Maximum preview length
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
Preview string, truncated with "..." if needed
|
|
276
|
+
"""
|
|
277
|
+
if not text:
|
|
278
|
+
return ""
|
|
279
|
+
|
|
280
|
+
# Collapse whitespace
|
|
281
|
+
preview = " ".join(text.split())
|
|
282
|
+
|
|
283
|
+
if len(preview) <= max_length:
|
|
284
|
+
return preview
|
|
285
|
+
|
|
286
|
+
# Truncate at word boundary
|
|
287
|
+
truncated = preview[:max_length]
|
|
288
|
+
# Find last space to avoid cutting words
|
|
289
|
+
last_space = truncated.rfind(" ")
|
|
290
|
+
if last_space > max_length * 0.5: # Only if we'd keep at least half
|
|
291
|
+
truncated = truncated[:last_space]
|
|
292
|
+
|
|
293
|
+
return truncated + "..."
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def calculate_message_size(
|
|
297
|
+
text: str | None = None,
|
|
298
|
+
html: str | None = None,
|
|
299
|
+
headers: dict[str, str] | None = None,
|
|
300
|
+
attachments_size: int = 0,
|
|
301
|
+
) -> int:
|
|
302
|
+
"""
|
|
303
|
+
Calculate approximate message size in bytes.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
text: Plain text body
|
|
307
|
+
html: HTML body
|
|
308
|
+
headers: Message headers
|
|
309
|
+
attachments_size: Total size of attachments
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
Approximate message size in bytes
|
|
313
|
+
"""
|
|
314
|
+
size = 0
|
|
315
|
+
|
|
316
|
+
if text:
|
|
317
|
+
size += len(text.encode("utf-8", errors="replace"))
|
|
318
|
+
|
|
319
|
+
if html:
|
|
320
|
+
size += len(html.encode("utf-8", errors="replace"))
|
|
321
|
+
|
|
322
|
+
if headers:
|
|
323
|
+
for key, value in headers.items():
|
|
324
|
+
size += len(key.encode("utf-8", errors="replace"))
|
|
325
|
+
size += len(value.encode("utf-8", errors="replace"))
|
|
326
|
+
|
|
327
|
+
size += attachments_size
|
|
328
|
+
|
|
329
|
+
return size
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
# -------------------------------------------------------------------------
|
|
333
|
+
# Fallback implementations (when Talon is not available)
|
|
334
|
+
# -------------------------------------------------------------------------
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def _basic_quote_removal(text: str) -> str:
|
|
338
|
+
"""
|
|
339
|
+
Basic quote removal without Talon.
|
|
340
|
+
|
|
341
|
+
Removes lines starting with > and common "On ... wrote:" patterns.
|
|
342
|
+
"""
|
|
343
|
+
if not text:
|
|
344
|
+
return ""
|
|
345
|
+
|
|
346
|
+
lines = text.split("\n")
|
|
347
|
+
result_lines = []
|
|
348
|
+
in_quote = False
|
|
349
|
+
|
|
350
|
+
for line in lines:
|
|
351
|
+
# Check for "On ... wrote:" pattern
|
|
352
|
+
if _is_quote_header(line):
|
|
353
|
+
in_quote = True
|
|
354
|
+
continue
|
|
355
|
+
|
|
356
|
+
# Check for > quoted lines
|
|
357
|
+
stripped = line.strip()
|
|
358
|
+
if stripped.startswith(">"):
|
|
359
|
+
in_quote = True
|
|
360
|
+
continue
|
|
361
|
+
|
|
362
|
+
# Check for "-----Original Message-----"
|
|
363
|
+
if "-----Original Message-----" in line:
|
|
364
|
+
in_quote = True
|
|
365
|
+
continue
|
|
366
|
+
|
|
367
|
+
if not in_quote:
|
|
368
|
+
result_lines.append(line)
|
|
369
|
+
elif stripped and not stripped.startswith(">"):
|
|
370
|
+
# Non-empty, non-quoted line after quote section
|
|
371
|
+
# This might be a signature or footer, keep for now
|
|
372
|
+
result_lines.append(line)
|
|
373
|
+
|
|
374
|
+
return "\n".join(result_lines).strip()
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _is_quote_header(line: str) -> bool:
|
|
378
|
+
"""Check if line is a quote header like 'On Jan 31, 2026, Bob wrote:'."""
|
|
379
|
+
line_lower = line.lower().strip()
|
|
380
|
+
|
|
381
|
+
# Gmail style: "On Mon, Jan 31, 2026 at 10:00 AM Bob wrote:"
|
|
382
|
+
if line_lower.startswith("on ") and "wrote:" in line_lower:
|
|
383
|
+
return True
|
|
384
|
+
|
|
385
|
+
# Outlook style (beginning of separator)
|
|
386
|
+
return bool(line.strip().startswith("From:") or line.strip().startswith("Sent:"))
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _basic_signature_removal(text: str) -> tuple[str, str | None]:
|
|
390
|
+
"""
|
|
391
|
+
Basic signature removal without Talon.
|
|
392
|
+
|
|
393
|
+
Looks for common signature delimiters.
|
|
394
|
+
"""
|
|
395
|
+
if not text:
|
|
396
|
+
return "", None
|
|
397
|
+
|
|
398
|
+
# Common signature delimiters
|
|
399
|
+
delimiters = [
|
|
400
|
+
"\n-- \n", # Standard signature delimiter
|
|
401
|
+
"\n--\n",
|
|
402
|
+
"\n___",
|
|
403
|
+
"\nBest regards",
|
|
404
|
+
"\nBest,",
|
|
405
|
+
"\nRegards,",
|
|
406
|
+
"\nThanks,",
|
|
407
|
+
"\nCheers,",
|
|
408
|
+
"\nSent from my iPhone",
|
|
409
|
+
"\nSent from my Android",
|
|
410
|
+
]
|
|
411
|
+
|
|
412
|
+
for delimiter in delimiters:
|
|
413
|
+
if delimiter in text:
|
|
414
|
+
parts = text.split(delimiter, 1)
|
|
415
|
+
clean = parts[0].strip()
|
|
416
|
+
# Include the delimiter (without leading newline) in the signature
|
|
417
|
+
sig_content = delimiter.lstrip("\n") + (parts[1] if len(parts) > 1 else "")
|
|
418
|
+
return clean, sig_content.strip() if sig_content.strip() else None
|
|
419
|
+
|
|
420
|
+
return text, None
|