fastemailparser 0.2.1__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fastemailparser-0.2.1/fastemailparser.egg-info → fastemailparser-0.2.2}/PKG-INFO +1 -1
- {fastemailparser-0.2.1 → fastemailparser-0.2.2/fastemailparser.egg-info}/PKG-INFO +1 -1
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/setup.py +1 -1
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/mime.c +2 -3
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/signature.c +54 -21
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/LICENSE +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/MANIFEST.in +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/README.md +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/email.h +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/emailparser.c +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/fastemailparser.egg-info/SOURCES.txt +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/fastemailparser.egg-info/dependency_links.txt +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/fastemailparser.egg-info/top_level.txt +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/setup.cfg +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/body.c +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/body.h +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/buf.h +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/email_iter.c +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/email_iter.h +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/headers.c +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/headers.h +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/html.c +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/html.h +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/mime.h +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/signature.h +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/standalone.c +0 -0
- {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/standalone.h +0 -0
|
@@ -28,7 +28,7 @@ module = Extension(
|
|
|
28
28
|
|
|
29
29
|
setup(
|
|
30
30
|
name="fastemailparser",
|
|
31
|
-
version="0.2.
|
|
31
|
+
version="0.2.2",
|
|
32
32
|
author="Julien Calenge @ Méthode",
|
|
33
33
|
author_email="julien.calenge@methode.dev",
|
|
34
34
|
description="Very fast email parsing tool, split emails, retrieve headers & signatures",
|
|
@@ -60,7 +60,8 @@ char *skip_mime_headers(char *raw) {
|
|
|
60
60
|
* description:
|
|
61
61
|
* if the buffer begins with email headers (not HTML), advances past
|
|
62
62
|
* the first blank line so chain-separator search starts from the
|
|
63
|
-
* actual body.
|
|
63
|
+
* actual body. scans until the first blank line with no byte limit —
|
|
64
|
+
* modern emails with ARC/DKIM chains can have headers beyond 16 KB.
|
|
64
65
|
*
|
|
65
66
|
* return: pointer to body start, equal to raw if nothing was skipped
|
|
66
67
|
*/
|
|
@@ -74,8 +75,6 @@ char *skip_mime_headers(char *raw) {
|
|
|
74
75
|
return p + 2;
|
|
75
76
|
if (p[0] == '\r' && p[1] == '\n' && p[2] == '\r' && p[3] == '\n')
|
|
76
77
|
return p + 4;
|
|
77
|
-
if ((size_t)(p - raw) > 8192)
|
|
78
|
-
break;
|
|
79
78
|
p++;
|
|
80
79
|
}
|
|
81
80
|
return raw;
|
|
@@ -211,30 +211,32 @@ static const char *block_start_from(const char *anchor_line,
|
|
|
211
211
|
return block_start;
|
|
212
212
|
}
|
|
213
213
|
|
|
214
|
-
static Py_ssize_t
|
|
214
|
+
static Py_ssize_t find_contacts_in_range(const char *search_start,
|
|
215
|
+
const char *end,
|
|
216
|
+
const char *text_base,
|
|
217
|
+
const char *context_base) {
|
|
215
218
|
/*
|
|
216
|
-
*
|
|
217
|
-
*
|
|
219
|
+
* search_start: first line to check for contact anchors
|
|
220
|
+
* end: one-past-end of the search range
|
|
221
|
+
* text_base: start of the full segment (used by has_body_before)
|
|
222
|
+
* context_base: lower bound for block_start_from walkback
|
|
218
223
|
*
|
|
219
224
|
* description:
|
|
220
|
-
* scans
|
|
221
|
-
*
|
|
222
|
-
* to
|
|
223
|
-
*
|
|
224
|
-
*
|
|
225
|
+
* scans [search_start, end) for a contact-info anchor line (phone,
|
|
226
|
+
* email, URL, or labeled field). when found, walks backward within
|
|
227
|
+
* [context_base, anchor) to find the start of the whole block.
|
|
228
|
+
* separating the search range from the walkback bound lets callers
|
|
229
|
+
* scan either the tail context (normal chain emails) or the head
|
|
230
|
+
* of the document (MIME emails where the body appears before large
|
|
231
|
+
* base64 attachments push the tail window away).
|
|
225
232
|
*
|
|
226
|
-
* return: byte offset
|
|
233
|
+
* return: byte offset relative to text_base, or -1 if not found
|
|
227
234
|
*/
|
|
228
|
-
const char *
|
|
229
|
-
const char *p;
|
|
230
|
-
const char *end;
|
|
235
|
+
const char *p = search_start;
|
|
231
236
|
const char *ls;
|
|
232
237
|
size_t llen;
|
|
233
238
|
size_t clean;
|
|
234
239
|
|
|
235
|
-
context = find_context_start(text, text_len);
|
|
236
|
-
p = context;
|
|
237
|
-
end = text + text_len;
|
|
238
240
|
while (p < end) {
|
|
239
241
|
ls = p;
|
|
240
242
|
while (p < end && *p != '\n')
|
|
@@ -244,14 +246,32 @@ static Py_ssize_t find_sig_by_contact(const char *text, size_t text_len) {
|
|
|
244
246
|
if (p < end)
|
|
245
247
|
p++;
|
|
246
248
|
if (is_contact_line(ls, clean)) {
|
|
247
|
-
const char *start = block_start_from(ls,
|
|
248
|
-
if (has_body_before(start,
|
|
249
|
-
return (Py_ssize_t)(start -
|
|
249
|
+
const char *start = block_start_from(ls, context_base);
|
|
250
|
+
if (has_body_before(start, text_base))
|
|
251
|
+
return (Py_ssize_t)(start - text_base);
|
|
250
252
|
}
|
|
251
253
|
}
|
|
252
254
|
return -1;
|
|
253
255
|
}
|
|
254
256
|
|
|
257
|
+
static Py_ssize_t find_sig_by_contact(const char *text, size_t text_len) {
|
|
258
|
+
/*
|
|
259
|
+
* text: plain-text segment to search
|
|
260
|
+
* text_len: byte length of text
|
|
261
|
+
*
|
|
262
|
+
* description:
|
|
263
|
+
* scans the last SIG_CONTEXT_LINES lines for a contact-info anchor
|
|
264
|
+
* line (phone, email, URL, or labeled field). when found, walks backward
|
|
265
|
+
* to return the position of the first line of the block, not just the
|
|
266
|
+
* anchor. this handles signatures that begin directly with a name/title
|
|
267
|
+
* with no polite closing phrase.
|
|
268
|
+
*
|
|
269
|
+
* return: byte offset of signature block start, or -1 if not found
|
|
270
|
+
*/
|
|
271
|
+
const char *context = find_context_start(text, text_len);
|
|
272
|
+
return find_contacts_in_range(context, text + text_len, text, context);
|
|
273
|
+
}
|
|
274
|
+
|
|
255
275
|
static int is_separator_line(const char *line, size_t len) {
|
|
256
276
|
/*
|
|
257
277
|
* line: one trimmed line of text
|
|
@@ -449,6 +469,14 @@ static Py_ssize_t find_sig_in_plain(const char *text, size_t text_len) {
|
|
|
449
469
|
* signature) is near the top, followed by base64 attachment data that
|
|
450
470
|
* would push it outside the pass-1 context window.
|
|
451
471
|
*
|
|
472
|
+
* pass 3 — first SIG_FALLBACK_LINES lines, contact-anchor detection.
|
|
473
|
+
* same rationale as pass 2: raw MIME emails with attachments have the
|
|
474
|
+
* contact block near the top, not in the tail window.
|
|
475
|
+
*
|
|
476
|
+
* pass 4 — last SIG_CONTEXT_LINES lines, contact-anchor detection.
|
|
477
|
+
* handles long chain emails with no polite closing whose signature
|
|
478
|
+
* sits at the bottom of the segment (the normal case).
|
|
479
|
+
*
|
|
452
480
|
* return: byte offset of signature start relative to text, or -1
|
|
453
481
|
*/
|
|
454
482
|
const char *full_end;
|
|
@@ -462,7 +490,7 @@ static Py_ssize_t find_sig_in_plain(const char *text, size_t text_len) {
|
|
|
462
490
|
off = scan_lines(context, full_end, text, 0);
|
|
463
491
|
if (off >= 0)
|
|
464
492
|
return off;
|
|
465
|
-
/*
|
|
493
|
+
/* Compute first SIG_FALLBACK_LINES boundary */
|
|
466
494
|
fallback_end = text;
|
|
467
495
|
count = 0;
|
|
468
496
|
while (fallback_end < full_end && count < SIG_FALLBACK_LINES) {
|
|
@@ -471,12 +499,17 @@ static Py_ssize_t find_sig_in_plain(const char *text, size_t text_len) {
|
|
|
471
499
|
fallback_end++;
|
|
472
500
|
}
|
|
473
501
|
if (fallback_end <= context) {
|
|
502
|
+
/* Pass 2: first SIG_FALLBACK_LINES lines, strong patterns only */
|
|
474
503
|
off = scan_lines(text, fallback_end, text, 1);
|
|
475
504
|
if (off >= 0)
|
|
476
505
|
return off;
|
|
506
|
+
/* Pass 3: contact-anchor in first SIG_FALLBACK_LINES lines.
|
|
507
|
+
context_base = text so the walkback can reach the very first line. */
|
|
508
|
+
off = find_contacts_in_range(text, fallback_end, text, text);
|
|
509
|
+
if (off >= 0)
|
|
510
|
+
return off;
|
|
477
511
|
}
|
|
478
|
-
/* Pass
|
|
479
|
-
(phone number, email address, URL, or labeled field like "Tel:", "E:") */
|
|
512
|
+
/* Pass 4: contact-anchor in tail context (long segments, no closing phrase) */
|
|
480
513
|
return find_sig_by_contact(text, text_len);
|
|
481
514
|
}
|
|
482
515
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{fastemailparser-0.2.1 → fastemailparser-0.2.2}/fastemailparser.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|