fastemailparser 0.2.1__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {fastemailparser-0.2.1/fastemailparser.egg-info → fastemailparser-0.2.2}/PKG-INFO +1 -1
  2. {fastemailparser-0.2.1 → fastemailparser-0.2.2/fastemailparser.egg-info}/PKG-INFO +1 -1
  3. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/setup.py +1 -1
  4. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/mime.c +2 -3
  5. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/signature.c +54 -21
  6. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/LICENSE +0 -0
  7. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/MANIFEST.in +0 -0
  8. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/README.md +0 -0
  9. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/email.h +0 -0
  10. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/emailparser.c +0 -0
  11. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/fastemailparser.egg-info/SOURCES.txt +0 -0
  12. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/fastemailparser.egg-info/dependency_links.txt +0 -0
  13. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/fastemailparser.egg-info/top_level.txt +0 -0
  14. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/setup.cfg +0 -0
  15. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/body.c +0 -0
  16. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/body.h +0 -0
  17. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/buf.h +0 -0
  18. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/email_iter.c +0 -0
  19. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/email_iter.h +0 -0
  20. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/headers.c +0 -0
  21. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/headers.h +0 -0
  22. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/html.c +0 -0
  23. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/html.h +0 -0
  24. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/mime.h +0 -0
  25. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/signature.h +0 -0
  26. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/standalone.c +0 -0
  27. {fastemailparser-0.2.1 → fastemailparser-0.2.2}/src/standalone.h +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fastemailparser
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: Very fast email parsing tool, split emails, retrieve headers & signatures
5
5
  Home-page: https://github.com/Methode-dev/EmailParser
6
6
  Author: Julien Calenge @ Méthode
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fastemailparser
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: Very fast email parsing tool, split emails, retrieve headers & signatures
5
5
  Home-page: https://github.com/Methode-dev/EmailParser
6
6
  Author: Julien Calenge @ Méthode
@@ -28,7 +28,7 @@ module = Extension(
28
28
 
29
29
  setup(
30
30
  name="fastemailparser",
31
- version="0.2.1",
31
+ version="0.2.2",
32
32
  author="Julien Calenge @ Méthode",
33
33
  author_email="julien.calenge@methode.dev",
34
34
  description="Very fast email parsing tool, split emails, retrieve headers & signatures",
@@ -60,7 +60,8 @@ char *skip_mime_headers(char *raw) {
60
60
  * description:
61
61
  * if the buffer begins with email headers (not HTML), advances past
62
62
  * the first blank line so chain-separator search starts from the
63
- * actual body. limited to the first 8 KB to avoid scanning large files.
63
+ * actual body. scans until the first blank line with no byte limit
64
+ * modern emails with ARC/DKIM chains can have headers beyond 16 KB.
64
65
  *
65
66
  * return: pointer to body start, equal to raw if nothing was skipped
66
67
  */
@@ -74,8 +75,6 @@ char *skip_mime_headers(char *raw) {
74
75
  return p + 2;
75
76
  if (p[0] == '\r' && p[1] == '\n' && p[2] == '\r' && p[3] == '\n')
76
77
  return p + 4;
77
- if ((size_t)(p - raw) > 8192)
78
- break;
79
78
  p++;
80
79
  }
81
80
  return raw;
@@ -211,30 +211,32 @@ static const char *block_start_from(const char *anchor_line,
211
211
  return block_start;
212
212
  }
213
213
 
214
- static Py_ssize_t find_sig_by_contact(const char *text, size_t text_len) {
214
+ static Py_ssize_t find_contacts_in_range(const char *search_start,
215
+ const char *end,
216
+ const char *text_base,
217
+ const char *context_base) {
215
218
  /*
216
- * text: plain-text segment to search
217
- * text_len: byte length of text
219
+ * search_start: first line to check for contact anchors
220
+ * end: one-past-end of the search range
221
+ * text_base: start of the full segment (used by has_body_before)
222
+ * context_base: lower bound for block_start_from walkback
218
223
  *
219
224
  * description:
220
- * scans the last SIG_CONTEXT_LINES lines for a contact-info anchor
221
- * line (phone, email, URL, or labeled field). when found, walks backward
222
- * to return the position of the first line of the block, not just the
223
- * anchor. this handles signatures that begin directly with a name/title
224
- * with no polite closing phrase.
225
+ * scans [search_start, end) for a contact-info anchor line (phone,
226
+ * email, URL, or labeled field). when found, walks backward within
227
+ * [context_base, anchor) to find the start of the whole block.
228
+ * separating the search range from the walkback bound lets callers
229
+ * scan either the tail context (normal chain emails) or the head
230
+ * of the document (MIME emails where the body appears before large
231
+ * base64 attachments push the tail window away).
225
232
  *
226
- * return: byte offset of signature block start, or -1 if not found
233
+ * return: byte offset relative to text_base, or -1 if not found
227
234
  */
228
- const char *context;
229
- const char *p;
230
- const char *end;
235
+ const char *p = search_start;
231
236
  const char *ls;
232
237
  size_t llen;
233
238
  size_t clean;
234
239
 
235
- context = find_context_start(text, text_len);
236
- p = context;
237
- end = text + text_len;
238
240
  while (p < end) {
239
241
  ls = p;
240
242
  while (p < end && *p != '\n')
@@ -244,14 +246,32 @@ static Py_ssize_t find_sig_by_contact(const char *text, size_t text_len) {
244
246
  if (p < end)
245
247
  p++;
246
248
  if (is_contact_line(ls, clean)) {
247
- const char *start = block_start_from(ls, context);
248
- if (has_body_before(start, text))
249
- return (Py_ssize_t)(start - text);
249
+ const char *start = block_start_from(ls, context_base);
250
+ if (has_body_before(start, text_base))
251
+ return (Py_ssize_t)(start - text_base);
250
252
  }
251
253
  }
252
254
  return -1;
253
255
  }
254
256
 
257
+ static Py_ssize_t find_sig_by_contact(const char *text, size_t text_len) {
258
+ /*
259
+ * text: plain-text segment to search
260
+ * text_len: byte length of text
261
+ *
262
+ * description:
263
+ * scans the last SIG_CONTEXT_LINES lines for a contact-info anchor
264
+ * line (phone, email, URL, or labeled field). when found, walks backward
265
+ * to return the position of the first line of the block, not just the
266
+ * anchor. this handles signatures that begin directly with a name/title
267
+ * with no polite closing phrase.
268
+ *
269
+ * return: byte offset of signature block start, or -1 if not found
270
+ */
271
+ const char *context = find_context_start(text, text_len);
272
+ return find_contacts_in_range(context, text + text_len, text, context);
273
+ }
274
+
255
275
  static int is_separator_line(const char *line, size_t len) {
256
276
  /*
257
277
  * line: one trimmed line of text
@@ -449,6 +469,14 @@ static Py_ssize_t find_sig_in_plain(const char *text, size_t text_len) {
449
469
  * signature) is near the top, followed by base64 attachment data that
450
470
  * would push it outside the pass-1 context window.
451
471
  *
472
+ * pass 3 — first SIG_FALLBACK_LINES lines, contact-anchor detection.
473
+ * same rationale as pass 2: raw MIME emails with attachments have the
474
+ * contact block near the top, not in the tail window.
475
+ *
476
+ * pass 4 — last SIG_CONTEXT_LINES lines, contact-anchor detection.
477
+ * handles long chain emails with no polite closing whose signature
478
+ * sits at the bottom of the segment (the normal case).
479
+ *
452
480
  * return: byte offset of signature start relative to text, or -1
453
481
  */
454
482
  const char *full_end;
@@ -462,7 +490,7 @@ static Py_ssize_t find_sig_in_plain(const char *text, size_t text_len) {
462
490
  off = scan_lines(context, full_end, text, 0);
463
491
  if (off >= 0)
464
492
  return off;
465
- /* Pass 2: scan first SIG_FALLBACK_LINES lines, strong patterns only */
493
+ /* Compute first SIG_FALLBACK_LINES boundary */
466
494
  fallback_end = text;
467
495
  count = 0;
468
496
  while (fallback_end < full_end && count < SIG_FALLBACK_LINES) {
@@ -471,12 +499,17 @@ static Py_ssize_t find_sig_in_plain(const char *text, size_t text_len) {
471
499
  fallback_end++;
472
500
  }
473
501
  if (fallback_end <= context) {
502
+ /* Pass 2: first SIG_FALLBACK_LINES lines, strong patterns only */
474
503
  off = scan_lines(text, fallback_end, text, 1);
475
504
  if (off >= 0)
476
505
  return off;
506
+ /* Pass 3: contact-anchor in first SIG_FALLBACK_LINES lines.
507
+ context_base = text so the walkback can reach the very first line. */
508
+ off = find_contacts_in_range(text, fallback_end, text, text);
509
+ if (off >= 0)
510
+ return off;
477
511
  }
478
- /* Pass 3: contact-anchor detection for signatures without a polite closing
479
- (phone number, email address, URL, or labeled field like "Tel:", "E:") */
512
+ /* Pass 4: contact-anchor in tail context (long segments, no closing phrase) */
480
513
  return find_sig_by_contact(text, text_len);
481
514
  }
482
515
 
File without changes
File without changes