fastemailparser 0.1.2__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {fastemailparser-0.1.2/fastemailparser.egg-info → fastemailparser-0.2.1}/PKG-INFO +19 -1
  2. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/README.md +18 -0
  3. {fastemailparser-0.1.2 → fastemailparser-0.2.1/fastemailparser.egg-info}/PKG-INFO +19 -1
  4. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/setup.py +1 -1
  5. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/src/headers.c +13 -9
  6. fastemailparser-0.2.1/src/signature.c +619 -0
  7. fastemailparser-0.1.2/src/signature.c +0 -238
  8. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/LICENSE +0 -0
  9. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/MANIFEST.in +0 -0
  10. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/email.h +0 -0
  11. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/emailparser.c +0 -0
  12. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/fastemailparser.egg-info/SOURCES.txt +0 -0
  13. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/fastemailparser.egg-info/dependency_links.txt +0 -0
  14. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/fastemailparser.egg-info/top_level.txt +0 -0
  15. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/setup.cfg +0 -0
  16. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/src/body.c +0 -0
  17. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/src/body.h +0 -0
  18. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/src/buf.h +0 -0
  19. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/src/email_iter.c +0 -0
  20. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/src/email_iter.h +0 -0
  21. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/src/headers.h +0 -0
  22. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/src/html.c +0 -0
  23. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/src/html.h +0 -0
  24. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/src/mime.c +0 -0
  25. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/src/mime.h +0 -0
  26. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/src/signature.h +0 -0
  27. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/src/standalone.c +0 -0
  28. {fastemailparser-0.1.2 → fastemailparser-0.2.1}/src/standalone.h +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fastemailparser
3
- Version: 0.1.2
3
+ Version: 0.2.1
4
4
  Summary: Very fast email parsing tool, split emails, retrieve headers & signatures
5
5
  Home-page: https://github.com/Methode-dev/EmailParser
6
6
  Author: Julien Calenge @ Méthode
@@ -55,6 +55,16 @@ Please find CONTRIBUTING.md, as it is a deep-dive documentation into the code.
55
55
  pip install emailparser
56
56
  ```
57
57
 
58
+ Quick Start:
59
+ ```python
60
+ from emailparser import Email
61
+
62
+ chain_mail = Email(open('mail.html', 'r').read())
63
+ print(next(chain_mail))
64
+ ```
65
+
66
+ See a more detailed usage in [Section 8](#8-putting-it-all-together)
67
+
58
68
  ---
59
69
  ## 1. Building
60
70
 
@@ -281,6 +291,14 @@ Extracts the header fields from any segment string.
281
291
  | `"bcc"` | `list[str]` | `[]` |
282
292
  | `"subject"` | `str \| None` | `None` |
283
293
  | `"date"` | `str \| None` | `None` |
294
+ | `"message-id"` | `str \| None` | `None` |
295
+ | `"thread-index"` | `str \| None` | `None` |
296
+ | `"thread-topic"` | `str \| None` | `None` |
297
+
298
+ `message-id`, `thread-index`, and `thread-topic` are present in the outermost
299
+ MIME header block and are therefore most useful via `email.outer_headers`.
300
+ Inner quoted segments rarely carry these fields, so they will return `None`
301
+ for most `parse_headers()` calls on segments 1, 2, …
284
302
 
285
303
  Recognised field names (case-insensitive):
286
304
 
@@ -34,6 +34,16 @@ Please find CONTRIBUTING.md, as it is a deep-dive documentation into the code.
34
34
  pip install emailparser
35
35
  ```
36
36
 
37
+ Quick Start:
38
+ ```python
39
+ from emailparser import Email
40
+
41
+ chain_mail = Email(open('mail.html', 'r').read())
42
+ print(next(chain_mail))
43
+ ```
44
+
45
+ See a more detailed usage in [Section 8](#8-putting-it-all-together)
46
+
37
47
  ---
38
48
  ## 1. Building
39
49
 
@@ -260,6 +270,14 @@ Extracts the header fields from any segment string.
260
270
  | `"bcc"` | `list[str]` | `[]` |
261
271
  | `"subject"` | `str \| None` | `None` |
262
272
  | `"date"` | `str \| None` | `None` |
273
+ | `"message-id"` | `str \| None` | `None` |
274
+ | `"thread-index"` | `str \| None` | `None` |
275
+ | `"thread-topic"` | `str \| None` | `None` |
276
+
277
+ `message-id`, `thread-index`, and `thread-topic` are present in the outermost
278
+ MIME header block and are therefore most useful via `email.outer_headers`.
279
+ Inner quoted segments rarely carry these fields, so they will return `None`
280
+ for most `parse_headers()` calls on segments 1, 2, …
263
281
 
264
282
  Recognised field names (case-insensitive):
265
283
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fastemailparser
3
- Version: 0.1.2
3
+ Version: 0.2.1
4
4
  Summary: Very fast email parsing tool, split emails, retrieve headers & signatures
5
5
  Home-page: https://github.com/Methode-dev/EmailParser
6
6
  Author: Julien Calenge @ Méthode
@@ -55,6 +55,16 @@ Please find CONTRIBUTING.md, as it is a deep-dive documentation into the code.
55
55
  pip install emailparser
56
56
  ```
57
57
 
58
+ Quick Start:
59
+ ```python
60
+ from emailparser import Email
61
+
62
+ chain_mail = Email(open('mail.html', 'r').read())
63
+ print(next(chain_mail))
64
+ ```
65
+
66
+ See a more detailed usage in [Section 8](#8-putting-it-all-together)
67
+
58
68
  ---
59
69
  ## 1. Building
60
70
 
@@ -281,6 +291,14 @@ Extracts the header fields from any segment string.
281
291
  | `"bcc"` | `list[str]` | `[]` |
282
292
  | `"subject"` | `str \| None` | `None` |
283
293
  | `"date"` | `str \| None` | `None` |
294
+ | `"message-id"` | `str \| None` | `None` |
295
+ | `"thread-index"` | `str \| None` | `None` |
296
+ | `"thread-topic"` | `str \| None` | `None` |
297
+
298
+ `message-id`, `thread-index`, and `thread-topic` are present in the outermost
299
+ MIME header block and are therefore most useful via `email.outer_headers`.
300
+ Inner quoted segments rarely carry these fields, so they will return `None`
301
+ for most `parse_headers()` calls on segments 1, 2, …
284
302
 
285
303
  Recognised field names (case-insensitive):
286
304
 
@@ -28,7 +28,7 @@ module = Extension(
28
28
 
29
29
  setup(
30
30
  name="fastemailparser",
31
- version="0.1.2",
31
+ version="0.2.1",
32
32
  author="Julien Calenge @ Méthode",
33
33
  author_email="julien.calenge@methode.dev",
34
34
  description="Very fast email parsing tool, split emails, retrieve headers & signatures",
@@ -25,11 +25,13 @@ const char *canonical_key(const char *name, size_t len) {
25
25
  const char *raw;
26
26
  const char *key;
27
27
  } MAP[] = {
28
- {"from", "from"}, {"reply-to", "from"}, {"to", "to"},
29
- {"cc", "cc"}, {"bcc", "bcc"}, {"subject", "subject"},
30
- {"date", "date"}, {"sent", "date"}, {"de", "from"},
31
- {"a", "to"}, {"\xc3\x80", "to"}, {"\xc3\xa0", "to"},
32
- {"objet", "subject"}, {"cci", "bcc"}, {"envoy\xc3\xa9", "date"},
28
+ {"from", "from"}, {"reply-to", "from"}, {"to", "to"},
29
+ {"cc", "cc"}, {"bcc", "bcc"}, {"subject", "subject"},
30
+ {"date", "date"}, {"sent", "date"}, {"de", "from"},
31
+ {"a", "to"}, {"\xc3\x80", "to"}, {"\xc3\xa0", "to"},
32
+ {"objet", "subject"}, {"cci", "bcc"}, {"envoy\xc3\xa9", "date"},
33
+ {"message-id", "message-id"}, {"thread-index", "thread-index"},
34
+ {"thread-topic", "thread-topic"},
33
35
  {NULL, NULL}};
34
36
  char lower[64];
35
37
  size_t n;
@@ -176,12 +178,14 @@ PyObject *py_parse_headers(PyObject *module, PyObject *args) {
176
178
  * all content is quoted-printable decoded before parsing.
177
179
  * lines are scanned until the first blank line (header/body separator).
178
180
  *
179
- * return: Python dict with keys "from", "to", "cc", "bcc",
180
- * "subject", "date" (string fields default to None,
181
- * list fields default to [])
181
+ * return: Python dict with keys "from", "to", "cc", "bcc", "subject",
182
+ * "date", "message-id", "thread-index", "thread-topic"
183
+ * (string fields default to None, list fields default to [])
182
184
  */
183
185
  static const char *LIST_KEYS[] = {"to", "cc", "bcc", NULL};
184
- static const char *STR_KEYS[] = {"from", "subject", "date", NULL};
186
+ static const char *STR_KEYS[] = {
187
+ "from", "subject", "date", "message-id", "thread-index",
188
+ "thread-topic", NULL};
185
189
  const char *text;
186
190
  Py_ssize_t text_len;
187
191
  int is_html;
@@ -0,0 +1,619 @@
1
+ #define PY_SSIZE_T_CLEAN
2
+ #include <Python.h>
3
+ #include "signature.h"
4
+ #include "html.h"
5
+ #include <string.h>
6
+ #include <stdlib.h>
7
+ #include <ctype.h>
8
+ #include <libxml/HTMLparser.h>
9
+ #include <libxml/tree.h>
10
+
11
+ /* Multi-word or highly specific phrases — any non-alpha terminator accepted */
12
+ static const char *STRONG_PATTERNS[] = {
13
+ "best regards", "kind regards", "warm regards",
14
+ "with regards", "many thanks", "best wishes",
15
+ "yours sincerely", "yours faithfully", "yours truly",
16
+ "cordialement", "bien cordialement", "salutations",
17
+ "thanks & best regards", "thanks & kind regards",
18
+ "all the best", "thank you very much",
19
+ NULL};
20
+
21
+ /* Short / ambiguous words — must be the whole line or end with comma/period */
22
+ static const char *WEAK_PATTERNS[] = {
23
+ "thanks", "cheers", "regards", "sincerely", "merci",
24
+ "best", "yours", "thank you", "thanks again",
25
+ NULL};
26
+
27
+ /* Lines from the end to search in pass 1 (covers chain email segments) */
28
+ #define SIG_CONTEXT_LINES 40
29
+ /* Lines from the start to search in pass 2 — fallback for single MIME emails
30
+ whose body text appears near the top, followed by base64 attachments */
31
+ #define SIG_FALLBACK_LINES 100
32
+ /* Max lines to walk backward from a contact anchor to find block start */
33
+ #define MAX_SIG_BLOCK_LINES 15
34
+
35
+ static const char *find_context_start(const char *text, size_t len); /* fwd */
36
+ static int has_body_before(const char *match_start, const char *text_base); /* fwd */
37
+
38
+ /* Single-letter and common multi-letter contact field labels */
39
+ static const char *CONTACT_LABELS[] = {
40
+ "tel:", "t:", "tél:", "phone:", "ph:", "mobile:", "mob:", "m:",
41
+ "cell:", "fax:", "f:", "email:", "e-mail:", "mail:", "e:", "w:",
42
+ "web:", "website:", "adr:", "addr:", "address:", "a:", "dir:",
43
+ "linkedin:", NULL};
44
+
45
+ static int looks_like_phone(const char *s, size_t len) {
46
+ /*
47
+ * s: trimmed line content
48
+ * len: byte length
49
+ *
50
+ * description:
51
+ * returns 1 if s looks like a phone number: 7-15 digits with only
52
+ * phone-valid separators (spaces, dashes, dots, parentheses) and
53
+ * at least one space/paren/dash OR a leading +. this avoids matching
54
+ * plain dates ("06/04/2026") or reference numbers.
55
+ *
56
+ * return: 1 if phone-like, 0 otherwise
57
+ */
58
+ int digits = 0;
59
+ int has_sep = 0;
60
+ int has_plus = (len > 0 && s[0] == '+');
61
+
62
+ for (size_t i = 0; i < len; i++) {
63
+ char c = s[i];
64
+ if (isdigit((unsigned char)c)) {
65
+ digits++;
66
+ continue;
67
+ }
68
+ if (c == ' ' || c == '-' || c == '(' || c == ')') {
69
+ has_sep = 1;
70
+ continue;
71
+ }
72
+ if (c == '.' || c == '\t') {
73
+ has_sep = 1;
74
+ continue;
75
+ }
76
+ if (c == '+' && i == 0)
77
+ continue;
78
+ if (c == '/')
79
+ continue; /* allow but not counted as a separator */
80
+ return 0;
81
+ }
82
+ return (digits >= 7 && digits <= 15 && (has_plus || has_sep));
83
+ }
84
+
85
+ static int looks_like_email(const char *s, size_t len) {
86
+ /*
87
+ * s: trimmed line content (no leading/trailing whitespace)
88
+ * len: byte length
89
+ *
90
+ * description:
91
+ * returns 1 if s is a bare email address: exactly one '@', no spaces,
92
+ * and at least one '.' after the '@'.
93
+ *
94
+ * return: 1 if email-like, 0 otherwise
95
+ */
96
+ int at_idx = -1;
97
+
98
+ for (size_t i = 0; i < len; i++) {
99
+ if (s[i] == '@') {
100
+ if (at_idx >= 0)
101
+ return 0; /* two @ signs */
102
+ at_idx = (int)i;
103
+ continue;
104
+ }
105
+ if (isspace((unsigned char)s[i]))
106
+ return 0;
107
+ }
108
+ if (at_idx <= 0 || at_idx >= (int)len - 3)
109
+ return 0;
110
+ for (int i = at_idx + 1; i < (int)len; i++)
111
+ if (s[i] == '.')
112
+ return 1;
113
+ return 0;
114
+ }
115
+
116
+ static int is_contact_line(const char *line, size_t len) {
117
+ /*
118
+ * line: one line of text
119
+ * len: byte length
120
+ *
121
+ * description:
122
+ * returns 1 if the line looks like a signature contact-info element:
123
+ * - URL (http/https/www)
124
+ * - known contact label prefix (Tel:, E:, T:, Email:, Web:, …)
125
+ * - bare phone number
126
+ * - bare email address
127
+ * the contact label alone is sufficient — no need to validate the value,
128
+ * since "E:", "T:", "A:" at the start of a line are highly specific to
129
+ * signature blocks.
130
+ *
131
+ * return: 1 if the line is a contact-info line, 0 otherwise
132
+ */
133
+ const char *s = line;
134
+ size_t n = len;
135
+ char lower[32];
136
+ size_t check;
137
+ int i;
138
+
139
+ while (n > 0 && isspace((unsigned char)*s)) { s++; n--; }
140
+ while (n > 0 && isspace((unsigned char)s[n - 1])) n--;
141
+ if (n == 0)
142
+ return 0;
143
+ if (n >= 7 && strncasecmp(s, "http://", 7) == 0)
144
+ return 1;
145
+ if (n >= 8 && strncasecmp(s, "https://", 8) == 0)
146
+ return 1;
147
+ if (n >= 4 && strncasecmp(s, "www.", 4) == 0)
148
+ return 1;
149
+ check = n < 31 ? n : 31;
150
+ for (size_t j = 0; j < check; j++)
151
+ lower[j] = (char)tolower((unsigned char)s[j]);
152
+ lower[check] = '\0';
153
+ for (i = 0; CONTACT_LABELS[i]; i++) {
154
+ size_t ll = strlen(CONTACT_LABELS[i]);
155
+ if (strncmp(lower, CONTACT_LABELS[i], ll) == 0)
156
+ return 1;
157
+ }
158
+ if (looks_like_phone(s, n))
159
+ return 1;
160
+ if (looks_like_email(s, n))
161
+ return 1;
162
+ return 0;
163
+ }
164
+
165
+ static const char *block_start_from(const char *anchor_line,
166
+ const char *context_start) {
167
+ /*
168
+ * anchor_line: pointer to the start of the contact-anchor line
169
+ * context_start: lower bound — do not walk before this point
170
+ *
171
+ * description:
172
+ * walks backward from anchor_line through consecutive non-blank lines
173
+ * (up to MAX_SIG_BLOCK_LINES) to find the first line of the signature
174
+ * block. stops when a blank line is encountered, which is the standard
175
+ * body/signature separator.
176
+ *
177
+ * return: pointer to the start of the earliest line in the block
178
+ */
179
+ const char *block_start = anchor_line;
180
+ const char *p = anchor_line;
181
+ int lines_back = 0;
182
+ const char *prev;
183
+ const char *prev_line;
184
+ size_t prev_len;
185
+ int blank;
186
+
187
+ while (p > context_start && lines_back < MAX_SIG_BLOCK_LINES) {
188
+ prev = p - 1;
189
+ /* p-1 is the \n ending the previous line — skip past it */
190
+ if (prev > context_start && *prev == '\n')
191
+ prev--;
192
+ while (prev > context_start && *prev != '\n')
193
+ prev--;
194
+ prev_line = (*prev == '\n') ? prev + 1 : context_start;
195
+ prev_len = (size_t)((p - 1) - prev_line);
196
+ if (prev_len > 0 && prev_line[prev_len - 1] == '\r')
197
+ prev_len--;
198
+ blank = 1;
199
+ for (size_t i = 0; i < prev_len; i++) {
200
+ if (!isspace((unsigned char)prev_line[i])) {
201
+ blank = 0;
202
+ break;
203
+ }
204
+ }
205
+ if (blank)
206
+ break;
207
+ block_start = prev_line;
208
+ p = prev_line;
209
+ lines_back++;
210
+ }
211
+ return block_start;
212
+ }
213
+
214
+ static Py_ssize_t find_sig_by_contact(const char *text, size_t text_len) {
215
+ /*
216
+ * text: plain-text segment to search
217
+ * text_len: byte length of text
218
+ *
219
+ * description:
220
+ * scans the last SIG_CONTEXT_LINES lines for a contact-info anchor
221
+ * line (phone, email, URL, or labeled field). when found, walks backward
222
+ * to return the position of the first line of the block, not just the
223
+ * anchor. this handles signatures that begin directly with a name/title
224
+ * with no polite closing phrase.
225
+ *
226
+ * return: byte offset of signature block start, or -1 if not found
227
+ */
228
+ const char *context;
229
+ const char *p;
230
+ const char *end;
231
+ const char *ls;
232
+ size_t llen;
233
+ size_t clean;
234
+
235
+ context = find_context_start(text, text_len);
236
+ p = context;
237
+ end = text + text_len;
238
+ while (p < end) {
239
+ ls = p;
240
+ while (p < end && *p != '\n')
241
+ p++;
242
+ llen = (size_t)(p - ls);
243
+ clean = (llen > 0 && ls[llen - 1] == '\r') ? llen - 1 : llen;
244
+ if (p < end)
245
+ p++;
246
+ if (is_contact_line(ls, clean)) {
247
+ const char *start = block_start_from(ls, context);
248
+ if (has_body_before(start, text))
249
+ return (Py_ssize_t)(start - text);
250
+ }
251
+ }
252
+ return -1;
253
+ }
254
+
255
+ static int is_separator_line(const char *line, size_t len) {
256
+ /*
257
+ * line: one trimmed line of text
258
+ * len: byte length of line
259
+ *
260
+ * description:
261
+ * returns 1 for visual divider lines that consist entirely of 2 or
262
+ * more identical separator characters from the set {-, _, *, =, ~}.
263
+ * single-char lines are ignored to avoid matching list bullets.
264
+ *
265
+ * return: 1 if line is a visual separator, 0 otherwise
266
+ */
267
+ char first;
268
+ size_t i;
269
+
270
+ if (len < 2)
271
+ return 0;
272
+ first = line[0];
273
+ if (first != '-' && first != '_' && first != '*' && first != '=' &&
274
+ first != '~')
275
+ return 0;
276
+ for (i = 1; i < len; i++)
277
+ if (line[i] != first)
278
+ return 0;
279
+ return 1;
280
+ }
281
+
282
+ static int is_signature_line(const char *line, size_t len) {
283
+ /*
284
+ * line: one line of text (without trailing \n)
285
+ * len: byte length of line
286
+ *
287
+ * description:
288
+ * returns 1 if the line is a signature indicator. detection order:
289
+ * 1. visual separator line (---, ___, ***): is_separator_line()
290
+ * 2. RFC 3676 delimiter: exactly "--" or "-- "
291
+ * 3. strong closing phrase: any non-alpha character after the phrase
292
+ * 4. weak closing phrase: must be followed by comma, period, or EOL
293
+ * only — never by a space-then-word to avoid mid-body false positives
294
+ * such as "Thanks for your help"
295
+ *
296
+ * return: 1 if line is a signature line, 0 otherwise
297
+ */
298
+ char lower[64];
299
+ size_t copy;
300
+ size_t plen;
301
+ int i;
302
+
303
+ if (is_separator_line(line, len))
304
+ return 1;
305
+ if (len >= 2 && line[0] == '-' && line[1] == '-')
306
+ if (len == 2 || (len == 3 && line[2] == ' '))
307
+ return 1;
308
+ copy = len < 63 ? len : 63;
309
+ for (size_t j = 0; j < copy; j++)
310
+ lower[j] = (char)tolower((unsigned char)line[j]);
311
+ lower[copy] = '\0';
312
+ for (i = 0; STRONG_PATTERNS[i]; i++) {
313
+ plen = strlen(STRONG_PATTERNS[i]);
314
+ if (strncmp(lower, STRONG_PATTERNS[i], plen) != 0)
315
+ continue;
316
+ if (copy == plen || !isalpha((unsigned char)lower[plen]))
317
+ return 1;
318
+ }
319
+ for (i = 0; WEAK_PATTERNS[i]; i++) {
320
+ plen = strlen(WEAK_PATTERNS[i]);
321
+ if (strncmp(lower, WEAK_PATTERNS[i], plen) != 0)
322
+ continue;
323
+ if (copy == plen || lower[plen] == ',' || lower[plen] == '.')
324
+ return 1;
325
+ }
326
+ return 0;
327
+ }
328
+
329
+ static const char *find_context_start(const char *text, size_t len) {
330
+ /*
331
+ * text: full plain-text content
332
+ * len: byte length of text
333
+ *
334
+ * description:
335
+ * walks backward from the end of text counting newlines. returns a
336
+ * pointer to the start of the last SIG_CONTEXT_LINES lines so that
337
+ * signature detection is restricted to the tail of the email, avoiding
338
+ * false positives from closing phrases appearing in the body.
339
+ *
340
+ * return: pointer into text at the start of the context window
341
+ */
342
+ const char *p;
343
+ int count;
344
+
345
+ p = text + len;
346
+ count = 0;
347
+ while (p > text) {
348
+ p--;
349
+ if (*p == '\n') {
350
+ count++;
351
+ if (count >= SIG_CONTEXT_LINES)
352
+ return p + 1;
353
+ }
354
+ }
355
+ return text;
356
+ }
357
+
358
+ /* Like is_signature_line but only checks separator lines and strong patterns.
359
+ Used in the beginning-fallback pass to keep false-positive risk low. */
360
+ static int is_strong_signature_line(const char *line, size_t len) {
361
+ char lower[64];
362
+ size_t copy;
363
+ size_t plen;
364
+ int i;
365
+
366
+ if (is_separator_line(line, len))
367
+ return 1;
368
+ if (len >= 2 && line[0] == '-' && line[1] == '-')
369
+ if (len == 2 || (len == 3 && line[2] == ' '))
370
+ return 1;
371
+ copy = len < 63 ? len : 63;
372
+ for (size_t j = 0; j < copy; j++)
373
+ lower[j] = (char)tolower((unsigned char)line[j]);
374
+ lower[copy] = '\0';
375
+ for (i = 0; STRONG_PATTERNS[i]; i++) {
376
+ plen = strlen(STRONG_PATTERNS[i]);
377
+ if (strncmp(lower, STRONG_PATTERNS[i], plen) != 0)
378
+ continue;
379
+ if (copy == plen || !isalpha((unsigned char)lower[plen]))
380
+ return 1;
381
+ }
382
+ return 0;
383
+ }
384
+
385
+ /* Returns 1 if there is at least one non-blank line between text_base and
386
+ match_start, ensuring the signature is not the very first content. */
387
+ static int has_body_before(const char *match_start, const char *text_base) {
388
+ const char *p = text_base;
389
+ const char *ls;
390
+ size_t llen;
391
+ size_t i;
392
+
393
+ while (p < match_start) {
394
+ ls = p;
395
+ while (p < match_start && *p != '\n')
396
+ p++;
397
+ llen = (size_t)(p - ls);
398
+ if (llen > 0 && ls[llen - 1] == '\r')
399
+ llen--;
400
+ for (i = 0; i < llen; i++) {
401
+ if (!isspace((unsigned char)ls[i]))
402
+ return 1;
403
+ }
404
+ if (p < match_start)
405
+ p++;
406
+ }
407
+ return 0;
408
+ }
409
+
410
+ static Py_ssize_t scan_lines(const char *search_start, const char *end,
411
+ const char *text_base, int strong_only) {
412
+ const char *p = search_start;
413
+ const char *ls;
414
+ size_t llen;
415
+ size_t clean;
416
+
417
+ while (p < end) {
418
+ ls = p;
419
+ while (p < end && *p != '\n')
420
+ p++;
421
+ llen = (size_t)(p - ls);
422
+ clean = (llen > 0 && ls[llen - 1] == '\r') ? llen - 1 : llen;
423
+ if ((strong_only ? is_strong_signature_line(ls, clean)
424
+ : is_signature_line(ls, clean)) &&
425
+ has_body_before(ls, text_base))
426
+ return (Py_ssize_t)(ls - text_base);
427
+ if (p < end)
428
+ p++;
429
+ }
430
+ return -1;
431
+ }
432
+
433
+ static Py_ssize_t find_sig_in_plain(const char *text, size_t text_len) {
434
+ /*
435
+ * text: plain-text segment to search
436
+ * text_len: byte length of text
437
+ *
438
+ * description:
439
+ * two-pass search to handle both chain email segments and single MIME
440
+ * emails whose signature sits near the top but has base64 attachments
441
+ * below it:
442
+ *
443
+ * pass 1 — last SIG_CONTEXT_LINES lines, full pattern set.
444
+ * covers chain email segments where the signature is near the end
445
+ * and avoids false positives from mid-body closing phrases.
446
+ *
447
+ * pass 2 — first SIG_FALLBACK_LINES lines, strong patterns only.
448
+ * fallback for raw MIME emails where the actual email body (and its
449
+ * signature) is near the top, followed by base64 attachment data that
450
+ * would push it outside the pass-1 context window.
451
+ *
452
+ * return: byte offset of signature start relative to text, or -1
453
+ */
454
+ const char *full_end;
455
+ const char *context;
456
+ const char *fallback_end;
457
+ Py_ssize_t off;
458
+ int count;
459
+
460
+ full_end = text + text_len;
461
+ context = find_context_start(text, text_len);
462
+ off = scan_lines(context, full_end, text, 0);
463
+ if (off >= 0)
464
+ return off;
465
+ /* Pass 2: scan first SIG_FALLBACK_LINES lines, strong patterns only */
466
+ fallback_end = text;
467
+ count = 0;
468
+ while (fallback_end < full_end && count < SIG_FALLBACK_LINES) {
469
+ if (*fallback_end == '\n')
470
+ count++;
471
+ fallback_end++;
472
+ }
473
+ if (fallback_end <= context) {
474
+ off = scan_lines(text, fallback_end, text, 1);
475
+ if (off >= 0)
476
+ return off;
477
+ }
478
+ /* Pass 3: contact-anchor detection for signatures without a polite closing
479
+ (phone number, email address, URL, or labeled field like "Tel:", "E:") */
480
+ return find_sig_by_contact(text, text_len);
481
+ }
482
+
483
+ static Py_ssize_t find_sig_in_html(const char *html, size_t html_len) {
484
+ /*
485
+ * html: raw HTML segment to search
486
+ * html_len: byte length of html
487
+ *
488
+ * description:
489
+ * extracts plain text from the HTML via html_to_plain_c, then runs
490
+ * the context-aware find_sig_in_plain on it. the signature phrase is
491
+ * then located in the original HTML bytes using memmem so that the
492
+ * returned offset points into the actual HTML source.
493
+ *
494
+ * return: byte offset of signature start in html, or -1 if not found
495
+ */
496
+ char *plain;
497
+ size_t plain_len;
498
+ Py_ssize_t plain_off;
499
+ const char *sig_start;
500
+ const char *sig_end;
501
+ size_t sig_len;
502
+ char *sig_text;
503
+ const char *pos;
504
+ Py_ssize_t off;
505
+
506
+ plain = html_to_plain_c(html);
507
+ if (!plain)
508
+ return -1;
509
+ plain_len = strlen(plain);
510
+ plain_off = find_sig_in_plain(plain, plain_len);
511
+ if (plain_off < 0) {
512
+ free(plain);
513
+ return -1;
514
+ }
515
+ sig_start = plain + plain_off;
516
+ sig_end = sig_start;
517
+ while (*sig_end && *sig_end != '\n')
518
+ sig_end++;
519
+ sig_len = (size_t)(sig_end - sig_start);
520
+ while (sig_len > 0 && isspace((unsigned char)sig_start[0])) {
521
+ sig_start++;
522
+ sig_len--;
523
+ }
524
+ while (sig_len > 0 && isspace((unsigned char)sig_start[sig_len - 1]))
525
+ sig_len--;
526
+ if (sig_len == 0) {
527
+ free(plain);
528
+ return -1;
529
+ }
530
+ sig_text = malloc(sig_len + 1);
531
+ if (!sig_text) {
532
+ free(plain);
533
+ return -1;
534
+ }
535
+ memcpy(sig_text, sig_start, sig_len);
536
+ sig_text[sig_len] = '\0';
537
+ free(plain); /* safe: sig_text is an independent copy */
538
+ pos = memmem(html, html_len, sig_text, sig_len);
539
+ off = pos ? (Py_ssize_t)(pos - html) : -1;
540
+ free(sig_text);
541
+ return off;
542
+ }
543
+
544
+ PyObject *py_find_signature(PyObject *module, PyObject *args) {
545
+ /*
546
+ * module: unused Python module argument
547
+ * args: Python tuple containing one string segment
548
+ *
549
+ * description:
550
+ * tries the HTML path first (html_to_plain_c + context-aware scan)
551
+ * if a '<' is found in the first 512 bytes; falls back to plain-text
552
+ * context-aware line scan. converts the resulting byte offset to a
553
+ * character offset for correct Python string slicing with UTF-8.
554
+ *
555
+ * return: Python int — character index of signature start, or -1
556
+ */
557
+ const char *text;
558
+ Py_ssize_t text_len;
559
+ Py_ssize_t byte_off;
560
+ Py_ssize_t i;
561
+ PyObject *prefix;
562
+ Py_ssize_t char_off;
563
+
564
+ if (!PyArg_ParseTuple(args, "s#", &text, &text_len))
565
+ return NULL;
566
+ byte_off = -1;
567
+ for (i = 0; i < text_len && i < 512; i++) {
568
+ if (text[i] == '<') {
569
+ byte_off = find_sig_in_html(text, (size_t)text_len);
570
+ break;
571
+ }
572
+ }
573
+ if (byte_off < 0)
574
+ byte_off = find_sig_in_plain(text, (size_t)text_len);
575
+ if (byte_off < 0)
576
+ return PyLong_FromLong(-1L);
577
+ prefix = PyUnicode_DecodeUTF8(text, byte_off, "replace");
578
+ if (!prefix)
579
+ return NULL;
580
+ char_off = PyUnicode_GetLength(prefix);
581
+ Py_DECREF(prefix);
582
+ return PyLong_FromSsize_t(char_off);
583
+ }
584
+
585
+ PyObject *py_strip_signature(PyObject *module, PyObject *args) {
586
+ /*
587
+ * module: unused Python module argument
588
+ * args: Python tuple containing one string segment
589
+ *
590
+ * description:
591
+ * calls py_find_signature to locate the signature start index, then
592
+ * returns a slice of the input up to (but not including) that index.
593
+ * if no signature is found, the full input string is returned unchanged.
594
+ *
595
+ * return: Python string with signature removed, or original if none found
596
+ */
597
+ const char *text;
598
+ Py_ssize_t text_len;
599
+ PyObject *idx_obj;
600
+ Py_ssize_t idx;
601
+ PyObject *full;
602
+ PyObject *result;
603
+
604
+ if (!PyArg_ParseTuple(args, "s#", &text, &text_len))
605
+ return NULL;
606
+ idx_obj = py_find_signature(module, args);
607
+ if (!idx_obj)
608
+ return NULL;
609
+ idx = PyLong_AsSsize_t(idx_obj);
610
+ Py_DECREF(idx_obj);
611
+ full = PyUnicode_DecodeUTF8(text, text_len, "replace");
612
+ if (!full)
613
+ return NULL;
614
+ if (idx < 0)
615
+ return full;
616
+ result = PySequence_GetSlice(full, 0, idx);
617
+ Py_DECREF(full);
618
+ return result;
619
+ }
@@ -1,238 +0,0 @@
1
- #define PY_SSIZE_T_CLEAN
2
- #include <Python.h>
3
- #include "signature.h"
4
- #include <string.h>
5
- #include <stdlib.h>
6
- #include <ctype.h>
7
- #include <libxml/HTMLparser.h>
8
- #include <libxml/tree.h>
9
-
10
- static const char *CLOSING_PATTERNS[] = {
11
- "best regards", "kind regards", "warm regards",
12
- "with regards", "many thanks", "best wishes",
13
- "yours sincerely", "yours faithfully", "yours truly",
14
- "sincerely", "cheers", "thanks",
15
- "regards", "cordialement", "bien cordialement",
16
- "merci", "salutations", NULL};
17
-
18
- static int is_signature_line(const char *line, size_t len) {
19
- /*
20
- * line: one line of text (without trailing \n)
21
- * len: byte length of line
22
- *
23
- * description:
24
- * returns 1 if line is an RFC 3676 "--" delimiter or starts with
25
- * one of the known English/French formal closing phrases.
26
- * the match is case-insensitive and requires a non-alpha character
27
- * (comma, space …) or end-of-string after the closing phrase so
28
- * that "regardsomething" does not match.
29
- *
30
- * return: 1 if line is a signature line, 0 otherwise
31
- */
32
- char lower[64];
33
- size_t copy;
34
- size_t plen;
35
- int i;
36
-
37
- if (len >= 2 && line[0] == '-' && line[1] == '-')
38
- if (len == 2 || (len == 3 && line[2] == ' '))
39
- return 1;
40
- copy = len < 63 ? len : 63;
41
- for (size_t j = 0; j < copy; j++)
42
- lower[j] = (char)tolower((unsigned char)line[j]);
43
- lower[copy] = '\0';
44
- for (i = 0; CLOSING_PATTERNS[i]; i++) {
45
- plen = strlen(CLOSING_PATTERNS[i]);
46
- if (strncmp(lower, CLOSING_PATTERNS[i], plen) != 0)
47
- continue;
48
- if (copy == plen || !isalpha((unsigned char)lower[plen]))
49
- return 1;
50
- }
51
- return 0;
52
- }
53
-
54
- static char *find_sig_text_node(xmlNodePtr node) {
55
- /*
56
- * node: root of the XML/HTML subtree to search
57
- *
58
- * description:
59
- * walks the DOM looking for the first text node whose trimmed
60
- * content matches is_signature_line. returns a malloc'd copy of
61
- * that content, or NULL if none is found.
62
- *
63
- * return: malloc'd NUL-terminated signature text, or NULL; caller frees
64
- */
65
- xmlNodePtr cur;
66
- const char *s;
67
- size_t len;
68
- char *copy;
69
-
70
- for (cur = node; cur; cur = cur->next) {
71
- if (cur->type == XML_TEXT_NODE && cur->content) {
72
- s = (const char *)cur->content;
73
- while (isspace((unsigned char)*s))
74
- s++;
75
- len = strlen(s);
76
- while (len > 0 && isspace((unsigned char)s[len - 1]))
77
- len--;
78
- if (len > 0 && is_signature_line(s, len)) {
79
- copy = malloc(len + 1);
80
- if (!copy)
81
- return NULL;
82
- memcpy(copy, s, len);
83
- copy[len] = '\0';
84
- return copy;
85
- }
86
- }
87
- if (cur->children) {
88
- copy = find_sig_text_node(cur->children);
89
- if (copy)
90
- return copy;
91
- }
92
- }
93
- return NULL;
94
- }
95
-
96
- static Py_ssize_t find_sig_in_html(const char *html, size_t html_len) {
97
- /*
98
- * html: raw HTML segment to search
99
- * html_len: byte length of html
100
- *
101
- * description:
102
- * parses the HTML with libxml2, walks the DOM to find the first
103
- * signature text node, then locates that literal string in the
104
- * original HTML bytes with memmem.
105
- *
106
- * return: byte offset of signature start in html, or -1 if not found
107
- */
108
- htmlDocPtr doc;
109
- xmlNodePtr root;
110
- char *sig_text;
111
- const char *pos;
112
- Py_ssize_t off;
113
-
114
- doc = htmlReadMemory(html, (int)html_len, NULL, "UTF-8",
115
- HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
116
- if (!doc)
117
- return -1;
118
- root = xmlDocGetRootElement(doc);
119
- sig_text = root ? find_sig_text_node(root) : NULL;
120
- xmlFreeDoc(doc);
121
- if (!sig_text)
122
- return -1;
123
- pos = memmem(html, html_len, sig_text, strlen(sig_text));
124
- off = pos ? (Py_ssize_t)(pos - html) : -1;
125
- free(sig_text);
126
- return off;
127
- }
128
-
129
- static Py_ssize_t find_sig_in_plain(const char *text, size_t text_len) {
130
- /*
131
- * text: plain-text segment to search
132
- * text_len: byte length of text
133
- *
134
- * description:
135
- * scans text line by line, stripping trailing \r, and calls
136
- * is_signature_line on each. returns the byte offset of the
137
- * first matching line.
138
- *
139
- * return: byte offset of signature start, or -1 if not found
140
- */
141
- const char *p;
142
- const char *end;
143
- const char *ls;
144
- size_t llen;
145
- size_t clean;
146
-
147
- p = text;
148
- end = text + text_len;
149
- while (p < end) {
150
- ls = p;
151
- while (p < end && *p != '\n')
152
- p++;
153
- llen = (size_t)(p - ls);
154
- clean = (llen > 0 && ls[llen - 1] == '\r') ? llen - 1 : llen;
155
- if (is_signature_line(ls, clean))
156
- return (Py_ssize_t)(ls - text);
157
- if (p < end)
158
- p++;
159
- }
160
- return -1;
161
- }
162
-
163
- PyObject *py_find_signature(PyObject *module, PyObject *args) {
164
- /*
165
- * module: unused Python module argument
166
- * args: Python tuple containing one string segment
167
- *
168
- * description:
169
- * tries the HTML path first (DOM parsing via libxml2) if a '<' is
170
- * found in the first 512 bytes; falls back to plain-text line scan.
171
- * converts the resulting byte offset to a character offset so that
172
- * Python slicing works correctly with multi-byte UTF-8 characters.
173
- *
174
- * return: Python int — character index of signature start, or -1
175
- */
176
- const char *text;
177
- Py_ssize_t text_len;
178
- Py_ssize_t byte_off;
179
- Py_ssize_t i;
180
- PyObject *prefix;
181
- Py_ssize_t char_off;
182
-
183
- if (!PyArg_ParseTuple(args, "s#", &text, &text_len))
184
- return NULL;
185
- byte_off = -1;
186
- for (i = 0; i < text_len && i < 512; i++) {
187
- if (text[i] == '<') {
188
- byte_off = find_sig_in_html(text, (size_t)text_len);
189
- break;
190
- }
191
- }
192
- if (byte_off < 0)
193
- byte_off = find_sig_in_plain(text, (size_t)text_len);
194
- if (byte_off < 0)
195
- return PyLong_FromLong(-1L);
196
- prefix = PyUnicode_DecodeUTF8(text, byte_off, "replace");
197
- if (!prefix)
198
- return NULL;
199
- char_off = PyUnicode_GetLength(prefix);
200
- Py_DECREF(prefix);
201
- return PyLong_FromSsize_t(char_off);
202
- }
203
-
204
- PyObject *py_strip_signature(PyObject *module, PyObject *args) {
205
- /*
206
- * module: unused Python module argument
207
- * args: Python tuple containing one string segment
208
- *
209
- * description:
210
- * calls py_find_signature to locate the signature start index, then
211
- * returns a slice of the input up to (but not including) that index.
212
- * if no signature is found, the full input string is returned unchanged.
213
- *
214
- * return: Python string with signature removed, or original if none found
215
- */
216
- const char *text;
217
- Py_ssize_t text_len;
218
- PyObject *idx_obj;
219
- Py_ssize_t idx;
220
- PyObject *full;
221
- PyObject *result;
222
-
223
- if (!PyArg_ParseTuple(args, "s#", &text, &text_len))
224
- return NULL;
225
- idx_obj = py_find_signature(module, args);
226
- if (!idx_obj)
227
- return NULL;
228
- idx = PyLong_AsSsize_t(idx_obj);
229
- Py_DECREF(idx_obj);
230
- full = PyUnicode_DecodeUTF8(text, text_len, "replace");
231
- if (!full)
232
- return NULL;
233
- if (idx < 0)
234
- return full;
235
- result = PySequence_GetSlice(full, 0, idx);
236
- Py_DECREF(full);
237
- return result;
238
- }
File without changes
File without changes