email_reply_trimmer 0.1.6 → 0.1.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/Gemfile +4 -0
- data/Rakefile +1 -1
- data/email_reply_trimmer.gemspec +5 -1
- data/lib/email_reply_trimmer/delimiter_matcher.rb +1 -1
- data/lib/email_reply_trimmer/email_header_matcher.rb +25 -13
- data/lib/email_reply_trimmer/embedded_email_matcher.rb +52 -25
- data/lib/email_reply_trimmer/signature_matcher.rb +24 -10
- data/lib/email_reply_trimmer.rb +79 -35
- data/test/before/forwarded_apple.txt +1 -0
- data/test/before/forwarded_gmail.txt +1 -0
- data/test/elided/email_headers_5.txt +23 -0
- data/test/elided/embedded_ception.txt +3 -3
- data/test/elided/embedded_email_12.txt +2 -2
- data/test/elided/embedded_email_13.txt +9 -0
- data/test/elided/embedded_email_14.txt +11 -0
- data/test/elided/embedded_email_15.txt +4 -0
- data/test/elided/embedded_email_16.txt +4 -0
- data/test/elided/embedded_email_17.txt +2 -0
- data/test/elided/embedded_email_18.txt +1 -0
- data/test/elided/embedded_email_19.txt +0 -0
- data/test/elided/embedded_email_chinese.txt +4 -0
- data/test/elided/embedded_email_german_4.txt +15 -0
- data/test/elided/embedded_email_german_5.txt +20 -0
- data/test/elided/embedded_email_german_6.txt +8 -0
- data/test/elided/embedded_email_norwegian.txt +9 -0
- data/test/elided/{embedded_email_polish.txt → embedded_email_polish_1.txt} +0 -0
- data/test/elided/embedded_email_polish_2.txt +7 -0
- data/test/elided/embedded_email_quote_text.txt +5 -0
- data/test/elided/{embedded_email_russian.txt → embedded_email_russian_1.txt} +0 -0
- data/test/elided/embedded_email_russian_2.txt +23 -0
- data/test/elided/embedded_email_swedish.txt +8 -0
- data/test/elided/embedded_email_ukrainian.txt +17 -0
- data/test/elided/forwarded_apple.txt +15 -0
- data/test/elided/forwarded_gmail.txt +15 -0
- data/test/elided/signatures.txt +5 -0
- data/test/elided/spam_1.txt +75 -0
- data/test/elided/spam_2.txt +152 -0
- data/test/emails/email_headers_5.txt +37 -0
- data/test/emails/embedded_email_1.txt +1 -1
- data/test/emails/embedded_email_13.txt +14 -0
- data/test/emails/embedded_email_14.txt +16 -0
- data/test/emails/embedded_email_15.txt +9 -0
- data/test/emails/embedded_email_16.txt +16 -0
- data/test/emails/embedded_email_17.txt +38 -0
- data/test/emails/embedded_email_18.txt +7 -0
- data/test/emails/embedded_email_19.txt +13 -0
- data/test/emails/embedded_email_4.txt +13 -13
- data/test/emails/embedded_email_7.txt +4 -4
- data/test/emails/embedded_email_chinese.txt +7 -0
- data/test/emails/embedded_email_german_4.txt +18 -0
- data/test/emails/embedded_email_german_5.txt +23 -0
- data/test/emails/embedded_email_german_6.txt +14 -0
- data/test/emails/embedded_email_norwegian.txt +11 -0
- data/test/emails/{embedded_email_polish.txt → embedded_email_polish_1.txt} +0 -0
- data/test/emails/embedded_email_polish_2.txt +11 -0
- data/test/emails/{embedded_email_russian.txt → embedded_email_russian_1.txt} +0 -0
- data/test/emails/embedded_email_russian_2.txt +26 -0
- data/test/emails/embedded_email_swedish.txt +20 -0
- data/test/emails/embedded_email_ukrainian.txt +19 -0
- data/test/emails/forwarded_apple.txt +17 -0
- data/test/emails/forwarded_gmail.txt +17 -0
- data/test/emails/signatures.txt +5 -0
- data/test/emails/spam_1.txt +75 -0
- data/test/emails/spam_2.txt +174 -0
- data/test/embedded/forwarded_apple.txt +13 -0
- data/test/embedded/forwarded_gmail.txt +14 -0
- data/test/matchers/does_not_contain_embedded_email.txt +5 -0
- data/test/test_email_matcher.rb +15 -0
- data/test/test_email_reply_trimmer.rb +7 -3
- data/test/trimmed/email_headers_5.txt +11 -0
- data/test/trimmed/embedded_email_13.txt +3 -0
- data/test/trimmed/embedded_email_14.txt +3 -0
- data/test/trimmed/embedded_email_15.txt +3 -0
- data/test/trimmed/embedded_email_16.txt +11 -0
- data/test/trimmed/embedded_email_17.txt +35 -0
- data/test/trimmed/embedded_email_18.txt +5 -0
- data/test/trimmed/embedded_email_19.txt +13 -0
- data/test/trimmed/embedded_email_chinese.txt +2 -0
- data/test/trimmed/embedded_email_german_4.txt +1 -0
- data/test/trimmed/embedded_email_german_5.txt +1 -0
- data/test/trimmed/embedded_email_german_6.txt +4 -0
- data/test/trimmed/embedded_email_norwegian.txt +1 -0
- data/test/trimmed/{embedded_email_polish.txt → embedded_email_polish_1.txt} +0 -0
- data/test/trimmed/embedded_email_polish_2.txt +2 -0
- data/test/trimmed/embedded_email_quote_text.txt +0 -5
- data/test/trimmed/{embedded_email_russian.txt → embedded_email_russian_1.txt} +0 -0
- data/test/trimmed/embedded_email_russian_2.txt +1 -0
- data/test/trimmed/embedded_email_swedish.txt +9 -0
- data/test/trimmed/embedded_email_ukrainian.txt +1 -0
- data/test/trimmed/forwarded_apple.txt +1 -0
- data/test/trimmed/forwarded_gmail.txt +1 -0
- data/test/trimmed/spam_1.txt +0 -0
- data/test/trimmed/spam_2.txt +21 -0
- metadata +122 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: c094ba472eabd06ed02276c4ec9e42cb32ccb873871e46e55c46cd4640425470
|
4
|
+
data.tar.gz: 968cee2c9b3de46ef86814dd84ee8449c400e9c5fa8d61363cbb90064204fadf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6c71cad161a2aaa27e813ff63f91b1e10359d73ffc95d0d01df0374dc8cc9a2ba506985cd4e33d12e63b3f41cd9e6f66bbe42293feea3f6e968c7d90e79a88de
|
7
|
+
data.tar.gz: e7e3661094d8b1183f572b7e847cac62a71215a341d12b627e0b2d5cfedfdad161f858b8aba3d97463d5ffda5919211da2b268dbd75e7edc48a95811d1e5b481
|
data/Gemfile
ADDED
data/Rakefile
CHANGED
data/email_reply_trimmer.gemspec
CHANGED
@@ -15,6 +15,10 @@ Gem::Specification.new do |s|
|
|
15
15
|
s.license = "MIT"
|
16
16
|
|
17
17
|
s.require_paths = ["lib"]
|
18
|
-
s.files = Dir["**/*"].reject { |path| File.directory?(path) }
|
18
|
+
s.files = Dir["**/*"].reject { |path| File.directory?(path) || path =~ /.*\.gem$/ }
|
19
19
|
s.test_files = s.files.select { |path| path =~ /^test\/.+_test\.rb$/ }
|
20
|
+
|
21
|
+
s.add_development_dependency 'rake', '~> 12'
|
22
|
+
s.add_development_dependency 'minitest', '~> 5'
|
23
|
+
s.add_development_dependency 'rubocop', '~> 0.52.1'
|
20
24
|
end
|
@@ -1,12 +1,12 @@
|
|
1
1
|
class EmailHeaderMatcher
|
2
2
|
|
3
|
-
EMAIL_HEADERS_WITH_DATE_MARKERS
|
4
|
-
#
|
3
|
+
EMAIL_HEADERS_WITH_DATE_MARKERS ||= [
|
4
|
+
# Norwegian
|
5
5
|
["Sendt"],
|
6
6
|
# English
|
7
|
-
["Sent"],
|
7
|
+
["Sent", "Date"],
|
8
8
|
# French
|
9
|
-
["Date"],
|
9
|
+
["Date", "Le"],
|
10
10
|
# German
|
11
11
|
["Gesendet"],
|
12
12
|
# Portuguese
|
@@ -17,19 +17,25 @@ class EmailHeaderMatcher
|
|
17
17
|
["Fecha"],
|
18
18
|
# Italian
|
19
19
|
["Data"],
|
20
|
+
# Dutch
|
21
|
+
["Datum"],
|
22
|
+
# Swedish
|
23
|
+
["Skickat"],
|
24
|
+
# Chinese
|
25
|
+
["发送时间"],
|
20
26
|
]
|
21
27
|
|
22
|
-
EMAIL_HEADERS_WITH_DATE_REGEXES
|
23
|
-
/^[[:blank:]
|
28
|
+
EMAIL_HEADERS_WITH_DATE_REGEXES ||= EMAIL_HEADERS_WITH_DATE_MARKERS.map do |header|
|
29
|
+
/^[[:blank:]*]*(?:#{header.join("|")})[[:blank:]*]*:.*\d+/
|
24
30
|
end
|
25
31
|
|
26
|
-
EMAIL_HEADERS_WITH_TEXT_MARKERS
|
27
|
-
#
|
32
|
+
EMAIL_HEADERS_WITH_TEXT_MARKERS ||= [
|
33
|
+
# Norwegian
|
28
34
|
["Fra", "Til", "Emne"],
|
29
35
|
# English
|
30
36
|
["From", "To", "Cc", "Reply-To", "Subject"],
|
31
37
|
# French
|
32
|
-
["De", "À", "Répondre à", "Objet"],
|
38
|
+
["De", "Expéditeur", "À", "Destinataire", "Répondre à", "Objet"],
|
33
39
|
# German
|
34
40
|
["Von", "An", "Betreff"],
|
35
41
|
# Portuguese
|
@@ -37,14 +43,20 @@ class EmailHeaderMatcher
|
|
37
43
|
# Spanish
|
38
44
|
["De", "Para", "Asunto"],
|
39
45
|
# Italian
|
40
|
-
["Da", "Risposta", "A", "Oggetto"]
|
46
|
+
["Da", "Risposta", "A", "Oggetto"],
|
47
|
+
# Dutch
|
48
|
+
["Van", "Beantwoorden - Aan", "Aan", "Onderwerp"],
|
49
|
+
# Swedish
|
50
|
+
["Från", "Till", "Ämne"],
|
51
|
+
# Chinese
|
52
|
+
["发件人", "收件人", "主题"],
|
41
53
|
]
|
42
54
|
|
43
|
-
EMAIL_HEADERS_WITH_TEXT_REGEXES
|
44
|
-
/^[[:blank:]
|
55
|
+
EMAIL_HEADERS_WITH_TEXT_REGEXES ||= EMAIL_HEADERS_WITH_TEXT_MARKERS.map do |header|
|
56
|
+
/^[[:blank:]*]*(?:#{header.join("|")})[[:blank:]*]*:.*[[:word:]]+/i
|
45
57
|
end
|
46
58
|
|
47
|
-
EMAIL_HEADER_REGEXES
|
59
|
+
EMAIL_HEADER_REGEXES ||= [
|
48
60
|
EMAIL_HEADERS_WITH_DATE_REGEXES,
|
49
61
|
EMAIL_HEADERS_WITH_TEXT_REGEXES,
|
50
62
|
].flatten
|
@@ -10,38 +10,43 @@ class EmbeddedEmailMatcher
|
|
10
10
|
# Dnia 14 lip 2015 o godz. 00:25 Michael Downey <info@discourse.org> napisał(a):
|
11
11
|
# Em seg, 27 de jul de 2015 17:13, Neil Lalonde <info@discourse.org> escreveu:
|
12
12
|
# El jueves, 21 de noviembre de 2013, codinghorror escribió:
|
13
|
-
#
|
14
|
-
|
13
|
+
# At 6/16/2016 08:32 PM, you wrote:
|
14
|
+
ON_DATE_SOMEONE_WROTE_REGEXES ||= [
|
15
|
+
# Chinese
|
16
|
+
/^[[:blank:]<>-]*在 (?:(?!\b(?>在|写道)\b).)+?写道[[:blank:].:>-]*$/im,
|
15
17
|
# Dutch
|
16
|
-
[
|
18
|
+
/^[[:blank:]<>-]*Op (?:(?!\b(?>Op|het\svolgende\sgeschreven|schreef)\b).)+?(het\svolgende\sgeschreven|schreef[^:]+)[[:blank:].:>-]*$/im,
|
17
19
|
# English
|
18
|
-
[
|
20
|
+
/^[[:blank:]<>-]*In message (?:(?!\b(?>In message|writes)\b).)+?writes[[:blank:].:>-]*$/im,
|
21
|
+
/^[[:blank:]<>-]*(On|At) (?:(?!\b(?>On|wrote|writes|says|said)\b).)+?(wrote|writes|says|said)[[:blank:].:>-]*$/im,
|
19
22
|
# French
|
20
|
-
[
|
23
|
+
/^[[:blank:]<>-]*Le (?:(?!\b(?>Le|nous\sa\sdit|a\s+écrit)\b).)+?(nous\sa\sdit|a\s+écrit)[[:blank:].:>-]*$/im,
|
24
|
+
# German
|
25
|
+
/^[[:blank:]<>-]*Am (?:(?!\b(?>Am|schrieben\sSie)\b).)+?schrieben\sSie[[:blank:].:>-]*$/im,
|
26
|
+
/^[[:blank:]<>-]*Am (?:(?!\b(?>Am|geschrieben)\b).)+?(geschrieben|schrieb[^:]+)[[:blank:].:>-]*$/im,
|
21
27
|
# Italian
|
22
|
-
[
|
28
|
+
/^[[:blank:]<>-]*Il (?:(?!\b(?>Il|ha\sscritto)\b).)+?ha\sscritto[[:blank:].:>-]*$/im,
|
23
29
|
# Polish
|
24
|
-
[
|
30
|
+
/^[[:blank:]<>-]*(Dnia|Dňa) (?:(?!\b(?>Dnia|Dňa|napisał)\b).)+?napisał(\(a\))?[[:blank:].:>-]*$/im,
|
25
31
|
# Portuguese
|
26
|
-
[
|
32
|
+
/^[[:blank:]<>-]*Em (?:(?!\b(?>Em|escreveu)\b).)+?escreveu[[:blank:].:>-]*$/im,
|
27
33
|
# Spanish
|
28
|
-
[
|
29
|
-
# German
|
30
|
-
["Am", "schrieb"],
|
34
|
+
/^[[:blank:]<>-]*El (?:(?!\b(?>El|escribió)\b).)+?escribió[[:blank:].:>-]*$/im,
|
31
35
|
]
|
32
36
|
|
33
|
-
ON_DATE_SOMEONE_WROTE_REGEXES = ON_DATE_SOMEONE_WROTE_MARKERS.map do |on, wrote|
|
34
|
-
wrote.gsub!(/ +/, "[[:space:]]+") # the "wrote" part might span over multiple lines
|
35
|
-
/^([[:blank:]>\-]*#{on}\s(?:(?!#{on}\s|#{wrote}:?)[\s\S])*#{wrote}:?[[:blank:]\-]*)$/m
|
36
|
-
end
|
37
|
-
|
38
37
|
# Op 10 dec. 2015 18:35 schreef "Arpit Jalan" <info@discourse.org>:
|
39
38
|
# Am 18.09.2013 um 16:24 schrieb codinghorror <info@discourse.org>:
|
39
|
+
# Den 15. jun. 2016 kl. 20.42 skrev Jeff Atwood <info@discourse.org>:
|
40
|
+
# søn. 30. apr. 2017 kl. 00.26 skrev David Taylor <meta@discoursemail.com>:
|
40
41
|
ON_DATE_WROTE_SOMEONE_MARKERS = [
|
41
42
|
# Dutch
|
42
43
|
["Op", "schreef"],
|
43
44
|
# German
|
44
45
|
["Am", "schrieb"],
|
46
|
+
# Norwegian
|
47
|
+
["Den", "skrev"],
|
48
|
+
# Dutch
|
49
|
+
["søn\.", "skrev"],
|
45
50
|
]
|
46
51
|
|
47
52
|
ON_DATE_WROTE_SOMEONE_REGEXES = ON_DATE_WROTE_SOMEONE_MARKERS.map do |on, wrote|
|
@@ -49,15 +54,31 @@ class EmbeddedEmailMatcher
|
|
49
54
|
end
|
50
55
|
|
51
56
|
# суббота, 14 марта 2015 г. пользователь etewiah написал:
|
57
|
+
# 23 mar 2017 21:25 "Neil Lalonde" <meta@discoursemail.com> napisał(a):
|
58
|
+
# 30 серп. 2016 р. 20:45 "Arpit" no-reply@example.com пише:
|
52
59
|
DATE_SOMEONE_WROTE_MARKERS = [
|
53
60
|
# Russian
|
54
61
|
["пользователь", "написал"],
|
62
|
+
# Polish
|
63
|
+
["", "napisał\\(a\\)"],
|
64
|
+
# Ukrainian
|
65
|
+
["", "пише"],
|
55
66
|
]
|
56
67
|
|
57
68
|
DATE_SOMEONE_WROTE_REGEXES = DATE_SOMEONE_WROTE_MARKERS.map do |user, wrote|
|
58
|
-
|
69
|
+
user.size == 0 ?
|
70
|
+
/\d{4}.{1,80}\n?.{0,80}?#{wrote}:/ :
|
71
|
+
/\d{4}.{1,80}#{user}.{0,80}\n?.{0,80}?#{wrote}:/
|
59
72
|
end
|
60
73
|
|
74
|
+
# Max Mustermann <try_discourse@discoursemail.com> schrieb am Fr., 28. Apr. 2017 um 11:53 Uhr:
|
75
|
+
SOMEONE_WROTE_ON_DATE_REGEXES ||= [
|
76
|
+
# English
|
77
|
+
/^.+\bwrote\b[[:space:]]+\bon\b.+[^:]+:/,
|
78
|
+
# German
|
79
|
+
/^.+\bschrieb\b[[:space:]]+\bam\b.+[^:]+:/,
|
80
|
+
]
|
81
|
+
|
61
82
|
# 2016-03-03 17:21 GMT+01:00 Some One
|
62
83
|
ISO_DATE_SOMEONE_REGEX = /^[[:blank:]>]*20\d\d-\d\d-\d\d \d\d:\d\d GMT\+\d\d:\d\d [\w[:blank:]]+$/
|
63
84
|
|
@@ -65,7 +86,7 @@ class EmbeddedEmailMatcher
|
|
65
86
|
# 2013/10/2 camilohollanda <info@discourse.org>
|
66
87
|
# вт, 5 янв. 2016 г. в 23:39, Erlend Sogge Heggen <info@discourse.org>:
|
67
88
|
# ср, 1 апр. 2015, 18:29, Denis Didkovsky <info@discourse.org>:
|
68
|
-
DATE_SOMEONE_EMAIL_REGEX =
|
89
|
+
DATE_SOMEONE_EMAIL_REGEX = /\d{4}.{1,80}\s?<[^@<>]+@[^@<>.]+\.[^@<>]+>:?$/
|
69
90
|
|
70
91
|
# codinghorror via Discourse Meta wrote:
|
71
92
|
# codinghorror via Discourse Meta <info@discourse.org> schrieb:
|
@@ -77,11 +98,12 @@ class EmbeddedEmailMatcher
|
|
77
98
|
]
|
78
99
|
|
79
100
|
SOMEONE_VIA_SOMETHING_WROTE_REGEXES = SOMEONE_VIA_SOMETHING_WROTE_MARKERS.map do |wrote|
|
80
|
-
|
101
|
+
/^.+ via .+ #{wrote}:?[[:blank:]]*$/
|
81
102
|
end
|
82
103
|
|
83
104
|
# Some One <info@discourse.org> wrote:
|
84
|
-
|
105
|
+
# Gavin Sinclair (gsinclair@soyabean.com.au) wrote:
|
106
|
+
SOMEONE_EMAIL_WROTE_REGEX = /^.+\b[\w.+-]+@[\w.-]+\.\w{2,}\b.+wrote:?$/
|
85
107
|
|
86
108
|
# Posted by mpalmer on 01/21/2016
|
87
109
|
POSTED_BY_SOMEONE_ON_DATE_REGEX = /^[[:blank:]>]*Posted by .+ on \d{2}\/\d{2}\/\d{4}$/i
|
@@ -92,17 +114,21 @@ class EmbeddedEmailMatcher
|
|
92
114
|
# ----- Original Message -----
|
93
115
|
# -----Original Message-----
|
94
116
|
# *----- Original Message -----*
|
117
|
+
# ----- Reply message -----
|
118
|
+
# ------------------ 原始邮件 ------------------
|
95
119
|
FORWARDED_EMAIL_REGEXES = [
|
96
120
|
# English
|
97
121
|
/^[[:blank:]>]*Begin forwarded message:/i,
|
98
|
-
/^[[:blank:]
|
99
|
-
/^[[:blank:]>\*]*-{2,}[[:blank:]]*(Forwarded|Original) Message[[:blank:]]*-{2,}/i,
|
122
|
+
/^[[:blank:]>*]*-{2,}[[:blank:]]*(Forwarded|Original|Reply) Message[[:blank:]]*-{2,}/i,
|
100
123
|
# French
|
101
|
-
/^[[:blank:]
|
124
|
+
/^[[:blank:]>]*Début du message transféré :/i,
|
125
|
+
/^[[:blank:]>*]*-{2,}[[:blank:]]*Message transféré[[:blank:]]*-{2,}/i,
|
102
126
|
# German
|
103
|
-
/^[[:blank:]
|
127
|
+
/^[[:blank:]>*]*-{2,}[[:blank:]]*Ursprüngliche Nachricht[[:blank:]]*-{2,}/i,
|
104
128
|
# Spanish
|
105
|
-
/^[[:blank:]
|
129
|
+
/^[[:blank:]>*]*-{2,}[[:blank:]]*Mensaje original[[:blank:]]*-{2,}/i,
|
130
|
+
# Chinese
|
131
|
+
/^[[:blank:]>*]*-{2,}[[:blank:]]*原始邮件[[:blank:]]*-{2,}/i,
|
106
132
|
]
|
107
133
|
|
108
134
|
EMBEDDED_REGEXES = [
|
@@ -110,6 +136,7 @@ class EmbeddedEmailMatcher
|
|
110
136
|
ON_DATE_WROTE_SOMEONE_REGEXES,
|
111
137
|
DATE_SOMEONE_WROTE_REGEXES,
|
112
138
|
DATE_SOMEONE_EMAIL_REGEX,
|
139
|
+
SOMEONE_WROTE_ON_DATE_REGEXES,
|
113
140
|
ISO_DATE_SOMEONE_REGEX,
|
114
141
|
SOMEONE_VIA_SOMETHING_WROTE_REGEXES,
|
115
142
|
SOMEONE_EMAIL_WROTE_REGEX,
|
@@ -13,24 +13,38 @@ class SignatureMatcher
|
|
13
13
|
# (sent from a phone)
|
14
14
|
# (Sent from mobile device)
|
15
15
|
# 從我的 iPhone 傳送
|
16
|
-
SIGNATURE_REGEXES
|
16
|
+
SIGNATURE_REGEXES ||= [
|
17
17
|
# Chinese
|
18
|
-
/^[[:blank:]
|
18
|
+
/^[[:blank:]]*從我的 iPhone 傳送/i,
|
19
19
|
# English
|
20
|
-
/^[[:blank:]
|
21
|
-
/^[[:blank:]
|
22
|
-
/^[[:blank:]
|
20
|
+
/^[[:blank:]]*[[:word:]]+ from mobile/i,
|
21
|
+
/^[[:blank:]]*[\(<]*Sent (from|via|with|by) .+[\)>]*/i,
|
22
|
+
/^[[:blank:]]*From my .{1,20}/i,
|
23
|
+
/^[[:blank:]]*Get Outlook for /i,
|
23
24
|
# French
|
24
|
-
/^[[:blank:]
|
25
|
+
/^[[:blank:]]*Envoyé depuis (mon|Yahoo Mail)/i,
|
25
26
|
# German
|
26
|
-
/^[[:blank:]
|
27
|
-
/^[[:blank:]
|
27
|
+
/^[[:blank:]]*Von meinem .+ gesendet/i,
|
28
|
+
/^[[:blank:]]*Diese Nachricht wurde von .+ gesendet/i,
|
29
|
+
# Italian
|
30
|
+
/^[[:blank:]]*Inviato da /i,
|
31
|
+
# Norwegian
|
32
|
+
/^[[:blank:]]*Sendt fra min /i,
|
33
|
+
# Portuguese
|
34
|
+
/^[[:blank:]]*Enviado do meu /i,
|
28
35
|
# Spanish
|
29
|
-
/^[[:blank:]
|
36
|
+
/^[[:blank:]]*Enviado desde mi /i,
|
37
|
+
# Dutch
|
38
|
+
/^[[:blank:]]*Verzonden met /i,
|
39
|
+
/^[[:blank:]]*Verstuurd vanaf mijn /i,
|
40
|
+
# Swedish
|
41
|
+
/^[[:blank:]]*från min /i,
|
30
42
|
]
|
31
43
|
|
32
44
|
def self.match?(line)
|
33
|
-
|
45
|
+
# remove any markdown links
|
46
|
+
stripped = line.gsub(/\[([^\]]+)\]\([^\)]+\)/) { $1 }
|
47
|
+
SIGNATURE_REGEXES.any? { |r| stripped =~ r }
|
34
48
|
end
|
35
49
|
|
36
50
|
end
|
data/lib/email_reply_trimmer.rb
CHANGED
@@ -6,7 +6,7 @@ require_relative "email_reply_trimmer/email_header_matcher"
|
|
6
6
|
require_relative "email_reply_trimmer/quote_matcher"
|
7
7
|
|
8
8
|
class EmailReplyTrimmer
|
9
|
-
VERSION = "0.1.
|
9
|
+
VERSION = "0.1.13"
|
10
10
|
|
11
11
|
DELIMITER = "d"
|
12
12
|
EMBEDDED = "b"
|
@@ -26,16 +26,11 @@ class EmailReplyTrimmer
|
|
26
26
|
return TEXT
|
27
27
|
end
|
28
28
|
|
29
|
-
def self.trim(text, split=false)
|
30
|
-
return if text.nil? || text =~ /\A[[:space:]]*\
|
29
|
+
def self.trim(text, split = false)
|
30
|
+
return if text.nil? || text =~ /\A[[:space:]]*\z/m
|
31
31
|
|
32
|
-
#
|
33
|
-
|
34
|
-
|
35
|
-
# fix embedded email markers that might span over multiple lines
|
36
|
-
EmbeddedEmailMatcher::ON_DATE_SOMEONE_WROTE_REGEXES.each do |r|
|
37
|
-
text.gsub!(r) { |m| m.gsub(/\n[[:space:]>\-]*/, " ") }
|
38
|
-
end
|
32
|
+
# do some cleanup
|
33
|
+
preprocess!(text)
|
39
34
|
|
40
35
|
# from now on, we'll work on a line-by-line basis
|
41
36
|
lines = text.split("\n")
|
@@ -59,8 +54,8 @@ class EmailReplyTrimmer
|
|
59
54
|
end
|
60
55
|
|
61
56
|
# when the reply is at the end of the email
|
62
|
-
if pattern
|
63
|
-
index = pattern =~ /t
|
57
|
+
if is_reply_at_end?(pattern)
|
58
|
+
index = pattern =~ /t[et]*$/
|
64
59
|
pattern = ""
|
65
60
|
lines = lines[index..-1]
|
66
61
|
end
|
@@ -75,12 +70,20 @@ class EmailReplyTrimmer
|
|
75
70
|
|
76
71
|
# if there is an embedded email marker, followed by a huge quote
|
77
72
|
# then take everything up to that marker
|
78
|
-
if pattern =~ /te*b[eqbh]*[te]
|
73
|
+
if pattern =~ /te*b[eqbh]*([te]*)$/ && $1.count("t") < 7
|
79
74
|
index = pattern =~ /te*b[eqbh]*[te]*$/
|
80
75
|
pattern = pattern[0..index]
|
81
76
|
lines = lines[0..index]
|
82
77
|
end
|
83
78
|
|
79
|
+
# if there is some text before a huge quote ending the email,
|
80
|
+
# then remove the quote
|
81
|
+
if pattern =~ /t?e*[qbe]+$/
|
82
|
+
index = pattern =~ /t?e*[qbe]+$/
|
83
|
+
pattern = pattern[0..index]
|
84
|
+
lines = lines[0..index]
|
85
|
+
end
|
86
|
+
|
84
87
|
# if there still are some embedded email markers, just remove them
|
85
88
|
while pattern =~ /b/
|
86
89
|
index = pattern =~ /b/
|
@@ -95,8 +98,8 @@ class EmailReplyTrimmer
|
|
95
98
|
size.times.each { |s| pattern[index + s] = EMAIL_HEADER }
|
96
99
|
end
|
97
100
|
|
98
|
-
# if there are at least 3 consecutive email headers,
|
99
|
-
# these headers
|
101
|
+
# if there are at least 3 consecutive email headers,
|
102
|
+
# take everything up to these headers
|
100
103
|
if pattern =~ /t[eq]*h{3,}/
|
101
104
|
index = pattern =~ /t[eq]*h{3,}/
|
102
105
|
pattern = pattern[0..index]
|
@@ -128,15 +131,10 @@ class EmailReplyTrimmer
|
|
128
131
|
end
|
129
132
|
|
130
133
|
def self.extract_embedded_email(text)
|
131
|
-
return if text.nil? || text =~ /\A[[:space:]]*\
|
134
|
+
return if text.nil? || text =~ /\A[[:space:]]*\z/m
|
132
135
|
|
133
|
-
#
|
134
|
-
|
135
|
-
|
136
|
-
# fix embedded email markers that might span over multiple lines
|
137
|
-
EmbeddedEmailMatcher::ON_DATE_SOMEONE_WROTE_REGEXES.each do |r|
|
138
|
-
text.gsub!(r) { |m| m.gsub(/\n[[:space:]>\-]*/, " ") }
|
139
|
-
end
|
136
|
+
# do some cleanup
|
137
|
+
preprocess!(text)
|
140
138
|
|
141
139
|
# from now on, we'll work on a line-by-line basis
|
142
140
|
lines = text.split("\n")
|
@@ -146,6 +144,12 @@ class EmailReplyTrimmer
|
|
146
144
|
|
147
145
|
if index = pattern =~ /(?:h[eqd]*?){3,}[tq]/
|
148
146
|
embedded = lines[index..-1].join("\n").strip
|
147
|
+
elsif index = pattern =~ /b(?:[eqd]*){3,}[tq]/
|
148
|
+
# Exception for email clients (macOS / iOS) which embed fwd emails in quotes.
|
149
|
+
embedded = lines[index + 1..-1].map { |l| l.gsub(/^>\s*/, '') }.join("\n").strip
|
150
|
+
end
|
151
|
+
|
152
|
+
if index
|
149
153
|
before = lines[0...(pattern[0...index] =~ /e*(b[eqd]*|b*[ed]*)$/)].join("\n").strip
|
150
154
|
return [embedded, before]
|
151
155
|
end
|
@@ -153,22 +157,62 @@ class EmailReplyTrimmer
|
|
153
157
|
|
154
158
|
private
|
155
159
|
|
156
|
-
|
157
|
-
|
160
|
+
def self.preprocess!(text)
|
161
|
+
# normalize line endings
|
162
|
+
text.gsub!("\r\n", "\n")
|
163
|
+
|
164
|
+
# remove PGP markers
|
165
|
+
text.gsub!(/\A-----BEGIN PGP SIGNED MESSAGE-----\n(?:Hash: \w+)?\s+/i, "")
|
166
|
+
text.gsub!(/^-----BEGIN PGP SIGNATURE-----$[\s\S]+^-----END PGP SIGNATURE-----/, "")
|
158
167
|
|
159
|
-
|
160
|
-
|
168
|
+
# remove unsubscribe links
|
169
|
+
text.gsub!(/^Unsubscribe: .+@.+(\n.+http:.+)?\s*\z/i, "")
|
161
170
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
171
|
+
# remove alias-style quotes marker
|
172
|
+
text.gsub!(/^.*>{5} "[^"\n]+" == .+ writes:/, "")
|
173
|
+
|
174
|
+
# change enclosed-style quotes format
|
175
|
+
text.gsub!(/^>>> ?(.+) ?>>>$\n([\s\S]+?)\n^<<< ?\1 ?<<<$/) { $2.gsub(/^/, "> ") }
|
176
|
+
text.gsub!(/^>{4,}[[:blank:]]*$\n([\s\S]+?)\n^<{4,}[[:blank:]]*$/) { $1.gsub(/^/, "> ") }
|
177
|
+
|
178
|
+
# fix all quotes formats
|
179
|
+
text.gsub!(/^((?:[[:blank:]]*[[:alpha:]]*[>|])+)/) { $1.gsub(/([[:alpha:]]+>|\|)/, ">") }
|
180
|
+
|
181
|
+
# fix embedded email markers that might span over multiple lines
|
182
|
+
(
|
183
|
+
EmbeddedEmailMatcher::ON_DATE_SOMEONE_WROTE_REGEXES +
|
184
|
+
EmbeddedEmailMatcher::SOMEONE_WROTE_ON_DATE_REGEXES +
|
185
|
+
EmbeddedEmailMatcher::DATE_SOMEONE_WROTE_REGEXES +
|
186
|
+
[EmbeddedEmailMatcher::DATE_SOMEONE_EMAIL_REGEX]
|
187
|
+
).each do |r|
|
188
|
+
text.gsub!(r) do |m|
|
189
|
+
m.count("\n") > 4 ? m : m.gsub(/\n+[[:space:]]*/, " ")
|
169
190
|
end
|
191
|
+
end
|
170
192
|
|
171
|
-
|
193
|
+
# remove leading/trailing whitespaces
|
194
|
+
text.strip!
|
195
|
+
end
|
196
|
+
|
197
|
+
def self.compute_elided(text, lines)
|
198
|
+
elided = []
|
199
|
+
|
200
|
+
t = 0
|
201
|
+
l = 0
|
202
|
+
|
203
|
+
while t < text.size
|
204
|
+
while l < lines.size && text[t] == lines[l]
|
205
|
+
t += 1
|
206
|
+
l += 1
|
207
|
+
end
|
208
|
+
elided << text[t]
|
209
|
+
t += 1
|
172
210
|
end
|
173
211
|
|
212
|
+
elided.join("\n").strip
|
213
|
+
end
|
214
|
+
|
215
|
+
def self.is_reply_at_end?(pattern)
|
216
|
+
pattern =~ /^b[^t]+t[et]*$/
|
217
|
+
end
|
174
218
|
end
|
@@ -0,0 +1 @@
|
|
1
|
+
This email has been forwarded from Apple Mail.
|
@@ -0,0 +1 @@
|
|
1
|
+
This email has been forwarded from Gmail.
|
@@ -0,0 +1,23 @@
|
|
1
|
+
From: Erlend Sogge Heggen <meta@discoursemail.com>
|
2
|
+
Reply-To: Erlend Sogge Heggen <meta+abcd@discoursemail.com>
|
3
|
+
Date: Wednesday, 5 April 2017 at 17:01
|
4
|
+
To: Jef <jef@bar.com>
|
5
|
+
Subject: [Discourse Meta] [PM] Discourse for Communities of Practice, educational organisation
|
6
|
+
|
7
|
+
|
8
|
+
erlend_sh<https://meta.discourse.org/u/erlend_sh> Erlend Sogge Heggen<https://meta.discourse.org/u/erlend_sh> Team
|
9
|
+
April 5
|
10
|
+
|
11
|
+
|
12
|
+
|
13
|
+
Hi Jef,
|
14
|
+
|
15
|
+
Is your University a legally recognised educational institution? Otherwise I'm afraid you're not eligible for this discount.
|
16
|
+
|
17
|
+
Sincerely,
|
18
|
+
|
19
|
+
Erlend
|
20
|
+
|
21
|
+
|
22
|
+
|
23
|
+
This email message and any attachments may contain confidential information and may be privileged. If you are not the intended recipient or otherwise not authorized to receive this message, you are prohibited to use, copy, disclose or take any action based on this email or any information contained herein. If you are not the intended recipient, please advise the sender immediately by replying to this email and permanently delete this message and any attachments from your system.
|
@@ -2,7 +2,7 @@ On Mon, Feb 1, 2016 at 6:32 PM, Jeff Atwood <info@discourse.org> wrote:
|
|
2
2
|
|
3
3
|
> This is Jeff's reply.
|
4
4
|
>
|
5
|
-
> On Mon, Feb 1, 2016 at 7:50 AM, Some One <foo@bar.com wrote:
|
5
|
+
> On Mon, Feb 1, 2016 at 7:50 AM, Some One <foo@bar.com > > wrote:
|
6
6
|
>
|
7
7
|
>> Great!
|
8
8
|
>>
|
@@ -14,7 +14,7 @@ On Mon, Feb 1, 2016 at 6:32 PM, Jeff Atwood <info@discourse.org> wrote:
|
|
14
14
|
>>
|
15
15
|
>>> WAT?
|
16
16
|
>>>
|
17
|
-
>>> On Wed, Jan 27, 2016 at 10:48 PM, Some One < foo@bar.com> wrote:
|
17
|
+
>>> On Wed, Jan 27, 2016 at 10:48 PM, Some One < >>> foo@bar.com> wrote:
|
18
18
|
>>>
|
19
19
|
>>>> Hi Team,
|
20
20
|
>>>>
|
@@ -22,7 +22,7 @@ On Mon, Feb 1, 2016 at 6:32 PM, Jeff Atwood <info@discourse.org> wrote:
|
|
22
22
|
>>>>
|
23
23
|
>>>> Some One
|
24
24
|
>>>>
|
25
|
-
>>>> On Wed, Jan 27, 2016 at 10:10 AM Discourse Team <team@discourse.org> wrote:
|
25
|
+
>>>> On Wed, Jan 27, 2016 at 10:10 AM Discourse Team <team@discourse.org> >>>> wrote:
|
26
26
|
>>>>
|
27
27
|
>>>>> Hello :waves_hand:
|
28
28
|
>>>>>
|
@@ -1,8 +1,8 @@
|
|
1
|
-
On Thu, 31 Mar 2016 at 11:16 Some One < mailto:Some One <some.one@foo.bar> wrote:
|
1
|
+
On Thu, 31 Mar 2016 at 11:16 Some One < mailto:Some One <some.one@foo.bar> > wrote:
|
2
2
|
|
3
3
|
|
4
4
|
Two 2
|
5
5
|
|
6
|
-
On Thu, 31 Mar 2016 at 10:05 Jeff Atwood < mailto:Jeff Atwood <team@discourse.org> wrote:
|
6
|
+
On Thu, 31 Mar 2016 at 10:05 Jeff Atwood < mailto:Jeff Atwood <team@discourse.org> > wrote:
|
7
7
|
|
8
8
|
Three 3
|
@@ -0,0 +1,9 @@
|
|
1
|
+
At 6/16/2016 08:32 PM, you wrote:
|
2
|
+
><https://meta.discourse.org/users/codinghorror>codinghorror
|
3
|
+
><https://meta.discourse.org/users/codinghorror>Jeff Atwood co-founder
|
4
|
+
>June 17
|
5
|
+
>
|
6
|
+
>Sorry I got a little mixed up with all the incoming replies. Are you
|
7
|
+
>able to log in?
|
8
|
+
>
|
9
|
+
>Use your email address and "forgot password" if you need it reset.
|
@@ -0,0 +1,11 @@
|
|
1
|
+
2016-10-24 15:36 GMT+02:00 Foo bar < info@foo.bar>:
|
2
|
+
|
3
|
+
> Thank you so much Erlend, very thanks!
|
4
|
+
>
|
5
|
+
> 2016-10-24 15:03 GMT+02:00 Erlend Sogge Heggen <meta@discoursemail.com>:
|
6
|
+
>
|
7
|
+
>> erlend_sh <https://meta.discourse.org/users/erlend_sh> Erlend Sogge
|
8
|
+
>> Heggen <https://meta.discourse.org/users/erlend_sh> team
|
9
|
+
>> October 24
|
10
|
+
>>
|
11
|
+
>> I received your application and I've replied with setup instructions.
|
@@ -0,0 +1 @@
|
|
1
|
+
On 8 May 2017 17:34, "Andy Jones" <Andy.Jones@jameshall.co.uk> wrote:
|
File without changes
|
@@ -0,0 +1,15 @@
|
|
1
|
+
Max Mustermann <try_discourse@discoursemail.com> schrieb am Fr., 28. Apr. 2017 um 11:53 Uhr:
|
2
|
+
|
3
|
+
> max_2 <http://try.discourse.org/u/cfstras_2>
|
4
|
+
> April 28
|
5
|
+
|
6
|
+
> Hi there! you should be getting a mail.
|
7
|
+
> ------------------------------
|
8
|
+
|
9
|
+
> Visit Topic <http://try.discourse.org/t/this-is-my-internal-test/716/2>
|
10
|
+
> or reply to this email to respond.
|
11
|
+
|
12
|
+
> To unsubscribe from these emails, click here
|
13
|
+
> <http://try.discourse.org/email/unsubscribe/badf00d>
|
14
|
+
> .
|
15
|
+
|