email_reply_trimmer 0.1.6 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +4 -0
  3. data/Gemfile.lock +37 -0
  4. data/Rakefile +1 -1
  5. data/email_reply_trimmer.gemspec +5 -1
  6. data/lib/email_reply_trimmer/delimiter_matcher.rb +1 -1
  7. data/lib/email_reply_trimmer/email_header_matcher.rb +25 -13
  8. data/lib/email_reply_trimmer/embedded_email_matcher.rb +52 -25
  9. data/lib/email_reply_trimmer/signature_matcher.rb +16 -9
  10. data/lib/email_reply_trimmer.rb +58 -23
  11. data/test/elided/email_headers_5.txt +23 -0
  12. data/test/elided/embedded_ception.txt +3 -3
  13. data/test/elided/embedded_email_12.txt +2 -2
  14. data/test/elided/embedded_email_13.txt +9 -0
  15. data/test/elided/embedded_email_14.txt +11 -0
  16. data/test/elided/embedded_email_15.txt +4 -0
  17. data/test/elided/embedded_email_16.txt +4 -0
  18. data/test/elided/embedded_email_17.txt +2 -0
  19. data/test/elided/embedded_email_18.txt +1 -0
  20. data/test/elided/embedded_email_19.txt +0 -0
  21. data/test/elided/embedded_email_chinese.txt +4 -0
  22. data/test/elided/embedded_email_german_4.txt +15 -0
  23. data/test/elided/embedded_email_german_5.txt +20 -0
  24. data/test/elided/embedded_email_german_6.txt +8 -0
  25. data/test/elided/embedded_email_norwegian.txt +9 -0
  26. data/test/elided/embedded_email_polish_2.txt +7 -0
  27. data/test/elided/embedded_email_quote_text.txt +5 -0
  28. data/test/elided/embedded_email_russian_2.txt +23 -0
  29. data/test/elided/embedded_email_swedish.txt +8 -0
  30. data/test/elided/embedded_email_ukrainian.txt +17 -0
  31. data/test/elided/signatures.txt +2 -0
  32. data/test/emails/email_headers_5.txt +37 -0
  33. data/test/emails/embedded_email_1.txt +1 -1
  34. data/test/emails/embedded_email_13.txt +14 -0
  35. data/test/emails/embedded_email_14.txt +16 -0
  36. data/test/emails/embedded_email_15.txt +9 -0
  37. data/test/emails/embedded_email_16.txt +16 -0
  38. data/test/emails/embedded_email_17.txt +38 -0
  39. data/test/emails/embedded_email_18.txt +7 -0
  40. data/test/emails/embedded_email_19.txt +13 -0
  41. data/test/emails/embedded_email_4.txt +13 -13
  42. data/test/emails/embedded_email_7.txt +4 -4
  43. data/test/emails/embedded_email_chinese.txt +7 -0
  44. data/test/emails/embedded_email_german_4.txt +18 -0
  45. data/test/emails/embedded_email_german_5.txt +23 -0
  46. data/test/emails/embedded_email_german_6.txt +14 -0
  47. data/test/emails/embedded_email_norwegian.txt +11 -0
  48. data/test/emails/embedded_email_polish_2.txt +11 -0
  49. data/test/emails/embedded_email_russian_2.txt +26 -0
  50. data/test/emails/embedded_email_swedish.txt +20 -0
  51. data/test/emails/embedded_email_ukrainian.txt +19 -0
  52. data/test/emails/signatures.txt +2 -0
  53. data/test/matchers/does_not_contain_embedded_email.txt +5 -0
  54. data/test/test_email_matcher.rb +15 -0
  55. data/test/test_email_reply_trimmer.rb +2 -2
  56. data/test/trimmed/email_headers_5.txt +11 -0
  57. data/test/trimmed/embedded_email_13.txt +3 -0
  58. data/test/trimmed/embedded_email_14.txt +3 -0
  59. data/test/trimmed/embedded_email_15.txt +3 -0
  60. data/test/trimmed/embedded_email_16.txt +11 -0
  61. data/test/trimmed/embedded_email_17.txt +35 -0
  62. data/test/trimmed/embedded_email_18.txt +5 -0
  63. data/test/trimmed/embedded_email_19.txt +13 -0
  64. data/test/trimmed/embedded_email_chinese.txt +2 -0
  65. data/test/trimmed/embedded_email_german_4.txt +1 -0
  66. data/test/trimmed/embedded_email_german_5.txt +1 -0
  67. data/test/trimmed/embedded_email_german_6.txt +4 -0
  68. data/test/trimmed/embedded_email_norwegian.txt +1 -0
  69. data/test/trimmed/embedded_email_polish_2.txt +2 -0
  70. data/test/trimmed/embedded_email_quote_text.txt +0 -5
  71. data/test/trimmed/embedded_email_russian_2.txt +1 -0
  72. data/test/trimmed/embedded_email_swedish.txt +9 -0
  73. data/test/trimmed/embedded_email_ukrainian.txt +1 -0
  74. metadata +107 -10
  75. /data/test/elided/{embedded_email_polish.txt → embedded_email_polish_1.txt} +0 -0
  76. /data/test/elided/{embedded_email_russian.txt → embedded_email_russian_1.txt} +0 -0
  77. /data/test/emails/{embedded_email_polish.txt → embedded_email_polish_1.txt} +0 -0
  78. /data/test/emails/{embedded_email_russian.txt → embedded_email_russian_1.txt} +0 -0
  79. /data/test/trimmed/{embedded_email_polish.txt → embedded_email_polish_1.txt} +0 -0
  80. /data/test/trimmed/{embedded_email_russian.txt → embedded_email_russian_1.txt} +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c6eabb8ce3f3327f3abe63f3ee2fe147fb161d96
4
- data.tar.gz: 7a07c2267ef47f4607a2ae2f05feac8c0a2584c1
3
+ metadata.gz: 7394892053e17114efa7dfb63444527df829dc6c
4
+ data.tar.gz: f598839360be70ef5c0ce255c4a00f1b1b73b2dd
5
5
  SHA512:
6
- metadata.gz: dba90e1fdc0b0a4f7032f9c2d1e9aabf575a390ee4cf39618037b9a820de8d7abad6338623b203211ef349dc3568b2e55440721dc89fa8ac144eda878cf0e463
7
- data.tar.gz: 19be9d9b0496d31e81f7f7adb296d273aeea84130a8243d669857de751748a4e923584a14d4c8f0a90f338428e7dd219fa4212fddce38575b44ec58a598c5455
6
+ metadata.gz: f27fda91801b09d266821ffa643421832efb3c729872b4e65ed1d7122858ff08bef7745be6216e8ab0b870a07b4ef0c7eeb87e3b1252cef60f52e2f1e0a31ac9
7
+ data.tar.gz: 1bd713d10c0294a6dd9fc1865e3d31a8c76dd4025807b1ca554c58bd52176f1d262eaf57ee029484c6c0dd0a700d071714c7c83d2a1b112820894846269b4eed
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in email_reply_trimmer.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,37 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ email_reply_trimmer (0.1.9)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ ast (2.4.0)
10
+ minitest (5.11.3)
11
+ parallel (1.12.1)
12
+ parser (2.4.0.2)
13
+ ast (~> 2.3)
14
+ powerpack (0.1.1)
15
+ rainbow (3.0.0)
16
+ rake (12.3.0)
17
+ rubocop (0.52.1)
18
+ parallel (~> 1.10)
19
+ parser (>= 2.4.0.2, < 3.0)
20
+ powerpack (~> 0.1)
21
+ rainbow (>= 2.2.2, < 4.0)
22
+ ruby-progressbar (~> 1.7)
23
+ unicode-display_width (~> 1.0, >= 1.0.1)
24
+ ruby-progressbar (1.9.0)
25
+ unicode-display_width (1.3.0)
26
+
27
+ PLATFORMS
28
+ ruby
29
+
30
+ DEPENDENCIES
31
+ email_reply_trimmer!
32
+ minitest (~> 5)
33
+ rake (~> 12)
34
+ rubocop (~> 0.52.1)
35
+
36
+ BUNDLED WITH
37
+ 1.16.1
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ def version
6
6
  @version ||= File.read("lib/#{name}.rb")[/^\s*VERSION\s*=\s*['"](?'version'\d+\.\d+\.\d+)['"]/, "version"]
7
7
  end
8
8
 
9
- task :default => :test
9
+ task default: :test
10
10
 
11
11
  require "rake/testtask"
12
12
  Rake::TestTask.new(:test)
@@ -15,6 +15,10 @@ Gem::Specification.new do |s|
15
15
  s.license = "MIT"
16
16
 
17
17
  s.require_paths = ["lib"]
18
- s.files = Dir["**/*"].reject { |path| File.directory?(path) }
18
+ s.files = Dir["**/*"].reject { |path| File.directory?(path) || path =~ /.*\.gem$/ }
19
19
  s.test_files = s.files.select { |path| path =~ /^test\/.+_test\.rb$/ }
20
+
21
+ s.add_development_dependency 'rake', '~> 12'
22
+ s.add_development_dependency 'minitest', '~> 5'
23
+ s.add_development_dependency 'rubocop', '~> 0.52.1'
20
24
  end
@@ -1,6 +1,6 @@
1
1
  class DelimiterMatcher
2
2
 
3
- DELIMITER_CHARACTERS ||= "-_,=+~#*ᐧ"
3
+ DELIMITER_CHARACTERS ||= "-_,=+~#*ᐧ—"
4
4
  DELIMITER_REGEX ||= /^[[:blank:]]*[#{Regexp.escape(DELIMITER_CHARACTERS)}]+[[:blank:]]*$/
5
5
 
6
6
  def self.match?(line)
@@ -1,12 +1,12 @@
1
1
  class EmailHeaderMatcher
2
2
 
3
- EMAIL_HEADERS_WITH_DATE_MARKERS = [
4
- # Dutch
3
+ EMAIL_HEADERS_WITH_DATE_MARKERS ||= [
4
+ # Norwegian
5
5
  ["Sendt"],
6
6
  # English
7
- ["Sent"],
7
+ ["Sent", "Date"],
8
8
  # French
9
- ["Date"],
9
+ ["Date", "Le"],
10
10
  # German
11
11
  ["Gesendet"],
12
12
  # Portuguese
@@ -17,19 +17,25 @@ class EmailHeaderMatcher
17
17
  ["Fecha"],
18
18
  # Italian
19
19
  ["Data"],
20
+ # Dutch
21
+ ["Datum"],
22
+ # Swedish
23
+ ["Skickat"],
24
+ # Chinese
25
+ ["发送时间"],
20
26
  ]
21
27
 
22
- EMAIL_HEADERS_WITH_DATE_REGEXES = EMAIL_HEADERS_WITH_DATE_MARKERS.map do |header|
23
- /^[[:blank:]>\*]*(?:#{header.join("|")})[[:blank:]\*]*:.*\d+/
28
+ EMAIL_HEADERS_WITH_DATE_REGEXES ||= EMAIL_HEADERS_WITH_DATE_MARKERS.map do |header|
29
+ /^[[:blank:]*]*(?:#{header.join("|")})[[:blank:]*]*:.*\d+/
24
30
  end
25
31
 
26
- EMAIL_HEADERS_WITH_TEXT_MARKERS = [
27
- # Dutch
32
+ EMAIL_HEADERS_WITH_TEXT_MARKERS ||= [
33
+ # Norwegian
28
34
  ["Fra", "Til", "Emne"],
29
35
  # English
30
36
  ["From", "To", "Cc", "Reply-To", "Subject"],
31
37
  # French
32
- ["De", "À", "Répondre à", "Objet"],
38
+ ["De", "Expéditeur", "À", "Destinataire", "Répondre à", "Objet"],
33
39
  # German
34
40
  ["Von", "An", "Betreff"],
35
41
  # Portuguese
@@ -37,14 +43,20 @@ class EmailHeaderMatcher
37
43
  # Spanish
38
44
  ["De", "Para", "Asunto"],
39
45
  # Italian
40
- ["Da", "Risposta", "A", "Oggetto"]
46
+ ["Da", "Risposta", "A", "Oggetto"],
47
+ # Dutch
48
+ ["Van", "Beantwoorden - Aan", "Aan", "Onderwerp"],
49
+ # Swedish
50
+ ["Från", "Till", "Ämne"],
51
+ # Chinese
52
+ ["发件人", "收件人", "主题"],
41
53
  ]
42
54
 
43
- EMAIL_HEADERS_WITH_TEXT_REGEXES = EMAIL_HEADERS_WITH_TEXT_MARKERS.map do |header|
44
- /^[[:blank:]>\*]*(?:#{header.join("|")})[[:blank:]\*]*:.*[[:word:]]+/
55
+ EMAIL_HEADERS_WITH_TEXT_REGEXES ||= EMAIL_HEADERS_WITH_TEXT_MARKERS.map do |header|
56
+ /^[[:blank:]*]*(?:#{header.join("|")})[[:blank:]*]*:.*[[:word:]]+/i
45
57
  end
46
58
 
47
- EMAIL_HEADER_REGEXES = [
59
+ EMAIL_HEADER_REGEXES ||= [
48
60
  EMAIL_HEADERS_WITH_DATE_REGEXES,
49
61
  EMAIL_HEADERS_WITH_TEXT_REGEXES,
50
62
  ].flatten
@@ -10,38 +10,43 @@ class EmbeddedEmailMatcher
10
10
  # Dnia 14 lip 2015 o godz. 00:25 Michael Downey <info@discourse.org> napisał(a):
11
11
  # Em seg, 27 de jul de 2015 17:13, Neil Lalonde <info@discourse.org> escreveu:
12
12
  # El jueves, 21 de noviembre de 2013, codinghorror escribió:
13
- # Am 03.02.2016 3:35 nachm. schrieb Max Mustermann <mail@example.com>:
14
- ON_DATE_SOMEONE_WROTE_MARKERS = [
13
+ # At 6/16/2016 08:32 PM, you wrote:
14
+ ON_DATE_SOMEONE_WROTE_REGEXES ||= [
15
+ # Chinese
16
+ /^[[:blank:]<>-]*在 (?:(?!\b(?>在|写道)\b).)+?写道[[:blank:].:>-]*$/im,
15
17
  # Dutch
16
- ["Op","het volgende geschreven"],
18
+ /^[[:blank:]<>-]*Op (?:(?!\b(?>Op|het\svolgende\sgeschreven|schreef)\b).)+?(het\svolgende\sgeschreven|schreef[^:]+)[[:blank:].:>-]*$/im,
17
19
  # English
18
- ["On", "wrote"],
20
+ /^[[:blank:]<>-]*In message (?:(?!\b(?>In message|writes)\b).)+?writes[[:blank:].:>-]*$/im,
21
+ /^[[:blank:]<>-]*(On|At) (?:(?!\b(?>On|wrote|writes|says|said)\b).)+?(wrote|writes|says|said)[[:blank:].:>-]*$/im,
19
22
  # French
20
- ["Le", "a écrit "],
23
+ /^[[:blank:]<>-]*Le (?:(?!\b(?>Le|nous\sa\sdit|a\s+écrit)\b).)+?(nous\sa\sdit|a\s+écrit)[[:blank:].:>-]*$/im,
24
+ # German
25
+ /^[[:blank:]<>-]*Am (?:(?!\b(?>Am|schrieben\sSie)\b).)+?schrieben\sSie[[:blank:].:>-]*$/im,
26
+ /^[[:blank:]<>-]*Am (?:(?!\b(?>Am|geschrieben)\b).)+?(geschrieben|schrieb[^:]+)[[:blank:].:>-]*$/im,
21
27
  # Italian
22
- ["Il", "ha scritto"],
28
+ /^[[:blank:]<>-]*Il (?:(?!\b(?>Il|ha\sscritto)\b).)+?ha\sscritto[[:blank:].:>-]*$/im,
23
29
  # Polish
24
- ["Dnia", "napisał\\(a\\)"],
30
+ /^[[:blank:]<>-]*(Dnia|Dňa) (?:(?!\b(?>Dnia|Dňa|napisał)\b).)+?napisał(\(a\))?[[:blank:].:>-]*$/im,
25
31
  # Portuguese
26
- ["Em", "escreveu"],
32
+ /^[[:blank:]<>-]*Em (?:(?!\b(?>Em|escreveu)\b).)+?escreveu[[:blank:].:>-]*$/im,
27
33
  # Spanish
28
- ["El", "escribió"],
29
- # German
30
- ["Am", "schrieb"],
34
+ /^[[:blank:]<>-]*El (?:(?!\b(?>El|escribió)\b).)+?escribió[[:blank:].:>-]*$/im,
31
35
  ]
32
36
 
33
- ON_DATE_SOMEONE_WROTE_REGEXES = ON_DATE_SOMEONE_WROTE_MARKERS.map do |on, wrote|
34
- wrote.gsub!(/ +/, "[[:space:]]+") # the "wrote" part might span over multiple lines
35
- /^([[:blank:]>\-]*#{on}\s(?:(?!#{on}\s|#{wrote}:?)[\s\S])*#{wrote}:?[[:blank:]\-]*)$/m
36
- end
37
-
38
37
  # Op 10 dec. 2015 18:35 schreef "Arpit Jalan" <info@discourse.org>:
39
38
  # Am 18.09.2013 um 16:24 schrieb codinghorror <info@discourse.org>:
39
+ # Den 15. jun. 2016 kl. 20.42 skrev Jeff Atwood <info@discourse.org>:
40
+ # søn. 30. apr. 2017 kl. 00.26 skrev David Taylor <meta@discoursemail.com>:
40
41
  ON_DATE_WROTE_SOMEONE_MARKERS = [
41
42
  # Dutch
42
43
  ["Op", "schreef"],
43
44
  # German
44
45
  ["Am", "schrieb"],
46
+ # Norwegian
47
+ ["Den", "skrev"],
48
+ # Dutch
49
+ ["søn\.", "skrev"],
45
50
  ]
46
51
 
47
52
  ON_DATE_WROTE_SOMEONE_REGEXES = ON_DATE_WROTE_SOMEONE_MARKERS.map do |on, wrote|
@@ -49,15 +54,31 @@ class EmbeddedEmailMatcher
49
54
  end
50
55
 
51
56
  # суббота, 14 марта 2015 г. пользователь etewiah написал:
57
+ # 23 mar 2017 21:25 "Neil Lalonde" <meta@discoursemail.com> napisał(a):
58
+ # 30 серп. 2016 р. 20:45 "Arpit" no-reply@example.com пише:
52
59
  DATE_SOMEONE_WROTE_MARKERS = [
53
60
  # Russian
54
61
  ["пользователь", "написал"],
62
+ # Polish
63
+ ["", "napisał\\(a\\)"],
64
+ # Ukrainian
65
+ ["", "пише"],
55
66
  ]
56
67
 
57
68
  DATE_SOMEONE_WROTE_REGEXES = DATE_SOMEONE_WROTE_MARKERS.map do |user, wrote|
58
- /.+#{user}.+#{wrote}:/
69
+ user.size == 0 ?
70
+ /^.*\d{4}.+\n?.*#{wrote}:/ :
71
+ /^.*\d{4}.+#{user}.*\n?.*#{wrote}:/
59
72
  end
60
73
 
74
+ # Max Mustermann <try_discourse@discoursemail.com> schrieb am Fr., 28. Apr. 2017 um 11:53 Uhr:
75
+ SOMEONE_WROTE_ON_DATE_REGEXES ||= [
76
+ # English
77
+ /^.+\bwrote\b[[:space:]]+\bon\b.+[^:]+:/,
78
+ # German
79
+ /^.+\bschrieb\b[[:space:]]+\bam\b.+[^:]+:/,
80
+ ]
81
+
61
82
  # 2016-03-03 17:21 GMT+01:00 Some One
62
83
  ISO_DATE_SOMEONE_REGEX = /^[[:blank:]>]*20\d\d-\d\d-\d\d \d\d:\d\d GMT\+\d\d:\d\d [\w[:blank:]]+$/
63
84
 
@@ -65,7 +86,7 @@ class EmbeddedEmailMatcher
65
86
  # 2013/10/2 camilohollanda <info@discourse.org>
66
87
  # вт, 5 янв. 2016 г. в 23:39, Erlend Sogge Heggen <info@discourse.org>:
67
88
  # ср, 1 апр. 2015, 18:29, Denis Didkovsky <info@discourse.org>:
68
- DATE_SOMEONE_EMAIL_REGEX = /^[[:blank:]>]*.*\d{4}.+<[^@<>]+@[^@<>.]+\.[^@<>]+>:?$/
89
+ DATE_SOMEONE_EMAIL_REGEX = /^.*\d{4}.+\s?<[^@<>]+@[^@<>.]+\.[^@<>]+>:?$/
69
90
 
70
91
  # codinghorror via Discourse Meta wrote:
71
92
  # codinghorror via Discourse Meta <info@discourse.org> schrieb:
@@ -77,11 +98,12 @@ class EmbeddedEmailMatcher
77
98
  ]
78
99
 
79
100
  SOMEONE_VIA_SOMETHING_WROTE_REGEXES = SOMEONE_VIA_SOMETHING_WROTE_MARKERS.map do |wrote|
80
- /^[[:blank:]>]*.+ via .+ #{wrote}:?[[:blank:]]*$/
101
+ /^.+ via .+ #{wrote}:?[[:blank:]]*$/
81
102
  end
82
103
 
83
104
  # Some One <info@discourse.org> wrote:
84
- SOMEONE_EMAIL_WROTE_REGEX = /^[[:blank:]>]*.+ <.+@.+\..+> wrote:?/
105
+ # Gavin Sinclair (gsinclair@soyabean.com.au) wrote:
106
+ SOMEONE_EMAIL_WROTE_REGEX = /^.+\b[\w.+-]+@[\w.-]+\.\w{2,}\b.+wrote:?$/
85
107
 
86
108
  # Posted by mpalmer on 01/21/2016
87
109
  POSTED_BY_SOMEONE_ON_DATE_REGEX = /^[[:blank:]>]*Posted by .+ on \d{2}\/\d{2}\/\d{4}$/i
@@ -92,17 +114,21 @@ class EmbeddedEmailMatcher
92
114
  # ----- Original Message -----
93
115
  # -----Original Message-----
94
116
  # *----- Original Message -----*
117
+ # ----- Reply message -----
118
+ # ------------------ 原始邮件 ------------------
95
119
  FORWARDED_EMAIL_REGEXES = [
96
120
  # English
97
121
  /^[[:blank:]>]*Begin forwarded message:/i,
98
- /^[[:blank:]>]*Reply message/i,
99
- /^[[:blank:]>\*]*-{2,}[[:blank:]]*(Forwarded|Original) Message[[:blank:]]*-{2,}/i,
122
+ /^[[:blank:]>*]*-{2,}[[:blank:]]*(Forwarded|Original|Reply) Message[[:blank:]]*-{2,}/i,
100
123
  # French
101
- /^[[:blank:]>\*]*-{2,}[[:blank:]]*Message transféré[[:blank:]]*-{2,}/i,
124
+ /^[[:blank:]>]*Début du message transféré :/i,
125
+ /^[[:blank:]>*]*-{2,}[[:blank:]]*Message transféré[[:blank:]]*-{2,}/i,
102
126
  # German
103
- /^[[:blank:]>\*]*-{2,}[[:blank:]]*Ursprüngliche Nachricht[[:blank:]]*-{2,}/i,
127
+ /^[[:blank:]>*]*-{2,}[[:blank:]]*Ursprüngliche Nachricht[[:blank:]]*-{2,}/i,
104
128
  # Spanish
105
- /^[[:blank:]>\*]*-{2,}[[:blank:]]*Mensaje original[[:blank:]]*-{2,}/i,
129
+ /^[[:blank:]>*]*-{2,}[[:blank:]]*Mensaje original[[:blank:]]*-{2,}/i,
130
+ # Chinese
131
+ /^[[:blank:]>*]*-{2,}[[:blank:]]*原始邮件[[:blank:]]*-{2,}/i,
106
132
  ]
107
133
 
108
134
  EMBEDDED_REGEXES = [
@@ -110,6 +136,7 @@ class EmbeddedEmailMatcher
110
136
  ON_DATE_WROTE_SOMEONE_REGEXES,
111
137
  DATE_SOMEONE_WROTE_REGEXES,
112
138
  DATE_SOMEONE_EMAIL_REGEX,
139
+ SOMEONE_WROTE_ON_DATE_REGEXES,
113
140
  ISO_DATE_SOMEONE_REGEX,
114
141
  SOMEONE_VIA_SOMETHING_WROTE_REGEXES,
115
142
  SOMEONE_EMAIL_WROTE_REGEX,
@@ -13,20 +13,27 @@ class SignatureMatcher
13
13
  # (sent from a phone)
14
14
  # (Sent from mobile device)
15
15
  # 從我的 iPhone 傳送
16
- SIGNATURE_REGEXES = [
16
+ SIGNATURE_REGEXES ||= [
17
17
  # Chinese
18
- /^[[:blank:]>]*從我的 iPhone 傳送/i,
18
+ /^[[:blank:]]*從我的 iPhone 傳送/i,
19
19
  # English
20
- /^[[:blank:]>]*[[:word:]]+ from mobile/i,
21
- /^[[:blank:]>]*[\(<]*sent (?:from|via|with|by) .+[\)>]*/i,
22
- /^[[:blank:]>]*from my .{1,20}/i, # don't match too much
20
+ /^[[:blank:]]*[[:word:]]+ from mobile/i,
21
+ /^[[:blank:]]*[\(<]*Sent (from|via|with|by) .+[\)>]*/i,
22
+ /^[[:blank:]]*From my .{1,20}/i,
23
+ /^[[:blank:]]*Get Outlook for iOS/i,
23
24
  # French
24
- /^[[:blank:]>]*Envoyé depuis mon .+/i,
25
+ /^[[:blank:]]*Envoyé depuis (mon|Yahoo Mail)/i,
25
26
  # German
26
- /^[[:blank:]>]*Von meinem .+ gesendet/i,
27
- /^[[:blank:]>]*Diese Nachricht wurde von .+ gesendet/i,
27
+ /^[[:blank:]]*Von meinem .+ gesendet/i,
28
+ /^[[:blank:]]*Diese Nachricht wurde von .+ gesendet/i,
29
+ # Italian
30
+ /^[[:blank:]]*Inviato da /i,
31
+ # Norwegian
32
+ /^[[:blank:]]*Sendt fra min /i,
33
+ # Portuguese
34
+ /^[[:blank:]]*Enviado do meu /i,
28
35
  # Spanish
29
- /^[[:blank:]>]*Enviado desde mi .+/i,
36
+ /^[[:blank:]]*Enviado desde mi /i,
30
37
  ]
31
38
 
32
39
  def self.match?(line)
@@ -6,7 +6,7 @@ require_relative "email_reply_trimmer/email_header_matcher"
6
6
  require_relative "email_reply_trimmer/quote_matcher"
7
7
 
8
8
  class EmailReplyTrimmer
9
- VERSION = "0.1.6"
9
+ VERSION = "0.1.10"
10
10
 
11
11
  DELIMITER = "d"
12
12
  EMBEDDED = "b"
@@ -26,16 +26,11 @@ class EmailReplyTrimmer
26
26
  return TEXT
27
27
  end
28
28
 
29
- def self.trim(text, split=false)
30
- return if text.nil? || text =~ /\A[[:space:]]*\Z/m
29
+ def self.trim(text, split = false)
30
+ return if text.nil? || text =~ /\A[[:space:]]*\z/m
31
31
 
32
- # normalize line endings
33
- text.gsub!("\r\n", "\n")
34
-
35
- # fix embedded email markers that might span over multiple lines
36
- EmbeddedEmailMatcher::ON_DATE_SOMEONE_WROTE_REGEXES.each do |r|
37
- text.gsub!(r) { |m| m.gsub(/\n[[:space:]>\-]*/, " ") }
38
- end
32
+ # do some cleanup
33
+ preprocess!(text)
39
34
 
40
35
  # from now on, we'll work on a line-by-line basis
41
36
  lines = text.split("\n")
@@ -59,8 +54,8 @@ class EmailReplyTrimmer
59
54
  end
60
55
 
61
56
  # when the reply is at the end of the email
62
- if pattern =~ /^b+q+[eq]*t[te]*$/
63
- index = pattern =~ /t/
57
+ if pattern =~ /^(b[^t]+)*b[bqeh]+t[et]*$/
58
+ index = pattern =~ /t[et]*$/
64
59
  pattern = ""
65
60
  lines = lines[index..-1]
66
61
  end
@@ -75,12 +70,20 @@ class EmailReplyTrimmer
75
70
 
76
71
  # if there is an embedded email marker, followed by a huge quote
77
72
  # then take everything up to that marker
78
- if pattern =~ /te*b[eqbh]*[te]*$/
73
+ if pattern =~ /te*b[eqbh]*([te]*)$/ && $1.count("t") < 7
79
74
  index = pattern =~ /te*b[eqbh]*[te]*$/
80
75
  pattern = pattern[0..index]
81
76
  lines = lines[0..index]
82
77
  end
83
78
 
79
+ # if there is some text before a huge quote ending the email,
80
+ # then remove the quote
81
+ if pattern =~ /te*[qbe]+$/
82
+ index = pattern =~ /te*[qbe]+$/
83
+ pattern = pattern[0..index]
84
+ lines = lines[0..index]
85
+ end
86
+
84
87
  # if there still are some embedded email markers, just remove them
85
88
  while pattern =~ /b/
86
89
  index = pattern =~ /b/
@@ -95,8 +98,8 @@ class EmailReplyTrimmer
95
98
  size.times.each { |s| pattern[index + s] = EMAIL_HEADER }
96
99
  end
97
100
 
98
- # if there are at least 3 consecutive email headers, take everything up to
99
- # these headers
101
+ # if there are at least 3 consecutive email headers,
102
+ # take everything up to these headers
100
103
  if pattern =~ /t[eq]*h{3,}/
101
104
  index = pattern =~ /t[eq]*h{3,}/
102
105
  pattern = pattern[0..index]
@@ -128,15 +131,10 @@ class EmailReplyTrimmer
128
131
  end
129
132
 
130
133
  def self.extract_embedded_email(text)
131
- return if text.nil? || text =~ /\A[[:space:]]*\Z/m
132
-
133
- # normalize line endings
134
- text.gsub!("\r\n", "\n")
134
+ return if text.nil? || text =~ /\A[[:space:]]*\z/m
135
135
 
136
- # fix embedded email markers that might span over multiple lines
137
- EmbeddedEmailMatcher::ON_DATE_SOMEONE_WROTE_REGEXES.each do |r|
138
- text.gsub!(r) { |m| m.gsub(/\n[[:space:]>\-]*/, " ") }
139
- end
136
+ # do some cleanup
137
+ preprocess!(text)
140
138
 
141
139
  # from now on, we'll work on a line-by-line basis
142
140
  lines = text.split("\n")
@@ -153,6 +151,43 @@ class EmailReplyTrimmer
153
151
 
154
152
  private
155
153
 
154
+ def self.preprocess!(text)
155
+ # normalize line endings
156
+ text.gsub!("\r\n", "\n")
157
+
158
+ # remove PGP markers
159
+ text.gsub!(/\A-----BEGIN PGP SIGNED MESSAGE-----\n(?:Hash: \w+)?\s+/i, "")
160
+ text.gsub!(/^-----BEGIN PGP SIGNATURE-----$[\s\S]+^-----END PGP SIGNATURE-----/, "")
161
+
162
+ # remove unsubscribe links
163
+ text.gsub!(/^Unsubscribe: .+@.+(\n.+http:.+)?\s*\z/i, "")
164
+
165
+ # remove alias-style quotes marker
166
+ text.gsub!(/^.*>{5} "[^"\n]+" == .+ writes:/, "")
167
+
168
+ # change enclosed-style quotes format
169
+ text.gsub!(/^>>> ?(.+) ?>>>$\n([\s\S]+?)\n^<<< ?\1 ?<<<$/) { $2.gsub(/^/, "> ") }
170
+ text.gsub!(/^>{4,}[[:blank:]]*$\n([\s\S]+?)\n^<{4,}[[:blank:]]*$/) { $1.gsub(/^/, "> ") }
171
+
172
+ # fix all quotes formats
173
+ text.gsub!(/^((?:[[:blank:]]*[[:alpha:]]*[>|])+)/) { $1.gsub(/([[:alpha:]]+>|\|)/, ">") }
174
+
175
+ # fix embedded email markers that might span over multiple lines
176
+ (
177
+ EmbeddedEmailMatcher::ON_DATE_SOMEONE_WROTE_REGEXES +
178
+ EmbeddedEmailMatcher::SOMEONE_WROTE_ON_DATE_REGEXES +
179
+ EmbeddedEmailMatcher::DATE_SOMEONE_WROTE_REGEXES +
180
+ [EmbeddedEmailMatcher::DATE_SOMEONE_EMAIL_REGEX]
181
+ ).each do |r|
182
+ text.gsub!(r) do |m|
183
+ m.count("\n") > 4 ? m : m.gsub(/\n+[[:space:]]*/, " ")
184
+ end
185
+ end
186
+
187
+ # remove leading/trailing whitespaces
188
+ text.strip!
189
+ end
190
+
156
191
  def self.compute_elided(text, lines)
157
192
  elided = []
158
193
 
@@ -0,0 +1,23 @@
1
+ From: Erlend Sogge Heggen <meta@discoursemail.com>
2
+ Reply-To: Erlend Sogge Heggen <meta+abcd@discoursemail.com>
3
+ Date: Wednesday, 5 April 2017 at 17:01
4
+ To: Jef <jef@bar.com>
5
+ Subject: [Discourse Meta] [PM] Discourse for Communities of Practice, educational organisation
6
+
7
+
8
+ erlend_sh<https://meta.discourse.org/u/erlend_sh> Erlend Sogge Heggen<https://meta.discourse.org/u/erlend_sh> Team
9
+ April 5
10
+
11
+
12
+
13
+ Hi Jef,
14
+
15
+ Is your University a legally recognised educational institution? Otherwise I'm afraid you're not eligible for this discount.
16
+
17
+ Sincerely,
18
+
19
+ Erlend
20
+
21
+
22
+
23
+ This email message and any attachments may contain confidential information and may be privileged. If you are not the intended recipient or otherwise not authorized to receive this message, you are prohibited to use, copy, disclose or take any action based on this email or any information contained herein. If you are not the intended recipient, please advise the sender immediately by replying to this email and permanently delete this message and any attachments from your system.
@@ -2,7 +2,7 @@ On Mon, Feb 1, 2016 at 6:32 PM, Jeff Atwood <info@discourse.org> wrote:
2
2
 
3
3
  > This is Jeff's reply.
4
4
  >
5
- > On Mon, Feb 1, 2016 at 7:50 AM, Some One <foo@bar.com wrote:
5
+ > On Mon, Feb 1, 2016 at 7:50 AM, Some One <foo@bar.com > > wrote:
6
6
  >
7
7
  >> Great!
8
8
  >>
@@ -14,7 +14,7 @@ On Mon, Feb 1, 2016 at 6:32 PM, Jeff Atwood <info@discourse.org> wrote:
14
14
  >>
15
15
  >>> WAT?
16
16
  >>>
17
- >>> On Wed, Jan 27, 2016 at 10:48 PM, Some One < foo@bar.com> wrote:
17
+ >>> On Wed, Jan 27, 2016 at 10:48 PM, Some One < >>> foo@bar.com> wrote:
18
18
  >>>
19
19
  >>>> Hi Team,
20
20
  >>>>
@@ -22,7 +22,7 @@ On Mon, Feb 1, 2016 at 6:32 PM, Jeff Atwood <info@discourse.org> wrote:
22
22
  >>>>
23
23
  >>>> Some One
24
24
  >>>>
25
- >>>> On Wed, Jan 27, 2016 at 10:10 AM Discourse Team <team@discourse.org> wrote:
25
+ >>>> On Wed, Jan 27, 2016 at 10:10 AM Discourse Team <team@discourse.org> >>>> wrote:
26
26
  >>>>
27
27
  >>>>> Hello :waves_hand:
28
28
  >>>>>
@@ -1,8 +1,8 @@
1
- On Thu, 31 Mar 2016 at 11:16 Some One < mailto:Some One <some.one@foo.bar> wrote:
1
+ On Thu, 31 Mar 2016 at 11:16 Some One < mailto:Some One <some.one@foo.bar> > wrote:
2
2
 
3
3
 
4
4
  Two 2
5
5
 
6
- On Thu, 31 Mar 2016 at 10:05 Jeff Atwood < mailto:Jeff Atwood <team@discourse.org> wrote:
6
+ On Thu, 31 Mar 2016 at 10:05 Jeff Atwood < mailto:Jeff Atwood <team@discourse.org> > wrote:
7
7
 
8
8
  Three 3
@@ -0,0 +1,9 @@
1
+ At 6/16/2016 08:32 PM, you wrote:
2
+ ><https://meta.discourse.org/users/codinghorror>codinghorror
3
+ ><https://meta.discourse.org/users/codinghorror>Jeff Atwood co-founder
4
+ >June 17
5
+ >
6
+ >Sorry I got a little mixed up with all the incoming replies. Are you
7
+ >able to log in?
8
+ >
9
+ >Use your email address and "forgot password" if you need it reset.
@@ -0,0 +1,11 @@
1
+ 2016-10-24 15:36 GMT+02:00 Foo bar < info@foo.bar>:
2
+
3
+ > Thank you so much Erlend, very thanks!
4
+ >
5
+ > 2016-10-24 15:03 GMT+02:00 Erlend Sogge Heggen <meta@discoursemail.com>:
6
+ >
7
+ >> erlend_sh <https://meta.discourse.org/users/erlend_sh> Erlend Sogge
8
+ >> Heggen <https://meta.discourse.org/users/erlend_sh> team
9
+ >> October 24
10
+ >>
11
+ >> I received your application and I've replied with setup instructions.
@@ -0,0 +1,4 @@
1
+ 2017-02-05 13:29 GMT+02:00 Very long author name < notifications@forum.some-discourse-host.local>:
2
+
3
+ > [Original Messages is quoted here]
4
+ > [...]
@@ -0,0 +1,4 @@
1
+ --
2
+ Eric Hodel - drbrain@segment7.net - http://segment7.net
3
+ All messages signed with fingerprint:
4
+ FEC2 57F1 D465 EB15 5D6E 7C11 332A 551C 796C 9F04
@@ -0,0 +1,2 @@
1
+ On 15 May 2017 19:32, "Nomadic Sprite" <nomadic.sprite01@gmail.com> wrote:
2
+ On 8 May 2017 6:51 pm, "Andy Jones" <Andy.Jones@jameshall.co.uk> wrote:
@@ -0,0 +1 @@
1
+ On 8 May 2017 17:34, "Andy Jones" <Andy.Jones@jameshall.co.uk> wrote:
File without changes
@@ -0,0 +1,4 @@
1
+ > 在 2016年12月12日,下午8:45,Erlend Sogge Heggen <meta@discoursemail.com> 写道:
2
+ > fu.zhang:
3
+ > Some random question
4
+ >
@@ -0,0 +1,15 @@
1
+ Max Mustermann <try_discourse@discoursemail.com> schrieb am Fr., 28. Apr. 2017 um 11:53 Uhr:
2
+
3
+ > max_2 <http://try.discourse.org/u/cfstras_2>
4
+ > April 28
5
+
6
+ > Hi there! you should be getting a mail.
7
+ > ------------------------------
8
+
9
+ > Visit Topic <http://try.discourse.org/t/this-is-my-internal-test/716/2>
10
+ > or reply to this email to respond.
11
+
12
+ > To unsubscribe from these emails, click here
13
+ > <http://try.discourse.org/email/unsubscribe/badf00d>
14
+ > .
15
+
@@ -0,0 +1,20 @@
1
+ Erlend Sogge Heggen <meta@discoursemail.com> schrieb am Di., 16. Aug. 2016 um 12:52 Uhr:
2
+
3
+ > erlend_sh <https://meta.discourse.org/users/erlend_sh> Erlend Sogge Heggen
4
+ > <https://meta.discourse.org/users/erlend_sh> team
5
+ > August 16
6
+ >
7
+ > Hi Bob,
8
+ >
9
+ > Sure, it sounds like your requirements would fit our Standard plan. I
10
+ > suggest you sign up for a free trial, as that will be the best way to see
11
+ > for yourself if we support the kind of custom styling
12
+ >
13
+ > Sincerely,
14
+ >
15
+ > Erlend
16
+ >
17
+ --
18
+ Viele Grüße / Best regards
19
+
20
+ Bob
@@ -0,0 +1,8 @@
1
+ Am 21.04.2016 17:48 schrieb "Einz Zwei" <noreply@discourse.pseuco.com >:
2
+
3
+ > einz.zwei <https://discourse.pseuco.com/users/einz.zwei> einz
4
+ > zwei <https://discourse.pseuco.com/users/einz.zwei>
5
+ > 21. April
6
+ >
7
+ > Vielleicht noch eine wichtige Frage:
8
+ > Wann sollten wir diese Trainingsvideos veröffentlichen, es gibt 2
@@ -0,0 +1,9 @@
1
+ Sendt fra min iPad
2
+
3
+ Den 15. jun. 2016 kl. 20.42 skrev Jeff Atwood <info@discourse.org<mailto:info@discourse.org>>:
4
+
5
+ codinghorror<https://meta.discourse.org/users/codinghorror> Jeff Atwood<https://meta.discourse.org/users/codinghorror> co-founder
6
+ June 15
7
+
8
+
9
+ Enable tags in your admin, site settings.