tr_email_reply_parser 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +15 -0
  2. data/LICENSE +22 -0
  3. data/README.md +68 -0
  4. data/Rakefile +135 -0
  5. data/lib/tr_email_reply_parser.rb +464 -0
  6. data/test/email_reply_parser_test.rb +441 -0
  7. data/test/emails/correct_sig.txt +4 -0
  8. data/test/emails/email_1_1.txt +13 -0
  9. data/test/emails/email_1_2.txt +51 -0
  10. data/test/emails/email_1_3.txt +55 -0
  11. data/test/emails/email_1_4.txt +5 -0
  12. data/test/emails/email_1_5.txt +15 -0
  13. data/test/emails/email_1_6.txt +15 -0
  14. data/test/emails/email_1_7.txt +12 -0
  15. data/test/emails/email_1_8.txt +6 -0
  16. data/test/emails/email_1_9.txt +9 -0
  17. data/test/emails/email_2_1.txt +25 -0
  18. data/test/emails/email_2_2.txt +10 -0
  19. data/test/emails/email_2_3.txt +14 -0
  20. data/test/emails/email_2_4.txt +14 -0
  21. data/test/emails/email_2_5.txt +15 -0
  22. data/test/emails/email_2_6.txt +11 -0
  23. data/test/emails/email_2_7.txt +5 -0
  24. data/test/emails/email_2_8.txt +4 -0
  25. data/test/emails/email_2_9.txt +9 -0
  26. data/test/emails/email_2nd_paragraph_starting_with_on.txt +12 -0
  27. data/test/emails/email_BlackBerry.txt +3 -0
  28. data/test/emails/email_bullets.txt +22 -0
  29. data/test/emails/email_from_address_in_quote_header.txt +12 -0
  30. data/test/emails/email_from_name_in_quote_header.txt +12 -0
  31. data/test/emails/email_hyphens.txt +5 -0
  32. data/test/emails/email_iPhone.txt +3 -0
  33. data/test/emails/email_mentions_own_email_address.txt +6 -0
  34. data/test/emails/email_mentions_own_name.txt +6 -0
  35. data/test/emails/email_multi_word_sent_from_my_mobile_device.txt +3 -0
  36. data/test/emails/email_multiline_quote_header_es_mx.txt +8 -0
  37. data/test/emails/email_multiline_quote_header_fr.txt +8 -0
  38. data/test/emails/email_multiline_quote_header_from_first.txt +11 -0
  39. data/test/emails/email_multiline_quote_header_from_replyto_date_to_subject.txt +12 -0
  40. data/test/emails/email_multiline_quote_header_from_to_date_subject.txt +11 -0
  41. data/test/emails/email_multiline_quote_header_none.txt +11 -0
  42. data/test/emails/email_multiline_quote_header_pt_br.txt +8 -0
  43. data/test/emails/email_multiline_quote_header_with_asterisks.txt +21 -0
  44. data/test/emails/email_multiline_quote_header_with_cc.txt +9 -0
  45. data/test/emails/email_multiline_quote_header_with_multiline_headers.txt +14 -0
  46. data/test/emails/email_no_signature_deliminator.txt +7 -0
  47. data/test/emails/email_no_signature_deliminator_adds_a_middle_initial.txt +7 -0
  48. data/test/emails/email_one_is_not_on.txt +10 -0
  49. data/test/emails/email_sent_from_my_not_signature.txt +3 -0
  50. data/test/emails/email_was_showing_as_nothing_visible.txt +13 -0
  51. data/test/emails/new_content/email_1_2.txt +28 -0
  52. data/tr_email_reply_parser.gemspec +123 -0
  53. metadata +143 -0
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ OWFiZWI0NDQ0Zjk3NmU2MGJmZTc2ZGI4MmQ0NDgxMzAzNzNiNmU3Mg==
5
+ data.tar.gz: !binary |-
6
+ NDc0ZTM1NWRkMjViNmE1MjlhZDBiMGVjMWVlNzc2YjAwNWNjZTA2Nw==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ MWVhNjJkZTVmNWMyZTlkMmM4OTc1ZjBmZTc3MjNmMWJkNGQ1MjJiMzBmNDZm
10
+ Mjg2MTM4NDAzNGM0YTg3N2Q4Y2JkYmVhZDlkNjBhMjU1NWZiNjBlZmNlNGQ4
11
+ NTZiOGZkNjg4MDgyMWQ0ZDkyY2E2MmM4ZmFiMWMwZmFhYTFlZjY=
12
+ data.tar.gz: !binary |-
13
+ MDk2OGVhZmFiYTIxYTFiYmRiYjcyNDE1NTUzNmVmN2QwNzQxNTRhYjAzMDE2
14
+ NjNkMGE4ZmY0M2ExNzcyOTYwZWVlMzg3MWUzNmRlZGJjZDNhMTZjMzExODdh
15
+ OWUxMjY5MzIzYTkwNWE2MWY4N2E0OGIwZDVjMzJkOWQ0MWMyOTg=
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License
2
+
3
+ Copyright (c) GitHub
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
22
+
data/README.md ADDED
@@ -0,0 +1,68 @@
1
+ # Email Reply Parser
2
+
3
+ [![Build Status](https://secure.travis-ci.org/lawrencepit/email_reply_parser.png?branch=master)](http://travis-ci.org/lawrencepit/email_reply_parser)
4
+ [![Code Climate](https://codeclimate.com/badge.png)](https://codeclimate.com/github/lawrencepit/email_reply_parser)
5
+ [![Gem Version](https://fury-badge.herokuapp.com/rb/email_reply_parser.png)](http://badge.fury.io/rb/email_reply_parser)
6
+
7
+ EmailReplyParser is a small library to parse plain text email content.
8
+
9
+ This is what GitHub uses to display comments that were created from
10
+ email replies. This code is being open sourced in an effort to
11
+ crowdsource the quality of our email representation.
12
+
13
+ ## Usage
14
+
15
+ To parse reply body:
16
+
17
+ `parsed_body = EmailReplyParser.parse_reply(email_body, from_address)`
18
+
19
+ Argument `from_address` is optional. If included it will attempt to parse out signatures based on the name in the from address (if signature doesn't have a standard deliminator.)
20
+
21
+ ## Installation
22
+
23
+ Get it from [GitHub][github] or `gem install email_reply_parser`. Run `rake` to run the tests.
24
+
25
+ [github]: https://github.com/github/email_reply_parser
26
+
27
+ ## Contribute
28
+
29
+ If you'd like to hack on EmailReplyParser, start by forking the repo on GitHub:
30
+
31
+ https://github.com/github/email_reply_parser
32
+
33
+ The best way to get your changes merged back into core is as follows:
34
+
35
+ * Clone down your fork
36
+ * Create a thoughtfully named topic branch to contain your change
37
+ * Hack away
38
+ * Add tests and make sure everything still passes by running rake
39
+ * If you are adding new functionality, document it in the README
40
+ * Do not change the version number, I will do that on my end
41
+ * If necessary, rebase your commits into logical chunks, without errors
42
+ * Push the branch up to GitHub
43
+ * Send a pull request to the `github/email_reply_parser` project.
44
+
45
+ ## Known Issues
46
+
47
+ ### Quoted Headers
48
+
49
+ Quoted headers like these currently don't work with other languages:
50
+
51
+ On <date>, <author> wrote:
52
+
53
+ > blah
54
+
55
+ ### Weird Signatures
56
+
57
+ Not everyone follows this convention:
58
+
59
+ Hello
60
+
61
+ Saludos!!!!!!!!!!!!!!
62
+ Galactic President Superstar Mc Awesomeville
63
+ GitHub
64
+
65
+ **********************DISCLAIMER***********************************
66
+ * Note: blah blah blah *
67
+ **********************DISCLAIMER***********************************
68
+
data/Rakefile ADDED
@@ -0,0 +1,135 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'date'
4
+
5
+ #############################################################################
6
+ #
7
+ # Helper functions
8
+ #
9
+ #############################################################################
10
+
11
+ def name
12
+ @name ||= Dir['*.gemspec'].first.split('.').first
13
+ end
14
+
15
+ def version
16
+ line = File.read("lib/#{name}.rb")[/^\s*VERSION\s*=\s*.*/]
17
+ line.match(/.*VERSION\s*=\s*['"](.*)['"]/)[1]
18
+ end
19
+
20
+ def date
21
+ Date.today.to_s
22
+ end
23
+
24
+ def rubyforge_project
25
+ name
26
+ end
27
+
28
+ def gemspec_file
29
+ "#{name}.gemspec"
30
+ end
31
+
32
+ def gem_file
33
+ "#{name}-#{version}.gem"
34
+ end
35
+
36
+ def replace_header(head, header_name)
37
+ head.sub!(/(\.#{header_name}\s*= ').*'/) { "#{$1}#{send(header_name)}'"}
38
+ end
39
+
40
+ #############################################################################
41
+ #
42
+ # Standard tasks
43
+ #
44
+ #############################################################################
45
+
46
+ task :default => :test
47
+
48
+ require 'rake/testtask'
49
+ Rake::TestTask.new(:test) do |test|
50
+ test.libs << 'lib' << 'test'
51
+ test.pattern = 'test/*_test.rb'
52
+ test.verbose = true
53
+ end
54
+
55
+ desc "Open an irb session preloaded with this library"
56
+ task :console do
57
+ sh "irb -rubygems -r ./lib/#{name}.rb"
58
+ end
59
+
60
+ #############################################################################
61
+ #
62
+ # Custom tasks (add your own tasks here)
63
+ #
64
+ #############################################################################
65
+
66
+
67
+
68
+ #############################################################################
69
+ #
70
+ # Packaging tasks
71
+ #
72
+ #############################################################################
73
+
74
+ desc "Create tag v#{version} and build and push #{gem_file} to Rubygems"
75
+ task :release => :build do
76
+ unless `git branch` =~ /^\* master$/
77
+ puts "You must be on the master branch to release!"
78
+ exit!
79
+ end
80
+ sh "git commit --allow-empty -a -m 'Release #{version}'"
81
+ sh "git tag v#{version}"
82
+ sh "git push origin master"
83
+ sh "git push origin v#{version}"
84
+ sh "gem push pkg/#{name}-#{version}.gem"
85
+ end
86
+
87
+ desc "Build #{gem_file} into the pkg directory"
88
+ task :build => :gemspec do
89
+ sh "mkdir -p pkg"
90
+ sh "gem build #{gemspec_file}"
91
+ sh "mv #{gem_file} pkg"
92
+ end
93
+
94
+ desc "Generate #{gemspec_file}"
95
+ task :gemspec => :validate do
96
+ # read spec file and split out manifest section
97
+ spec = File.read(gemspec_file)
98
+ head, manifest, tail = spec.split(" # = MANIFEST =\n")
99
+
100
+ # replace name version and date
101
+ replace_header(head, :name)
102
+ replace_header(head, :version)
103
+ replace_header(head, :date)
104
+ #comment this out if your rubyforge_project has a different name
105
+ replace_header(head, :rubyforge_project)
106
+
107
+ # determine file list from git ls-files
108
+ files = `git ls-files`.
109
+ split("\n").
110
+ sort.
111
+ reject { |file| file =~ /^\./ }.
112
+ reject { |file| file =~ /^(rdoc|pkg)/ }.
113
+ map { |file| " #{file}" }.
114
+ join("\n")
115
+
116
+ # piece file back together and write
117
+ manifest = " s.files = %w[\n#{files}\n ]\n"
118
+ spec = [head, manifest, tail].join(" # = MANIFEST =\n")
119
+ File.open(gemspec_file, 'w') { |io| io.write(spec) }
120
+ puts "Updated #{gemspec_file}"
121
+ end
122
+
123
+ desc "Validate #{gemspec_file}"
124
+ task :validate do
125
+ libfiles = Dir['lib/*'] - ["lib/#{name}.rb", "lib/#{name}"]
126
+ unless libfiles.empty?
127
+ puts "Directory `lib` should only contain a `#{name}.rb` file and `#{name}` dir."
128
+ exit!
129
+ end
130
+ unless Dir['VERSION*'].empty?
131
+ puts "A `VERSION` file at root level violates Gem best practices."
132
+ exit!
133
+ end
134
+ end
135
+
@@ -0,0 +1,464 @@
1
+ require 'strscan'
2
+
3
+ # EmailReplyParser is a small library to parse plain text email content. The
4
+ # goal is to identify which fragments are quoted, part of a signature, or
5
+ # original body content. We want to support both top and bottom posters, so
6
+ # no simple "REPLY ABOVE HERE" content is used.
7
+ #
8
+ # Beyond RFC 5322 (which is handled by the [Ruby mail gem][mail]), there aren't
9
+ # any real standards for how emails are created. This attempts to parse out
10
+ # common conventions for things like replies:
11
+ #
12
+ # this is some text
13
+ #
14
+ # On <date>, <author> wrote:
15
+ # > blah blah
16
+ # > blah blah
17
+ #
18
+ # ... and signatures:
19
+ #
20
+ # this is some text
21
+ #
22
+ # --
23
+ # Bob
24
+ # http://homepage.com/~bob
25
+ #
26
+ # Each of these are parsed into Fragment objects.
27
+ #
28
+ # EmailReplyParser also attempts to figure out which of these blocks should
29
+ # be hidden from users.
30
+ #
31
+ # [mail]: https://github.com/mikel/mail
32
+ class EmailReplyParser
33
+ VERSION = "0.6.0"
34
+
35
+ # Public: Splits an email body into a list of Fragments.
36
+ #
37
+ # text - A String email body.
38
+ # from_address - from address of the email (optional)
39
+ #
40
+ # Returns an Email instance.
41
+ def self.read(text, from_address = "")
42
+ Email.new.read(text, from_address)
43
+ end
44
+
45
+ # Public: Get the text of the visible portions of the given email body.
46
+ #
47
+ # text - A String email body.
48
+ # from_address - from address of the email (optional)
49
+ #
50
+ # Returns a String.
51
+ def self.parse_reply(text, from_address = "")
52
+ self.read(text.to_s, from_address).visible_text
53
+ end
54
+
55
+ def self.parse_new_content(text, from_address = "")
56
+ self.read(text, from_address).new_content
57
+ end
58
+
59
+ ### Emails
60
+
61
+ # An Email instance represents a parsed body String.
62
+ class Email
63
+ # Emails have an Array of Fragments.
64
+ attr_reader :fragments
65
+
66
+ def initialize
67
+ @fragments = []
68
+ end
69
+
70
+ # Public: Gets the combined text of the visible fragments of the email body.
71
+ #
72
+ # Returns a String.
73
+ def visible_text
74
+ fragments.select{|f| !f.hidden?}.map{|f| f.to_s}.join("\n").rstrip
75
+ end
76
+
77
+ def new_content
78
+ fragments.select{|f| !f.quoted? && !f.hidden? && !f.signature?}.map{|f| f.to_s}.join("\n").rstrip
79
+ end
80
+
81
+ # Splits the given text into a list of Fragments. This is roughly done by
82
+ # reversing the text and parsing from the bottom to the top. This way we
83
+ # can check for 'On <date>, <author> wrote:' lines above quoted blocks.
84
+ #
85
+ # text - A String email body.
86
+ # from_address - from address of the email (optional)
87
+ #
88
+ # Returns this same Email instance.
89
+ def read(text, from_address = "")
90
+ # parse out the from name if one exists and save for use later
91
+ @from_name_raw = parse_raw_name_from_address(from_address)
92
+ @from_name_normalized = normalize_name(@from_name_raw)
93
+ @from_email = parse_email_from_address(from_address)
94
+
95
+ text = normalize_text(text)
96
+
97
+ # The text is reversed initially due to the way we check for hidden
98
+ # fragments.
99
+ text = text.reverse
100
+
101
+ # This determines if any 'visible' Fragment has been found. Once any
102
+ # visible Fragment is found, stop looking for hidden ones.
103
+ @found_visible = false
104
+
105
+ # This instance variable points to the current Fragment. If the matched
106
+ # line fits, it should be added to this Fragment. Otherwise, finish it
107
+ # and start a new Fragment.
108
+ @fragment = nil
109
+
110
+ # Use the StringScanner to pull out each line of the email content.
111
+ @scanner = StringScanner.new(text)
112
+ while line = @scanner.scan_until(/\n/n)
113
+ scan_line(line)
114
+ end
115
+
116
+ # Be sure to parse the last line of the email.
117
+ if (last_line = @scanner.rest.to_s).size > 0
118
+ scan_line(last_line, true)
119
+ end
120
+
121
+ # Finish up the final fragment. Finishing a fragment will detect any
122
+ # attributes (hidden, signature, reply), and join each line into a
123
+ # string.
124
+ finish_fragment
125
+
126
+ @scanner = @fragment = nil
127
+
128
+ # Now that parsing is done, reverse the order.
129
+ @fragments.reverse!
130
+ self
131
+ end
132
+
133
+ private
134
+ EMPTY = "".freeze
135
+
136
+ COMMON_REPLY_HEADER_REGEXES = [
137
+ /^On(.+)wrote:$/nm,
138
+ /\A\d{4}\/\d{1,2}\/\d{1,2}\s+.{1,80}\s<[^@]+@[^@]+>\Z/,
139
+ ]
140
+
141
+ # Line optionally starts with whitespace, contains two or more hyphens or
142
+ # underscores, and ends with optional whitespace.
143
+ # Example: '---' or '___' or '--- '
144
+ MULTI_LINE_SIGNATURE_REGEX = /^\s*[-_]{2,}\s*$/
145
+
146
+ # Line optionally starts with whitespace, followed by one hyphen, followed by a word character
147
+ # Example: '-Sandro'
148
+ ONE_LINE_SIGNATURE_REGEX = /^\s*-\w/
149
+
150
+ ORIGINAL_MESSAGE_SIGNATURE_REGEX = /^[\s_-]+(Original Message)?[\s_-]+$/
151
+
152
+ # No block-quotes (> or <), followed by up to three words, followed by "Sent from my".
153
+ # Example: "Sent from my iPhone 3G"
154
+ SENT_FROM_REGEX = /^Sent from my (\s*\w+){1,3}(\s*<.*>)?$/
155
+
156
+ if defined?(Regexp::NOENCODING)
157
+ SIGNATURE_REGEX = Regexp.new(Regexp.union(MULTI_LINE_SIGNATURE_REGEX, ONE_LINE_SIGNATURE_REGEX, ORIGINAL_MESSAGE_SIGNATURE_REGEX, SENT_FROM_REGEX).source, Regexp::NOENCODING)
158
+ else
159
+ SIGNATURE_REGEX = Regexp.new(Regexp.union(MULTI_LINE_SIGNATURE_REGEX, ONE_LINE_SIGNATURE_REGEX, ORIGINAL_MESSAGE_SIGNATURE_REGEX, SENT_FROM_REGEX).source)
160
+ end
161
+
162
+ # TODO: refactor out in a i18n.yml file
163
+ # Supports English, French, Es-Mexican, Pt-Brazilian
164
+ # Maps a label to a label-group
165
+ QUOTE_HEADER_LABELS = Hash[*{
166
+ :from => ["From", "De"],
167
+ :to => ["To", "Para", "A"],
168
+ :cc => ["CC"],
169
+ :reply_to => ["Reply-To"],
170
+ :date => ["Date", "Sent", "Enviado", "Enviada em", "Fecha"],
171
+ :subject => ["Subject", "Assunto", "Asunto", "Objet"]
172
+ }.map {|group, labels| labels.map {|label| [label.downcase, group]}}.flatten]
173
+
174
+ # normalize text so it is easier to parse
175
+ #
176
+ # text - text to normalize
177
+ #
178
+ # Returns a String
179
+ def normalize_text(text)
180
+ # in 1.9 we want to operate on the raw bytes
181
+ text = text.dup.force_encoding('binary') if text.respond_to?(:force_encoding)
182
+
183
+ # Normalize line endings.
184
+ text.gsub!("\r\n", "\n")
185
+
186
+ # Check for multi-line reply headers. Some clients break up
187
+ # the "On DATE, NAME <EMAIL> wrote:" line into multiple lines.
188
+ if match = text.match(/^(On\s(.+)wrote:)$/m)
189
+ # Remove all new lines from the reply header. as long as we don't have any double newline
190
+ # if we do they we have grabbed something that is not actually a reply header
191
+ text.gsub! match[1], match[1].gsub("\n", " ") unless match[1] =~ /\n\n/
192
+ end
193
+
194
+ # Some users may reply directly above a line of underscores.
195
+ # In order to ensure that these fragments are split correctly,
196
+ # make sure that all lines of underscores are preceded by
197
+ # at least two newline characters.
198
+ text.gsub!(/([^\n])(?=\n_{7}_+)$/m, "\\1\n")
199
+
200
+ text
201
+ end
202
+
203
+ # Parse a person's name from an e-mail address
204
+ #
205
+ # email - email address.
206
+ #
207
+ # Returns a String.
208
+ def parse_name_from_address(address)
209
+ normalize_name(parse_raw_name_from_address(address))
210
+ end
211
+
212
+ def parse_raw_name_from_address(address)
213
+ match = address.match(/^["']*([\w\s,]+)["']*\s*</)
214
+ match ? match[1].strip.to_s : EMPTY
215
+ end
216
+
217
+ def parse_email_from_address(address)
218
+ match = address.match /<(.*)>/
219
+ match ? match[1] : address
220
+ end
221
+
222
+ # Normalize a name to First Last
223
+ #
224
+ # name - name to normailze.
225
+ #
226
+ # Returns a String.
227
+ def normalize_name(name)
228
+ if name.include?(',')
229
+ make_name_first_then_last(name)
230
+ else
231
+ name
232
+ end
233
+ end
234
+
235
+ def make_name_first_then_last(name)
236
+ split_name = name.split(',')
237
+ if split_name[0].include?(" ")
238
+ split_name[0].to_s
239
+ else
240
+ split_name[1].strip + " " + split_name[0].strip
241
+ end
242
+ end
243
+
244
+ ### Line-by-Line Parsing
245
+
246
+ # Scans the given line of text and determines which fragment it belongs to.
247
+ def scan_line(line, last = false)
248
+ line.chomp!("\n")
249
+ line.reverse!
250
+ line.rstrip!
251
+
252
+ # Mark the current Fragment as a signature if the current line is empty
253
+ # and the Fragment starts with a common signature indicator.
254
+ # Mark the current Fragment as a quote if the current line is empty
255
+ # and the Fragment starts with a multiline quote header.
256
+ scan_signature_or_quote if @fragment && line == EMPTY
257
+
258
+ # We're looking for leading `>`'s to see if this line is part of a
259
+ # quoted Fragment.
260
+ is_quoted = !!(line =~ /^>+/n)
261
+
262
+ # Note that a common reply header also counts as part of the quoted
263
+ # Fragment, even though it doesn't start with `>`.
264
+ unless @fragment &&
265
+ ((@fragment.quoted? == is_quoted) ||
266
+ (@fragment.quoted? && (line_is_reply_header?(line) || line == EMPTY)))
267
+ finish_fragment
268
+ @fragment = Fragment.new
269
+ @fragment.quoted = is_quoted
270
+ end
271
+
272
+ @fragment.add_line(line)
273
+ scan_signature_or_quote if last
274
+ end
275
+
276
+ def scan_signature_or_quote
277
+ if signature_line?(@fragment.lines.first)
278
+ @fragment.signature = true
279
+ finish_fragment
280
+ elsif multiline_quote_header_in_fragment?
281
+ @fragment.quoted = true
282
+ finish_fragment
283
+ end
284
+ end
285
+
286
+ # Returns +true+ if the current block in the current fragment has
287
+ # a multiline quote header, +false+ otherwise.
288
+ #
289
+ # The quote header we're looking for is mainly generated by Outlook
290
+ # clients. It's considered a quote header if the first 4 folded lines
291
+ # have one of the following forms:
292
+ #
293
+ # label: some text
294
+ # *label:* some text
295
+ #
296
+ # where a line like this:
297
+ #
298
+ # label: some text
299
+ # possibly indented text that belongs to the previous line
300
+ #
301
+ # is folded into:
302
+ #
303
+ # label: some text possibly indented text that belongs to the previous line
304
+ #
305
+ # and where label is a value from +QUOTE_HEADER_LABELS+ that appears
306
+ # only once in the first 4 lines and where each group of a label
307
+ # is represented at most once.
308
+ def multiline_quote_header_in_fragment?
309
+ folding = false
310
+ label_groups = []
311
+ @fragment.current_block.split("\n").each do |line|
312
+ if line =~ /\A\s*\*?([^:]+):(\s|\*)/
313
+ label = QUOTE_HEADER_LABELS[$1.downcase]
314
+ if label
315
+ return false if label_groups.include?(label)
316
+ return true if label_groups.length == 3
317
+ label_groups << label
318
+ folding = true
319
+ elsif !folding
320
+ return false
321
+ end
322
+ elsif !folding
323
+ return false
324
+ else
325
+ folding = true
326
+ end
327
+ end
328
+ return false
329
+ end
330
+
331
+ # Detects if a given line is the beginning of a signature
332
+ #
333
+ # line - A String line of text from the email.
334
+ #
335
+ # Returns true if the line is the beginning of a signature, or false.
336
+ def signature_line?(line)
337
+ line =~ SIGNATURE_REGEX || line_is_signature_name?(line)
338
+ end
339
+
340
+ # Detects if a given line is a common reply header.
341
+ #
342
+ # line - A String line of text from the email.
343
+ #
344
+ # Returns true if the line is a valid header, or false.
345
+ def line_is_reply_header?(line)
346
+ COMMON_REPLY_HEADER_REGEXES.each do |regex|
347
+ return true if line =~ regex
348
+ end
349
+ false
350
+ end
351
+
352
+ # Detects if the @from name is a big part of a given line and therefore the beginning of a signature
353
+ #
354
+ # line - A String line of text from the email.
355
+ #
356
+ # Returns true if @from_name is a big part of the line, or false.
357
+ def line_is_signature_name?(line)
358
+ regexp = generate_regexp_for_name()
359
+ @from_name_normalized != "" && (line =~ regexp) && ((@from_name_normalized.size.to_f / line.size) > 0.25)
360
+ end
361
+
362
+ #generates regexp which always for additional words or initials between first and last names
363
+ def generate_regexp_for_name
364
+ name_parts = @from_name_normalized.split(" ")
365
+ seperator = '[\w.\s]*'
366
+ regexp = Regexp.new(name_parts.join(seperator), Regexp::IGNORECASE)
367
+ end
368
+
369
+ # Builds the fragment string, after all lines have been added.
370
+ # It also checks to see if this Fragment is hidden. The hidden
371
+ # Fragment check reads from the bottom to the top.
372
+ #
373
+ # Any quoted Fragments or signature Fragments are marked hidden if they
374
+ # are below any visible Fragments. Visible Fragments are expected to
375
+ # contain original content by the author. If they are below a quoted
376
+ # Fragment, then the Fragment should be visible to give context to the
377
+ # reply.
378
+ #
379
+ # some original text (visible)
380
+ #
381
+ # > do you have any two's? (quoted, visible)
382
+ #
383
+ # Go fish! (visible)
384
+ #
385
+ # > --
386
+ # > Player 1 (quoted, hidden)
387
+ #
388
+ # --
389
+ # Player 2 (signature, hidden)
390
+ #
391
+ def finish_fragment
392
+ if @fragment
393
+ @fragment.finish
394
+ if !@found_visible
395
+ if @fragment.quoted? || @fragment.signature? ||
396
+ @fragment.reply_header? || @fragment.to_s.strip == EMPTY
397
+ @fragment.hidden = true
398
+ else
399
+ @found_visible = true
400
+ end
401
+ end
402
+ @fragments << @fragment
403
+ end
404
+ @fragment = nil
405
+ end
406
+ end
407
+
408
+ # Represents a group of paragraphs in the email sharing common attributes.
409
+ # Paragraphs should get their own fragment if they are a quoted area or a
410
+ # signature.
411
+ class Fragment < Struct.new(:quoted, :signature, :reply_header, :hidden)
412
+ # Array of string lines that make up the content of this fragment.
413
+ attr_reader :lines
414
+
415
+ # Array of string lines that is being processed not having
416
+ # an empty line.
417
+ attr_reader :current_block
418
+
419
+ # This is reserved for the joined String that is build when this Fragment
420
+ # is finished.
421
+ attr_reader :content
422
+
423
+ def initialize
424
+ self.quoted = self.signature = self.reply_header = self.hidden = false
425
+ @lines = []
426
+ @current_block = []
427
+ @content = nil
428
+ end
429
+
430
+ alias quoted? quoted
431
+ alias signature? signature
432
+ alias reply_header? reply_header
433
+ alias hidden? hidden
434
+
435
+ def add_line(line)
436
+ return unless line
437
+ @lines.insert(0, line)
438
+ if line == ""
439
+ @current_block.clear
440
+ else
441
+ @current_block.insert(0, line)
442
+ end
443
+ end
444
+
445
+ def current_block
446
+ @current_block.join("\n")
447
+ end
448
+
449
+ # Builds the string content by joining the lines and reversing them.
450
+ def finish
451
+ @content = @lines.join("\n")
452
+ @lines = @current_block = nil
453
+ end
454
+
455
+ def to_s
456
+ @lines ? @lines.join("\n") : @content
457
+ end
458
+
459
+ def inspect
460
+ "#{super.inspect} : #{to_s.inspect}"
461
+ end
462
+ end
463
+ end
464
+