tr_email_reply_parser 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +15 -0
  2. data/LICENSE +22 -0
  3. data/README.md +68 -0
  4. data/Rakefile +135 -0
  5. data/lib/tr_email_reply_parser.rb +464 -0
  6. data/test/email_reply_parser_test.rb +441 -0
  7. data/test/emails/correct_sig.txt +4 -0
  8. data/test/emails/email_1_1.txt +13 -0
  9. data/test/emails/email_1_2.txt +51 -0
  10. data/test/emails/email_1_3.txt +55 -0
  11. data/test/emails/email_1_4.txt +5 -0
  12. data/test/emails/email_1_5.txt +15 -0
  13. data/test/emails/email_1_6.txt +15 -0
  14. data/test/emails/email_1_7.txt +12 -0
  15. data/test/emails/email_1_8.txt +6 -0
  16. data/test/emails/email_1_9.txt +9 -0
  17. data/test/emails/email_2_1.txt +25 -0
  18. data/test/emails/email_2_2.txt +10 -0
  19. data/test/emails/email_2_3.txt +14 -0
  20. data/test/emails/email_2_4.txt +14 -0
  21. data/test/emails/email_2_5.txt +15 -0
  22. data/test/emails/email_2_6.txt +11 -0
  23. data/test/emails/email_2_7.txt +5 -0
  24. data/test/emails/email_2_8.txt +4 -0
  25. data/test/emails/email_2_9.txt +9 -0
  26. data/test/emails/email_2nd_paragraph_starting_with_on.txt +12 -0
  27. data/test/emails/email_BlackBerry.txt +3 -0
  28. data/test/emails/email_bullets.txt +22 -0
  29. data/test/emails/email_from_address_in_quote_header.txt +12 -0
  30. data/test/emails/email_from_name_in_quote_header.txt +12 -0
  31. data/test/emails/email_hyphens.txt +5 -0
  32. data/test/emails/email_iPhone.txt +3 -0
  33. data/test/emails/email_mentions_own_email_address.txt +6 -0
  34. data/test/emails/email_mentions_own_name.txt +6 -0
  35. data/test/emails/email_multi_word_sent_from_my_mobile_device.txt +3 -0
  36. data/test/emails/email_multiline_quote_header_es_mx.txt +8 -0
  37. data/test/emails/email_multiline_quote_header_fr.txt +8 -0
  38. data/test/emails/email_multiline_quote_header_from_first.txt +11 -0
  39. data/test/emails/email_multiline_quote_header_from_replyto_date_to_subject.txt +12 -0
  40. data/test/emails/email_multiline_quote_header_from_to_date_subject.txt +11 -0
  41. data/test/emails/email_multiline_quote_header_none.txt +11 -0
  42. data/test/emails/email_multiline_quote_header_pt_br.txt +8 -0
  43. data/test/emails/email_multiline_quote_header_with_asterisks.txt +21 -0
  44. data/test/emails/email_multiline_quote_header_with_cc.txt +9 -0
  45. data/test/emails/email_multiline_quote_header_with_multiline_headers.txt +14 -0
  46. data/test/emails/email_no_signature_deliminator.txt +7 -0
  47. data/test/emails/email_no_signature_deliminator_adds_a_middle_initial.txt +7 -0
  48. data/test/emails/email_one_is_not_on.txt +10 -0
  49. data/test/emails/email_sent_from_my_not_signature.txt +3 -0
  50. data/test/emails/email_was_showing_as_nothing_visible.txt +13 -0
  51. data/test/emails/new_content/email_1_2.txt +28 -0
  52. data/tr_email_reply_parser.gemspec +123 -0
  53. metadata +143 -0
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ OWFiZWI0NDQ0Zjk3NmU2MGJmZTc2ZGI4MmQ0NDgxMzAzNzNiNmU3Mg==
5
+ data.tar.gz: !binary |-
6
+ NDc0ZTM1NWRkMjViNmE1MjlhZDBiMGVjMWVlNzc2YjAwNWNjZTA2Nw==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ MWVhNjJkZTVmNWMyZTlkMmM4OTc1ZjBmZTc3MjNmMWJkNGQ1MjJiMzBmNDZm
10
+ Mjg2MTM4NDAzNGM0YTg3N2Q4Y2JkYmVhZDlkNjBhMjU1NWZiNjBlZmNlNGQ4
11
+ NTZiOGZkNjg4MDgyMWQ0ZDkyY2E2MmM4ZmFiMWMwZmFhYTFlZjY=
12
+ data.tar.gz: !binary |-
13
+ MDk2OGVhZmFiYTIxYTFiYmRiYjcyNDE1NTUzNmVmN2QwNzQxNTRhYjAzMDE2
14
+ NjNkMGE4ZmY0M2ExNzcyOTYwZWVlMzg3MWUzNmRlZGJjZDNhMTZjMzExODdh
15
+ OWUxMjY5MzIzYTkwNWE2MWY4N2E0OGIwZDVjMzJkOWQ0MWMyOTg=
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License
2
+
3
+ Copyright (c) GitHub
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
22
+
data/README.md ADDED
@@ -0,0 +1,68 @@
1
+ # Email Reply Parser
2
+
3
+ [![Build Status](https://secure.travis-ci.org/lawrencepit/email_reply_parser.png?branch=master)](http://travis-ci.org/lawrencepit/email_reply_parser)
4
+ [![Code Climate](https://codeclimate.com/badge.png)](https://codeclimate.com/github/lawrencepit/email_reply_parser)
5
+ [![Gem Version](https://fury-badge.herokuapp.com/rb/email_reply_parser.png)](http://badge.fury.io/rb/email_reply_parser)
6
+
7
+ EmailReplyParser is a small library to parse plain text email content.
8
+
9
+ This is what GitHub uses to display comments that were created from
10
+ email replies. This code is being open sourced in an effort to
11
+ crowdsource the quality of our email representation.
12
+
13
+ ## Usage
14
+
15
+ To parse reply body:
16
+
17
+ `parsed_body = EmailReplyParser.parse_reply(email_body, from_address)`
18
+
19
+ Argument `from_address` is optional. If included it will attempt to parse out signatures based on the name in the from address (if signature doesn't have a standard deliminator.)
20
+
21
+ ## Installation
22
+
23
+ Get it from [GitHub][github] or `gem install email_reply_parser`. Run `rake` to run the tests.
24
+
25
+ [github]: https://github.com/github/email_reply_parser
26
+
27
+ ## Contribute
28
+
29
+ If you'd like to hack on EmailReplyParser, start by forking the repo on GitHub:
30
+
31
+ https://github.com/github/email_reply_parser
32
+
33
+ The best way to get your changes merged back into core is as follows:
34
+
35
+ * Clone down your fork
36
+ * Create a thoughtfully named topic branch to contain your change
37
+ * Hack away
38
+ * Add tests and make sure everything still passes by running rake
39
+ * If you are adding new functionality, document it in the README
40
+ * Do not change the version number, I will do that on my end
41
+ * If necessary, rebase your commits into logical chunks, without errors
42
+ * Push the branch up to GitHub
43
+ * Send a pull request to the `github/email_reply_parser` project.
44
+
45
+ ## Known Issues
46
+
47
+ ### Quoted Headers
48
+
49
+ Quoted headers like these currently don't work with other languages:
50
+
51
+ On <date>, <author> wrote:
52
+
53
+ > blah
54
+
55
+ ### Weird Signatures
56
+
57
+ Not everyone follows this convention:
58
+
59
+ Hello
60
+
61
+ Saludos!!!!!!!!!!!!!!
62
+ Galactic President Superstar Mc Awesomeville
63
+ GitHub
64
+
65
+ **********************DISCLAIMER***********************************
66
+ * Note: blah blah blah *
67
+ **********************DISCLAIMER***********************************
68
+
data/Rakefile ADDED
@@ -0,0 +1,135 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'date'
4
+
5
+ #############################################################################
6
+ #
7
+ # Helper functions
8
+ #
9
+ #############################################################################
10
+
11
+ def name
12
+ @name ||= Dir['*.gemspec'].first.split('.').first
13
+ end
14
+
15
+ def version
16
+ line = File.read("lib/#{name}.rb")[/^\s*VERSION\s*=\s*.*/]
17
+ line.match(/.*VERSION\s*=\s*['"](.*)['"]/)[1]
18
+ end
19
+
20
+ def date
21
+ Date.today.to_s
22
+ end
23
+
24
+ def rubyforge_project
25
+ name
26
+ end
27
+
28
+ def gemspec_file
29
+ "#{name}.gemspec"
30
+ end
31
+
32
+ def gem_file
33
+ "#{name}-#{version}.gem"
34
+ end
35
+
36
+ def replace_header(head, header_name)
37
+ head.sub!(/(\.#{header_name}\s*= ').*'/) { "#{$1}#{send(header_name)}'"}
38
+ end
39
+
40
+ #############################################################################
41
+ #
42
+ # Standard tasks
43
+ #
44
+ #############################################################################
45
+
46
+ task :default => :test
47
+
48
+ require 'rake/testtask'
49
+ Rake::TestTask.new(:test) do |test|
50
+ test.libs << 'lib' << 'test'
51
+ test.pattern = 'test/*_test.rb'
52
+ test.verbose = true
53
+ end
54
+
55
+ desc "Open an irb session preloaded with this library"
56
+ task :console do
57
+ sh "irb -rubygems -r ./lib/#{name}.rb"
58
+ end
59
+
60
+ #############################################################################
61
+ #
62
+ # Custom tasks (add your own tasks here)
63
+ #
64
+ #############################################################################
65
+
66
+
67
+
68
+ #############################################################################
69
+ #
70
+ # Packaging tasks
71
+ #
72
+ #############################################################################
73
+
74
+ desc "Create tag v#{version} and build and push #{gem_file} to Rubygems"
75
+ task :release => :build do
76
+ unless `git branch` =~ /^\* master$/
77
+ puts "You must be on the master branch to release!"
78
+ exit!
79
+ end
80
+ sh "git commit --allow-empty -a -m 'Release #{version}'"
81
+ sh "git tag v#{version}"
82
+ sh "git push origin master"
83
+ sh "git push origin v#{version}"
84
+ sh "gem push pkg/#{name}-#{version}.gem"
85
+ end
86
+
87
+ desc "Build #{gem_file} into the pkg directory"
88
+ task :build => :gemspec do
89
+ sh "mkdir -p pkg"
90
+ sh "gem build #{gemspec_file}"
91
+ sh "mv #{gem_file} pkg"
92
+ end
93
+
94
+ desc "Generate #{gemspec_file}"
95
+ task :gemspec => :validate do
96
+ # read spec file and split out manifest section
97
+ spec = File.read(gemspec_file)
98
+ head, manifest, tail = spec.split(" # = MANIFEST =\n")
99
+
100
+ # replace name version and date
101
+ replace_header(head, :name)
102
+ replace_header(head, :version)
103
+ replace_header(head, :date)
104
+ #comment this out if your rubyforge_project has a different name
105
+ replace_header(head, :rubyforge_project)
106
+
107
+ # determine file list from git ls-files
108
+ files = `git ls-files`.
109
+ split("\n").
110
+ sort.
111
+ reject { |file| file =~ /^\./ }.
112
+ reject { |file| file =~ /^(rdoc|pkg)/ }.
113
+ map { |file| " #{file}" }.
114
+ join("\n")
115
+
116
+ # piece file back together and write
117
+ manifest = " s.files = %w[\n#{files}\n ]\n"
118
+ spec = [head, manifest, tail].join(" # = MANIFEST =\n")
119
+ File.open(gemspec_file, 'w') { |io| io.write(spec) }
120
+ puts "Updated #{gemspec_file}"
121
+ end
122
+
123
+ desc "Validate #{gemspec_file}"
124
+ task :validate do
125
+ libfiles = Dir['lib/*'] - ["lib/#{name}.rb", "lib/#{name}"]
126
+ unless libfiles.empty?
127
+ puts "Directory `lib` should only contain a `#{name}.rb` file and `#{name}` dir."
128
+ exit!
129
+ end
130
+ unless Dir['VERSION*'].empty?
131
+ puts "A `VERSION` file at root level violates Gem best practices."
132
+ exit!
133
+ end
134
+ end
135
+
@@ -0,0 +1,464 @@
1
+ require 'strscan'
2
+
3
+ # EmailReplyParser is a small library to parse plain text email content. The
4
+ # goal is to identify which fragments are quoted, part of a signature, or
5
+ # original body content. We want to support both top and bottom posters, so
6
+ # no simple "REPLY ABOVE HERE" content is used.
7
+ #
8
+ # Beyond RFC 5322 (which is handled by the [Ruby mail gem][mail]), there aren't
9
+ # any real standards for how emails are created. This attempts to parse out
10
+ # common conventions for things like replies:
11
+ #
12
+ # this is some text
13
+ #
14
+ # On <date>, <author> wrote:
15
+ # > blah blah
16
+ # > blah blah
17
+ #
18
+ # ... and signatures:
19
+ #
20
+ # this is some text
21
+ #
22
+ # --
23
+ # Bob
24
+ # http://homepage.com/~bob
25
+ #
26
+ # Each of these are parsed into Fragment objects.
27
+ #
28
+ # EmailReplyParser also attempts to figure out which of these blocks should
29
+ # be hidden from users.
30
+ #
31
+ # [mail]: https://github.com/mikel/mail
32
+ class EmailReplyParser
33
+ VERSION = "0.6.0"
34
+
35
+ # Public: Splits an email body into a list of Fragments.
36
+ #
37
+ # text - A String email body.
38
+ # from_address - from address of the email (optional)
39
+ #
40
+ # Returns an Email instance.
41
+ def self.read(text, from_address = "")
42
+ Email.new.read(text, from_address)
43
+ end
44
+
45
+ # Public: Get the text of the visible portions of the given email body.
46
+ #
47
+ # text - A String email body.
48
+ # from_address - from address of the email (optional)
49
+ #
50
+ # Returns a String.
51
+ def self.parse_reply(text, from_address = "")
52
+ self.read(text.to_s, from_address).visible_text
53
+ end
54
+
55
+ def self.parse_new_content(text, from_address = "")
56
+ self.read(text, from_address).new_content
57
+ end
58
+
59
+ ### Emails
60
+
61
+ # An Email instance represents a parsed body String.
62
+ class Email
63
+ # Emails have an Array of Fragments.
64
+ attr_reader :fragments
65
+
66
+ def initialize
67
+ @fragments = []
68
+ end
69
+
70
+ # Public: Gets the combined text of the visible fragments of the email body.
71
+ #
72
+ # Returns a String.
73
+ def visible_text
74
+ fragments.select{|f| !f.hidden?}.map{|f| f.to_s}.join("\n").rstrip
75
+ end
76
+
77
+ def new_content
78
+ fragments.select{|f| !f.quoted? && !f.hidden? && !f.signature?}.map{|f| f.to_s}.join("\n").rstrip
79
+ end
80
+
81
+ # Splits the given text into a list of Fragments. This is roughly done by
82
+ # reversing the text and parsing from the bottom to the top. This way we
83
+ # can check for 'On <date>, <author> wrote:' lines above quoted blocks.
84
+ #
85
+ # text - A String email body.
86
+ # from_address - from address of the email (optional)
87
+ #
88
+ # Returns this same Email instance.
89
+ def read(text, from_address = "")
90
+ # parse out the from name if one exists and save for use later
91
+ @from_name_raw = parse_raw_name_from_address(from_address)
92
+ @from_name_normalized = normalize_name(@from_name_raw)
93
+ @from_email = parse_email_from_address(from_address)
94
+
95
+ text = normalize_text(text)
96
+
97
+ # The text is reversed initially due to the way we check for hidden
98
+ # fragments.
99
+ text = text.reverse
100
+
101
+ # This determines if any 'visible' Fragment has been found. Once any
102
+ # visible Fragment is found, stop looking for hidden ones.
103
+ @found_visible = false
104
+
105
+ # This instance variable points to the current Fragment. If the matched
106
+ # line fits, it should be added to this Fragment. Otherwise, finish it
107
+ # and start a new Fragment.
108
+ @fragment = nil
109
+
110
+ # Use the StringScanner to pull out each line of the email content.
111
+ @scanner = StringScanner.new(text)
112
+ while line = @scanner.scan_until(/\n/n)
113
+ scan_line(line)
114
+ end
115
+
116
+ # Be sure to parse the last line of the email.
117
+ if (last_line = @scanner.rest.to_s).size > 0
118
+ scan_line(last_line, true)
119
+ end
120
+
121
+ # Finish up the final fragment. Finishing a fragment will detect any
122
+ # attributes (hidden, signature, reply), and join each line into a
123
+ # string.
124
+ finish_fragment
125
+
126
+ @scanner = @fragment = nil
127
+
128
+ # Now that parsing is done, reverse the order.
129
+ @fragments.reverse!
130
+ self
131
+ end
132
+
133
+ private
134
+ EMPTY = "".freeze
135
+
136
+ COMMON_REPLY_HEADER_REGEXES = [
137
+ /^On(.+)wrote:$/nm,
138
+ /\A\d{4}\/\d{1,2}\/\d{1,2}\s+.{1,80}\s<[^@]+@[^@]+>\Z/,
139
+ ]
140
+
141
+ # Line optionally starts with whitespace, contains two or more hyphens or
142
+ # underscores, and ends with optional whitespace.
143
+ # Example: '---' or '___' or '--- '
144
+ MULTI_LINE_SIGNATURE_REGEX = /^\s*[-_]{2,}\s*$/
145
+
146
+ # Line optionally starts with whitespace, followed by one hyphen, followed by a word character
147
+ # Example: '-Sandro'
148
+ ONE_LINE_SIGNATURE_REGEX = /^\s*-\w/
149
+
150
+ ORIGINAL_MESSAGE_SIGNATURE_REGEX = /^[\s_-]+(Original Message)?[\s_-]+$/
151
+
152
+ # No block-quotes (> or <), followed by up to three words, followed by "Sent from my".
153
+ # Example: "Sent from my iPhone 3G"
154
+ SENT_FROM_REGEX = /^Sent from my (\s*\w+){1,3}(\s*<.*>)?$/
155
+
156
+ if defined?(Regexp::NOENCODING)
157
+ SIGNATURE_REGEX = Regexp.new(Regexp.union(MULTI_LINE_SIGNATURE_REGEX, ONE_LINE_SIGNATURE_REGEX, ORIGINAL_MESSAGE_SIGNATURE_REGEX, SENT_FROM_REGEX).source, Regexp::NOENCODING)
158
+ else
159
+ SIGNATURE_REGEX = Regexp.new(Regexp.union(MULTI_LINE_SIGNATURE_REGEX, ONE_LINE_SIGNATURE_REGEX, ORIGINAL_MESSAGE_SIGNATURE_REGEX, SENT_FROM_REGEX).source)
160
+ end
161
+
162
+ # TODO: refactor out in a i18n.yml file
163
+ # Supports English, French, Es-Mexican, Pt-Brazilian
164
+ # Maps a label to a label-group
165
+ QUOTE_HEADER_LABELS = Hash[*{
166
+ :from => ["From", "De"],
167
+ :to => ["To", "Para", "A"],
168
+ :cc => ["CC"],
169
+ :reply_to => ["Reply-To"],
170
+ :date => ["Date", "Sent", "Enviado", "Enviada em", "Fecha"],
171
+ :subject => ["Subject", "Assunto", "Asunto", "Objet"]
172
+ }.map {|group, labels| labels.map {|label| [label.downcase, group]}}.flatten]
173
+
174
+ # normalize text so it is easier to parse
175
+ #
176
+ # text - text to normalize
177
+ #
178
+ # Returns a String
179
+ def normalize_text(text)
180
+ # in 1.9 we want to operate on the raw bytes
181
+ text = text.dup.force_encoding('binary') if text.respond_to?(:force_encoding)
182
+
183
+ # Normalize line endings.
184
+ text.gsub!("\r\n", "\n")
185
+
186
+ # Check for multi-line reply headers. Some clients break up
187
+ # the "On DATE, NAME <EMAIL> wrote:" line into multiple lines.
188
+ if match = text.match(/^(On\s(.+)wrote:)$/m)
189
+ # Remove all new lines from the reply header. as long as we don't have any double newline
190
+ # if we do they we have grabbed something that is not actually a reply header
191
+ text.gsub! match[1], match[1].gsub("\n", " ") unless match[1] =~ /\n\n/
192
+ end
193
+
194
+ # Some users may reply directly above a line of underscores.
195
+ # In order to ensure that these fragments are split correctly,
196
+ # make sure that all lines of underscores are preceded by
197
+ # at least two newline characters.
198
+ text.gsub!(/([^\n])(?=\n_{7}_+)$/m, "\\1\n")
199
+
200
+ text
201
+ end
202
+
203
+ # Parse a person's name from an e-mail address
204
+ #
205
+ # email - email address.
206
+ #
207
+ # Returns a String.
208
+ def parse_name_from_address(address)
209
+ normalize_name(parse_raw_name_from_address(address))
210
+ end
211
+
212
+ def parse_raw_name_from_address(address)
213
+ match = address.match(/^["']*([\w\s,]+)["']*\s*</)
214
+ match ? match[1].strip.to_s : EMPTY
215
+ end
216
+
217
+ def parse_email_from_address(address)
218
+ match = address.match /<(.*)>/
219
+ match ? match[1] : address
220
+ end
221
+
222
+ # Normalize a name to First Last
223
+ #
224
+ # name - name to normailze.
225
+ #
226
+ # Returns a String.
227
+ def normalize_name(name)
228
+ if name.include?(',')
229
+ make_name_first_then_last(name)
230
+ else
231
+ name
232
+ end
233
+ end
234
+
235
+ def make_name_first_then_last(name)
236
+ split_name = name.split(',')
237
+ if split_name[0].include?(" ")
238
+ split_name[0].to_s
239
+ else
240
+ split_name[1].strip + " " + split_name[0].strip
241
+ end
242
+ end
243
+
244
+ ### Line-by-Line Parsing
245
+
246
+ # Scans the given line of text and determines which fragment it belongs to.
247
+ def scan_line(line, last = false)
248
+ line.chomp!("\n")
249
+ line.reverse!
250
+ line.rstrip!
251
+
252
+ # Mark the current Fragment as a signature if the current line is empty
253
+ # and the Fragment starts with a common signature indicator.
254
+ # Mark the current Fragment as a quote if the current line is empty
255
+ # and the Fragment starts with a multiline quote header.
256
+ scan_signature_or_quote if @fragment && line == EMPTY
257
+
258
+ # We're looking for leading `>`'s to see if this line is part of a
259
+ # quoted Fragment.
260
+ is_quoted = !!(line =~ /^>+/n)
261
+
262
+ # Note that a common reply header also counts as part of the quoted
263
+ # Fragment, even though it doesn't start with `>`.
264
+ unless @fragment &&
265
+ ((@fragment.quoted? == is_quoted) ||
266
+ (@fragment.quoted? && (line_is_reply_header?(line) || line == EMPTY)))
267
+ finish_fragment
268
+ @fragment = Fragment.new
269
+ @fragment.quoted = is_quoted
270
+ end
271
+
272
+ @fragment.add_line(line)
273
+ scan_signature_or_quote if last
274
+ end
275
+
276
+ def scan_signature_or_quote
277
+ if signature_line?(@fragment.lines.first)
278
+ @fragment.signature = true
279
+ finish_fragment
280
+ elsif multiline_quote_header_in_fragment?
281
+ @fragment.quoted = true
282
+ finish_fragment
283
+ end
284
+ end
285
+
286
+ # Returns +true+ if the current block in the current fragment has
287
+ # a multiline quote header, +false+ otherwise.
288
+ #
289
+ # The quote header we're looking for is mainly generated by Outlook
290
+ # clients. It's considered a quote header if the first 4 folded lines
291
+ # have one of the following forms:
292
+ #
293
+ # label: some text
294
+ # *label:* some text
295
+ #
296
+ # where a line like this:
297
+ #
298
+ # label: some text
299
+ # possibly indented text that belongs to the previous line
300
+ #
301
+ # is folded into:
302
+ #
303
+ # label: some text possibly indented text that belongs to the previous line
304
+ #
305
+ # and where label is a value from +QUOTE_HEADER_LABELS+ that appears
306
+ # only once in the first 4 lines and where each group of a label
307
+ # is represented at most once.
308
+ def multiline_quote_header_in_fragment?
309
+ folding = false
310
+ label_groups = []
311
+ @fragment.current_block.split("\n").each do |line|
312
+ if line =~ /\A\s*\*?([^:]+):(\s|\*)/
313
+ label = QUOTE_HEADER_LABELS[$1.downcase]
314
+ if label
315
+ return false if label_groups.include?(label)
316
+ return true if label_groups.length == 3
317
+ label_groups << label
318
+ folding = true
319
+ elsif !folding
320
+ return false
321
+ end
322
+ elsif !folding
323
+ return false
324
+ else
325
+ folding = true
326
+ end
327
+ end
328
+ return false
329
+ end
330
+
331
+ # Detects if a given line is the beginning of a signature
332
+ #
333
+ # line - A String line of text from the email.
334
+ #
335
+ # Returns true if the line is the beginning of a signature, or false.
336
+ def signature_line?(line)
337
+ line =~ SIGNATURE_REGEX || line_is_signature_name?(line)
338
+ end
339
+
340
+ # Detects if a given line is a common reply header.
341
+ #
342
+ # line - A String line of text from the email.
343
+ #
344
+ # Returns true if the line is a valid header, or false.
345
+ def line_is_reply_header?(line)
346
+ COMMON_REPLY_HEADER_REGEXES.each do |regex|
347
+ return true if line =~ regex
348
+ end
349
+ false
350
+ end
351
+
352
+ # Detects if the @from name is a big part of a given line and therefore the beginning of a signature
353
+ #
354
+ # line - A String line of text from the email.
355
+ #
356
+ # Returns true if @from_name is a big part of the line, or false.
357
+ def line_is_signature_name?(line)
358
+ regexp = generate_regexp_for_name()
359
+ @from_name_normalized != "" && (line =~ regexp) && ((@from_name_normalized.size.to_f / line.size) > 0.25)
360
+ end
361
+
362
+ #generates regexp which always for additional words or initials between first and last names
363
+ def generate_regexp_for_name
364
+ name_parts = @from_name_normalized.split(" ")
365
+ seperator = '[\w.\s]*'
366
+ regexp = Regexp.new(name_parts.join(seperator), Regexp::IGNORECASE)
367
+ end
368
+
369
+ # Builds the fragment string, after all lines have been added.
370
+ # It also checks to see if this Fragment is hidden. The hidden
371
+ # Fragment check reads from the bottom to the top.
372
+ #
373
+ # Any quoted Fragments or signature Fragments are marked hidden if they
374
+ # are below any visible Fragments. Visible Fragments are expected to
375
+ # contain original content by the author. If they are below a quoted
376
+ # Fragment, then the Fragment should be visible to give context to the
377
+ # reply.
378
+ #
379
+ # some original text (visible)
380
+ #
381
+ # > do you have any two's? (quoted, visible)
382
+ #
383
+ # Go fish! (visible)
384
+ #
385
+ # > --
386
+ # > Player 1 (quoted, hidden)
387
+ #
388
+ # --
389
+ # Player 2 (signature, hidden)
390
+ #
391
+ def finish_fragment
392
+ if @fragment
393
+ @fragment.finish
394
+ if !@found_visible
395
+ if @fragment.quoted? || @fragment.signature? ||
396
+ @fragment.reply_header? || @fragment.to_s.strip == EMPTY
397
+ @fragment.hidden = true
398
+ else
399
+ @found_visible = true
400
+ end
401
+ end
402
+ @fragments << @fragment
403
+ end
404
+ @fragment = nil
405
+ end
406
+ end
407
+
408
+ # Represents a group of paragraphs in the email sharing common attributes.
409
+ # Paragraphs should get their own fragment if they are a quoted area or a
410
+ # signature.
411
+ class Fragment < Struct.new(:quoted, :signature, :reply_header, :hidden)
412
+ # Array of string lines that make up the content of this fragment.
413
+ attr_reader :lines
414
+
415
+ # Array of string lines that is being processed not having
416
+ # an empty line.
417
+ attr_reader :current_block
418
+
419
+ # This is reserved for the joined String that is build when this Fragment
420
+ # is finished.
421
+ attr_reader :content
422
+
423
+ def initialize
424
+ self.quoted = self.signature = self.reply_header = self.hidden = false
425
+ @lines = []
426
+ @current_block = []
427
+ @content = nil
428
+ end
429
+
430
+ alias quoted? quoted
431
+ alias signature? signature
432
+ alias reply_header? reply_header
433
+ alias hidden? hidden
434
+
435
+ def add_line(line)
436
+ return unless line
437
+ @lines.insert(0, line)
438
+ if line == ""
439
+ @current_block.clear
440
+ else
441
+ @current_block.insert(0, line)
442
+ end
443
+ end
444
+
445
+ def current_block
446
+ @current_block.join("\n")
447
+ end
448
+
449
+ # Builds the string content by joining the lines and reversing them.
450
+ def finish
451
+ @content = @lines.join("\n")
452
+ @lines = @current_block = nil
453
+ end
454
+
455
+ def to_s
456
+ @lines ? @lines.join("\n") : @content
457
+ end
458
+
459
+ def inspect
460
+ "#{super.inspect} : #{to_s.inspect}"
461
+ end
462
+ end
463
+ end
464
+