email_reply_parser-discourse 0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +22 -0
  3. data/README.md +68 -0
  4. data/Rakefile +135 -0
  5. data/email_reply_parser.gemspec +122 -0
  6. data/lib/email_reply_parser.rb +456 -0
  7. data/test/email_reply_parser_test.rb +431 -0
  8. data/test/emails/correct_sig.txt +4 -0
  9. data/test/emails/email_1_1.txt +13 -0
  10. data/test/emails/email_1_2.txt +51 -0
  11. data/test/emails/email_1_3.txt +55 -0
  12. data/test/emails/email_1_4.txt +5 -0
  13. data/test/emails/email_1_5.txt +15 -0
  14. data/test/emails/email_1_6.txt +15 -0
  15. data/test/emails/email_1_7.txt +12 -0
  16. data/test/emails/email_1_8.txt +6 -0
  17. data/test/emails/email_1_9.txt +9 -0
  18. data/test/emails/email_2_1.txt +25 -0
  19. data/test/emails/email_2_2.txt +10 -0
  20. data/test/emails/email_2_3.txt +14 -0
  21. data/test/emails/email_2_4.txt +14 -0
  22. data/test/emails/email_2_5.txt +15 -0
  23. data/test/emails/email_2_6.txt +11 -0
  24. data/test/emails/email_2_7.txt +5 -0
  25. data/test/emails/email_2_8.txt +4 -0
  26. data/test/emails/email_2_9.txt +9 -0
  27. data/test/emails/email_2nd_paragraph_starting_with_on.txt +12 -0
  28. data/test/emails/email_BlackBerry.txt +3 -0
  29. data/test/emails/email_bullets.txt +22 -0
  30. data/test/emails/email_from_address_in_quote_header.txt +12 -0
  31. data/test/emails/email_from_name_in_quote_header.txt +12 -0
  32. data/test/emails/email_hyphens.txt +5 -0
  33. data/test/emails/email_iPhone.txt +3 -0
  34. data/test/emails/email_mentions_own_email_address.txt +6 -0
  35. data/test/emails/email_mentions_own_name.txt +6 -0
  36. data/test/emails/email_multi_word_sent_from_my_mobile_device.txt +3 -0
  37. data/test/emails/email_multiline_quote_header_es_mx.txt +8 -0
  38. data/test/emails/email_multiline_quote_header_fr.txt +8 -0
  39. data/test/emails/email_multiline_quote_header_from_first.txt +11 -0
  40. data/test/emails/email_multiline_quote_header_from_replyto_date_to_subject.txt +12 -0
  41. data/test/emails/email_multiline_quote_header_from_to_date_subject.txt +11 -0
  42. data/test/emails/email_multiline_quote_header_none.txt +11 -0
  43. data/test/emails/email_multiline_quote_header_pt_br.txt +8 -0
  44. data/test/emails/email_multiline_quote_header_with_asterisks.txt +21 -0
  45. data/test/emails/email_multiline_quote_header_with_cc.txt +9 -0
  46. data/test/emails/email_multiline_quote_header_with_multiline_headers.txt +14 -0
  47. data/test/emails/email_no_signature_deliminator.txt +7 -0
  48. data/test/emails/email_no_signature_deliminator_adds_a_middle_initial.txt +7 -0
  49. data/test/emails/email_one_is_not_on.txt +10 -0
  50. data/test/emails/email_sent_from_my_not_signature.txt +3 -0
  51. data/test/emails/email_was_showing_as_nothing_visible.txt +13 -0
  52. metadata +96 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: f3702e6f07642f5ffb98621fc3888d29c8395a22
4
+ data.tar.gz: 02c0a97683bca63070e9776c43e31b8b4106b089
5
+ SHA512:
6
+ metadata.gz: be413b85292da182617366a2fd4bc72de3cca647a6f048100d2a5bf027cd136067924916953a237263b8deeb4b34c8c7512cb9f464aa8f57215c90e3add3e977
7
+ data.tar.gz: 70c9789054b9ff4cda0103a79f7490dd907cfb103e9d4cb9bb05138694846bf17fe2c6b2833b44000b79700ec927e647fdabb2d3c3c5e4d1acebbd555ee60702
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License
2
+
3
+ Copyright (c) GitHub
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
22
+
data/README.md ADDED
@@ -0,0 +1,68 @@
1
+ # Email Reply Parser
2
+
3
+ [![Build Status](https://secure.travis-ci.org/lawrencepit/email_reply_parser.png?branch=master)](http://travis-ci.org/lawrencepit/email_reply_parser)
4
+ [![Code Climate](https://codeclimate.com/badge.png)](https://codeclimate.com/github/lawrencepit/email_reply_parser)
5
+ [![Gem Version](https://fury-badge.herokuapp.com/rb/email_reply_parser.png)](http://badge.fury.io/rb/email_reply_parser)
6
+
7
+ EmailReplyParser is a small library to parse plain text email content.
8
+
9
+ This is what GitHub uses to display comments that were created from
10
+ email replies. This code is being open sourced in an effort to
11
+ crowdsource the quality of our email representation.
12
+
13
+ ## Usage
14
+
15
+ To parse reply body:
16
+
17
+ `parsed_body = EmailReplyParser.parse_reply(email_body, from_address)`
18
+
19
+ Argument `from_address` is optional. If included it will attempt to parse out signatures based on the name in the from address (if signature doesn't have a standard deliminator.)
20
+
21
+ ## Installation
22
+
23
+ Get it from [GitHub][github] or `gem install email_reply_parser`. Run `rake` to run the tests.
24
+
25
+ [github]: https://github.com/github/email_reply_parser
26
+
27
+ ## Contribute
28
+
29
+ If you'd like to hack on EmailReplyParser, start by forking the repo on GitHub:
30
+
31
+ https://github.com/github/email_reply_parser
32
+
33
+ The best way to get your changes merged back into core is as follows:
34
+
35
+ * Clone down your fork
36
+ * Create a thoughtfully named topic branch to contain your change
37
+ * Hack away
38
+ * Add tests and make sure everything still passes by running rake
39
+ * If you are adding new functionality, document it in the README
40
+ * Do not change the version number, I will do that on my end
41
+ * If necessary, rebase your commits into logical chunks, without errors
42
+ * Push the branch up to GitHub
43
+ * Send a pull request to the `github/email_reply_parser` project.
44
+
45
+ ## Known Issues
46
+
47
+ ### Quoted Headers
48
+
49
+ Quoted headers like these currently don't work with other languages:
50
+
51
+ On <date>, <author> wrote:
52
+
53
+ > blah
54
+
55
+ ### Weird Signatures
56
+
57
+ Not everyone follows this convention:
58
+
59
+ Hello
60
+
61
+ Saludos!!!!!!!!!!!!!!
62
+ Galactic President Superstar Mc Awesomeville
63
+ GitHub
64
+
65
+ **********************DISCLAIMER***********************************
66
+ * Note: blah blah blah *
67
+ **********************DISCLAIMER***********************************
68
+
data/Rakefile ADDED
@@ -0,0 +1,135 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'date'
4
+
5
+ #############################################################################
6
+ #
7
+ # Helper functions
8
+ #
9
+ #############################################################################
10
+
11
+ def name
12
+ @name ||= Dir['*.gemspec'].first.split('.').first
13
+ end
14
+
15
+ def version
16
+ line = File.read("lib/#{name}.rb")[/^\s*VERSION\s*=\s*.*/]
17
+ line.match(/.*VERSION\s*=\s*['"](.*)['"]/)[1]
18
+ end
19
+
20
+ def date
21
+ Date.today.to_s
22
+ end
23
+
24
+ def rubyforge_project
25
+ name
26
+ end
27
+
28
+ def gemspec_file
29
+ "#{name}.gemspec"
30
+ end
31
+
32
+ def gem_file
33
+ "#{name}-#{version}.gem"
34
+ end
35
+
36
+ def replace_header(head, header_name)
37
+ head.sub!(/(\.#{header_name}\s*= ').*'/) { "#{$1}#{send(header_name)}'"}
38
+ end
39
+
40
+ #############################################################################
41
+ #
42
+ # Standard tasks
43
+ #
44
+ #############################################################################
45
+
46
+ task :default => :test
47
+
48
+ require 'rake/testtask'
49
+ Rake::TestTask.new(:test) do |test|
50
+ test.libs << 'lib' << 'test'
51
+ test.pattern = 'test/*_test.rb'
52
+ test.verbose = true
53
+ end
54
+
55
+ desc "Open an irb session preloaded with this library"
56
+ task :console do
57
+ sh "irb -rubygems -r ./lib/#{name}.rb"
58
+ end
59
+
60
+ #############################################################################
61
+ #
62
+ # Custom tasks (add your own tasks here)
63
+ #
64
+ #############################################################################
65
+
66
+
67
+
68
+ #############################################################################
69
+ #
70
+ # Packaging tasks
71
+ #
72
+ #############################################################################
73
+
74
+ desc "Create tag v#{version} and build and push #{gem_file} to Rubygems"
75
+ task :release => :build do
76
+ unless `git branch` =~ /^\* master$/
77
+ puts "You must be on the master branch to release!"
78
+ exit!
79
+ end
80
+ sh "git commit --allow-empty -a -m 'Release #{version}'"
81
+ sh "git tag v#{version}"
82
+ sh "git push origin master"
83
+ sh "git push origin v#{version}"
84
+ sh "gem push pkg/#{name}-#{version}.gem"
85
+ end
86
+
87
+ desc "Build #{gem_file} into the pkg directory"
88
+ task :build => :gemspec do
89
+ sh "mkdir -p pkg"
90
+ sh "gem build #{gemspec_file}"
91
+ sh "mv #{gem_file} pkg"
92
+ end
93
+
94
+ desc "Generate #{gemspec_file}"
95
+ task :gemspec => :validate do
96
+ # read spec file and split out manifest section
97
+ spec = File.read(gemspec_file)
98
+ head, manifest, tail = spec.split(" # = MANIFEST =\n")
99
+
100
+ # replace name version and date
101
+ replace_header(head, :name)
102
+ replace_header(head, :version)
103
+ replace_header(head, :date)
104
+ #comment this out if your rubyforge_project has a different name
105
+ replace_header(head, :rubyforge_project)
106
+
107
+ # determine file list from git ls-files
108
+ files = `git ls-files`.
109
+ split("\n").
110
+ sort.
111
+ reject { |file| file =~ /^\./ }.
112
+ reject { |file| file =~ /^(rdoc|pkg)/ }.
113
+ map { |file| " #{file}" }.
114
+ join("\n")
115
+
116
+ # piece file back together and write
117
+ manifest = " s.files = %w[\n#{files}\n ]\n"
118
+ spec = [head, manifest, tail].join(" # = MANIFEST =\n")
119
+ File.open(gemspec_file, 'w') { |io| io.write(spec) }
120
+ puts "Updated #{gemspec_file}"
121
+ end
122
+
123
+ desc "Validate #{gemspec_file}"
124
+ task :validate do
125
+ libfiles = Dir['lib/*'] - ["lib/#{name}.rb", "lib/#{name}"]
126
+ unless libfiles.empty?
127
+ puts "Directory `lib` should only contain a `#{name}.rb` file and `#{name}` dir."
128
+ exit!
129
+ end
130
+ unless Dir['VERSION*'].empty?
131
+ puts "A `VERSION` file at root level violates Gem best practices."
132
+ exit!
133
+ end
134
+ end
135
+
@@ -0,0 +1,122 @@
1
+ $LOAD_PATH.unshift '.'
2
+ require 'lib/email_reply_parser'
3
+
4
+ ## This is the rakegem gemspec template. Make sure you read and understand
5
+ ## all of the comments. Some sections require modification, and others can
6
+ ## be deleted if you don't need them. Once you understand the contents of
7
+ ## this file, feel free to delete any comments that begin with two hash marks.
8
+ ## You can find comprehensive Gem::Specification documentation, at
9
+ ## http://docs.rubygems.org/read/chapter/20
10
+ Gem::Specification.new do |s|
11
+ s.specification_version = 2 if s.respond_to? :specification_version=
12
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
13
+ s.rubygems_version = '1.3.5'
14
+
15
+ ## Leave these as is they will be modified for you by the rake gemspec task.
16
+ ## If your rubyforge_project name is different, then edit it and comment out
17
+ ## the sub! line in the Rakefile
18
+ s.name = 'email_reply_parser-discourse'
19
+ s.version = EmailReplyParser::VERSION
20
+ s.date = Time.now.strftime('%Y-%m-%d')
21
+ s.rubyforge_project = 'email_reply_parser'
22
+
23
+ ## Make sure your summary is short. The description may be as long
24
+ ## as you like.
25
+ s.summary = "Email Reply Parser"
26
+ s.description = "EmailReplyParser is a small library to parse plain text email content to return only the reply."
27
+
28
+ ## List the primary authors. If there are a bunch of authors, it's probably
29
+ ## better to set the email to an email list or something. If you don't have
30
+ ## a custom homepage, consider using your GitHub URL or the like.
31
+ s.authors = ["Rick Olson"]
32
+ s.email = 'technoweenie@gmail.com'
33
+ s.homepage = 'http://github.com/github/email_reply_parser'
34
+
35
+ ## This gets added to the $LOAD_PATH so that 'lib/NAME.rb' can be required as
36
+ ## require 'NAME.rb' or'/lib/NAME/file.rb' can be as require 'NAME/file.rb'
37
+ s.require_paths = %w[lib]
38
+
39
+ ## This sections is only necessary if you have C extensions.
40
+ #s.require_paths << 'ext'
41
+ #s.extensions = %w[ext/extconf.rb]
42
+
43
+ ## If your gem includes any executables, list them here.
44
+ #s.executables = ["name"]
45
+ #s.default_executable = 'name'
46
+
47
+ ## Specify any RDoc options here. You'll want to add your README and
48
+ ## LICENSE files to the extra_rdoc_files list.
49
+ s.rdoc_options = ["--charset=UTF-8"]
50
+ s.extra_rdoc_files = %w[README.md LICENSE]
51
+
52
+ ## List your runtime dependencies here. Runtime dependencies are those
53
+ ## that are needed for an end user to actually USE your code.
54
+ #s.add_dependency('DEPNAME', [">= 1.1.0", "< 2.0.0"])
55
+
56
+ ## List your development dependencies here. Development dependencies are
57
+ ## those that are only needed during development
58
+ #s.add_development_dependency('DEVDEPNAME', [">= 1.1.0", "< 2.0.0"])
59
+
60
+ ## Leave this section as-is. It will be automatically generated from the
61
+ ## contents of your Git repository via the gemspec task. DO NOT REMOVE
62
+ ## THE MANIFEST COMMENTS, they are used as delimiters by the task.
63
+ # = MANIFEST =
64
+ s.files = %w[
65
+ LICENSE
66
+ README.md
67
+ Rakefile
68
+ email_reply_parser.gemspec
69
+ lib/email_reply_parser.rb
70
+ test/email_reply_parser_test.rb
71
+ test/emails/correct_sig.txt
72
+ test/emails/email_1_1.txt
73
+ test/emails/email_1_2.txt
74
+ test/emails/email_1_3.txt
75
+ test/emails/email_1_4.txt
76
+ test/emails/email_1_5.txt
77
+ test/emails/email_1_6.txt
78
+ test/emails/email_1_7.txt
79
+ test/emails/email_1_8.txt
80
+ test/emails/email_1_9.txt
81
+ test/emails/email_2_1.txt
82
+ test/emails/email_2_2.txt
83
+ test/emails/email_2_3.txt
84
+ test/emails/email_2_4.txt
85
+ test/emails/email_2_5.txt
86
+ test/emails/email_2_6.txt
87
+ test/emails/email_2_7.txt
88
+ test/emails/email_2_8.txt
89
+ test/emails/email_2_9.txt
90
+ test/emails/email_2nd_paragraph_starting_with_on.txt
91
+ test/emails/email_BlackBerry.txt
92
+ test/emails/email_bullets.txt
93
+ test/emails/email_from_address_in_quote_header.txt
94
+ test/emails/email_from_name_in_quote_header.txt
95
+ test/emails/email_hyphens.txt
96
+ test/emails/email_iPhone.txt
97
+ test/emails/email_mentions_own_email_address.txt
98
+ test/emails/email_mentions_own_name.txt
99
+ test/emails/email_multi_word_sent_from_my_mobile_device.txt
100
+ test/emails/email_multiline_quote_header_es_mx.txt
101
+ test/emails/email_multiline_quote_header_fr.txt
102
+ test/emails/email_multiline_quote_header_from_first.txt
103
+ test/emails/email_multiline_quote_header_from_replyto_date_to_subject.txt
104
+ test/emails/email_multiline_quote_header_from_to_date_subject.txt
105
+ test/emails/email_multiline_quote_header_none.txt
106
+ test/emails/email_multiline_quote_header_pt_br.txt
107
+ test/emails/email_multiline_quote_header_with_asterisks.txt
108
+ test/emails/email_multiline_quote_header_with_cc.txt
109
+ test/emails/email_multiline_quote_header_with_multiline_headers.txt
110
+ test/emails/email_no_signature_deliminator.txt
111
+ test/emails/email_no_signature_deliminator_adds_a_middle_initial.txt
112
+ test/emails/email_one_is_not_on.txt
113
+ test/emails/email_sent_from_my_not_signature.txt
114
+ test/emails/email_was_showing_as_nothing_visible.txt
115
+ ]
116
+ # = MANIFEST =
117
+
118
+ ## Test files will be grabbed from the file list. Make sure the path glob
119
+ ## matches what you actually use.
120
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
121
+ end
122
+
@@ -0,0 +1,456 @@
1
+ require 'strscan'
2
+
3
+ # EmailReplyParser is a small library to parse plain text email content. The
4
+ # goal is to identify which fragments are quoted, part of a signature, or
5
+ # original body content. We want to support both top and bottom posters, so
6
+ # no simple "REPLY ABOVE HERE" content is used.
7
+ #
8
+ # Beyond RFC 5322 (which is handled by the [Ruby mail gem][mail]), there aren't
9
+ # any real standards for how emails are created. This attempts to parse out
10
+ # common conventions for things like replies:
11
+ #
12
+ # this is some text
13
+ #
14
+ # On <date>, <author> wrote:
15
+ # > blah blah
16
+ # > blah blah
17
+ #
18
+ # ... and signatures:
19
+ #
20
+ # this is some text
21
+ #
22
+ # --
23
+ # Bob
24
+ # http://homepage.com/~bob
25
+ #
26
+ # Each of these are parsed into Fragment objects.
27
+ #
28
+ # EmailReplyParser also attempts to figure out which of these blocks should
29
+ # be hidden from users.
30
+ #
31
+ # [mail]: https://github.com/mikel/mail
32
+ class EmailReplyParser
33
+ VERSION = "0.6"
34
+
35
+ # Public: Splits an email body into a list of Fragments.
36
+ #
37
+ # text - A String email body.
38
+ # from_address - from address of the email (optional)
39
+ #
40
+ # Returns an Email instance.
41
+ def self.read(text, from_address = "")
42
+ Email.new.read(text, from_address)
43
+ end
44
+
45
+ # Public: Get the text of the visible portions of the given email body.
46
+ #
47
+ # text - A String email body.
48
+ # from_address - from address of the email (optional)
49
+ #
50
+ # Returns a String.
51
+ def self.parse_reply(text, from_address = "")
52
+ self.read(text.to_s, from_address).visible_text
53
+ end
54
+
55
+ ### Emails
56
+
57
+ # An Email instance represents a parsed body String.
58
+ class Email
59
+ # Emails have an Array of Fragments.
60
+ attr_reader :fragments
61
+
62
+ def initialize
63
+ @fragments = []
64
+ end
65
+
66
+ # Public: Gets the combined text of the visible fragments of the email body.
67
+ #
68
+ # Returns a String.
69
+ def visible_text
70
+ fragments.select{|f| !f.hidden?}.map{|f| f.to_s}.join("\n").rstrip
71
+ end
72
+
73
+ # Splits the given text into a list of Fragments. This is roughly done by
74
+ # reversing the text and parsing from the bottom to the top. This way we
75
+ # can check for 'On <date>, <author> wrote:' lines above quoted blocks.
76
+ #
77
+ # text - A String email body.
78
+ # from_address - from address of the email (optional)
79
+ #
80
+ # Returns this same Email instance.
81
+ def read(text, from_address = "")
82
+ # parse out the from name if one exists and save for use later
83
+ @from_name_raw = parse_raw_name_from_address(from_address)
84
+ @from_name_normalized = normalize_name(@from_name_raw)
85
+ @from_email = parse_email_from_address(from_address)
86
+
87
+ text = normalize_text(text)
88
+
89
+ # The text is reversed initially due to the way we check for hidden
90
+ # fragments.
91
+ text = text.reverse
92
+
93
+ # This determines if any 'visible' Fragment has been found. Once any
94
+ # visible Fragment is found, stop looking for hidden ones.
95
+ @found_visible = false
96
+
97
+ # This instance variable points to the current Fragment. If the matched
98
+ # line fits, it should be added to this Fragment. Otherwise, finish it
99
+ # and start a new Fragment.
100
+ @fragment = nil
101
+
102
+ # Use the StringScanner to pull out each line of the email content.
103
+ @scanner = StringScanner.new(text)
104
+ while line = @scanner.scan_until(/\n/n)
105
+ scan_line(line)
106
+ end
107
+
108
+ # Be sure to parse the last line of the email.
109
+ if (last_line = @scanner.rest.to_s).size > 0
110
+ scan_line(last_line, true)
111
+ end
112
+
113
+ # Finish up the final fragment. Finishing a fragment will detect any
114
+ # attributes (hidden, signature, reply), and join each line into a
115
+ # string.
116
+ finish_fragment
117
+
118
+ @scanner = @fragment = nil
119
+
120
+ # Now that parsing is done, reverse the order.
121
+ @fragments.reverse!
122
+ self
123
+ end
124
+
125
+ private
126
+ EMPTY = "".freeze
127
+
128
+ COMMON_REPLY_HEADER_REGEXES = [
129
+ /^On(.+)wrote:$/nm,
130
+ /\A\d{4}\/\d{1,2}\/\d{1,2}\s+.{1,80}\s<[^@]+@[^@]+>\Z/,
131
+ ]
132
+
133
+ # Line optionally starts with whitespace, contains two or more hyphens or
134
+ # underscores, and ends with optional whitespace.
135
+ # Example: '---' or '___' or '--- '
136
+ MULTI_LINE_SIGNATURE_REGEX = /^\s*[-_]{2,}\s*$/
137
+
138
+ # Line optionally starts with whitespace, followed by one hyphen, followed by a word character
139
+ # Example: '-Sandro'
140
+ ONE_LINE_SIGNATURE_REGEX = /^\s*-\w/
141
+
142
+ ORIGINAL_MESSAGE_SIGNATURE_REGEX = /^[\s_-]+(Original Message)?[\s_-]+$/
143
+
144
+ # No block-quotes (> or <), followed by up to three words, followed by "Sent from my".
145
+ # Example: "Sent from my iPhone 3G"
146
+ SENT_FROM_REGEX = /^Sent from my (\s*\w+){1,3}(\s*<.*>)?$/
147
+
148
+ if defined?(Regexp::NOENCODING)
149
+ SIGNATURE_REGEX = Regexp.new(Regexp.union(MULTI_LINE_SIGNATURE_REGEX, ONE_LINE_SIGNATURE_REGEX, ORIGINAL_MESSAGE_SIGNATURE_REGEX, SENT_FROM_REGEX).source, Regexp::NOENCODING)
150
+ else
151
+ SIGNATURE_REGEX = Regexp.new(Regexp.union(MULTI_LINE_SIGNATURE_REGEX, ONE_LINE_SIGNATURE_REGEX, ORIGINAL_MESSAGE_SIGNATURE_REGEX, SENT_FROM_REGEX).source)
152
+ end
153
+
154
+ # TODO: refactor out in a i18n.yml file
155
+ # Supports English, French, Es-Mexican, Pt-Brazilian
156
+ # Maps a label to a label-group
157
+ QUOTE_HEADER_LABELS = Hash[*{
158
+ :from => ["From", "De"],
159
+ :to => ["To", "Para", "A"],
160
+ :cc => ["CC"],
161
+ :reply_to => ["Reply-To"],
162
+ :date => ["Date", "Sent", "Enviado", "Enviada em", "Fecha"],
163
+ :subject => ["Subject", "Assunto", "Asunto", "Objet"]
164
+ }.map {|group, labels| labels.map {|label| [label.downcase, group]}}.flatten]
165
+
166
+ # normalize text so it is easier to parse
167
+ #
168
+ # text - text to normalize
169
+ #
170
+ # Returns a String
171
+ def normalize_text(text)
172
+ # in 1.9 we want to operate on the raw bytes
173
+ text = text.dup.force_encoding('binary') if text.respond_to?(:force_encoding)
174
+
175
+ # Normalize line endings.
176
+ text.gsub!("\r\n", "\n")
177
+
178
+ # Check for multi-line reply headers. Some clients break up
179
+ # the "On DATE, NAME <EMAIL> wrote:" line into multiple lines.
180
+ if match = text.match(/^(On\s(.+)wrote:)$/m)
181
+ # Remove all new lines from the reply header. as long as we don't have any double newline
182
+ # if we do they we have grabbed something that is not actually a reply header
183
+ text.gsub! match[1], match[1].gsub("\n", " ") unless match[1] =~ /\n\n/
184
+ end
185
+
186
+ # Some users may reply directly above a line of underscores.
187
+ # In order to ensure that these fragments are split correctly,
188
+ # make sure that all lines of underscores are preceded by
189
+ # at least two newline characters.
190
+ text.gsub!(/([^\n])(?=\n_{7}_+)$/m, "\\1\n")
191
+
192
+ text
193
+ end
194
+
195
+ # Parse a person's name from an e-mail address
196
+ #
197
+ # email - email address.
198
+ #
199
+ # Returns a String.
200
+ def parse_name_from_address(address)
201
+ normalize_name(parse_raw_name_from_address(address))
202
+ end
203
+
204
+ def parse_raw_name_from_address(address)
205
+ match = address.match(/^["']*([\w\s,]+)["']*\s*</)
206
+ match ? match[1].strip.to_s : EMPTY
207
+ end
208
+
209
+ def parse_email_from_address(address)
210
+ match = address.match /<(.*)>/
211
+ match ? match[1] : address
212
+ end
213
+
214
+ # Normalize a name to First Last
215
+ #
216
+ # name - name to normailze.
217
+ #
218
+ # Returns a String.
219
+ def normalize_name(name)
220
+ if name.include?(',')
221
+ make_name_first_then_last(name)
222
+ else
223
+ name
224
+ end
225
+ end
226
+
227
+ def make_name_first_then_last(name)
228
+ split_name = name.split(',')
229
+ if split_name[0].include?(" ")
230
+ split_name[0].to_s
231
+ else
232
+ split_name[1].strip + " " + split_name[0].strip
233
+ end
234
+ end
235
+
236
+ ### Line-by-Line Parsing
237
+
238
+ # Scans the given line of text and determines which fragment it belongs to.
239
+ def scan_line(line, last = false)
240
+ line.chomp!("\n")
241
+ line.reverse!
242
+ line.rstrip!
243
+
244
+ # Mark the current Fragment as a signature if the current line is empty
245
+ # and the Fragment starts with a common signature indicator.
246
+ # Mark the current Fragment as a quote if the current line is empty
247
+ # and the Fragment starts with a multiline quote header.
248
+ scan_signature_or_quote if @fragment && line == EMPTY
249
+
250
+ # We're looking for leading `>`'s to see if this line is part of a
251
+ # quoted Fragment.
252
+ is_quoted = !!(line =~ /^>+/n)
253
+
254
+ # Note that a common reply header also counts as part of the quoted
255
+ # Fragment, even though it doesn't start with `>`.
256
+ unless @fragment &&
257
+ ((@fragment.quoted? == is_quoted) ||
258
+ (@fragment.quoted? && (line_is_reply_header?(line) || line == EMPTY)))
259
+ finish_fragment
260
+ @fragment = Fragment.new
261
+ @fragment.quoted = is_quoted
262
+ end
263
+
264
+ @fragment.add_line(line)
265
+ scan_signature_or_quote if last
266
+ end
267
+
268
+ def scan_signature_or_quote
269
+ if signature_line?(@fragment.lines.first)
270
+ @fragment.signature = true
271
+ finish_fragment
272
+ elsif multiline_quote_header_in_fragment?
273
+ @fragment.quoted = true
274
+ finish_fragment
275
+ end
276
+ end
277
+
278
+ # Returns +true+ if the current block in the current fragment has
279
+ # a multiline quote header, +false+ otherwise.
280
+ #
281
+ # The quote header we're looking for is mainly generated by Outlook
282
+ # clients. It's considered a quote header if the first 4 folded lines
283
+ # have one of the following forms:
284
+ #
285
+ # label: some text
286
+ # *label:* some text
287
+ #
288
+ # where a line like this:
289
+ #
290
+ # label: some text
291
+ # possibly indented text that belongs to the previous line
292
+ #
293
+ # is folded into:
294
+ #
295
+ # label: some text possibly indented text that belongs to the previous line
296
+ #
297
+ # and where label is a value from +QUOTE_HEADER_LABELS+ that appears
298
+ # only once in the first 4 lines and where each group of a label
299
+ # is represented at most once.
300
+ def multiline_quote_header_in_fragment?
301
+ folding = false
302
+ label_groups = []
303
+ @fragment.current_block.split("\n").each do |line|
304
+ if line =~ /\A\s*\*?([^:]+):(\s|\*)/
305
+ label = QUOTE_HEADER_LABELS[$1.downcase]
306
+ if label
307
+ return false if label_groups.include?(label)
308
+ return true if label_groups.length == 3
309
+ label_groups << label
310
+ folding = true
311
+ elsif !folding
312
+ return false
313
+ end
314
+ elsif !folding
315
+ return false
316
+ else
317
+ folding = true
318
+ end
319
+ end
320
+ return false
321
+ end
322
+
323
+ # Detects if a given line is the beginning of a signature
324
+ #
325
+ # line - A String line of text from the email.
326
+ #
327
+ # Returns true if the line is the beginning of a signature, or false.
328
+ def signature_line?(line)
329
+ line =~ SIGNATURE_REGEX || line_is_signature_name?(line)
330
+ end
331
+
332
+ # Detects if a given line is a common reply header.
333
+ #
334
+ # line - A String line of text from the email.
335
+ #
336
+ # Returns true if the line is a valid header, or false.
337
+ def line_is_reply_header?(line)
338
+ COMMON_REPLY_HEADER_REGEXES.each do |regex|
339
+ return true if line =~ regex
340
+ end
341
+ false
342
+ end
343
+
344
+ # Detects if the @from name is a big part of a given line and therefore the beginning of a signature
345
+ #
346
+ # line - A String line of text from the email.
347
+ #
348
+ # Returns true if @from_name is a big part of the line, or false.
349
+ def line_is_signature_name?(line)
350
+ regexp = generate_regexp_for_name()
351
+ @from_name_normalized != "" && (line =~ regexp) && ((@from_name_normalized.size.to_f / line.size) > 0.25)
352
+ end
353
+
354
+ #generates regexp which always for additional words or initials between first and last names
355
+ def generate_regexp_for_name
356
+ name_parts = @from_name_normalized.split(" ")
357
+ seperator = '[\w.\s]*'
358
+ regexp = Regexp.new(name_parts.join(seperator), Regexp::IGNORECASE)
359
+ end
360
+
361
+ # Builds the fragment string, after all lines have been added.
362
+ # It also checks to see if this Fragment is hidden. The hidden
363
+ # Fragment check reads from the bottom to the top.
364
+ #
365
+ # Any quoted Fragments or signature Fragments are marked hidden if they
366
+ # are below any visible Fragments. Visible Fragments are expected to
367
+ # contain original content by the author. If they are below a quoted
368
+ # Fragment, then the Fragment should be visible to give context to the
369
+ # reply.
370
+ #
371
+ # some original text (visible)
372
+ #
373
+ # > do you have any two's? (quoted, visible)
374
+ #
375
+ # Go fish! (visible)
376
+ #
377
+ # > --
378
+ # > Player 1 (quoted, hidden)
379
+ #
380
+ # --
381
+ # Player 2 (signature, hidden)
382
+ #
383
+ def finish_fragment
384
+ if @fragment
385
+ @fragment.finish
386
+ if !@found_visible
387
+ if @fragment.quoted? || @fragment.signature? ||
388
+ @fragment.reply_header? || @fragment.to_s.strip == EMPTY
389
+ @fragment.hidden = true
390
+ else
391
+ @found_visible = true
392
+ end
393
+ end
394
+ @fragments << @fragment
395
+ end
396
+ @fragment = nil
397
+ end
398
+ end
399
+
400
+ # Represents a group of paragraphs in the email sharing common attributes.
401
+ # Paragraphs should get their own fragment if they are a quoted area or a
402
+ # signature.
403
+ class Fragment < Struct.new(:quoted, :signature, :reply_header, :hidden)
404
+ # Array of string lines that make up the content of this fragment.
405
+ attr_reader :lines
406
+
407
+ # Array of string lines that is being processed not having
408
+ # an empty line.
409
+ attr_reader :current_block
410
+
411
+ # This is reserved for the joined String that is build when this Fragment
412
+ # is finished.
413
+ attr_reader :content
414
+
415
+ def initialize
416
+ self.quoted = self.signature = self.reply_header = self.hidden = false
417
+ @lines = []
418
+ @current_block = []
419
+ @content = nil
420
+ end
421
+
422
+ alias quoted? quoted
423
+ alias signature? signature
424
+ alias reply_header? reply_header
425
+ alias hidden? hidden
426
+
427
+ def add_line(line)
428
+ return unless line
429
+ @lines.insert(0, line)
430
+ if line == ""
431
+ @current_block.clear
432
+ else
433
+ @current_block.insert(0, line)
434
+ end
435
+ end
436
+
437
+ def current_block
438
+ @current_block.join("\n")
439
+ end
440
+
441
+ # Builds the string content by joining the lines and reversing them.
442
+ def finish
443
+ @content = @lines.join("\n")
444
+ @lines = @current_block = nil
445
+ end
446
+
447
+ def to_s
448
+ @lines ? @lines.join("\n") : @content
449
+ end
450
+
451
+ def inspect
452
+ "#{super.inspect} : #{to_s.inspect}"
453
+ end
454
+ end
455
+ end
456
+