lp_email_reply_parser 0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +22 -0
- data/README.md +68 -0
- data/Rakefile +135 -0
- data/lib/email_reply_parser.rb +448 -0
- data/lp_email_reply_parser.gemspec +77 -0
- data/test/email_reply_parser_test.rb +431 -0
- data/test/emails/correct_sig.txt +4 -0
- data/test/emails/email_1_1.txt +13 -0
- data/test/emails/email_1_2.txt +51 -0
- data/test/emails/email_1_3.txt +55 -0
- data/test/emails/email_1_4.txt +5 -0
- data/test/emails/email_1_5.txt +15 -0
- data/test/emails/email_1_6.txt +15 -0
- data/test/emails/email_1_7.txt +12 -0
- data/test/emails/email_1_8.txt +6 -0
- data/test/emails/email_1_9.txt +9 -0
- data/test/emails/email_2_1.txt +25 -0
- data/test/emails/email_2_2.txt +10 -0
- data/test/emails/email_2_3.txt +14 -0
- data/test/emails/email_2_4.txt +14 -0
- data/test/emails/email_2_5.txt +15 -0
- data/test/emails/email_2_6.txt +11 -0
- data/test/emails/email_2_7.txt +5 -0
- data/test/emails/email_2_8.txt +4 -0
- data/test/emails/email_2_9.txt +9 -0
- data/test/emails/email_2nd_paragraph_starting_with_on.txt +12 -0
- data/test/emails/email_BlackBerry.txt +3 -0
- data/test/emails/email_bullets.txt +22 -0
- data/test/emails/email_from_address_in_quote_header.txt +12 -0
- data/test/emails/email_from_name_in_quote_header.txt +12 -0
- data/test/emails/email_hyphens.txt +5 -0
- data/test/emails/email_iPhone.txt +3 -0
- data/test/emails/email_mentions_own_email_address.txt +6 -0
- data/test/emails/email_mentions_own_name.txt +6 -0
- data/test/emails/email_multi_word_sent_from_my_mobile_device.txt +3 -0
- data/test/emails/email_multiline_quote_header_es_mx.txt +8 -0
- data/test/emails/email_multiline_quote_header_fr.txt +8 -0
- data/test/emails/email_multiline_quote_header_from_first.txt +11 -0
- data/test/emails/email_multiline_quote_header_from_replyto_date_to_subject.txt +12 -0
- data/test/emails/email_multiline_quote_header_from_to_date_subject.txt +11 -0
- data/test/emails/email_multiline_quote_header_none.txt +11 -0
- data/test/emails/email_multiline_quote_header_pt_br.txt +8 -0
- data/test/emails/email_multiline_quote_header_with_asterisks.txt +21 -0
- data/test/emails/email_multiline_quote_header_with_cc.txt +9 -0
- data/test/emails/email_multiline_quote_header_with_multiline_headers.txt +14 -0
- data/test/emails/email_no_signature_deliminator.txt +7 -0
- data/test/emails/email_no_signature_deliminator_adds_a_middle_initial.txt +7 -0
- data/test/emails/email_one_is_not_on.txt +10 -0
- data/test/emails/email_sent_from_my_not_signature.txt +3 -0
- data/test/emails/email_was_showing_as_nothing_visible.txt +13 -0
- metadata +143 -0
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
The MIT License
|
2
|
+
|
3
|
+
Copyright (c) GitHub
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
22
|
+
|
data/README.md
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
# Email Reply Parser
|
2
|
+
|
3
|
+
[](http://travis-ci.org/lawrencepit/email_reply_parser)
|
4
|
+
[](https://codeclimate.com/github/lawrencepit/email_reply_parser)
|
5
|
+
[](http://badge.fury.io/rb/email_reply_parser)
|
6
|
+
|
7
|
+
EmailReplyParser is a small library to parse plain text email content.
|
8
|
+
|
9
|
+
This is what GitHub uses to display comments that were created from
|
10
|
+
email replies. This code is being open sourced in an effort to
|
11
|
+
crowdsource the quality of our email representation.
|
12
|
+
|
13
|
+
## Usage
|
14
|
+
|
15
|
+
To parse reply body:
|
16
|
+
|
17
|
+
`parsed_body = EmailReplyParser.parse_reply(email_body, from_address)`
|
18
|
+
|
19
|
+
Argument `from_address` is optional. If included it will attempt to parse out signatures based on the name in the from address (if signature doesn't have a standard deliminator.)
|
20
|
+
|
21
|
+
## Installation
|
22
|
+
|
23
|
+
Get it from [GitHub][github] or `gem install email_reply_parser`. Run `rake` to run the tests.
|
24
|
+
|
25
|
+
[github]: https://github.com/github/email_reply_parser
|
26
|
+
|
27
|
+
## Contribute
|
28
|
+
|
29
|
+
If you'd like to hack on EmailReplyParser, start by forking the repo on GitHub:
|
30
|
+
|
31
|
+
https://github.com/github/email_reply_parser
|
32
|
+
|
33
|
+
The best way to get your changes merged back into core is as follows:
|
34
|
+
|
35
|
+
* Clone down your fork
|
36
|
+
* Create a thoughtfully named topic branch to contain your change
|
37
|
+
* Hack away
|
38
|
+
* Add tests and make sure everything still passes by running rake
|
39
|
+
* If you are adding new functionality, document it in the README
|
40
|
+
* Do not change the version number, I will do that on my end
|
41
|
+
* If necessary, rebase your commits into logical chunks, without errors
|
42
|
+
* Push the branch up to GitHub
|
43
|
+
* Send a pull request to the `github/email_reply_parser` project.
|
44
|
+
|
45
|
+
## Known Issues
|
46
|
+
|
47
|
+
### Quoted Headers
|
48
|
+
|
49
|
+
Quoted headers like these currently don't work with other languages:
|
50
|
+
|
51
|
+
On <date>, <author> wrote:
|
52
|
+
|
53
|
+
> blah
|
54
|
+
|
55
|
+
### Weird Signatures
|
56
|
+
|
57
|
+
Not everyone follows this convention:
|
58
|
+
|
59
|
+
Hello
|
60
|
+
|
61
|
+
Saludos!!!!!!!!!!!!!!
|
62
|
+
Galactic President Superstar Mc Awesomeville
|
63
|
+
GitHub
|
64
|
+
|
65
|
+
**********************DISCLAIMER***********************************
|
66
|
+
* Note: blah blah blah *
|
67
|
+
**********************DISCLAIMER***********************************
|
68
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'date'
|
4
|
+
|
5
|
+
#############################################################################
|
6
|
+
#
|
7
|
+
# Helper functions
|
8
|
+
#
|
9
|
+
#############################################################################
|
10
|
+
|
11
|
+
def name
|
12
|
+
@name ||= Dir['*.gemspec'].first.split('.').first
|
13
|
+
end
|
14
|
+
|
15
|
+
def version
|
16
|
+
line = File.read("lib/#{name}.rb")[/^\s*VERSION\s*=\s*.*/]
|
17
|
+
line.match(/.*VERSION\s*=\s*['"](.*)['"]/)[1]
|
18
|
+
end
|
19
|
+
|
20
|
+
def date
|
21
|
+
Date.today.to_s
|
22
|
+
end
|
23
|
+
|
24
|
+
def rubyforge_project
|
25
|
+
name
|
26
|
+
end
|
27
|
+
|
28
|
+
def gemspec_file
|
29
|
+
"#{name}.gemspec"
|
30
|
+
end
|
31
|
+
|
32
|
+
def gem_file
|
33
|
+
"#{name}-#{version}.gem"
|
34
|
+
end
|
35
|
+
|
36
|
+
def replace_header(head, header_name)
|
37
|
+
head.sub!(/(\.#{header_name}\s*= ').*'/) { "#{$1}#{send(header_name)}'"}
|
38
|
+
end
|
39
|
+
|
40
|
+
#############################################################################
|
41
|
+
#
|
42
|
+
# Standard tasks
|
43
|
+
#
|
44
|
+
#############################################################################
|
45
|
+
|
46
|
+
task :default => :test
|
47
|
+
|
48
|
+
require 'rake/testtask'
|
49
|
+
Rake::TestTask.new(:test) do |test|
|
50
|
+
test.libs << 'lib' << 'test'
|
51
|
+
test.pattern = 'test/*_test.rb'
|
52
|
+
test.verbose = true
|
53
|
+
end
|
54
|
+
|
55
|
+
desc "Open an irb session preloaded with this library"
|
56
|
+
task :console do
|
57
|
+
sh "irb -rubygems -r ./lib/#{name}.rb"
|
58
|
+
end
|
59
|
+
|
60
|
+
#############################################################################
|
61
|
+
#
|
62
|
+
# Custom tasks (add your own tasks here)
|
63
|
+
#
|
64
|
+
#############################################################################
|
65
|
+
|
66
|
+
|
67
|
+
|
68
|
+
#############################################################################
|
69
|
+
#
|
70
|
+
# Packaging tasks
|
71
|
+
#
|
72
|
+
#############################################################################
|
73
|
+
|
74
|
+
desc "Create tag v#{version} and build and push #{gem_file} to Rubygems"
|
75
|
+
task :release => :build do
|
76
|
+
unless `git branch` =~ /^\* master$/
|
77
|
+
puts "You must be on the master branch to release!"
|
78
|
+
exit!
|
79
|
+
end
|
80
|
+
sh "git commit --allow-empty -a -m 'Release #{version}'"
|
81
|
+
sh "git tag v#{version}"
|
82
|
+
sh "git push origin master"
|
83
|
+
sh "git push origin v#{version}"
|
84
|
+
sh "gem push pkg/#{name}-#{version}.gem"
|
85
|
+
end
|
86
|
+
|
87
|
+
desc "Build #{gem_file} into the pkg directory"
|
88
|
+
task :build => :gemspec do
|
89
|
+
sh "mkdir -p pkg"
|
90
|
+
sh "gem build #{gemspec_file}"
|
91
|
+
sh "mv #{gem_file} pkg"
|
92
|
+
end
|
93
|
+
|
94
|
+
desc "Generate #{gemspec_file}"
|
95
|
+
task :gemspec => :validate do
|
96
|
+
# read spec file and split out manifest section
|
97
|
+
spec = File.read(gemspec_file)
|
98
|
+
head, manifest, tail = spec.split(" # = MANIFEST =\n")
|
99
|
+
|
100
|
+
# replace name version and date
|
101
|
+
replace_header(head, :name)
|
102
|
+
replace_header(head, :version)
|
103
|
+
replace_header(head, :date)
|
104
|
+
#comment this out if your rubyforge_project has a different name
|
105
|
+
replace_header(head, :rubyforge_project)
|
106
|
+
|
107
|
+
# determine file list from git ls-files
|
108
|
+
files = `git ls-files`.
|
109
|
+
split("\n").
|
110
|
+
sort.
|
111
|
+
reject { |file| file =~ /^\./ }.
|
112
|
+
reject { |file| file =~ /^(rdoc|pkg)/ }.
|
113
|
+
map { |file| " #{file}" }.
|
114
|
+
join("\n")
|
115
|
+
|
116
|
+
# piece file back together and write
|
117
|
+
manifest = " s.files = %w[\n#{files}\n ]\n"
|
118
|
+
spec = [head, manifest, tail].join(" # = MANIFEST =\n")
|
119
|
+
File.open(gemspec_file, 'w') { |io| io.write(spec) }
|
120
|
+
puts "Updated #{gemspec_file}"
|
121
|
+
end
|
122
|
+
|
123
|
+
desc "Validate #{gemspec_file}"
|
124
|
+
task :validate do
|
125
|
+
libfiles = Dir['lib/*'] - ["lib/#{name}.rb", "lib/#{name}"]
|
126
|
+
unless libfiles.empty?
|
127
|
+
puts "Directory `lib` should only contain a `#{name}.rb` file and `#{name}` dir."
|
128
|
+
exit!
|
129
|
+
end
|
130
|
+
unless Dir['VERSION*'].empty?
|
131
|
+
puts "A `VERSION` file at root level violates Gem best practices."
|
132
|
+
exit!
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
@@ -0,0 +1,448 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
# EmailReplyParser is a small library to parse plain text email content. The
|
4
|
+
# goal is to identify which fragments are quoted, part of a signature, or
|
5
|
+
# original body content. We want to support both top and bottom posters, so
|
6
|
+
# no simple "REPLY ABOVE HERE" content is used.
|
7
|
+
#
|
8
|
+
# Beyond RFC 5322 (which is handled by the [Ruby mail gem][mail]), there aren't
|
9
|
+
# any real standards for how emails are created. This attempts to parse out
|
10
|
+
# common conventions for things like replies:
|
11
|
+
#
|
12
|
+
# this is some text
|
13
|
+
#
|
14
|
+
# On <date>, <author> wrote:
|
15
|
+
# > blah blah
|
16
|
+
# > blah blah
|
17
|
+
#
|
18
|
+
# ... and signatures:
|
19
|
+
#
|
20
|
+
# this is some text
|
21
|
+
#
|
22
|
+
# --
|
23
|
+
# Bob
|
24
|
+
# http://homepage.com/~bob
|
25
|
+
#
|
26
|
+
# Each of these are parsed into Fragment objects.
|
27
|
+
#
|
28
|
+
# EmailReplyParser also attempts to figure out which of these blocks should
|
29
|
+
# be hidden from users.
|
30
|
+
#
|
31
|
+
# [mail]: https://github.com/mikel/mail
|
32
|
+
class EmailReplyParser
|
33
|
+
VERSION = "0.6"
|
34
|
+
|
35
|
+
# Public: Splits an email body into a list of Fragments.
|
36
|
+
#
|
37
|
+
# text - A String email body.
|
38
|
+
# from_address - from address of the email (optional)
|
39
|
+
#
|
40
|
+
# Returns an Email instance.
|
41
|
+
def self.read(text, from_address = "")
|
42
|
+
Email.new.read(text, from_address)
|
43
|
+
end
|
44
|
+
|
45
|
+
# Public: Get the text of the visible portions of the given email body.
|
46
|
+
#
|
47
|
+
# text - A String email body.
|
48
|
+
# from_address - from address of the email (optional)
|
49
|
+
#
|
50
|
+
# Returns a String.
|
51
|
+
def self.parse_reply(text, from_address = "")
|
52
|
+
self.read(text.to_s, from_address).visible_text
|
53
|
+
end
|
54
|
+
|
55
|
+
### Emails
|
56
|
+
|
57
|
+
# An Email instance represents a parsed body String.
|
58
|
+
class Email
|
59
|
+
# Emails have an Array of Fragments.
|
60
|
+
attr_reader :fragments
|
61
|
+
|
62
|
+
# Public: Gets the combined text of the visible fragments of the email body.
|
63
|
+
#
|
64
|
+
# Returns a String.
|
65
|
+
def visible_text
|
66
|
+
fragments.select{|f| !f.hidden?}.map{|f| f.to_s}.join("\n").rstrip
|
67
|
+
end
|
68
|
+
|
69
|
+
# Splits the given text into a list of Fragments. This is roughly done by
|
70
|
+
# reversing the text and parsing from the bottom to the top. This way we
|
71
|
+
# can check for 'On <date>, <author> wrote:' lines above quoted blocks.
|
72
|
+
#
|
73
|
+
# text - A String email body.
|
74
|
+
# from_address - from address of the email (optional)
|
75
|
+
#
|
76
|
+
# Returns this same Email instance.
|
77
|
+
def read(text, from_address = "")
|
78
|
+
@fragments = []
|
79
|
+
|
80
|
+
# parse out the from name if one exists and save for use later
|
81
|
+
@from_name_raw = parse_raw_name_from_address(from_address)
|
82
|
+
@from_name_normalized = normalize_name(@from_name_raw)
|
83
|
+
@from_email = parse_email_from_address(from_address)
|
84
|
+
|
85
|
+
text = normalize_text(text)
|
86
|
+
|
87
|
+
# The text is reversed initially due to the way we check for hidden
|
88
|
+
# fragments.
|
89
|
+
text = text.reverse
|
90
|
+
|
91
|
+
# This determines if any 'visible' Fragment has been found. Once any
|
92
|
+
# visible Fragment is found, stop looking for hidden ones.
|
93
|
+
@found_visible = false
|
94
|
+
|
95
|
+
# This instance variable points to the current Fragment. If the matched
|
96
|
+
# line fits, it should be added to this Fragment. Otherwise, finish it
|
97
|
+
# and start a new Fragment.
|
98
|
+
@fragment = nil
|
99
|
+
|
100
|
+
# Use the StringScanner to pull out each line of the email content.
|
101
|
+
@scanner = StringScanner.new(text)
|
102
|
+
while line = @scanner.scan_until(/\n/n)
|
103
|
+
scan_line(line)
|
104
|
+
end
|
105
|
+
|
106
|
+
# Be sure to parse the last line of the email.
|
107
|
+
if (last_line = @scanner.rest.to_s).size > 0
|
108
|
+
scan_line(last_line, true)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Finish up the final fragment. Finishing a fragment will detect any
|
112
|
+
# attributes (hidden, signature, reply), and join each line into a
|
113
|
+
# string.
|
114
|
+
finish_fragment
|
115
|
+
|
116
|
+
@scanner = @fragment = nil
|
117
|
+
|
118
|
+
self
|
119
|
+
end
|
120
|
+
|
121
|
+
private
|
122
|
+
EMPTY = "".freeze
|
123
|
+
|
124
|
+
COMMON_REPLY_HEADER_REGEXES = [
|
125
|
+
/^On(.+)wrote:$/nm,
|
126
|
+
/\A\d{4}\/\d{1,2}\/\d{1,2}\s+.{1,80}\s<[^@]+@[^@]+>\Z/,
|
127
|
+
]
|
128
|
+
|
129
|
+
# Line optionally starts with whitespace, contains two or more hyphens or
|
130
|
+
# underscores, and ends with optional whitespace.
|
131
|
+
# Example: '---' or '___' or '--- '
|
132
|
+
MULTI_LINE_SIGNATURE_REGEX = /^\s*[-_]{2,}\s*$/
|
133
|
+
|
134
|
+
# Line optionally starts with whitespace, followed by one hyphen, followed by a word character
|
135
|
+
# Example: '-Sandro'
|
136
|
+
ONE_LINE_SIGNATURE_REGEX = /^\s*-\w/
|
137
|
+
|
138
|
+
ORIGINAL_MESSAGE_SIGNATURE_REGEX = /^[\s_-]+(Original Message)?[\s_-]+$/
|
139
|
+
|
140
|
+
# No block-quotes (> or <), followed by up to three words, followed by "Sent from my".
|
141
|
+
# Example: "Sent from my iPhone 3G"
|
142
|
+
SENT_FROM_REGEX = /^Sent from my (\s*\w+){1,3}(\s*<.*>)?$/
|
143
|
+
|
144
|
+
SIGNATURE_REGEX = Regexp.new(Regexp.union(MULTI_LINE_SIGNATURE_REGEX, ONE_LINE_SIGNATURE_REGEX, ORIGINAL_MESSAGE_SIGNATURE_REGEX, SENT_FROM_REGEX).source, Regexp::NOENCODING)
|
145
|
+
|
146
|
+
# TODO: refactor out in a i18n.yml file
|
147
|
+
# Supports English, French, Es-Mexican, Pt-Brazilian
|
148
|
+
# Maps a label to a label-group
|
149
|
+
QUOTE_HEADER_LABELS = Hash[*{
|
150
|
+
:from => ["From", "De"],
|
151
|
+
:to => ["To", "Para", "A"],
|
152
|
+
:cc => ["CC"],
|
153
|
+
:reply_to => ["Reply-To"],
|
154
|
+
:date => ["Date", "Sent", "Enviado", "Enviada em", "Fecha"],
|
155
|
+
:subject => ["Subject", "Assunto", "Asunto", "Objet"]
|
156
|
+
}.map {|group, labels| labels.map {|label| [label.downcase, group]}}.flatten]
|
157
|
+
|
158
|
+
# normalize text so it is easier to parse
|
159
|
+
#
|
160
|
+
# text - text to normalize
|
161
|
+
#
|
162
|
+
# Returns a String
|
163
|
+
def normalize_text(text)
|
164
|
+
# in 1.9 we want to operate on the raw bytes
|
165
|
+
text = text.dup.force_encoding('binary') if text.respond_to?(:force_encoding)
|
166
|
+
|
167
|
+
# Normalize line endings.
|
168
|
+
text.gsub!("\r\n", "\n")
|
169
|
+
|
170
|
+
# Check for multi-line reply headers. Some clients break up
|
171
|
+
# the "On DATE, NAME <EMAIL> wrote:" line into multiple lines.
|
172
|
+
if match = text.match(/^(On\s(.+)wrote:)$/m)
|
173
|
+
# Remove all new lines from the reply header. as long as we don't have any double newline
|
174
|
+
# if we do they we have grabbed something that is not actually a reply header
|
175
|
+
text.gsub! match[1], match[1].gsub("\n", " ") unless match[1] =~ /\n\n/
|
176
|
+
end
|
177
|
+
|
178
|
+
# Some users may reply directly above a line of underscores.
|
179
|
+
# In order to ensure that these fragments are split correctly,
|
180
|
+
# make sure that all lines of underscores are preceded by
|
181
|
+
# at least two newline characters.
|
182
|
+
text.gsub!(/([^\n])(?=\n_{7}_+)$/m, "\\1\n")
|
183
|
+
|
184
|
+
text
|
185
|
+
end
|
186
|
+
|
187
|
+
# Parse a person's name from an e-mail address
|
188
|
+
#
|
189
|
+
# email - email address.
|
190
|
+
#
|
191
|
+
# Returns a String.
|
192
|
+
def parse_name_from_address(address)
|
193
|
+
normalize_name(parse_raw_name_from_address(address))
|
194
|
+
end
|
195
|
+
|
196
|
+
def parse_raw_name_from_address(address)
|
197
|
+
match = address.match(/^["']*([\w\s,]+)["']*\s*</)
|
198
|
+
match ? match[1].strip.to_s : EMPTY
|
199
|
+
end
|
200
|
+
|
201
|
+
def parse_email_from_address(address)
|
202
|
+
match = address.match /<(.*)>/
|
203
|
+
match ? match[1] : address
|
204
|
+
end
|
205
|
+
|
206
|
+
# Normalize a name to First Last
|
207
|
+
#
|
208
|
+
# name - name to normailze.
|
209
|
+
#
|
210
|
+
# Returns a String.
|
211
|
+
def normalize_name(name)
|
212
|
+
if name.include?(',')
|
213
|
+
make_name_first_then_last(name)
|
214
|
+
else
|
215
|
+
name
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
def make_name_first_then_last(name)
|
220
|
+
split_name = name.split(',')
|
221
|
+
if split_name[0].include?(" ")
|
222
|
+
split_name[0].to_s
|
223
|
+
else
|
224
|
+
split_name[1].strip + " " + split_name[0].strip
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
### Line-by-Line Parsing
|
229
|
+
|
230
|
+
# Scans the given line of text and determines which fragment it belongs to.
|
231
|
+
def scan_line(line, last = false)
|
232
|
+
line.chomp!("\n")
|
233
|
+
line.reverse!
|
234
|
+
line.rstrip!
|
235
|
+
|
236
|
+
# Mark the current Fragment as a signature if the current line is empty
|
237
|
+
# and the Fragment starts with a common signature indicator.
|
238
|
+
# Mark the current Fragment as a quote if the current line is empty
|
239
|
+
# and the Fragment starts with a multiline quote header.
|
240
|
+
scan_signature_or_quote if @fragment && line == EMPTY
|
241
|
+
|
242
|
+
# We're looking for leading `>`'s to see if this line is part of a
|
243
|
+
# quoted Fragment.
|
244
|
+
is_quoted = !!(line =~ /^>+/n)
|
245
|
+
|
246
|
+
# Note that a common reply header also counts as part of the quoted
|
247
|
+
# Fragment, even though it doesn't start with `>`.
|
248
|
+
unless @fragment &&
|
249
|
+
((@fragment.quoted? == is_quoted) ||
|
250
|
+
(@fragment.quoted? && (line_is_reply_header?(line) || line == EMPTY)))
|
251
|
+
finish_fragment
|
252
|
+
@fragment = Fragment.new
|
253
|
+
@fragment.quoted = is_quoted
|
254
|
+
end
|
255
|
+
|
256
|
+
@fragment.add_line(line)
|
257
|
+
scan_signature_or_quote if last
|
258
|
+
end
|
259
|
+
|
260
|
+
def scan_signature_or_quote
|
261
|
+
if signature_line?(@fragment.lines.first)
|
262
|
+
@fragment.signature = true
|
263
|
+
finish_fragment
|
264
|
+
elsif multiline_quote_header_in_fragment?
|
265
|
+
@fragment.quoted = true
|
266
|
+
finish_fragment
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
# Returns +true+ if the current block in the current fragment has
|
271
|
+
# a multiline quote header, +false+ otherwise.
|
272
|
+
#
|
273
|
+
# The quote header we're looking for is mainly generated by Outlook
|
274
|
+
# clients. It's considered a quote header if the first 4 folded lines
|
275
|
+
# have one of the following forms:
|
276
|
+
#
|
277
|
+
# label: some text
|
278
|
+
# *label:* some text
|
279
|
+
#
|
280
|
+
# where a line like this:
|
281
|
+
#
|
282
|
+
# label: some text
|
283
|
+
# possibly indented text that belongs to the previous line
|
284
|
+
#
|
285
|
+
# is folded into:
|
286
|
+
#
|
287
|
+
# label: some text possibly indented text that belongs to the previous line
|
288
|
+
#
|
289
|
+
# and where label is a value from +QUOTE_HEADER_LABELS+ that appears
|
290
|
+
# only once in the first 4 lines and where each group of a label
|
291
|
+
# is represented at most once.
|
292
|
+
def multiline_quote_header_in_fragment?
|
293
|
+
folding = false
|
294
|
+
label_groups = []
|
295
|
+
@fragment.current_block.split("\n").each do |line|
|
296
|
+
if line =~ /\A\s*\*?([^:]+):(\s|\*)/
|
297
|
+
label = QUOTE_HEADER_LABELS[$1.downcase]
|
298
|
+
if label
|
299
|
+
return false if label_groups.include?(label)
|
300
|
+
return true if label_groups.length == 3
|
301
|
+
label_groups << label
|
302
|
+
folding = true
|
303
|
+
elsif !folding
|
304
|
+
return false
|
305
|
+
end
|
306
|
+
elsif !folding
|
307
|
+
return false
|
308
|
+
else
|
309
|
+
folding = true
|
310
|
+
end
|
311
|
+
end
|
312
|
+
return false
|
313
|
+
end
|
314
|
+
|
315
|
+
# Detects if a given line is the beginning of a signature
|
316
|
+
#
|
317
|
+
# line - A String line of text from the email.
|
318
|
+
#
|
319
|
+
# Returns true if the line is the beginning of a signature, or false.
|
320
|
+
def signature_line?(line)
|
321
|
+
line =~ SIGNATURE_REGEX || line_is_signature_name?(line)
|
322
|
+
end
|
323
|
+
|
324
|
+
# Detects if a given line is a common reply header.
|
325
|
+
#
|
326
|
+
# line - A String line of text from the email.
|
327
|
+
#
|
328
|
+
# Returns true if the line is a valid header, or false.
|
329
|
+
def line_is_reply_header?(line)
|
330
|
+
COMMON_REPLY_HEADER_REGEXES.each do |regex|
|
331
|
+
return true if line =~ regex
|
332
|
+
end
|
333
|
+
false
|
334
|
+
end
|
335
|
+
|
336
|
+
# Detects if the @from name is a big part of a given line and therefore the beginning of a signature
|
337
|
+
#
|
338
|
+
# line - A String line of text from the email.
|
339
|
+
#
|
340
|
+
# Returns true if @from_name is a big part of the line, or false.
|
341
|
+
def line_is_signature_name?(line)
|
342
|
+
regexp = generate_regexp_for_name()
|
343
|
+
@from_name_normalized != "" && (line =~ regexp) && ((@from_name_normalized.size.to_f / line.size) > 0.25)
|
344
|
+
end
|
345
|
+
|
346
|
+
#generates regexp which always for additional words or initials between first and last names
|
347
|
+
def generate_regexp_for_name
|
348
|
+
name_parts = @from_name_normalized.split(" ")
|
349
|
+
seperator = '[\w.\s]*'
|
350
|
+
regexp = Regexp.new(name_parts.join(seperator), Regexp::IGNORECASE)
|
351
|
+
end
|
352
|
+
|
353
|
+
# Builds the fragment string, after all lines have been added.
|
354
|
+
# It also checks to see if this Fragment is hidden. The hidden
|
355
|
+
# Fragment check reads from the bottom to the top.
|
356
|
+
#
|
357
|
+
# Any quoted Fragments or signature Fragments are marked hidden if they
|
358
|
+
# are below any visible Fragments. Visible Fragments are expected to
|
359
|
+
# contain original content by the author. If they are below a quoted
|
360
|
+
# Fragment, then the Fragment should be visible to give context to the
|
361
|
+
# reply.
|
362
|
+
#
|
363
|
+
# some original text (visible)
|
364
|
+
#
|
365
|
+
# > do you have any two's? (quoted, visible)
|
366
|
+
#
|
367
|
+
# Go fish! (visible)
|
368
|
+
#
|
369
|
+
# > --
|
370
|
+
# > Player 1 (quoted, hidden)
|
371
|
+
#
|
372
|
+
# --
|
373
|
+
# Player 2 (signature, hidden)
|
374
|
+
#
|
375
|
+
def finish_fragment
|
376
|
+
if @fragment
|
377
|
+
@fragment.finish
|
378
|
+
if !@found_visible
|
379
|
+
if @fragment.quoted? || @fragment.signature? ||
|
380
|
+
@fragment.reply_header? || @fragment.to_s.strip == EMPTY
|
381
|
+
@fragment.hidden = true
|
382
|
+
else
|
383
|
+
@found_visible = true
|
384
|
+
end
|
385
|
+
end
|
386
|
+
@fragments.insert(0, @fragment)
|
387
|
+
end
|
388
|
+
@fragment = nil
|
389
|
+
end
|
390
|
+
end
|
391
|
+
|
392
|
+
# Represents a group of paragraphs in the email sharing common attributes.
|
393
|
+
# Paragraphs should get their own fragment if they are a quoted area or a
|
394
|
+
# signature.
|
395
|
+
class Fragment < Struct.new(:quoted, :signature, :reply_header, :hidden)
|
396
|
+
# Array of string lines that make up the content of this fragment.
|
397
|
+
attr_reader :lines
|
398
|
+
|
399
|
+
# Array of string lines that is being processed not having
|
400
|
+
# an empty line.
|
401
|
+
attr_reader :current_block
|
402
|
+
|
403
|
+
# This is reserved for the joined String that is build when this Fragment
|
404
|
+
# is finished.
|
405
|
+
attr_reader :content
|
406
|
+
|
407
|
+
def initialize
|
408
|
+
self.quoted = self.signature = self.reply_header = self.hidden = false
|
409
|
+
@lines = []
|
410
|
+
@current_block = []
|
411
|
+
@content = nil
|
412
|
+
end
|
413
|
+
|
414
|
+
alias quoted? quoted
|
415
|
+
alias signature? signature
|
416
|
+
alias reply_header? reply_header
|
417
|
+
alias hidden? hidden
|
418
|
+
|
419
|
+
def add_line(line)
|
420
|
+
return unless line
|
421
|
+
@lines.insert(0, line)
|
422
|
+
if line == ""
|
423
|
+
@current_block.clear
|
424
|
+
else
|
425
|
+
@current_block.insert(0, line)
|
426
|
+
end
|
427
|
+
end
|
428
|
+
|
429
|
+
def current_block
|
430
|
+
@current_block.join("\n")
|
431
|
+
end
|
432
|
+
|
433
|
+
# Builds the string content by joining the lines and reversing them.
|
434
|
+
def finish
|
435
|
+
@content = @lines.join("\n")
|
436
|
+
@lines = @current_block = nil
|
437
|
+
end
|
438
|
+
|
439
|
+
def to_s
|
440
|
+
@lines ? @lines.join("\n") : @content
|
441
|
+
end
|
442
|
+
|
443
|
+
def inspect
|
444
|
+
"#{super.inspect} : #{to_s.inspect}"
|
445
|
+
end
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|