tr_email_reply_parser 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/LICENSE +22 -0
- data/README.md +68 -0
- data/Rakefile +135 -0
- data/lib/tr_email_reply_parser.rb +464 -0
- data/test/email_reply_parser_test.rb +441 -0
- data/test/emails/correct_sig.txt +4 -0
- data/test/emails/email_1_1.txt +13 -0
- data/test/emails/email_1_2.txt +51 -0
- data/test/emails/email_1_3.txt +55 -0
- data/test/emails/email_1_4.txt +5 -0
- data/test/emails/email_1_5.txt +15 -0
- data/test/emails/email_1_6.txt +15 -0
- data/test/emails/email_1_7.txt +12 -0
- data/test/emails/email_1_8.txt +6 -0
- data/test/emails/email_1_9.txt +9 -0
- data/test/emails/email_2_1.txt +25 -0
- data/test/emails/email_2_2.txt +10 -0
- data/test/emails/email_2_3.txt +14 -0
- data/test/emails/email_2_4.txt +14 -0
- data/test/emails/email_2_5.txt +15 -0
- data/test/emails/email_2_6.txt +11 -0
- data/test/emails/email_2_7.txt +5 -0
- data/test/emails/email_2_8.txt +4 -0
- data/test/emails/email_2_9.txt +9 -0
- data/test/emails/email_2nd_paragraph_starting_with_on.txt +12 -0
- data/test/emails/email_BlackBerry.txt +3 -0
- data/test/emails/email_bullets.txt +22 -0
- data/test/emails/email_from_address_in_quote_header.txt +12 -0
- data/test/emails/email_from_name_in_quote_header.txt +12 -0
- data/test/emails/email_hyphens.txt +5 -0
- data/test/emails/email_iPhone.txt +3 -0
- data/test/emails/email_mentions_own_email_address.txt +6 -0
- data/test/emails/email_mentions_own_name.txt +6 -0
- data/test/emails/email_multi_word_sent_from_my_mobile_device.txt +3 -0
- data/test/emails/email_multiline_quote_header_es_mx.txt +8 -0
- data/test/emails/email_multiline_quote_header_fr.txt +8 -0
- data/test/emails/email_multiline_quote_header_from_first.txt +11 -0
- data/test/emails/email_multiline_quote_header_from_replyto_date_to_subject.txt +12 -0
- data/test/emails/email_multiline_quote_header_from_to_date_subject.txt +11 -0
- data/test/emails/email_multiline_quote_header_none.txt +11 -0
- data/test/emails/email_multiline_quote_header_pt_br.txt +8 -0
- data/test/emails/email_multiline_quote_header_with_asterisks.txt +21 -0
- data/test/emails/email_multiline_quote_header_with_cc.txt +9 -0
- data/test/emails/email_multiline_quote_header_with_multiline_headers.txt +14 -0
- data/test/emails/email_no_signature_deliminator.txt +7 -0
- data/test/emails/email_no_signature_deliminator_adds_a_middle_initial.txt +7 -0
- data/test/emails/email_one_is_not_on.txt +10 -0
- data/test/emails/email_sent_from_my_not_signature.txt +3 -0
- data/test/emails/email_was_showing_as_nothing_visible.txt +13 -0
- data/test/emails/new_content/email_1_2.txt +28 -0
- data/tr_email_reply_parser.gemspec +123 -0
- metadata +143 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
OWFiZWI0NDQ0Zjk3NmU2MGJmZTc2ZGI4MmQ0NDgxMzAzNzNiNmU3Mg==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
NDc0ZTM1NWRkMjViNmE1MjlhZDBiMGVjMWVlNzc2YjAwNWNjZTA2Nw==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
MWVhNjJkZTVmNWMyZTlkMmM4OTc1ZjBmZTc3MjNmMWJkNGQ1MjJiMzBmNDZm
|
10
|
+
Mjg2MTM4NDAzNGM0YTg3N2Q4Y2JkYmVhZDlkNjBhMjU1NWZiNjBlZmNlNGQ4
|
11
|
+
NTZiOGZkNjg4MDgyMWQ0ZDkyY2E2MmM4ZmFiMWMwZmFhYTFlZjY=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
MDk2OGVhZmFiYTIxYTFiYmRiYjcyNDE1NTUzNmVmN2QwNzQxNTRhYjAzMDE2
|
14
|
+
NjNkMGE4ZmY0M2ExNzcyOTYwZWVlMzg3MWUzNmRlZGJjZDNhMTZjMzExODdh
|
15
|
+
OWUxMjY5MzIzYTkwNWE2MWY4N2E0OGIwZDVjMzJkOWQ0MWMyOTg=
|
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
The MIT License
|
2
|
+
|
3
|
+
Copyright (c) GitHub
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
22
|
+
|
data/README.md
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
# Email Reply Parser
|
2
|
+
|
3
|
+
[![Build Status](https://secure.travis-ci.org/lawrencepit/email_reply_parser.png?branch=master)](http://travis-ci.org/lawrencepit/email_reply_parser)
|
4
|
+
[![Code Climate](https://codeclimate.com/badge.png)](https://codeclimate.com/github/lawrencepit/email_reply_parser)
|
5
|
+
[![Gem Version](https://fury-badge.herokuapp.com/rb/email_reply_parser.png)](http://badge.fury.io/rb/email_reply_parser)
|
6
|
+
|
7
|
+
EmailReplyParser is a small library to parse plain text email content.
|
8
|
+
|
9
|
+
This is what GitHub uses to display comments that were created from
|
10
|
+
email replies. This code is being open sourced in an effort to
|
11
|
+
crowdsource the quality of our email representation.
|
12
|
+
|
13
|
+
## Usage
|
14
|
+
|
15
|
+
To parse reply body:
|
16
|
+
|
17
|
+
`parsed_body = EmailReplyParser.parse_reply(email_body, from_address)`
|
18
|
+
|
19
|
+
Argument `from_address` is optional. If included it will attempt to parse out signatures based on the name in the from address (if signature doesn't have a standard deliminator.)
|
20
|
+
|
21
|
+
## Installation
|
22
|
+
|
23
|
+
Get it from [GitHub][github] or `gem install email_reply_parser`. Run `rake` to run the tests.
|
24
|
+
|
25
|
+
[github]: https://github.com/github/email_reply_parser
|
26
|
+
|
27
|
+
## Contribute
|
28
|
+
|
29
|
+
If you'd like to hack on EmailReplyParser, start by forking the repo on GitHub:
|
30
|
+
|
31
|
+
https://github.com/github/email_reply_parser
|
32
|
+
|
33
|
+
The best way to get your changes merged back into core is as follows:
|
34
|
+
|
35
|
+
* Clone down your fork
|
36
|
+
* Create a thoughtfully named topic branch to contain your change
|
37
|
+
* Hack away
|
38
|
+
* Add tests and make sure everything still passes by running rake
|
39
|
+
* If you are adding new functionality, document it in the README
|
40
|
+
* Do not change the version number, I will do that on my end
|
41
|
+
* If necessary, rebase your commits into logical chunks, without errors
|
42
|
+
* Push the branch up to GitHub
|
43
|
+
* Send a pull request to the `github/email_reply_parser` project.
|
44
|
+
|
45
|
+
## Known Issues
|
46
|
+
|
47
|
+
### Quoted Headers
|
48
|
+
|
49
|
+
Quoted headers like these currently don't work with other languages:
|
50
|
+
|
51
|
+
On <date>, <author> wrote:
|
52
|
+
|
53
|
+
> blah
|
54
|
+
|
55
|
+
### Weird Signatures
|
56
|
+
|
57
|
+
Not everyone follows this convention:
|
58
|
+
|
59
|
+
Hello
|
60
|
+
|
61
|
+
Saludos!!!!!!!!!!!!!!
|
62
|
+
Galactic President Superstar Mc Awesomeville
|
63
|
+
GitHub
|
64
|
+
|
65
|
+
**********************DISCLAIMER***********************************
|
66
|
+
* Note: blah blah blah *
|
67
|
+
**********************DISCLAIMER***********************************
|
68
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'date'
|
4
|
+
|
5
|
+
#############################################################################
|
6
|
+
#
|
7
|
+
# Helper functions
|
8
|
+
#
|
9
|
+
#############################################################################
|
10
|
+
|
11
|
+
def name
|
12
|
+
@name ||= Dir['*.gemspec'].first.split('.').first
|
13
|
+
end
|
14
|
+
|
15
|
+
def version
|
16
|
+
line = File.read("lib/#{name}.rb")[/^\s*VERSION\s*=\s*.*/]
|
17
|
+
line.match(/.*VERSION\s*=\s*['"](.*)['"]/)[1]
|
18
|
+
end
|
19
|
+
|
20
|
+
def date
|
21
|
+
Date.today.to_s
|
22
|
+
end
|
23
|
+
|
24
|
+
def rubyforge_project
|
25
|
+
name
|
26
|
+
end
|
27
|
+
|
28
|
+
def gemspec_file
|
29
|
+
"#{name}.gemspec"
|
30
|
+
end
|
31
|
+
|
32
|
+
def gem_file
|
33
|
+
"#{name}-#{version}.gem"
|
34
|
+
end
|
35
|
+
|
36
|
+
def replace_header(head, header_name)
|
37
|
+
head.sub!(/(\.#{header_name}\s*= ').*'/) { "#{$1}#{send(header_name)}'"}
|
38
|
+
end
|
39
|
+
|
40
|
+
#############################################################################
|
41
|
+
#
|
42
|
+
# Standard tasks
|
43
|
+
#
|
44
|
+
#############################################################################
|
45
|
+
|
46
|
+
task :default => :test
|
47
|
+
|
48
|
+
require 'rake/testtask'
|
49
|
+
Rake::TestTask.new(:test) do |test|
|
50
|
+
test.libs << 'lib' << 'test'
|
51
|
+
test.pattern = 'test/*_test.rb'
|
52
|
+
test.verbose = true
|
53
|
+
end
|
54
|
+
|
55
|
+
desc "Open an irb session preloaded with this library"
|
56
|
+
task :console do
|
57
|
+
sh "irb -rubygems -r ./lib/#{name}.rb"
|
58
|
+
end
|
59
|
+
|
60
|
+
#############################################################################
|
61
|
+
#
|
62
|
+
# Custom tasks (add your own tasks here)
|
63
|
+
#
|
64
|
+
#############################################################################
|
65
|
+
|
66
|
+
|
67
|
+
|
68
|
+
#############################################################################
|
69
|
+
#
|
70
|
+
# Packaging tasks
|
71
|
+
#
|
72
|
+
#############################################################################
|
73
|
+
|
74
|
+
desc "Create tag v#{version} and build and push #{gem_file} to Rubygems"
|
75
|
+
task :release => :build do
|
76
|
+
unless `git branch` =~ /^\* master$/
|
77
|
+
puts "You must be on the master branch to release!"
|
78
|
+
exit!
|
79
|
+
end
|
80
|
+
sh "git commit --allow-empty -a -m 'Release #{version}'"
|
81
|
+
sh "git tag v#{version}"
|
82
|
+
sh "git push origin master"
|
83
|
+
sh "git push origin v#{version}"
|
84
|
+
sh "gem push pkg/#{name}-#{version}.gem"
|
85
|
+
end
|
86
|
+
|
87
|
+
desc "Build #{gem_file} into the pkg directory"
|
88
|
+
task :build => :gemspec do
|
89
|
+
sh "mkdir -p pkg"
|
90
|
+
sh "gem build #{gemspec_file}"
|
91
|
+
sh "mv #{gem_file} pkg"
|
92
|
+
end
|
93
|
+
|
94
|
+
desc "Generate #{gemspec_file}"
|
95
|
+
task :gemspec => :validate do
|
96
|
+
# read spec file and split out manifest section
|
97
|
+
spec = File.read(gemspec_file)
|
98
|
+
head, manifest, tail = spec.split(" # = MANIFEST =\n")
|
99
|
+
|
100
|
+
# replace name version and date
|
101
|
+
replace_header(head, :name)
|
102
|
+
replace_header(head, :version)
|
103
|
+
replace_header(head, :date)
|
104
|
+
#comment this out if your rubyforge_project has a different name
|
105
|
+
replace_header(head, :rubyforge_project)
|
106
|
+
|
107
|
+
# determine file list from git ls-files
|
108
|
+
files = `git ls-files`.
|
109
|
+
split("\n").
|
110
|
+
sort.
|
111
|
+
reject { |file| file =~ /^\./ }.
|
112
|
+
reject { |file| file =~ /^(rdoc|pkg)/ }.
|
113
|
+
map { |file| " #{file}" }.
|
114
|
+
join("\n")
|
115
|
+
|
116
|
+
# piece file back together and write
|
117
|
+
manifest = " s.files = %w[\n#{files}\n ]\n"
|
118
|
+
spec = [head, manifest, tail].join(" # = MANIFEST =\n")
|
119
|
+
File.open(gemspec_file, 'w') { |io| io.write(spec) }
|
120
|
+
puts "Updated #{gemspec_file}"
|
121
|
+
end
|
122
|
+
|
123
|
+
desc "Validate #{gemspec_file}"
|
124
|
+
task :validate do
|
125
|
+
libfiles = Dir['lib/*'] - ["lib/#{name}.rb", "lib/#{name}"]
|
126
|
+
unless libfiles.empty?
|
127
|
+
puts "Directory `lib` should only contain a `#{name}.rb` file and `#{name}` dir."
|
128
|
+
exit!
|
129
|
+
end
|
130
|
+
unless Dir['VERSION*'].empty?
|
131
|
+
puts "A `VERSION` file at root level violates Gem best practices."
|
132
|
+
exit!
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
@@ -0,0 +1,464 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
# EmailReplyParser is a small library to parse plain text email content. The
|
4
|
+
# goal is to identify which fragments are quoted, part of a signature, or
|
5
|
+
# original body content. We want to support both top and bottom posters, so
|
6
|
+
# no simple "REPLY ABOVE HERE" content is used.
|
7
|
+
#
|
8
|
+
# Beyond RFC 5322 (which is handled by the [Ruby mail gem][mail]), there aren't
|
9
|
+
# any real standards for how emails are created. This attempts to parse out
|
10
|
+
# common conventions for things like replies:
|
11
|
+
#
|
12
|
+
# this is some text
|
13
|
+
#
|
14
|
+
# On <date>, <author> wrote:
|
15
|
+
# > blah blah
|
16
|
+
# > blah blah
|
17
|
+
#
|
18
|
+
# ... and signatures:
|
19
|
+
#
|
20
|
+
# this is some text
|
21
|
+
#
|
22
|
+
# --
|
23
|
+
# Bob
|
24
|
+
# http://homepage.com/~bob
|
25
|
+
#
|
26
|
+
# Each of these are parsed into Fragment objects.
|
27
|
+
#
|
28
|
+
# EmailReplyParser also attempts to figure out which of these blocks should
|
29
|
+
# be hidden from users.
|
30
|
+
#
|
31
|
+
# [mail]: https://github.com/mikel/mail
|
32
|
+
class EmailReplyParser
|
33
|
+
VERSION = "0.6.0"
|
34
|
+
|
35
|
+
# Public: Splits an email body into a list of Fragments.
|
36
|
+
#
|
37
|
+
# text - A String email body.
|
38
|
+
# from_address - from address of the email (optional)
|
39
|
+
#
|
40
|
+
# Returns an Email instance.
|
41
|
+
def self.read(text, from_address = "")
|
42
|
+
Email.new.read(text, from_address)
|
43
|
+
end
|
44
|
+
|
45
|
+
# Public: Get the text of the visible portions of the given email body.
|
46
|
+
#
|
47
|
+
# text - A String email body.
|
48
|
+
# from_address - from address of the email (optional)
|
49
|
+
#
|
50
|
+
# Returns a String.
|
51
|
+
def self.parse_reply(text, from_address = "")
|
52
|
+
self.read(text.to_s, from_address).visible_text
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.parse_new_content(text, from_address = "")
|
56
|
+
self.read(text, from_address).new_content
|
57
|
+
end
|
58
|
+
|
59
|
+
### Emails
|
60
|
+
|
61
|
+
# An Email instance represents a parsed body String.
|
62
|
+
class Email
|
63
|
+
# Emails have an Array of Fragments.
|
64
|
+
attr_reader :fragments
|
65
|
+
|
66
|
+
def initialize
|
67
|
+
@fragments = []
|
68
|
+
end
|
69
|
+
|
70
|
+
# Public: Gets the combined text of the visible fragments of the email body.
|
71
|
+
#
|
72
|
+
# Returns a String.
|
73
|
+
def visible_text
|
74
|
+
fragments.select{|f| !f.hidden?}.map{|f| f.to_s}.join("\n").rstrip
|
75
|
+
end
|
76
|
+
|
77
|
+
def new_content
|
78
|
+
fragments.select{|f| !f.quoted? && !f.hidden? && !f.signature?}.map{|f| f.to_s}.join("\n").rstrip
|
79
|
+
end
|
80
|
+
|
81
|
+
# Splits the given text into a list of Fragments. This is roughly done by
|
82
|
+
# reversing the text and parsing from the bottom to the top. This way we
|
83
|
+
# can check for 'On <date>, <author> wrote:' lines above quoted blocks.
|
84
|
+
#
|
85
|
+
# text - A String email body.
|
86
|
+
# from_address - from address of the email (optional)
|
87
|
+
#
|
88
|
+
# Returns this same Email instance.
|
89
|
+
def read(text, from_address = "")
|
90
|
+
# parse out the from name if one exists and save for use later
|
91
|
+
@from_name_raw = parse_raw_name_from_address(from_address)
|
92
|
+
@from_name_normalized = normalize_name(@from_name_raw)
|
93
|
+
@from_email = parse_email_from_address(from_address)
|
94
|
+
|
95
|
+
text = normalize_text(text)
|
96
|
+
|
97
|
+
# The text is reversed initially due to the way we check for hidden
|
98
|
+
# fragments.
|
99
|
+
text = text.reverse
|
100
|
+
|
101
|
+
# This determines if any 'visible' Fragment has been found. Once any
|
102
|
+
# visible Fragment is found, stop looking for hidden ones.
|
103
|
+
@found_visible = false
|
104
|
+
|
105
|
+
# This instance variable points to the current Fragment. If the matched
|
106
|
+
# line fits, it should be added to this Fragment. Otherwise, finish it
|
107
|
+
# and start a new Fragment.
|
108
|
+
@fragment = nil
|
109
|
+
|
110
|
+
# Use the StringScanner to pull out each line of the email content.
|
111
|
+
@scanner = StringScanner.new(text)
|
112
|
+
while line = @scanner.scan_until(/\n/n)
|
113
|
+
scan_line(line)
|
114
|
+
end
|
115
|
+
|
116
|
+
# Be sure to parse the last line of the email.
|
117
|
+
if (last_line = @scanner.rest.to_s).size > 0
|
118
|
+
scan_line(last_line, true)
|
119
|
+
end
|
120
|
+
|
121
|
+
# Finish up the final fragment. Finishing a fragment will detect any
|
122
|
+
# attributes (hidden, signature, reply), and join each line into a
|
123
|
+
# string.
|
124
|
+
finish_fragment
|
125
|
+
|
126
|
+
@scanner = @fragment = nil
|
127
|
+
|
128
|
+
# Now that parsing is done, reverse the order.
|
129
|
+
@fragments.reverse!
|
130
|
+
self
|
131
|
+
end
|
132
|
+
|
133
|
+
private
|
134
|
+
EMPTY = "".freeze
|
135
|
+
|
136
|
+
COMMON_REPLY_HEADER_REGEXES = [
|
137
|
+
/^On(.+)wrote:$/nm,
|
138
|
+
/\A\d{4}\/\d{1,2}\/\d{1,2}\s+.{1,80}\s<[^@]+@[^@]+>\Z/,
|
139
|
+
]
|
140
|
+
|
141
|
+
# Line optionally starts with whitespace, contains two or more hyphens or
|
142
|
+
# underscores, and ends with optional whitespace.
|
143
|
+
# Example: '---' or '___' or '--- '
|
144
|
+
MULTI_LINE_SIGNATURE_REGEX = /^\s*[-_]{2,}\s*$/
|
145
|
+
|
146
|
+
# Line optionally starts with whitespace, followed by one hyphen, followed by a word character
|
147
|
+
# Example: '-Sandro'
|
148
|
+
ONE_LINE_SIGNATURE_REGEX = /^\s*-\w/
|
149
|
+
|
150
|
+
ORIGINAL_MESSAGE_SIGNATURE_REGEX = /^[\s_-]+(Original Message)?[\s_-]+$/
|
151
|
+
|
152
|
+
# No block-quotes (> or <), followed by up to three words, followed by "Sent from my".
|
153
|
+
# Example: "Sent from my iPhone 3G"
|
154
|
+
SENT_FROM_REGEX = /^Sent from my (\s*\w+){1,3}(\s*<.*>)?$/
|
155
|
+
|
156
|
+
if defined?(Regexp::NOENCODING)
|
157
|
+
SIGNATURE_REGEX = Regexp.new(Regexp.union(MULTI_LINE_SIGNATURE_REGEX, ONE_LINE_SIGNATURE_REGEX, ORIGINAL_MESSAGE_SIGNATURE_REGEX, SENT_FROM_REGEX).source, Regexp::NOENCODING)
|
158
|
+
else
|
159
|
+
SIGNATURE_REGEX = Regexp.new(Regexp.union(MULTI_LINE_SIGNATURE_REGEX, ONE_LINE_SIGNATURE_REGEX, ORIGINAL_MESSAGE_SIGNATURE_REGEX, SENT_FROM_REGEX).source)
|
160
|
+
end
|
161
|
+
|
162
|
+
# TODO: refactor out in a i18n.yml file
|
163
|
+
# Supports English, French, Es-Mexican, Pt-Brazilian
|
164
|
+
# Maps a label to a label-group
|
165
|
+
QUOTE_HEADER_LABELS = Hash[*{
|
166
|
+
:from => ["From", "De"],
|
167
|
+
:to => ["To", "Para", "A"],
|
168
|
+
:cc => ["CC"],
|
169
|
+
:reply_to => ["Reply-To"],
|
170
|
+
:date => ["Date", "Sent", "Enviado", "Enviada em", "Fecha"],
|
171
|
+
:subject => ["Subject", "Assunto", "Asunto", "Objet"]
|
172
|
+
}.map {|group, labels| labels.map {|label| [label.downcase, group]}}.flatten]
|
173
|
+
|
174
|
+
# normalize text so it is easier to parse
|
175
|
+
#
|
176
|
+
# text - text to normalize
|
177
|
+
#
|
178
|
+
# Returns a String
|
179
|
+
def normalize_text(text)
|
180
|
+
# in 1.9 we want to operate on the raw bytes
|
181
|
+
text = text.dup.force_encoding('binary') if text.respond_to?(:force_encoding)
|
182
|
+
|
183
|
+
# Normalize line endings.
|
184
|
+
text.gsub!("\r\n", "\n")
|
185
|
+
|
186
|
+
# Check for multi-line reply headers. Some clients break up
|
187
|
+
# the "On DATE, NAME <EMAIL> wrote:" line into multiple lines.
|
188
|
+
if match = text.match(/^(On\s(.+)wrote:)$/m)
|
189
|
+
# Remove all new lines from the reply header. as long as we don't have any double newline
|
190
|
+
# if we do they we have grabbed something that is not actually a reply header
|
191
|
+
text.gsub! match[1], match[1].gsub("\n", " ") unless match[1] =~ /\n\n/
|
192
|
+
end
|
193
|
+
|
194
|
+
# Some users may reply directly above a line of underscores.
|
195
|
+
# In order to ensure that these fragments are split correctly,
|
196
|
+
# make sure that all lines of underscores are preceded by
|
197
|
+
# at least two newline characters.
|
198
|
+
text.gsub!(/([^\n])(?=\n_{7}_+)$/m, "\\1\n")
|
199
|
+
|
200
|
+
text
|
201
|
+
end
|
202
|
+
|
203
|
+
# Parse a person's name from an e-mail address
|
204
|
+
#
|
205
|
+
# email - email address.
|
206
|
+
#
|
207
|
+
# Returns a String.
|
208
|
+
def parse_name_from_address(address)
|
209
|
+
normalize_name(parse_raw_name_from_address(address))
|
210
|
+
end
|
211
|
+
|
212
|
+
def parse_raw_name_from_address(address)
|
213
|
+
match = address.match(/^["']*([\w\s,]+)["']*\s*</)
|
214
|
+
match ? match[1].strip.to_s : EMPTY
|
215
|
+
end
|
216
|
+
|
217
|
+
def parse_email_from_address(address)
|
218
|
+
match = address.match /<(.*)>/
|
219
|
+
match ? match[1] : address
|
220
|
+
end
|
221
|
+
|
222
|
+
# Normalize a name to First Last
|
223
|
+
#
|
224
|
+
# name - name to normailze.
|
225
|
+
#
|
226
|
+
# Returns a String.
|
227
|
+
def normalize_name(name)
|
228
|
+
if name.include?(',')
|
229
|
+
make_name_first_then_last(name)
|
230
|
+
else
|
231
|
+
name
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
def make_name_first_then_last(name)
|
236
|
+
split_name = name.split(',')
|
237
|
+
if split_name[0].include?(" ")
|
238
|
+
split_name[0].to_s
|
239
|
+
else
|
240
|
+
split_name[1].strip + " " + split_name[0].strip
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
### Line-by-Line Parsing
|
245
|
+
|
246
|
+
# Scans the given line of text and determines which fragment it belongs to.
|
247
|
+
def scan_line(line, last = false)
|
248
|
+
line.chomp!("\n")
|
249
|
+
line.reverse!
|
250
|
+
line.rstrip!
|
251
|
+
|
252
|
+
# Mark the current Fragment as a signature if the current line is empty
|
253
|
+
# and the Fragment starts with a common signature indicator.
|
254
|
+
# Mark the current Fragment as a quote if the current line is empty
|
255
|
+
# and the Fragment starts with a multiline quote header.
|
256
|
+
scan_signature_or_quote if @fragment && line == EMPTY
|
257
|
+
|
258
|
+
# We're looking for leading `>`'s to see if this line is part of a
|
259
|
+
# quoted Fragment.
|
260
|
+
is_quoted = !!(line =~ /^>+/n)
|
261
|
+
|
262
|
+
# Note that a common reply header also counts as part of the quoted
|
263
|
+
# Fragment, even though it doesn't start with `>`.
|
264
|
+
unless @fragment &&
|
265
|
+
((@fragment.quoted? == is_quoted) ||
|
266
|
+
(@fragment.quoted? && (line_is_reply_header?(line) || line == EMPTY)))
|
267
|
+
finish_fragment
|
268
|
+
@fragment = Fragment.new
|
269
|
+
@fragment.quoted = is_quoted
|
270
|
+
end
|
271
|
+
|
272
|
+
@fragment.add_line(line)
|
273
|
+
scan_signature_or_quote if last
|
274
|
+
end
|
275
|
+
|
276
|
+
def scan_signature_or_quote
|
277
|
+
if signature_line?(@fragment.lines.first)
|
278
|
+
@fragment.signature = true
|
279
|
+
finish_fragment
|
280
|
+
elsif multiline_quote_header_in_fragment?
|
281
|
+
@fragment.quoted = true
|
282
|
+
finish_fragment
|
283
|
+
end
|
284
|
+
end
|
285
|
+
|
286
|
+
# Returns +true+ if the current block in the current fragment has
|
287
|
+
# a multiline quote header, +false+ otherwise.
|
288
|
+
#
|
289
|
+
# The quote header we're looking for is mainly generated by Outlook
|
290
|
+
# clients. It's considered a quote header if the first 4 folded lines
|
291
|
+
# have one of the following forms:
|
292
|
+
#
|
293
|
+
# label: some text
|
294
|
+
# *label:* some text
|
295
|
+
#
|
296
|
+
# where a line like this:
|
297
|
+
#
|
298
|
+
# label: some text
|
299
|
+
# possibly indented text that belongs to the previous line
|
300
|
+
#
|
301
|
+
# is folded into:
|
302
|
+
#
|
303
|
+
# label: some text possibly indented text that belongs to the previous line
|
304
|
+
#
|
305
|
+
# and where label is a value from +QUOTE_HEADER_LABELS+ that appears
|
306
|
+
# only once in the first 4 lines and where each group of a label
|
307
|
+
# is represented at most once.
|
308
|
+
def multiline_quote_header_in_fragment?
|
309
|
+
folding = false
|
310
|
+
label_groups = []
|
311
|
+
@fragment.current_block.split("\n").each do |line|
|
312
|
+
if line =~ /\A\s*\*?([^:]+):(\s|\*)/
|
313
|
+
label = QUOTE_HEADER_LABELS[$1.downcase]
|
314
|
+
if label
|
315
|
+
return false if label_groups.include?(label)
|
316
|
+
return true if label_groups.length == 3
|
317
|
+
label_groups << label
|
318
|
+
folding = true
|
319
|
+
elsif !folding
|
320
|
+
return false
|
321
|
+
end
|
322
|
+
elsif !folding
|
323
|
+
return false
|
324
|
+
else
|
325
|
+
folding = true
|
326
|
+
end
|
327
|
+
end
|
328
|
+
return false
|
329
|
+
end
|
330
|
+
|
331
|
+
# Detects if a given line is the beginning of a signature
|
332
|
+
#
|
333
|
+
# line - A String line of text from the email.
|
334
|
+
#
|
335
|
+
# Returns true if the line is the beginning of a signature, or false.
|
336
|
+
def signature_line?(line)
|
337
|
+
line =~ SIGNATURE_REGEX || line_is_signature_name?(line)
|
338
|
+
end
|
339
|
+
|
340
|
+
# Detects if a given line is a common reply header.
|
341
|
+
#
|
342
|
+
# line - A String line of text from the email.
|
343
|
+
#
|
344
|
+
# Returns true if the line is a valid header, or false.
|
345
|
+
def line_is_reply_header?(line)
|
346
|
+
COMMON_REPLY_HEADER_REGEXES.each do |regex|
|
347
|
+
return true if line =~ regex
|
348
|
+
end
|
349
|
+
false
|
350
|
+
end
|
351
|
+
|
352
|
+
# Detects if the @from name is a big part of a given line and therefore the beginning of a signature
|
353
|
+
#
|
354
|
+
# line - A String line of text from the email.
|
355
|
+
#
|
356
|
+
# Returns true if @from_name is a big part of the line, or false.
|
357
|
+
def line_is_signature_name?(line)
|
358
|
+
regexp = generate_regexp_for_name()
|
359
|
+
@from_name_normalized != "" && (line =~ regexp) && ((@from_name_normalized.size.to_f / line.size) > 0.25)
|
360
|
+
end
|
361
|
+
|
362
|
+
#generates regexp which always for additional words or initials between first and last names
|
363
|
+
def generate_regexp_for_name
|
364
|
+
name_parts = @from_name_normalized.split(" ")
|
365
|
+
seperator = '[\w.\s]*'
|
366
|
+
regexp = Regexp.new(name_parts.join(seperator), Regexp::IGNORECASE)
|
367
|
+
end
|
368
|
+
|
369
|
+
# Builds the fragment string, after all lines have been added.
|
370
|
+
# It also checks to see if this Fragment is hidden. The hidden
|
371
|
+
# Fragment check reads from the bottom to the top.
|
372
|
+
#
|
373
|
+
# Any quoted Fragments or signature Fragments are marked hidden if they
|
374
|
+
# are below any visible Fragments. Visible Fragments are expected to
|
375
|
+
# contain original content by the author. If they are below a quoted
|
376
|
+
# Fragment, then the Fragment should be visible to give context to the
|
377
|
+
# reply.
|
378
|
+
#
|
379
|
+
# some original text (visible)
|
380
|
+
#
|
381
|
+
# > do you have any two's? (quoted, visible)
|
382
|
+
#
|
383
|
+
# Go fish! (visible)
|
384
|
+
#
|
385
|
+
# > --
|
386
|
+
# > Player 1 (quoted, hidden)
|
387
|
+
#
|
388
|
+
# --
|
389
|
+
# Player 2 (signature, hidden)
|
390
|
+
#
|
391
|
+
def finish_fragment
|
392
|
+
if @fragment
|
393
|
+
@fragment.finish
|
394
|
+
if !@found_visible
|
395
|
+
if @fragment.quoted? || @fragment.signature? ||
|
396
|
+
@fragment.reply_header? || @fragment.to_s.strip == EMPTY
|
397
|
+
@fragment.hidden = true
|
398
|
+
else
|
399
|
+
@found_visible = true
|
400
|
+
end
|
401
|
+
end
|
402
|
+
@fragments << @fragment
|
403
|
+
end
|
404
|
+
@fragment = nil
|
405
|
+
end
|
406
|
+
end
|
407
|
+
|
408
|
+
# Represents a group of paragraphs in the email sharing common attributes.
|
409
|
+
# Paragraphs should get their own fragment if they are a quoted area or a
|
410
|
+
# signature.
|
411
|
+
class Fragment < Struct.new(:quoted, :signature, :reply_header, :hidden)
|
412
|
+
# Array of string lines that make up the content of this fragment.
|
413
|
+
attr_reader :lines
|
414
|
+
|
415
|
+
# Array of string lines that is being processed not having
|
416
|
+
# an empty line.
|
417
|
+
attr_reader :current_block
|
418
|
+
|
419
|
+
# This is reserved for the joined String that is build when this Fragment
|
420
|
+
# is finished.
|
421
|
+
attr_reader :content
|
422
|
+
|
423
|
+
def initialize
|
424
|
+
self.quoted = self.signature = self.reply_header = self.hidden = false
|
425
|
+
@lines = []
|
426
|
+
@current_block = []
|
427
|
+
@content = nil
|
428
|
+
end
|
429
|
+
|
430
|
+
alias quoted? quoted
|
431
|
+
alias signature? signature
|
432
|
+
alias reply_header? reply_header
|
433
|
+
alias hidden? hidden
|
434
|
+
|
435
|
+
def add_line(line)
|
436
|
+
return unless line
|
437
|
+
@lines.insert(0, line)
|
438
|
+
if line == ""
|
439
|
+
@current_block.clear
|
440
|
+
else
|
441
|
+
@current_block.insert(0, line)
|
442
|
+
end
|
443
|
+
end
|
444
|
+
|
445
|
+
def current_block
|
446
|
+
@current_block.join("\n")
|
447
|
+
end
|
448
|
+
|
449
|
+
# Builds the string content by joining the lines and reversing them.
|
450
|
+
def finish
|
451
|
+
@content = @lines.join("\n")
|
452
|
+
@lines = @current_block = nil
|
453
|
+
end
|
454
|
+
|
455
|
+
def to_s
|
456
|
+
@lines ? @lines.join("\n") : @content
|
457
|
+
end
|
458
|
+
|
459
|
+
def inspect
|
460
|
+
"#{super.inspect} : #{to_s.inspect}"
|
461
|
+
end
|
462
|
+
end
|
463
|
+
end
|
464
|
+
|