mail_extract 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +41 -0
- data/.rspec +3 -0
- data/README.md +54 -0
- data/Rakefile +11 -0
- data/lib/mail_extract.rb +12 -0
- data/lib/mail_extract/line.rb +36 -0
- data/lib/mail_extract/parser.rb +60 -0
- data/lib/mail_extract/version.rb +3 -0
- data/mail_extract.gemspec +19 -0
- data/spec/fixtures/result_simple.txt +7 -0
- data/spec/fixtures/result_simple_with_quotes.txt +28 -0
- data/spec/fixtures/simple.txt +13 -0
- data/spec/fixtures/simple_with_quotes.txt +51 -0
- data/spec/line_spec.rb +30 -0
- data/spec/parser_spec.rb +13 -0
- data/spec/spec_helper.rb +17 -0
- metadata +102 -0
data/.gitignore
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
!.gitignore
|
2
|
+
*.gem
|
3
|
+
*.rbc
|
4
|
+
*.sw[a-p]
|
5
|
+
*.tmproj
|
6
|
+
*.tmproject
|
7
|
+
*.un~
|
8
|
+
*~
|
9
|
+
.DS_Store
|
10
|
+
.Spotlight-V100
|
11
|
+
.Trashes
|
12
|
+
._*
|
13
|
+
.bundle
|
14
|
+
.config
|
15
|
+
.directory
|
16
|
+
.elc
|
17
|
+
.redcar
|
18
|
+
.yardoc
|
19
|
+
/.emacs.desktop
|
20
|
+
/.emacs.desktop.lock
|
21
|
+
Desktop.ini
|
22
|
+
Gemfile.lock
|
23
|
+
Icon?
|
24
|
+
InstalledFiles
|
25
|
+
Session.vim
|
26
|
+
Thumbs.db
|
27
|
+
\#*\#
|
28
|
+
_yardoc
|
29
|
+
auto-save-list
|
30
|
+
coverage
|
31
|
+
doc/
|
32
|
+
lib/bundler/man
|
33
|
+
pkg
|
34
|
+
pkg/*
|
35
|
+
rdoc
|
36
|
+
spec/reports
|
37
|
+
test/tmp
|
38
|
+
test/version_tmp
|
39
|
+
tmp
|
40
|
+
tmtags
|
41
|
+
tramp
|
data/.rspec
ADDED
data/README.md
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
# MailExtract
|
2
|
+
|
3
|
+
MailExtract is a small ruby library to parse plain-text email contents.
|
4
|
+
|
5
|
+
It removes all quoted text and signatures leaving only original text.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
gem install mail_extract
|
10
|
+
|
11
|
+
## Usage
|
12
|
+
|
13
|
+
### General usage
|
14
|
+
|
15
|
+
require 'mail_extract'
|
16
|
+
|
17
|
+
body = MailExtract::Parser.new('MESSAGE').body
|
18
|
+
|
19
|
+
# or via shortcut
|
20
|
+
body = MailExtract.new('MESSAGE').body
|
21
|
+
|
22
|
+
### Using with Mail gem
|
23
|
+
|
24
|
+
require 'mail'
|
25
|
+
require 'mail_extract'
|
26
|
+
|
27
|
+
mail = Mail.read_from_string(YOUR_MESSAGE_BODY)
|
28
|
+
|
29
|
+
# find only plain-text parts
|
30
|
+
if mail.multipart?
|
31
|
+
part = mail.parts.select { |p| p.content_type =~ /text\/plain/ }.first rescue nil
|
32
|
+
unless part.nil?
|
33
|
+
message = part.body.decoded
|
34
|
+
end
|
35
|
+
else
|
36
|
+
message = part.body.decoded
|
37
|
+
end
|
38
|
+
|
39
|
+
clean_message = MailExtract.new(message).body
|
40
|
+
|
41
|
+
## Known issues
|
42
|
+
|
43
|
+
- Invalid signature patterns (that does not follow --, ___)
|
44
|
+
- Invalid quote patterns (that does not start with >)
|
45
|
+
|
46
|
+
## License
|
47
|
+
|
48
|
+
Copyright © 2011 Dan Sosedoff.
|
49
|
+
|
50
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
51
|
+
|
52
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
53
|
+
|
54
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
data/lib/mail_extract.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
module MailExtract
|
2
|
+
class Line
|
3
|
+
attr_reader :body, :type
|
4
|
+
|
5
|
+
PATTERNS = {
|
6
|
+
/^[>]+\s?/ => :quote,
|
7
|
+
/^--/ => :signature,
|
8
|
+
/^-- / => :signature,
|
9
|
+
/^[_]{2,}\n?/ => :signature,
|
10
|
+
/^[-]{2,}\n?/ => :signature
|
11
|
+
}
|
12
|
+
|
13
|
+
def initialize(str)
|
14
|
+
@body = str
|
15
|
+
detect_type(str)
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def detect_type(line)
|
21
|
+
# Detects the start line of quote text
|
22
|
+
if line.strip =~ /^On/ && line =~ /at [\d:]+/ && line.strip =~ /wrote:?\z/
|
23
|
+
@type = :quote
|
24
|
+
return
|
25
|
+
end
|
26
|
+
|
27
|
+
@type = :text
|
28
|
+
PATTERNS.each_pair do |p,t|
|
29
|
+
if line =~ p
|
30
|
+
@type = t
|
31
|
+
break
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
module MailExtract
|
4
|
+
class Parser
|
5
|
+
attr_reader :body
|
6
|
+
|
7
|
+
# Initialize a new MailExtract::Parser object
|
8
|
+
# text - Email message body
|
9
|
+
#
|
10
|
+
def initialize(text)
|
11
|
+
@lines = []
|
12
|
+
@text = text.strip
|
13
|
+
@body = ""
|
14
|
+
@last_type = :text
|
15
|
+
@type = :text
|
16
|
+
parse
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
# Process email message body
|
22
|
+
#
|
23
|
+
def parse
|
24
|
+
scanner = StringScanner.new(@text)
|
25
|
+
while str = scanner.scan_until(/\n/)
|
26
|
+
parse_line(str)
|
27
|
+
end
|
28
|
+
if (last_line = scanner.rest.to_s).size > 0
|
29
|
+
parse_line(last_line)
|
30
|
+
end
|
31
|
+
@body = @lines.join("\n").strip
|
32
|
+
end
|
33
|
+
|
34
|
+
# Process a single line
|
35
|
+
#
|
36
|
+
def parse_line(str)
|
37
|
+
line = MailExtract::Line.new(str)
|
38
|
+
if line.type == :quote
|
39
|
+
if @last_type == :text
|
40
|
+
@type = :quote
|
41
|
+
end
|
42
|
+
elsif line.type == :text
|
43
|
+
if @last_type == :quote
|
44
|
+
@type = :text
|
45
|
+
end
|
46
|
+
if @last_type == :signature
|
47
|
+
@type = :signature
|
48
|
+
end
|
49
|
+
elsif line.type == :signature
|
50
|
+
if @last_type == :text
|
51
|
+
@type = :signature
|
52
|
+
elsif @last_type == :quote
|
53
|
+
@type = :quote
|
54
|
+
end
|
55
|
+
end
|
56
|
+
@last_type = line.type
|
57
|
+
@lines << line.body.strip if @type == :text
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/mail_extract/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.name = 'mail_extract'
|
6
|
+
gem.version = MailExtract::VERSION.dup
|
7
|
+
gem.author = 'Dan Sosedoff'
|
8
|
+
gem.email = 'dan.sosedoff@gmail.com'
|
9
|
+
gem.homepage = 'https://github.com/sosedoff/mail_extract'
|
10
|
+
gem.summary = %q{Extracts email message body}
|
11
|
+
gem.description = %q{Email body parser that strips out all quotes and signatures.}
|
12
|
+
|
13
|
+
gem.files = `git ls-files`.split("\n")
|
14
|
+
gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
15
|
+
gem.executables = `git ls-files -- bin/*`.split("\n").map{|f| File.basename(f)}
|
16
|
+
gem.require_paths = ['lib']
|
17
|
+
|
18
|
+
gem.add_development_dependency 'rspec', '~> 2.6'
|
19
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
Hi,
|
2
|
+
|
3
|
+
You can list the keys for the bucket and call delete for each. Or if you
|
4
|
+
put the keys (and kept track of them in your test) you can delete them
|
5
|
+
one at a time (without incurring the cost of calling list first.)
|
6
|
+
|
7
|
+
Something like:
|
8
|
+
|
9
|
+
String bucket = "my_bucket";
|
10
|
+
BucketResponse bucketResponse = riakClient.listBucket(bucket);
|
11
|
+
RiakBucketInfo bucketInfo = bucketResponse.getBucketInfo();
|
12
|
+
|
13
|
+
for(String key : bucketInfo.getKeys()) {
|
14
|
+
riakClient.delete(bucket, key);
|
15
|
+
}
|
16
|
+
|
17
|
+
|
18
|
+
would do it.
|
19
|
+
|
20
|
+
See also
|
21
|
+
|
22
|
+
http://wiki.basho.com/REST-API.html#Bucket-operations
|
23
|
+
|
24
|
+
which says
|
25
|
+
|
26
|
+
"At the moment there is no straightforward way to delete an entire
|
27
|
+
Bucket. There is, however, an open ticket for the feature. To delete all
|
28
|
+
the keys in a bucket, you’ll need to delete them all individually."
|
@@ -0,0 +1,13 @@
|
|
1
|
+
Hi folks
|
2
|
+
|
3
|
+
What is the best way to clear a Riak bucket of all key, values after
|
4
|
+
running a test?
|
5
|
+
I am currently using the Java HTTP API.
|
6
|
+
|
7
|
+
-Abhishek Kona
|
8
|
+
|
9
|
+
|
10
|
+
_______________________________________________
|
11
|
+
riak-users mailing list
|
12
|
+
riak-users@lists.basho.com
|
13
|
+
http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com
|
@@ -0,0 +1,51 @@
|
|
1
|
+
Hi,
|
2
|
+
On Tue, 2011-03-01 at 18:02 +0530, Abhishek Kona wrote:
|
3
|
+
> Hi folks
|
4
|
+
>
|
5
|
+
> What is the best way to clear a Riak bucket of all key, values after
|
6
|
+
> running a test?
|
7
|
+
> I am currently using the Java HTTP API.
|
8
|
+
|
9
|
+
You can list the keys for the bucket and call delete for each. Or if you
|
10
|
+
put the keys (and kept track of them in your test) you can delete them
|
11
|
+
one at a time (without incurring the cost of calling list first.)
|
12
|
+
|
13
|
+
Something like:
|
14
|
+
|
15
|
+
String bucket = "my_bucket";
|
16
|
+
BucketResponse bucketResponse = riakClient.listBucket(bucket);
|
17
|
+
RiakBucketInfo bucketInfo = bucketResponse.getBucketInfo();
|
18
|
+
|
19
|
+
for(String key : bucketInfo.getKeys()) {
|
20
|
+
riakClient.delete(bucket, key);
|
21
|
+
}
|
22
|
+
|
23
|
+
|
24
|
+
would do it.
|
25
|
+
|
26
|
+
See also
|
27
|
+
|
28
|
+
http://wiki.basho.com/REST-API.html#Bucket-operations
|
29
|
+
|
30
|
+
which says
|
31
|
+
|
32
|
+
"At the moment there is no straightforward way to delete an entire
|
33
|
+
Bucket. There is, however, an open ticket for the feature. To delete all
|
34
|
+
the keys in a bucket, you’ll need to delete them all individually."
|
35
|
+
|
36
|
+
>
|
37
|
+
> -Abhishek Kona
|
38
|
+
>
|
39
|
+
>
|
40
|
+
> _______________________________________________
|
41
|
+
> riak-users mailing list
|
42
|
+
> riak-users@lists.basho.com
|
43
|
+
> http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
|
48
|
+
_______________________________________________
|
49
|
+
riak-users mailing list
|
50
|
+
riak-users@lists.basho.com
|
51
|
+
http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com
|
data/spec/line_spec.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'MailExtract::Line' do
|
4
|
+
def line(str)
|
5
|
+
MailExtract::Line.new(str)
|
6
|
+
end
|
7
|
+
|
8
|
+
it 'detects quote start by date' do
|
9
|
+
line('On Tue, 2011-03-01 at 18:02 +0530, somebody wrote:').type.should == :quote
|
10
|
+
line('On 2011-03-01 at 18:02 somebody wrote').type.should == :quote
|
11
|
+
line('On some day somebody wrote').type.should == :text
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'detects quote' do
|
15
|
+
line('> this is a quote').type.should == :quote
|
16
|
+
line('> >> this is a quote').type.should == :quote
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'detects signature' do
|
20
|
+
lines = [
|
21
|
+
"--\nUsername",
|
22
|
+
"-- \nUsername",
|
23
|
+
"_______\nSome text"
|
24
|
+
]
|
25
|
+
|
26
|
+
lines.each do |l|
|
27
|
+
line(l).type.should == :signature
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/spec/parser_spec.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'MailExtract::Parser' do
|
4
|
+
it 'parses an email' do
|
5
|
+
body = MailExtract.new(fixture('simple.txt')).body
|
6
|
+
body.should == result_fixture('simple.txt')
|
7
|
+
end
|
8
|
+
|
9
|
+
it 'parses an email with quotes' do
|
10
|
+
body = MailExtract.new(fixture('simple_with_quotes.txt')).body
|
11
|
+
body.should == result_fixture('simple_with_quotes.txt')
|
12
|
+
end
|
13
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
$:.unshift File.expand_path("../..", __FILE__)
|
2
|
+
|
3
|
+
require 'mail_extract'
|
4
|
+
|
5
|
+
def fixture_path(file=nil)
|
6
|
+
path = File.expand_path("../fixtures", __FILE__)
|
7
|
+
path = File.join(path, file) unless file.nil?
|
8
|
+
path
|
9
|
+
end
|
10
|
+
|
11
|
+
def fixture(file)
|
12
|
+
File.read(File.join(fixture_path, file))
|
13
|
+
end
|
14
|
+
|
15
|
+
def result_fixture(file)
|
16
|
+
fixture("result_#{file}")
|
17
|
+
end
|
metadata
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mail_extract
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Dan Sosedoff
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-07-21 00:00:00 -05:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: rspec
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 15
|
30
|
+
segments:
|
31
|
+
- 2
|
32
|
+
- 6
|
33
|
+
version: "2.6"
|
34
|
+
type: :development
|
35
|
+
version_requirements: *id001
|
36
|
+
description: Email body parser that strips out all quotes and signatures.
|
37
|
+
email: dan.sosedoff@gmail.com
|
38
|
+
executables: []
|
39
|
+
|
40
|
+
extensions: []
|
41
|
+
|
42
|
+
extra_rdoc_files: []
|
43
|
+
|
44
|
+
files:
|
45
|
+
- .gitignore
|
46
|
+
- .rspec
|
47
|
+
- README.md
|
48
|
+
- Rakefile
|
49
|
+
- lib/mail_extract.rb
|
50
|
+
- lib/mail_extract/line.rb
|
51
|
+
- lib/mail_extract/parser.rb
|
52
|
+
- lib/mail_extract/version.rb
|
53
|
+
- mail_extract.gemspec
|
54
|
+
- spec/fixtures/result_simple.txt
|
55
|
+
- spec/fixtures/result_simple_with_quotes.txt
|
56
|
+
- spec/fixtures/simple.txt
|
57
|
+
- spec/fixtures/simple_with_quotes.txt
|
58
|
+
- spec/line_spec.rb
|
59
|
+
- spec/parser_spec.rb
|
60
|
+
- spec/spec_helper.rb
|
61
|
+
has_rdoc: true
|
62
|
+
homepage: https://github.com/sosedoff/mail_extract
|
63
|
+
licenses: []
|
64
|
+
|
65
|
+
post_install_message:
|
66
|
+
rdoc_options: []
|
67
|
+
|
68
|
+
require_paths:
|
69
|
+
- lib
|
70
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
hash: 3
|
76
|
+
segments:
|
77
|
+
- 0
|
78
|
+
version: "0"
|
79
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
80
|
+
none: false
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
hash: 3
|
85
|
+
segments:
|
86
|
+
- 0
|
87
|
+
version: "0"
|
88
|
+
requirements: []
|
89
|
+
|
90
|
+
rubyforge_project:
|
91
|
+
rubygems_version: 1.6.2
|
92
|
+
signing_key:
|
93
|
+
specification_version: 3
|
94
|
+
summary: Extracts email message body
|
95
|
+
test_files:
|
96
|
+
- spec/fixtures/result_simple.txt
|
97
|
+
- spec/fixtures/result_simple_with_quotes.txt
|
98
|
+
- spec/fixtures/simple.txt
|
99
|
+
- spec/fixtures/simple_with_quotes.txt
|
100
|
+
- spec/line_spec.rb
|
101
|
+
- spec/parser_spec.rb
|
102
|
+
- spec/spec_helper.rb
|