mail_extract 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,41 @@
1
+ !.gitignore
2
+ *.gem
3
+ *.rbc
4
+ *.sw[a-p]
5
+ *.tmproj
6
+ *.tmproject
7
+ *.un~
8
+ *~
9
+ .DS_Store
10
+ .Spotlight-V100
11
+ .Trashes
12
+ ._*
13
+ .bundle
14
+ .config
15
+ .directory
16
+ .elc
17
+ .redcar
18
+ .yardoc
19
+ /.emacs.desktop
20
+ /.emacs.desktop.lock
21
+ Desktop.ini
22
+ Gemfile.lock
23
+ Icon?
24
+ InstalledFiles
25
+ Session.vim
26
+ Thumbs.db
27
+ \#*\#
28
+ _yardoc
29
+ auto-save-list
30
+ coverage
31
+ doc/
32
+ lib/bundler/man
33
+ pkg
34
+ pkg/*
35
+ rdoc
36
+ spec/reports
37
+ test/tmp
38
+ test/version_tmp
39
+ tmp
40
+ tmtags
41
+ tramp
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ --format=nested
3
+ --backtrace
data/README.md ADDED
@@ -0,0 +1,54 @@
1
+ # MailExtract
2
+
3
+ MailExtract is a small ruby library to parse plain-text email contents.
4
+
5
+ It removes all quoted text and signatures leaving only original text.
6
+
7
+ ## Installation
8
+
9
+ gem install mail_extract
10
+
11
+ ## Usage
12
+
13
+ ### General usage
14
+
15
+ require 'mail_extract'
16
+
17
+ body = MailExtract::Parser.new('MESSAGE').body
18
+
19
+ # or via shortcut
20
+ body = MailExtract.new('MESSAGE').body
21
+
22
+ ### Using with Mail gem
23
+
24
+ require 'mail'
25
+ require 'mail_extract'
26
+
27
+ mail = Mail.read_from_string(YOUR_MESSAGE_BODY)
28
+
29
+ # find only plain-text parts
30
+ if mail.multipart?
31
+ part = mail.parts.select { |p| p.content_type =~ /text\/plain/ }.first rescue nil
32
+ unless part.nil?
33
+ message = part.body.decoded
34
+ end
35
+ else
36
+ message = part.body.decoded
37
+ end
38
+
39
+ clean_message = MailExtract.new(message).body
40
+
41
+ ## Known issues
42
+
43
+ - Invalid signature patterns (that does not follow --, ___)
44
+ - Invalid quote patterns (that does not start with >)
45
+
46
+ ## License
47
+
48
+ Copyright © 2011 Dan Sosedoff.
49
+
50
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
51
+
52
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
53
+
54
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env rake
2
+
3
+ require "bundler"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec) do |spec|
7
+ spec.pattern = 'spec/*_spec.rb'
8
+ end
9
+
10
+ task :default => :spec
11
+ task :test => :spec
@@ -0,0 +1,12 @@
1
+ require 'mail_extract/line'
2
+ require 'mail_extract/parser'
3
+
4
+ module MailExtract
5
+ class << self
6
+ # Shortcut to MailExtract::Parser.new
7
+ #
8
+ def new(body)
9
+ MailExtract::Parser.new(body)
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,36 @@
1
+ module MailExtract
2
+ class Line
3
+ attr_reader :body, :type
4
+
5
+ PATTERNS = {
6
+ /^[>]+\s?/ => :quote,
7
+ /^--/ => :signature,
8
+ /^-- / => :signature,
9
+ /^[_]{2,}\n?/ => :signature,
10
+ /^[-]{2,}\n?/ => :signature
11
+ }
12
+
13
+ def initialize(str)
14
+ @body = str
15
+ detect_type(str)
16
+ end
17
+
18
+ private
19
+
20
+ def detect_type(line)
21
+ # Detects the start line of quote text
22
+ if line.strip =~ /^On/ && line =~ /at [\d:]+/ && line.strip =~ /wrote:?\z/
23
+ @type = :quote
24
+ return
25
+ end
26
+
27
+ @type = :text
28
+ PATTERNS.each_pair do |p,t|
29
+ if line =~ p
30
+ @type = t
31
+ break
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,60 @@
1
+ require 'strscan'
2
+
3
+ module MailExtract
4
+ class Parser
5
+ attr_reader :body
6
+
7
+ # Initialize a new MailExtract::Parser object
8
+ # text - Email message body
9
+ #
10
+ def initialize(text)
11
+ @lines = []
12
+ @text = text.strip
13
+ @body = ""
14
+ @last_type = :text
15
+ @type = :text
16
+ parse
17
+ end
18
+
19
+ private
20
+
21
+ # Process email message body
22
+ #
23
+ def parse
24
+ scanner = StringScanner.new(@text)
25
+ while str = scanner.scan_until(/\n/)
26
+ parse_line(str)
27
+ end
28
+ if (last_line = scanner.rest.to_s).size > 0
29
+ parse_line(last_line)
30
+ end
31
+ @body = @lines.join("\n").strip
32
+ end
33
+
34
+ # Process a single line
35
+ #
36
+ def parse_line(str)
37
+ line = MailExtract::Line.new(str)
38
+ if line.type == :quote
39
+ if @last_type == :text
40
+ @type = :quote
41
+ end
42
+ elsif line.type == :text
43
+ if @last_type == :quote
44
+ @type = :text
45
+ end
46
+ if @last_type == :signature
47
+ @type = :signature
48
+ end
49
+ elsif line.type == :signature
50
+ if @last_type == :text
51
+ @type = :signature
52
+ elsif @last_type == :quote
53
+ @type = :quote
54
+ end
55
+ end
56
+ @last_type = line.type
57
+ @lines << line.body.strip if @type == :text
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,3 @@
1
+ module MailExtract
2
+ VERSION = "0.1.0".freeze unless defined? ::MailExtract::VERSION
3
+ end
@@ -0,0 +1,19 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/mail_extract/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.name = 'mail_extract'
6
+ gem.version = MailExtract::VERSION.dup
7
+ gem.author = 'Dan Sosedoff'
8
+ gem.email = 'dan.sosedoff@gmail.com'
9
+ gem.homepage = 'https://github.com/sosedoff/mail_extract'
10
+ gem.summary = %q{Extracts email message body}
11
+ gem.description = %q{Email body parser that strips out all quotes and signatures.}
12
+
13
+ gem.files = `git ls-files`.split("\n")
14
+ gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
15
+ gem.executables = `git ls-files -- bin/*`.split("\n").map{|f| File.basename(f)}
16
+ gem.require_paths = ['lib']
17
+
18
+ gem.add_development_dependency 'rspec', '~> 2.6'
19
+ end
@@ -0,0 +1,7 @@
1
+ Hi folks
2
+
3
+ What is the best way to clear a Riak bucket of all key, values after
4
+ running a test?
5
+ I am currently using the Java HTTP API.
6
+
7
+ -Abhishek Kona
@@ -0,0 +1,28 @@
1
+ Hi,
2
+
3
+ You can list the keys for the bucket and call delete for each. Or if you
4
+ put the keys (and kept track of them in your test) you can delete them
5
+ one at a time (without incurring the cost of calling list first.)
6
+
7
+ Something like:
8
+
9
+ String bucket = "my_bucket";
10
+ BucketResponse bucketResponse = riakClient.listBucket(bucket);
11
+ RiakBucketInfo bucketInfo = bucketResponse.getBucketInfo();
12
+
13
+ for(String key : bucketInfo.getKeys()) {
14
+ riakClient.delete(bucket, key);
15
+ }
16
+
17
+
18
+ would do it.
19
+
20
+ See also
21
+
22
+ http://wiki.basho.com/REST-API.html#Bucket-operations
23
+
24
+ which says
25
+
26
+ "At the moment there is no straightforward way to delete an entire
27
+ Bucket. There is, however, an open ticket for the feature. To delete all
28
+ the keys in a bucket, you’ll need to delete them all individually."
@@ -0,0 +1,13 @@
1
+ Hi folks
2
+
3
+ What is the best way to clear a Riak bucket of all key, values after
4
+ running a test?
5
+ I am currently using the Java HTTP API.
6
+
7
+ -Abhishek Kona
8
+
9
+
10
+ _______________________________________________
11
+ riak-users mailing list
12
+ riak-users@lists.basho.com
13
+ http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com
@@ -0,0 +1,51 @@
1
+ Hi,
2
+ On Tue, 2011-03-01 at 18:02 +0530, Abhishek Kona wrote:
3
+ > Hi folks
4
+ >
5
+ > What is the best way to clear a Riak bucket of all key, values after
6
+ > running a test?
7
+ > I am currently using the Java HTTP API.
8
+
9
+ You can list the keys for the bucket and call delete for each. Or if you
10
+ put the keys (and kept track of them in your test) you can delete them
11
+ one at a time (without incurring the cost of calling list first.)
12
+
13
+ Something like:
14
+
15
+ String bucket = "my_bucket";
16
+ BucketResponse bucketResponse = riakClient.listBucket(bucket);
17
+ RiakBucketInfo bucketInfo = bucketResponse.getBucketInfo();
18
+
19
+ for(String key : bucketInfo.getKeys()) {
20
+ riakClient.delete(bucket, key);
21
+ }
22
+
23
+
24
+ would do it.
25
+
26
+ See also
27
+
28
+ http://wiki.basho.com/REST-API.html#Bucket-operations
29
+
30
+ which says
31
+
32
+ "At the moment there is no straightforward way to delete an entire
33
+ Bucket. There is, however, an open ticket for the feature. To delete all
34
+ the keys in a bucket, you’ll need to delete them all individually."
35
+
36
+ >
37
+ > -Abhishek Kona
38
+ >
39
+ >
40
+ > _______________________________________________
41
+ > riak-users mailing list
42
+ > riak-users@lists.basho.com
43
+ > http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com
44
+
45
+
46
+
47
+
48
+ _______________________________________________
49
+ riak-users mailing list
50
+ riak-users@lists.basho.com
51
+ http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com
data/spec/line_spec.rb ADDED
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'MailExtract::Line' do
4
+ def line(str)
5
+ MailExtract::Line.new(str)
6
+ end
7
+
8
+ it 'detects quote start by date' do
9
+ line('On Tue, 2011-03-01 at 18:02 +0530, somebody wrote:').type.should == :quote
10
+ line('On 2011-03-01 at 18:02 somebody wrote').type.should == :quote
11
+ line('On some day somebody wrote').type.should == :text
12
+ end
13
+
14
+ it 'detects quote' do
15
+ line('> this is a quote').type.should == :quote
16
+ line('> >> this is a quote').type.should == :quote
17
+ end
18
+
19
+ it 'detects signature' do
20
+ lines = [
21
+ "--\nUsername",
22
+ "-- \nUsername",
23
+ "_______\nSome text"
24
+ ]
25
+
26
+ lines.each do |l|
27
+ line(l).type.should == :signature
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,13 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'MailExtract::Parser' do
4
+ it 'parses an email' do
5
+ body = MailExtract.new(fixture('simple.txt')).body
6
+ body.should == result_fixture('simple.txt')
7
+ end
8
+
9
+ it 'parses an email with quotes' do
10
+ body = MailExtract.new(fixture('simple_with_quotes.txt')).body
11
+ body.should == result_fixture('simple_with_quotes.txt')
12
+ end
13
+ end
@@ -0,0 +1,17 @@
1
+ $:.unshift File.expand_path("../..", __FILE__)
2
+
3
+ require 'mail_extract'
4
+
5
+ def fixture_path(file=nil)
6
+ path = File.expand_path("../fixtures", __FILE__)
7
+ path = File.join(path, file) unless file.nil?
8
+ path
9
+ end
10
+
11
+ def fixture(file)
12
+ File.read(File.join(fixture_path, file))
13
+ end
14
+
15
+ def result_fixture(file)
16
+ fixture("result_#{file}")
17
+ end
metadata ADDED
@@ -0,0 +1,102 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mail_extract
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Dan Sosedoff
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-07-21 00:00:00 -05:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: rspec
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ hash: 15
30
+ segments:
31
+ - 2
32
+ - 6
33
+ version: "2.6"
34
+ type: :development
35
+ version_requirements: *id001
36
+ description: Email body parser that strips out all quotes and signatures.
37
+ email: dan.sosedoff@gmail.com
38
+ executables: []
39
+
40
+ extensions: []
41
+
42
+ extra_rdoc_files: []
43
+
44
+ files:
45
+ - .gitignore
46
+ - .rspec
47
+ - README.md
48
+ - Rakefile
49
+ - lib/mail_extract.rb
50
+ - lib/mail_extract/line.rb
51
+ - lib/mail_extract/parser.rb
52
+ - lib/mail_extract/version.rb
53
+ - mail_extract.gemspec
54
+ - spec/fixtures/result_simple.txt
55
+ - spec/fixtures/result_simple_with_quotes.txt
56
+ - spec/fixtures/simple.txt
57
+ - spec/fixtures/simple_with_quotes.txt
58
+ - spec/line_spec.rb
59
+ - spec/parser_spec.rb
60
+ - spec/spec_helper.rb
61
+ has_rdoc: true
62
+ homepage: https://github.com/sosedoff/mail_extract
63
+ licenses: []
64
+
65
+ post_install_message:
66
+ rdoc_options: []
67
+
68
+ require_paths:
69
+ - lib
70
+ required_ruby_version: !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ hash: 3
76
+ segments:
77
+ - 0
78
+ version: "0"
79
+ required_rubygems_version: !ruby/object:Gem::Requirement
80
+ none: false
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ hash: 3
85
+ segments:
86
+ - 0
87
+ version: "0"
88
+ requirements: []
89
+
90
+ rubyforge_project:
91
+ rubygems_version: 1.6.2
92
+ signing_key:
93
+ specification_version: 3
94
+ summary: Extracts email message body
95
+ test_files:
96
+ - spec/fixtures/result_simple.txt
97
+ - spec/fixtures/result_simple_with_quotes.txt
98
+ - spec/fixtures/simple.txt
99
+ - spec/fixtures/simple_with_quotes.txt
100
+ - spec/line_spec.rb
101
+ - spec/parser_spec.rb
102
+ - spec/spec_helper.rb