chronicle-email 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 85cc6d5cd7ef25487ecb960c67f12e287c5a2c28894c6cba0b075bd7f0511725
4
- data.tar.gz: 39969678f5abf0104df8dce01dd6e30aeee2878fbe9082e7055f9ffd94c7ade5
3
+ metadata.gz: f9096dfd9082ccbbf59651fbeb3f7d8bbd6b00a246b1cff5094ce196f2a201ec
4
+ data.tar.gz: f3a702a0f56ce3301345c7be20cf2fdff5722354ef4e6f34aa83c1e0fb73a928
5
5
  SHA512:
6
- metadata.gz: 50ac6a07fdc6324430a54a6739e7493b72583f2e35220b6a92c5949cc5731b6276aaea9fbebcac012f4f32e57ef34f10ad069bdda43afad9f3b9ee15a6e832ed
7
- data.tar.gz: 32f9fb63a319f0b37745dc297d229dcf1113f68bef528d13a4dd21dfb4dd83d74a41ff38e906ff6dda284fb8d34b066cb7bfbcafa642ad7e190f56173fb6f018
6
+ metadata.gz: f522c605c2277ac3552670aa67eafbd8ffe258717ac81bfafb64f4d22db0ec63891ce47faa85e761216ba7d4a2d2cb10138c67aaf28228a0cd3f5c53d8f3f8a5
7
+ data.tar.gz: 700aa4b3162b5b566f83354232303873cda6377c0ead42d57c96d7db7fec41b620ed153a0d4af81ac29e5c1a0e253a6e74bb99ccfca25ab824db39238d36d5ed
data/.gitignore CHANGED
@@ -7,6 +7,8 @@
7
7
  /spec/reports/
8
8
  /tmp/
9
9
 
10
+ Gemfile.lock
11
+
10
12
  # rspec failure tracking
11
13
  .rspec_status
12
14
  .DS_Store
data/README.md CHANGED
@@ -1,16 +1,36 @@
1
1
  # Chronicle::Email
2
+ [![Gem Version](https://badge.fury.io/rb/chronicle-email.svg)](https://badge.fury.io/rb/chronicle-email)
2
3
 
3
- Email importer for [chronicle-etl](https://github.com/chronicle-app/chronicle-etl)
4
-
5
- ## Available classes
6
- - `email:mbox` - Extractor for processing .mbox files
7
- - `email:chronicle` - Transformer that converts an email into a chronicle schema
4
+ Extract and work with your email using the command line with this plugin for [chronicle-etl](https://github.com/chronicle-app/chronicle-etl).
8
5
 
9
6
  ## Usage
10
7
 
11
- ```bash
12
- gem install chronicle-etl
13
- gem install chronicle-email
8
+ ```sh
9
+ # Install chronicle-etl and this plugin
10
+ $ gem install chronicle-etl
11
+ $ chronicle-etl plugins:install email
12
+
13
+ # Process emails from an mbox file
14
+ $ chronicle-etl --extractor email:mbox -i test.mbox --transformer email --fields subject
15
+ ```
16
+
17
+ ## Available Connectors
18
+ ### Extractors
19
+
20
+ #### `mbox`
21
+ Extractor for importing emails from an mbox file
22
+
23
+ ##### Settings
24
+ - `input`: A path to an .mbox file
25
+
26
+ ### Transformers
27
+
28
+ #### `email`
29
+ Transform an email (in the form of a string) into Chronicle Schema
30
+
31
+ ##### Settings
32
+ - `body_as_markdown`: (default: false) Whether to convert the email body into markdown
33
+ - `remove_signature`: (default: true) Whether to attempt to strip out the email signature (using the [`email_reply_parser`](https://github.com/github/email_reply_parser) gem)
14
34
 
15
- chronicle-etl --extractor email:mbox --extractor-opts filename:"./mail.mbox" --transformer email:chronicle --loader stdout
16
- ```
35
+ ## Roadmap
36
+ - Add an IMAP (and gmail) extractor #1
@@ -36,12 +36,12 @@ Gem::Specification.new do |spec|
36
36
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
37
37
  spec.require_paths = ["lib"]
38
38
 
39
- spec.add_dependency "chronicle-etl", "~> 0.2.2"
39
+ spec.add_dependency "chronicle-etl", "~> 0.4.4"
40
40
  spec.add_dependency "mail", "~> 2.7"
41
41
  spec.add_dependency 'email_reply_parser', '~> 0.5'
42
+ spec.add_dependency 'reverse_markdown', '~> 2.0'
42
43
 
43
44
  spec.add_development_dependency "bundler", "~> 2.1"
44
45
  spec.add_development_dependency "rake", "~> 13.0"
45
46
  spec.add_development_dependency "rspec", "~> 3.9"
46
- spec.add_development_dependency "pry-byebug", "~> 3.9"
47
47
  end
@@ -0,0 +1,122 @@
1
+ require 'chronicle/etl'
2
+ require 'mail'
3
+ require 'timeout'
4
+ require 'email_reply_parser'
5
+ require 'reverse_markdown'
6
+
7
+ module Chronicle
8
+ module Email
9
+ class EmailTransformer < Chronicle::ETL::Transformer
10
+ register_connector do |r|
11
+ r.description = 'an email object'
12
+ r.provider = 'email'
13
+ r.identifier = 'email'
14
+ end
15
+
16
+ setting :body_as_markdown, default: false
17
+ setting :remove_signature, default: true
18
+
19
+ def transform
20
+ build_messaged
21
+ end
22
+
23
+ def id
24
+ message.message_id || raise(Chronicle::ETL::UntransformableRecordError, "Email doesn't have an ID")
25
+ end
26
+
27
+ def timestamp
28
+ message.date&.to_time || raise(Chronicle::ETL::UntransformableRecordError, "Email doesn't have a timestamp")
29
+ end
30
+
31
+ private
32
+
33
+ def message
34
+ @message ||= Mail.new(@extraction.data[:email])
35
+ end
36
+
37
+ def build_messaged
38
+ record = ::Chronicle::ETL::Models::Activity.new
39
+ record.verb = 'messaged'
40
+ record.provider = 'email'
41
+ record.provider_id = id
42
+ record.end_at = timestamp
43
+
44
+ record.dedupe_on << [:verb, :provider, :provider_id]
45
+
46
+ record.actor = build_actor
47
+ record.involved = build_message
48
+ record
49
+ end
50
+
51
+ def build_actor
52
+ # sometimes From: fields are malformed and we can't build an
53
+ # actor out of it.
54
+ raise(Chronicle::ETL::UntransformableRecordError, "Can't determine email sender") unless message[:from]&.addrs&.any?
55
+
56
+ record = ::Chronicle::ETL::Models::Entity.new
57
+ record.represents = 'identity'
58
+ record.provider = 'email'
59
+ record.slug = message[:from].addrs.first.address
60
+ record.title = message[:from].addrs.first.display_name
61
+
62
+ record.dedupe_on << [:represents, :provider, :slug]
63
+
64
+ record
65
+ end
66
+
67
+ def build_message
68
+ record = ::Chronicle::ETL::Models::Entity.new
69
+ record.represents = 'message'
70
+ record.title = clean_subject(message.subject)
71
+ record.body = clean_body(message)
72
+ record.provider = 'email'
73
+ record.provider_id = id
74
+
75
+ # TODO: handle consumer
76
+ # TODO: handle email references
77
+ # TODO: handle email account owner
78
+ # TODO: handle attachments
79
+
80
+ record
81
+ end
82
+
83
+ def clean_subject(subject)
84
+ subject&.encode("UTF-8", invalid: :replace, undef: :replace)
85
+ end
86
+
87
+ def clean_body message
88
+ # FIXME: this all needs to be refactored
89
+
90
+ if message.multipart?
91
+ body = message.text_part&.decoded rescue Mail::UnknownEncodingType
92
+ else
93
+ body = message.body&.decoded rescue Mail::UnknownEncodingType
94
+ body = body_to_markdown if @config.body_as_markdown
95
+ end
96
+
97
+ body = body_without_signature(body) if @config.remove_signature
98
+
99
+ # Force UTF-8 encoding
100
+ body.encode("UTF-8", invalid: :replace, undef: :replace)
101
+ end
102
+
103
+ def body_to_markdown(body)
104
+ ReverseMarkdown.convert(body)
105
+ rescue StandardError
106
+ # Fall back to unparsed body? Raise Untransformable error?
107
+ end
108
+
109
+ def body_without_signature(body)
110
+ return unless body && body != ""
111
+
112
+ # FIXME: regex in EmailReplyParse gem seems to get into infinite loops
113
+ # with certain long bodies that have binary data
114
+ parsed_body = Timeout::timeout(5) do
115
+ EmailReplyParser.parse_reply(body)
116
+ end
117
+ rescue Timeout::Error, StandardError => e
118
+ return body
119
+ end
120
+ end
121
+ end
122
+ end
@@ -5,35 +5,55 @@ require 'tempfile'
5
5
  module Chronicle
6
6
  module Email
7
7
  class MboxExtractor < Chronicle::ETL::Extractor
8
+ register_connector do |r|
9
+ r.provider = 'email'
10
+ r.description = 'an .mbox file'
11
+ r.identifier = 'mbox'
12
+ end
13
+
14
+ setting :input, required: true
15
+
8
16
  # mbox format is a bunch of emails concatanated together, separated
9
17
  # by a line that starts with "From "
10
18
  NEW_EMAIL_REGEX = Regexp.new('^From [^\s]+ .{24}')
11
19
 
12
20
  def results_count
13
- file = File.open(@options[:filename])
14
- count = 0
15
- file.each do |line|
16
- count += 1 if line =~ NEW_EMAIL_REGEX
21
+ File.foreach(@filename).sum do |line|
22
+ line.scan(NEW_EMAIL_REGEX).count
17
23
  end
18
- return count
24
+ end
25
+
26
+ def prepare
27
+ @filename = @config.input.first
19
28
  end
20
29
 
21
30
  def extract
22
- file = File.open(@options[:filename])
23
- tmp = Tempfile.new('chronicile-mbox')
31
+ file = File.open(@filename)
32
+ tmp = Tempfile.new('chronicle-mbox')
24
33
 
34
+ # Read the .mbox file line by line and look for a header that indicates
35
+ # the start of a new email. As we read line by line, we save to a tmp
36
+ # file and then read it back when we notice the next header.
37
+ # Doing it this way is a lot faster than saving each line to a
38
+ # a variable, especially when we're reading emails with large binary
39
+ # attachments.
40
+ #
41
+ # TODO: make this thread-safe (one tmp file per email?)
25
42
  file.each do |line|
26
43
  if line =~ NEW_EMAIL_REGEX
27
44
  if File.size(tmp) > 0
28
45
  tmp.rewind
29
46
  email = tmp.read
30
- yield email
47
+ yield Chronicle::ETL::Extraction.new(data: { email: email} )
31
48
  tmp.truncate(0)
32
49
  tmp.rewind
33
50
  end
34
51
  end
35
52
  tmp.write(line)
36
53
  end
54
+ ensure
55
+ tmp.close
56
+ tmp.unlink
37
57
  file.close
38
58
  end
39
59
  end
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
2
  module Email
3
- VERSION = "0.1.1"
3
+ VERSION = "0.2.0"
4
4
  end
5
5
  end
@@ -1,9 +1,8 @@
1
1
  require "chronicle/email/version"
2
- require "chronicle/email/chronicle_transformer"
2
+ require "chronicle/email/email_transformer"
3
3
  require "chronicle/email/mbox_extractor"
4
4
 
5
5
  module Chronicle
6
6
  module Email
7
- PROVIDER_NAME = "email"
8
7
  end
9
8
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chronicle-email
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Louis
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-29 00:00:00.000000000 Z
11
+ date: 2022-03-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: chronicle-etl
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 0.2.2
19
+ version: 0.4.4
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 0.2.2
26
+ version: 0.4.4
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: mail
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -53,49 +53,49 @@ dependencies:
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0.5'
55
55
  - !ruby/object:Gem::Dependency
56
- name: bundler
56
+ name: reverse_markdown
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '2.1'
62
- type: :development
61
+ version: '2.0'
62
+ type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '2.1'
68
+ version: '2.0'
69
69
  - !ruby/object:Gem::Dependency
70
- name: rake
70
+ name: bundler
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '13.0'
75
+ version: '2.1'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '13.0'
82
+ version: '2.1'
83
83
  - !ruby/object:Gem::Dependency
84
- name: rspec
84
+ name: rake
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: '3.9'
89
+ version: '13.0'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: '3.9'
96
+ version: '13.0'
97
97
  - !ruby/object:Gem::Dependency
98
- name: pry-byebug
98
+ name: rspec
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
101
  - - "~>"
@@ -117,11 +117,9 @@ extra_rdoc_files: []
117
117
  files:
118
118
  - ".gitignore"
119
119
  - ".rspec"
120
- - ".ruby-version"
121
120
  - ".travis.yml"
122
121
  - CODE_OF_CONDUCT.md
123
122
  - Gemfile
124
- - Gemfile.lock
125
123
  - LICENSE.txt
126
124
  - README.md
127
125
  - Rakefile
@@ -129,7 +127,7 @@ files:
129
127
  - bin/setup
130
128
  - chronicle-email.gemspec
131
129
  - lib/chronicle/email.rb
132
- - lib/chronicle/email/chronicle_transformer.rb
130
+ - lib/chronicle/email/email_transformer.rb
133
131
  - lib/chronicle/email/mbox_extractor.rb
134
132
  - lib/chronicle/email/version.rb
135
133
  homepage: https://github.com/chronicle-app/chronicle-email
@@ -154,7 +152,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
154
152
  - !ruby/object:Gem::Version
155
153
  version: '0'
156
154
  requirements: []
157
- rubygems_version: 3.1.2
155
+ rubygems_version: 3.3.3
158
156
  signing_key:
159
157
  specification_version: 4
160
158
  summary: Email importer for Chronicle
data/.ruby-version DELETED
@@ -1 +0,0 @@
1
- 2.7.1
data/Gemfile.lock DELETED
@@ -1,85 +0,0 @@
1
- PATH
2
- remote: .
3
- specs:
4
- chronicle-email (0.1.1)
5
- chronicle-etl (~> 0.2.2)
6
- email_reply_parser (~> 0.5)
7
- mail (~> 2.7)
8
-
9
- GEM
10
- remote: https://rubygems.org/
11
- specs:
12
- byebug (11.1.3)
13
- chronicle-etl (0.2.2)
14
- colorize (~> 0.8.1)
15
- thor (~> 0.20)
16
- tty-progressbar (~> 0.17)
17
- tty-table (~> 0.11)
18
- coderay (1.1.3)
19
- colorize (0.8.1)
20
- diff-lcs (1.4.4)
21
- email_reply_parser (0.5.10)
22
- equatable (0.6.1)
23
- mail (2.7.1)
24
- mini_mime (>= 0.1.1)
25
- method_source (1.0.0)
26
- mini_mime (1.0.2)
27
- necromancer (0.6.0)
28
- pastel (0.7.4)
29
- equatable (~> 0.6)
30
- tty-color (~> 0.5)
31
- pry (0.13.1)
32
- coderay (~> 1.1)
33
- method_source (~> 1.0)
34
- pry-byebug (3.9.0)
35
- byebug (~> 11.0)
36
- pry (~> 0.13.0)
37
- rake (13.0.1)
38
- rspec (3.9.0)
39
- rspec-core (~> 3.9.0)
40
- rspec-expectations (~> 3.9.0)
41
- rspec-mocks (~> 3.9.0)
42
- rspec-core (3.9.2)
43
- rspec-support (~> 3.9.3)
44
- rspec-expectations (3.9.2)
45
- diff-lcs (>= 1.2.0, < 2.0)
46
- rspec-support (~> 3.9.0)
47
- rspec-mocks (3.9.1)
48
- diff-lcs (>= 1.2.0, < 2.0)
49
- rspec-support (~> 3.9.0)
50
- rspec-support (3.9.3)
51
- strings (0.1.8)
52
- strings-ansi (~> 0.1)
53
- unicode-display_width (~> 1.5)
54
- unicode_utils (~> 1.4)
55
- strings-ansi (0.1.0)
56
- thor (0.20.3)
57
- tty-color (0.5.2)
58
- tty-cursor (0.7.1)
59
- tty-progressbar (0.17.0)
60
- strings-ansi (~> 0.1.0)
61
- tty-cursor (~> 0.7)
62
- tty-screen (~> 0.7)
63
- unicode-display_width (~> 1.6)
64
- tty-screen (0.8.1)
65
- tty-table (0.11.0)
66
- equatable (~> 0.6)
67
- necromancer (~> 0.5)
68
- pastel (~> 0.7.2)
69
- strings (~> 0.1.5)
70
- tty-screen (~> 0.7)
71
- unicode-display_width (1.7.0)
72
- unicode_utils (1.4.0)
73
-
74
- PLATFORMS
75
- ruby
76
-
77
- DEPENDENCIES
78
- bundler (~> 2.1)
79
- chronicle-email!
80
- pry-byebug (~> 3.9)
81
- rake (~> 13.0)
82
- rspec (~> 3.9)
83
-
84
- BUNDLED WITH
85
- 2.1.4
@@ -1,150 +0,0 @@
1
- require 'chronicle/etl'
2
- require 'mail'
3
- require 'timeout'
4
- require 'email_reply_parser'
5
-
6
- module Chronicle
7
- module Email
8
- class ChronicleTransformer < Chronicle::ETL::Transformer
9
- def transform
10
- message = Mail.new(@data.b)
11
- build_messaged(message)
12
- end
13
-
14
- def build_messaged message
15
- {
16
- type: 'activities',
17
- attributes: {
18
- verb: 'messaged',
19
- end_at: message.date,
20
- provider: 'email',
21
- provider_id: message.message_id,
22
- },
23
- meta: { dedupe_on: 'verb,provider,provider_id'},
24
- relationships: {
25
- actor: { data: build_actor(message) },
26
- involved: { data: build_message(message) }
27
- }
28
- }
29
- end
30
-
31
- def build_actor message
32
- # sometimes From: fields are malformed and we can't build an
33
- # actor out of it.
34
- return unless message[:from] && message[:from].addrs && message[:from].addrs.any?
35
-
36
- {
37
- type: 'entities',
38
- attributes: {
39
- represents: 'identity',
40
- provider: 'email',
41
- slug: message[:from].addrs.first.address,
42
- title: message[:from].addrs.first.display_name
43
- },
44
- meta: { dedupe_on: 'represents,provider,slug'}
45
- }
46
- end
47
-
48
- def build_message message
49
- {
50
- type: 'entities',
51
- attributes: {
52
- represents: 'message',
53
- title: clean_subject(message.subject),
54
- body: clean_body(message),
55
- provider: 'email',
56
- provider_id: message.message_id
57
- },
58
- meta: { dedupe_on: 'represents,provider,provider_id'},
59
- relationships: {
60
- consumers: { data: build_consumers(message) },
61
- antecedents: { data: build_references(message) },
62
- owners: { data: build_account(message) },
63
- # contains: { data: build_attachments(message) }
64
- }
65
- }
66
- end
67
-
68
- def build_account message
69
- return unless account_email = [message.header['delivered-to']].flatten[0]&.value
70
-
71
- {
72
- type: 'entities',
73
- attributes: {
74
- represents: 'identity',
75
- provider: 'email',
76
- slug: account_email
77
- },
78
- meta: { dedupe_on: 'provider,slug,represents' }
79
- }
80
- end
81
-
82
- def build_consumers(message)
83
- to = []
84
- to += message[:to].addrs if message[:to]
85
- to += message[:cc].addrs.flatten.compact if message[:cc]
86
-
87
- to.collect do |consumer|
88
- {
89
- type: 'entities',
90
- attributes: {
91
- represents: 'identity',
92
- provider: 'email',
93
- slug: consumer.address,
94
- title: consumer.display_name
95
- },
96
- meta: { dedupe_on: 'provider,slug' }
97
- }
98
- end
99
- end
100
-
101
- def build_references(message)
102
- references = [message.references].flatten.compact
103
- references.collect{|reference|
104
- {
105
- type: 'entities',
106
- attributes: {
107
- represents: 'message',
108
- provider: 'email',
109
- provider_id: reference
110
- },
111
- meta: { dedupe_on: 'represents,provider,provider_id' }
112
- }
113
- }
114
- end
115
-
116
- def clean_subject(subject)
117
- subject&.encode("UTF-8", invalid: :replace, undef: :replace)
118
- end
119
-
120
- def clean_body message
121
- # FIXME: this all needs to be refactored
122
-
123
- if message.multipart?
124
- body = message.text_part&.decoded rescue Mail::UnknownEncodingType
125
- else
126
- body = message.body&.decoded rescue Mail::UnknownEncodingType
127
- end
128
-
129
- if body && body != ""
130
- begin
131
- # regex in EmailReplyParse gem seems to get into infinite loops with
132
- # certain long bodies that have binary data
133
- parsed_body = Timeout::timeout(5) do
134
- EmailReplyParser.parse_reply(body)
135
- end
136
- rescue Timeout::Error => e
137
- return nil
138
- rescue StandardError => e # Whackamole game with these parsing / encoding problems
139
- return nil
140
- end
141
-
142
- # Force UTF-8 encoding
143
- return parsed_body.encode("UTF-8", invalid: :replace, undef: :replace)
144
- else
145
- return nil
146
- end
147
- end
148
- end
149
- end
150
- end