chronicle-email 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +30 -10
- data/chronicle-email.gemspec +2 -2
- data/lib/chronicle/email/email_transformer.rb +122 -0
- data/lib/chronicle/email/mbox_extractor.rb +28 -8
- data/lib/chronicle/email/version.rb +1 -1
- data/lib/chronicle/email.rb +1 -2
- metadata +17 -19
- data/.ruby-version +0 -1
- data/Gemfile.lock +0 -85
- data/lib/chronicle/email/chronicle_transformer.rb +0 -150
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: f9096dfd9082ccbbf59651fbeb3f7d8bbd6b00a246b1cff5094ce196f2a201ec
|
|
4
|
+
data.tar.gz: f3a702a0f56ce3301345c7be20cf2fdff5722354ef4e6f34aa83c1e0fb73a928
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f522c605c2277ac3552670aa67eafbd8ffe258717ac81bfafb64f4d22db0ec63891ce47faa85e761216ba7d4a2d2cb10138c67aaf28228a0cd3f5c53d8f3f8a5
|
|
7
|
+
data.tar.gz: 700aa4b3162b5b566f83354232303873cda6377c0ead42d57c96d7db7fec41b620ed153a0d4af81ac29e5c1a0e253a6e74bb99ccfca25ab824db39238d36d5ed
|
data/.gitignore
CHANGED
data/README.md
CHANGED
|
@@ -1,16 +1,36 @@
|
|
|
1
1
|
# Chronicle::Email
|
|
2
|
+
[](https://badge.fury.io/rb/chronicle-email)
|
|
2
3
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
## Available classes
|
|
6
|
-
- `email:mbox` - Extractor for processing .mbox files
|
|
7
|
-
- `email:chronicle` - Transformer that converts an email into a chronicle schema
|
|
4
|
+
Extract and work with your email using the command line with this plugin for [chronicle-etl](https://github.com/chronicle-app/chronicle-etl).
|
|
8
5
|
|
|
9
6
|
## Usage
|
|
10
7
|
|
|
11
|
-
```
|
|
12
|
-
|
|
13
|
-
gem install chronicle-
|
|
8
|
+
```sh
|
|
9
|
+
# Install chronicle-etl and this plugin
|
|
10
|
+
$ gem install chronicle-etl
|
|
11
|
+
$ chronicle-etl plugins:install email
|
|
12
|
+
|
|
13
|
+
# Process emails from an mbox file
|
|
14
|
+
$ chronicle-etl --extractor email:mbox -i test.mbox --transformer email --fields subject
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Available Connectors
|
|
18
|
+
### Extractors
|
|
19
|
+
|
|
20
|
+
#### `mbox`
|
|
21
|
+
Extractor for importing emails from an mbox file
|
|
22
|
+
|
|
23
|
+
##### Settings
|
|
24
|
+
- `input`: A path to an .mbox file
|
|
25
|
+
|
|
26
|
+
### Transformers
|
|
27
|
+
|
|
28
|
+
#### `email`
|
|
29
|
+
Transform an email (in the form of a string) into Chronicle Schema
|
|
30
|
+
|
|
31
|
+
##### Settings
|
|
32
|
+
- `body_as_markdown`: (default: false) Whether to convert the email body into markdown
|
|
33
|
+
- `remove_signature`: (default: true) Whether to attempt to strip out the email signature (using the [`email_reply_parser`](https://github.com/github/email_reply_parser) gem)
|
|
14
34
|
|
|
15
|
-
|
|
16
|
-
|
|
35
|
+
## Roadmap
|
|
36
|
+
- Add an IMAP (and gmail) extractor #1
|
data/chronicle-email.gemspec
CHANGED
|
@@ -36,12 +36,12 @@ Gem::Specification.new do |spec|
|
|
|
36
36
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
37
37
|
spec.require_paths = ["lib"]
|
|
38
38
|
|
|
39
|
-
spec.add_dependency "chronicle-etl", "~> 0.
|
|
39
|
+
spec.add_dependency "chronicle-etl", "~> 0.4.4"
|
|
40
40
|
spec.add_dependency "mail", "~> 2.7"
|
|
41
41
|
spec.add_dependency 'email_reply_parser', '~> 0.5'
|
|
42
|
+
spec.add_dependency 'reverse_markdown', '~> 2.0'
|
|
42
43
|
|
|
43
44
|
spec.add_development_dependency "bundler", "~> 2.1"
|
|
44
45
|
spec.add_development_dependency "rake", "~> 13.0"
|
|
45
46
|
spec.add_development_dependency "rspec", "~> 3.9"
|
|
46
|
-
spec.add_development_dependency "pry-byebug", "~> 3.9"
|
|
47
47
|
end
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
require 'chronicle/etl'
|
|
2
|
+
require 'mail'
|
|
3
|
+
require 'timeout'
|
|
4
|
+
require 'email_reply_parser'
|
|
5
|
+
require 'reverse_markdown'
|
|
6
|
+
|
|
7
|
+
module Chronicle
|
|
8
|
+
module Email
|
|
9
|
+
class EmailTransformer < Chronicle::ETL::Transformer
|
|
10
|
+
register_connector do |r|
|
|
11
|
+
r.description = 'an email object'
|
|
12
|
+
r.provider = 'email'
|
|
13
|
+
r.identifier = 'email'
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
setting :body_as_markdown, default: false
|
|
17
|
+
setting :remove_signature, default: true
|
|
18
|
+
|
|
19
|
+
def transform
|
|
20
|
+
build_messaged
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def id
|
|
24
|
+
message.message_id || raise(Chronicle::ETL::UntransformableRecordError, "Email doesn't have an ID")
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def timestamp
|
|
28
|
+
message.date&.to_time || raise(Chronicle::ETL::UntransformableRecordError, "Email doesn't have a timestamp")
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
def message
|
|
34
|
+
@message ||= Mail.new(@extraction.data[:email])
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def build_messaged
|
|
38
|
+
record = ::Chronicle::ETL::Models::Activity.new
|
|
39
|
+
record.verb = 'messaged'
|
|
40
|
+
record.provider = 'email'
|
|
41
|
+
record.provider_id = id
|
|
42
|
+
record.end_at = timestamp
|
|
43
|
+
|
|
44
|
+
record.dedupe_on << [:verb, :provider, :provider_id]
|
|
45
|
+
|
|
46
|
+
record.actor = build_actor
|
|
47
|
+
record.involved = build_message
|
|
48
|
+
record
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def build_actor
|
|
52
|
+
# sometimes From: fields are malformed and we can't build an
|
|
53
|
+
# actor out of it.
|
|
54
|
+
raise(Chronicle::ETL::UntransformableRecordError, "Can't determine email sender") unless message[:from]&.addrs&.any?
|
|
55
|
+
|
|
56
|
+
record = ::Chronicle::ETL::Models::Entity.new
|
|
57
|
+
record.represents = 'identity'
|
|
58
|
+
record.provider = 'email'
|
|
59
|
+
record.slug = message[:from].addrs.first.address
|
|
60
|
+
record.title = message[:from].addrs.first.display_name
|
|
61
|
+
|
|
62
|
+
record.dedupe_on << [:represents, :provider, :slug]
|
|
63
|
+
|
|
64
|
+
record
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def build_message
|
|
68
|
+
record = ::Chronicle::ETL::Models::Entity.new
|
|
69
|
+
record.represents = 'message'
|
|
70
|
+
record.title = clean_subject(message.subject)
|
|
71
|
+
record.body = clean_body(message)
|
|
72
|
+
record.provider = 'email'
|
|
73
|
+
record.provider_id = id
|
|
74
|
+
|
|
75
|
+
# TODO: handle consumer
|
|
76
|
+
# TODO: handle email references
|
|
77
|
+
# TODO: handle email account owner
|
|
78
|
+
# TODO: handle attachments
|
|
79
|
+
|
|
80
|
+
record
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def clean_subject(subject)
|
|
84
|
+
subject&.encode("UTF-8", invalid: :replace, undef: :replace)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def clean_body message
|
|
88
|
+
# FIXME: this all needs to be refactored
|
|
89
|
+
|
|
90
|
+
if message.multipart?
|
|
91
|
+
body = message.text_part&.decoded rescue Mail::UnknownEncodingType
|
|
92
|
+
else
|
|
93
|
+
body = message.body&.decoded rescue Mail::UnknownEncodingType
|
|
94
|
+
body = body_to_markdown if @config.body_as_markdown
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
body = body_without_signature(body) if @config.remove_signature
|
|
98
|
+
|
|
99
|
+
# Force UTF-8 encoding
|
|
100
|
+
body.encode("UTF-8", invalid: :replace, undef: :replace)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def body_to_markdown(body)
|
|
104
|
+
ReverseMarkdown.convert(body)
|
|
105
|
+
rescue StandardError
|
|
106
|
+
# Fall back to unparsed body? Raise Untransformable error?
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def body_without_signature(body)
|
|
110
|
+
return unless body && body != ""
|
|
111
|
+
|
|
112
|
+
# FIXME: regex in EmailReplyParse gem seems to get into infinite loops
|
|
113
|
+
# with certain long bodies that have binary data
|
|
114
|
+
parsed_body = Timeout::timeout(5) do
|
|
115
|
+
EmailReplyParser.parse_reply(body)
|
|
116
|
+
end
|
|
117
|
+
rescue Timeout::Error, StandardError => e
|
|
118
|
+
return body
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
@@ -5,35 +5,55 @@ require 'tempfile'
|
|
|
5
5
|
module Chronicle
|
|
6
6
|
module Email
|
|
7
7
|
class MboxExtractor < Chronicle::ETL::Extractor
|
|
8
|
+
register_connector do |r|
|
|
9
|
+
r.provider = 'email'
|
|
10
|
+
r.description = 'an .mbox file'
|
|
11
|
+
r.identifier = 'mbox'
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
setting :input, required: true
|
|
15
|
+
|
|
8
16
|
# mbox format is a bunch of emails concatanated together, separated
|
|
9
17
|
# by a line that starts with "From "
|
|
10
18
|
NEW_EMAIL_REGEX = Regexp.new('^From [^\s]+ .{24}')
|
|
11
19
|
|
|
12
20
|
def results_count
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
file.each do |line|
|
|
16
|
-
count += 1 if line =~ NEW_EMAIL_REGEX
|
|
21
|
+
File.foreach(@filename).sum do |line|
|
|
22
|
+
line.scan(NEW_EMAIL_REGEX).count
|
|
17
23
|
end
|
|
18
|
-
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def prepare
|
|
27
|
+
@filename = @config.input.first
|
|
19
28
|
end
|
|
20
29
|
|
|
21
30
|
def extract
|
|
22
|
-
file = File.open(@
|
|
23
|
-
tmp = Tempfile.new('
|
|
31
|
+
file = File.open(@filename)
|
|
32
|
+
tmp = Tempfile.new('chronicle-mbox')
|
|
24
33
|
|
|
34
|
+
# Read the .mbox file line by line and look for a header that indicates
|
|
35
|
+
# the start of a new email. As we read line by line, we save to a tmp
|
|
36
|
+
# file and then read it back when we notice the next header.
|
|
37
|
+
# Doing it this way is a lot faster than saving each line to a
|
|
38
|
+
# a variable, especially when we're reading emails with large binary
|
|
39
|
+
# attachments.
|
|
40
|
+
#
|
|
41
|
+
# TODO: make this thread-safe (one tmp file per email?)
|
|
25
42
|
file.each do |line|
|
|
26
43
|
if line =~ NEW_EMAIL_REGEX
|
|
27
44
|
if File.size(tmp) > 0
|
|
28
45
|
tmp.rewind
|
|
29
46
|
email = tmp.read
|
|
30
|
-
yield email
|
|
47
|
+
yield Chronicle::ETL::Extraction.new(data: { email: email} )
|
|
31
48
|
tmp.truncate(0)
|
|
32
49
|
tmp.rewind
|
|
33
50
|
end
|
|
34
51
|
end
|
|
35
52
|
tmp.write(line)
|
|
36
53
|
end
|
|
54
|
+
ensure
|
|
55
|
+
tmp.close
|
|
56
|
+
tmp.unlink
|
|
37
57
|
file.close
|
|
38
58
|
end
|
|
39
59
|
end
|
data/lib/chronicle/email.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: chronicle-email
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Louis
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2022-03-20 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: chronicle-etl
|
|
@@ -16,14 +16,14 @@ dependencies:
|
|
|
16
16
|
requirements:
|
|
17
17
|
- - "~>"
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version: 0.
|
|
19
|
+
version: 0.4.4
|
|
20
20
|
type: :runtime
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
24
|
- - "~>"
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
|
-
version: 0.
|
|
26
|
+
version: 0.4.4
|
|
27
27
|
- !ruby/object:Gem::Dependency
|
|
28
28
|
name: mail
|
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -53,49 +53,49 @@ dependencies:
|
|
|
53
53
|
- !ruby/object:Gem::Version
|
|
54
54
|
version: '0.5'
|
|
55
55
|
- !ruby/object:Gem::Dependency
|
|
56
|
-
name:
|
|
56
|
+
name: reverse_markdown
|
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
|
58
58
|
requirements:
|
|
59
59
|
- - "~>"
|
|
60
60
|
- !ruby/object:Gem::Version
|
|
61
|
-
version: '2.
|
|
62
|
-
type: :
|
|
61
|
+
version: '2.0'
|
|
62
|
+
type: :runtime
|
|
63
63
|
prerelease: false
|
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
|
65
65
|
requirements:
|
|
66
66
|
- - "~>"
|
|
67
67
|
- !ruby/object:Gem::Version
|
|
68
|
-
version: '2.
|
|
68
|
+
version: '2.0'
|
|
69
69
|
- !ruby/object:Gem::Dependency
|
|
70
|
-
name:
|
|
70
|
+
name: bundler
|
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
|
72
72
|
requirements:
|
|
73
73
|
- - "~>"
|
|
74
74
|
- !ruby/object:Gem::Version
|
|
75
|
-
version: '
|
|
75
|
+
version: '2.1'
|
|
76
76
|
type: :development
|
|
77
77
|
prerelease: false
|
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
|
79
79
|
requirements:
|
|
80
80
|
- - "~>"
|
|
81
81
|
- !ruby/object:Gem::Version
|
|
82
|
-
version: '
|
|
82
|
+
version: '2.1'
|
|
83
83
|
- !ruby/object:Gem::Dependency
|
|
84
|
-
name:
|
|
84
|
+
name: rake
|
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
|
86
86
|
requirements:
|
|
87
87
|
- - "~>"
|
|
88
88
|
- !ruby/object:Gem::Version
|
|
89
|
-
version: '
|
|
89
|
+
version: '13.0'
|
|
90
90
|
type: :development
|
|
91
91
|
prerelease: false
|
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
|
93
93
|
requirements:
|
|
94
94
|
- - "~>"
|
|
95
95
|
- !ruby/object:Gem::Version
|
|
96
|
-
version: '
|
|
96
|
+
version: '13.0'
|
|
97
97
|
- !ruby/object:Gem::Dependency
|
|
98
|
-
name:
|
|
98
|
+
name: rspec
|
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
|
100
100
|
requirements:
|
|
101
101
|
- - "~>"
|
|
@@ -117,11 +117,9 @@ extra_rdoc_files: []
|
|
|
117
117
|
files:
|
|
118
118
|
- ".gitignore"
|
|
119
119
|
- ".rspec"
|
|
120
|
-
- ".ruby-version"
|
|
121
120
|
- ".travis.yml"
|
|
122
121
|
- CODE_OF_CONDUCT.md
|
|
123
122
|
- Gemfile
|
|
124
|
-
- Gemfile.lock
|
|
125
123
|
- LICENSE.txt
|
|
126
124
|
- README.md
|
|
127
125
|
- Rakefile
|
|
@@ -129,7 +127,7 @@ files:
|
|
|
129
127
|
- bin/setup
|
|
130
128
|
- chronicle-email.gemspec
|
|
131
129
|
- lib/chronicle/email.rb
|
|
132
|
-
- lib/chronicle/email/
|
|
130
|
+
- lib/chronicle/email/email_transformer.rb
|
|
133
131
|
- lib/chronicle/email/mbox_extractor.rb
|
|
134
132
|
- lib/chronicle/email/version.rb
|
|
135
133
|
homepage: https://github.com/chronicle-app/chronicle-email
|
|
@@ -154,7 +152,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
154
152
|
- !ruby/object:Gem::Version
|
|
155
153
|
version: '0'
|
|
156
154
|
requirements: []
|
|
157
|
-
rubygems_version: 3.
|
|
155
|
+
rubygems_version: 3.3.3
|
|
158
156
|
signing_key:
|
|
159
157
|
specification_version: 4
|
|
160
158
|
summary: Email importer for Chronicle
|
data/.ruby-version
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
2.7.1
|
data/Gemfile.lock
DELETED
|
@@ -1,85 +0,0 @@
|
|
|
1
|
-
PATH
|
|
2
|
-
remote: .
|
|
3
|
-
specs:
|
|
4
|
-
chronicle-email (0.1.1)
|
|
5
|
-
chronicle-etl (~> 0.2.2)
|
|
6
|
-
email_reply_parser (~> 0.5)
|
|
7
|
-
mail (~> 2.7)
|
|
8
|
-
|
|
9
|
-
GEM
|
|
10
|
-
remote: https://rubygems.org/
|
|
11
|
-
specs:
|
|
12
|
-
byebug (11.1.3)
|
|
13
|
-
chronicle-etl (0.2.2)
|
|
14
|
-
colorize (~> 0.8.1)
|
|
15
|
-
thor (~> 0.20)
|
|
16
|
-
tty-progressbar (~> 0.17)
|
|
17
|
-
tty-table (~> 0.11)
|
|
18
|
-
coderay (1.1.3)
|
|
19
|
-
colorize (0.8.1)
|
|
20
|
-
diff-lcs (1.4.4)
|
|
21
|
-
email_reply_parser (0.5.10)
|
|
22
|
-
equatable (0.6.1)
|
|
23
|
-
mail (2.7.1)
|
|
24
|
-
mini_mime (>= 0.1.1)
|
|
25
|
-
method_source (1.0.0)
|
|
26
|
-
mini_mime (1.0.2)
|
|
27
|
-
necromancer (0.6.0)
|
|
28
|
-
pastel (0.7.4)
|
|
29
|
-
equatable (~> 0.6)
|
|
30
|
-
tty-color (~> 0.5)
|
|
31
|
-
pry (0.13.1)
|
|
32
|
-
coderay (~> 1.1)
|
|
33
|
-
method_source (~> 1.0)
|
|
34
|
-
pry-byebug (3.9.0)
|
|
35
|
-
byebug (~> 11.0)
|
|
36
|
-
pry (~> 0.13.0)
|
|
37
|
-
rake (13.0.1)
|
|
38
|
-
rspec (3.9.0)
|
|
39
|
-
rspec-core (~> 3.9.0)
|
|
40
|
-
rspec-expectations (~> 3.9.0)
|
|
41
|
-
rspec-mocks (~> 3.9.0)
|
|
42
|
-
rspec-core (3.9.2)
|
|
43
|
-
rspec-support (~> 3.9.3)
|
|
44
|
-
rspec-expectations (3.9.2)
|
|
45
|
-
diff-lcs (>= 1.2.0, < 2.0)
|
|
46
|
-
rspec-support (~> 3.9.0)
|
|
47
|
-
rspec-mocks (3.9.1)
|
|
48
|
-
diff-lcs (>= 1.2.0, < 2.0)
|
|
49
|
-
rspec-support (~> 3.9.0)
|
|
50
|
-
rspec-support (3.9.3)
|
|
51
|
-
strings (0.1.8)
|
|
52
|
-
strings-ansi (~> 0.1)
|
|
53
|
-
unicode-display_width (~> 1.5)
|
|
54
|
-
unicode_utils (~> 1.4)
|
|
55
|
-
strings-ansi (0.1.0)
|
|
56
|
-
thor (0.20.3)
|
|
57
|
-
tty-color (0.5.2)
|
|
58
|
-
tty-cursor (0.7.1)
|
|
59
|
-
tty-progressbar (0.17.0)
|
|
60
|
-
strings-ansi (~> 0.1.0)
|
|
61
|
-
tty-cursor (~> 0.7)
|
|
62
|
-
tty-screen (~> 0.7)
|
|
63
|
-
unicode-display_width (~> 1.6)
|
|
64
|
-
tty-screen (0.8.1)
|
|
65
|
-
tty-table (0.11.0)
|
|
66
|
-
equatable (~> 0.6)
|
|
67
|
-
necromancer (~> 0.5)
|
|
68
|
-
pastel (~> 0.7.2)
|
|
69
|
-
strings (~> 0.1.5)
|
|
70
|
-
tty-screen (~> 0.7)
|
|
71
|
-
unicode-display_width (1.7.0)
|
|
72
|
-
unicode_utils (1.4.0)
|
|
73
|
-
|
|
74
|
-
PLATFORMS
|
|
75
|
-
ruby
|
|
76
|
-
|
|
77
|
-
DEPENDENCIES
|
|
78
|
-
bundler (~> 2.1)
|
|
79
|
-
chronicle-email!
|
|
80
|
-
pry-byebug (~> 3.9)
|
|
81
|
-
rake (~> 13.0)
|
|
82
|
-
rspec (~> 3.9)
|
|
83
|
-
|
|
84
|
-
BUNDLED WITH
|
|
85
|
-
2.1.4
|
|
@@ -1,150 +0,0 @@
|
|
|
1
|
-
require 'chronicle/etl'
|
|
2
|
-
require 'mail'
|
|
3
|
-
require 'timeout'
|
|
4
|
-
require 'email_reply_parser'
|
|
5
|
-
|
|
6
|
-
module Chronicle
|
|
7
|
-
module Email
|
|
8
|
-
class ChronicleTransformer < Chronicle::ETL::Transformer
|
|
9
|
-
def transform
|
|
10
|
-
message = Mail.new(@data.b)
|
|
11
|
-
build_messaged(message)
|
|
12
|
-
end
|
|
13
|
-
|
|
14
|
-
def build_messaged message
|
|
15
|
-
{
|
|
16
|
-
type: 'activities',
|
|
17
|
-
attributes: {
|
|
18
|
-
verb: 'messaged',
|
|
19
|
-
end_at: message.date,
|
|
20
|
-
provider: 'email',
|
|
21
|
-
provider_id: message.message_id,
|
|
22
|
-
},
|
|
23
|
-
meta: { dedupe_on: 'verb,provider,provider_id'},
|
|
24
|
-
relationships: {
|
|
25
|
-
actor: { data: build_actor(message) },
|
|
26
|
-
involved: { data: build_message(message) }
|
|
27
|
-
}
|
|
28
|
-
}
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
def build_actor message
|
|
32
|
-
# sometimes From: fields are malformed and we can't build an
|
|
33
|
-
# actor out of it.
|
|
34
|
-
return unless message[:from] && message[:from].addrs && message[:from].addrs.any?
|
|
35
|
-
|
|
36
|
-
{
|
|
37
|
-
type: 'entities',
|
|
38
|
-
attributes: {
|
|
39
|
-
represents: 'identity',
|
|
40
|
-
provider: 'email',
|
|
41
|
-
slug: message[:from].addrs.first.address,
|
|
42
|
-
title: message[:from].addrs.first.display_name
|
|
43
|
-
},
|
|
44
|
-
meta: { dedupe_on: 'represents,provider,slug'}
|
|
45
|
-
}
|
|
46
|
-
end
|
|
47
|
-
|
|
48
|
-
def build_message message
|
|
49
|
-
{
|
|
50
|
-
type: 'entities',
|
|
51
|
-
attributes: {
|
|
52
|
-
represents: 'message',
|
|
53
|
-
title: clean_subject(message.subject),
|
|
54
|
-
body: clean_body(message),
|
|
55
|
-
provider: 'email',
|
|
56
|
-
provider_id: message.message_id
|
|
57
|
-
},
|
|
58
|
-
meta: { dedupe_on: 'represents,provider,provider_id'},
|
|
59
|
-
relationships: {
|
|
60
|
-
consumers: { data: build_consumers(message) },
|
|
61
|
-
antecedents: { data: build_references(message) },
|
|
62
|
-
owners: { data: build_account(message) },
|
|
63
|
-
# contains: { data: build_attachments(message) }
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
def build_account message
|
|
69
|
-
return unless account_email = [message.header['delivered-to']].flatten[0]&.value
|
|
70
|
-
|
|
71
|
-
{
|
|
72
|
-
type: 'entities',
|
|
73
|
-
attributes: {
|
|
74
|
-
represents: 'identity',
|
|
75
|
-
provider: 'email',
|
|
76
|
-
slug: account_email
|
|
77
|
-
},
|
|
78
|
-
meta: { dedupe_on: 'provider,slug,represents' }
|
|
79
|
-
}
|
|
80
|
-
end
|
|
81
|
-
|
|
82
|
-
def build_consumers(message)
|
|
83
|
-
to = []
|
|
84
|
-
to += message[:to].addrs if message[:to]
|
|
85
|
-
to += message[:cc].addrs.flatten.compact if message[:cc]
|
|
86
|
-
|
|
87
|
-
to.collect do |consumer|
|
|
88
|
-
{
|
|
89
|
-
type: 'entities',
|
|
90
|
-
attributes: {
|
|
91
|
-
represents: 'identity',
|
|
92
|
-
provider: 'email',
|
|
93
|
-
slug: consumer.address,
|
|
94
|
-
title: consumer.display_name
|
|
95
|
-
},
|
|
96
|
-
meta: { dedupe_on: 'provider,slug' }
|
|
97
|
-
}
|
|
98
|
-
end
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
def build_references(message)
|
|
102
|
-
references = [message.references].flatten.compact
|
|
103
|
-
references.collect{|reference|
|
|
104
|
-
{
|
|
105
|
-
type: 'entities',
|
|
106
|
-
attributes: {
|
|
107
|
-
represents: 'message',
|
|
108
|
-
provider: 'email',
|
|
109
|
-
provider_id: reference
|
|
110
|
-
},
|
|
111
|
-
meta: { dedupe_on: 'represents,provider,provider_id' }
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
end
|
|
115
|
-
|
|
116
|
-
def clean_subject(subject)
|
|
117
|
-
subject&.encode("UTF-8", invalid: :replace, undef: :replace)
|
|
118
|
-
end
|
|
119
|
-
|
|
120
|
-
def clean_body message
|
|
121
|
-
# FIXME: this all needs to be refactored
|
|
122
|
-
|
|
123
|
-
if message.multipart?
|
|
124
|
-
body = message.text_part&.decoded rescue Mail::UnknownEncodingType
|
|
125
|
-
else
|
|
126
|
-
body = message.body&.decoded rescue Mail::UnknownEncodingType
|
|
127
|
-
end
|
|
128
|
-
|
|
129
|
-
if body && body != ""
|
|
130
|
-
begin
|
|
131
|
-
# regex in EmailReplyParse gem seems to get into infinite loops with
|
|
132
|
-
# certain long bodies that have binary data
|
|
133
|
-
parsed_body = Timeout::timeout(5) do
|
|
134
|
-
EmailReplyParser.parse_reply(body)
|
|
135
|
-
end
|
|
136
|
-
rescue Timeout::Error => e
|
|
137
|
-
return nil
|
|
138
|
-
rescue StandardError => e # Whackamole game with these parsing / encoding problems
|
|
139
|
-
return nil
|
|
140
|
-
end
|
|
141
|
-
|
|
142
|
-
# Force UTF-8 encoding
|
|
143
|
-
return parsed_body.encode("UTF-8", invalid: :replace, undef: :replace)
|
|
144
|
-
else
|
|
145
|
-
return nil
|
|
146
|
-
end
|
|
147
|
-
end
|
|
148
|
-
end
|
|
149
|
-
end
|
|
150
|
-
end
|