chronicle-email 0.1.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +66 -10
- data/chronicle-email.gemspec +2 -2
- data/lib/chronicle/email/email_transformer.rb +122 -0
- data/lib/chronicle/email/imap_extractor.rb +89 -0
- data/lib/chronicle/email/mbox_extractor.rb +28 -8
- data/lib/chronicle/email/version.rb +1 -1
- data/lib/chronicle/email.rb +2 -2
- metadata +18 -19
- data/.ruby-version +0 -1
- data/Gemfile.lock +0 -85
- data/lib/chronicle/email/chronicle_transformer.rb +0 -150
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e37ee5bdabd254e03a06962503f5875ab75d2aa0b271aff0b946c73f673d5cc3
|
4
|
+
data.tar.gz: c351c35f1b5545f0d0f9e6b4375076b159dab88eb454cf3e946fdab84ddc03a4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fedb95cf29c21d0dac17d6cbeca81da0a5318ab41ea8a5eaf355e8996606b03398ba70c4233e4591a78ceaef5ce5242b59c5e227e79a6310ba4474c245476e57
|
7
|
+
data.tar.gz: 7a06834ea16425d456da704b3272c96d6e0f427044e46e60a477dbce1cab7f9621056f4f18f7b90a964f83e3e85281a981fdf155155ebf4cb8b06871cd9884ab
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1,16 +1,72 @@
|
|
1
1
|
# Chronicle::Email
|
2
|
+
[![Gem Version](https://badge.fury.io/rb/chronicle-email.svg)](https://badge.fury.io/rb/chronicle-email)
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
## Available classes
|
6
|
-
- `email:mbox` - Extractor for processing .mbox files
|
7
|
-
- `email:chronicle` - Transformer that converts an email into a chronicle schema
|
4
|
+
Extract and work with your email using the command line with this plugin for [chronicle-etl](https://github.com/chronicle-app/chronicle-etl).
|
8
5
|
|
9
6
|
## Usage
|
10
7
|
|
11
|
-
```
|
12
|
-
|
13
|
-
gem install chronicle-
|
8
|
+
```sh
|
9
|
+
# Install chronicle-etl and this plugin
|
10
|
+
$ gem install chronicle-etl
|
11
|
+
$ chronicle-etl plugins:install email
|
12
|
+
```
|
13
|
+
|
14
|
+
### Extracting email from IMAP
|
15
|
+
|
16
|
+
For Gmail accounts, you can create an [app password](https://myaccount.google.com/apppasswords); your email address is your username.
|
17
|
+
|
18
|
+
```sh
|
19
|
+
# Save username and password
|
20
|
+
$ chronicle-etl secrets:set imap username foo@gmail.com
|
21
|
+
$ chronicle-etl secrets:set imap password APPPASSWORD
|
22
|
+
|
23
|
+
# Then, retrieve your email from the last five days
|
24
|
+
$ chronicle-etl --extractor email:imap --transformer email --since 5d --loader json
|
25
|
+
|
26
|
+
# If you don't want to save your credentials as a secret, you can just pass
|
27
|
+
# them to the extractor directly
|
28
|
+
$ chronicle-etl --extractor email:imap --transformer email --since 5d --loader json \
|
29
|
+
--extractor-opts username:foo@gmail.com --password:APPPASSWORD
|
30
|
+
```
|
31
|
+
|
32
|
+
### Processing email from an .mbox file
|
33
|
+
The MBOX format is used to archive an email mailbox. [Google Takeout](https://takeout.google.com/settings/takeout) exports emails from gmail in this format.
|
34
|
+
|
35
|
+
```sh
|
36
|
+
# Retrieve the subject lines of all emails in test.mbox
|
37
|
+
$ chronicle-etl --extractor email:mbox --input inbox.mbox --transformer email --fields subject
|
38
|
+
```
|
39
|
+
|
40
|
+
## Available Connectors
|
41
|
+
### Extractors
|
42
|
+
|
43
|
+
#### `imap`
|
44
|
+
Extractor for importing recent emails from an IMAP server.
|
45
|
+
|
46
|
+
##### Settings
|
47
|
+
|
48
|
+
- `since`: Retrieve emails since this date
|
49
|
+
- `until`: Retrieve emails until this date
|
50
|
+
- `username`
|
51
|
+
- `password`
|
52
|
+
- `host`: (default: imap.gmail.com)
|
53
|
+
- `port`: (default: 993) Use 143 for unencrypted connections
|
54
|
+
- `mailbox`: (default: "[Gmail]/All Mail")
|
55
|
+
- `search_query`: When using Gmail, you can pass in a search query (`from:foo has:attachment`) to filter messages by
|
56
|
+
|
57
|
+
For accessing Gmail, you can create a one-time [app password](https://myaccount.google.com/apppasswords). Your email address is your username.
|
58
|
+
|
59
|
+
#### `mbox`
|
60
|
+
Extractor for importing emails from an MBOX file
|
61
|
+
|
62
|
+
##### Settings
|
63
|
+
- `input`: A path to an .mbox file
|
64
|
+
|
65
|
+
### Transformers
|
66
|
+
|
67
|
+
#### `email`
|
68
|
+
Transform an email (in the form of a string) into Chronicle Schema
|
14
69
|
|
15
|
-
|
16
|
-
|
70
|
+
##### Settings
|
71
|
+
- `body_as_markdown`: (default: false) Whether to convert the email body into markdown
|
72
|
+
- `remove_signature`: (default: true) Whether to attempt to strip out the email signature (using the [`email_reply_parser`](https://github.com/github/email_reply_parser) gem)
|
data/chronicle-email.gemspec
CHANGED
@@ -36,12 +36,12 @@ Gem::Specification.new do |spec|
|
|
36
36
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
37
37
|
spec.require_paths = ["lib"]
|
38
38
|
|
39
|
-
spec.add_dependency "chronicle-etl", "~> 0.
|
39
|
+
spec.add_dependency "chronicle-etl", "~> 0.5"
|
40
40
|
spec.add_dependency "mail", "~> 2.7"
|
41
41
|
spec.add_dependency 'email_reply_parser', '~> 0.5'
|
42
|
+
spec.add_dependency 'reverse_markdown', '~> 2.0'
|
42
43
|
|
43
44
|
spec.add_development_dependency "bundler", "~> 2.1"
|
44
45
|
spec.add_development_dependency "rake", "~> 13.0"
|
45
46
|
spec.add_development_dependency "rspec", "~> 3.9"
|
46
|
-
spec.add_development_dependency "pry-byebug", "~> 3.9"
|
47
47
|
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
require 'chronicle/etl'
|
2
|
+
require 'mail'
|
3
|
+
require 'timeout'
|
4
|
+
require 'email_reply_parser'
|
5
|
+
require 'reverse_markdown'
|
6
|
+
|
7
|
+
module Chronicle
|
8
|
+
module Email
|
9
|
+
class EmailTransformer < Chronicle::ETL::Transformer
|
10
|
+
register_connector do |r|
|
11
|
+
r.description = 'an email object'
|
12
|
+
r.provider = 'email'
|
13
|
+
r.identifier = 'email'
|
14
|
+
end
|
15
|
+
|
16
|
+
setting :body_as_markdown, default: false
|
17
|
+
setting :remove_signature, default: true
|
18
|
+
|
19
|
+
def transform
|
20
|
+
build_messaged
|
21
|
+
end
|
22
|
+
|
23
|
+
def id
|
24
|
+
message.message_id || raise(Chronicle::ETL::UntransformableRecordError, "Email doesn't have an ID")
|
25
|
+
end
|
26
|
+
|
27
|
+
def timestamp
|
28
|
+
message.date&.to_time || raise(Chronicle::ETL::UntransformableRecordError, "Email doesn't have a timestamp")
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def message
|
34
|
+
@message ||= Mail.new(@extraction.data[:email])
|
35
|
+
end
|
36
|
+
|
37
|
+
def build_messaged
|
38
|
+
record = ::Chronicle::ETL::Models::Activity.new
|
39
|
+
record.verb = 'messaged'
|
40
|
+
record.provider = 'email'
|
41
|
+
record.provider_id = id
|
42
|
+
record.end_at = timestamp
|
43
|
+
|
44
|
+
record.dedupe_on << [:verb, :provider, :provider_id]
|
45
|
+
|
46
|
+
record.actor = build_actor
|
47
|
+
record.involved = build_message
|
48
|
+
record
|
49
|
+
end
|
50
|
+
|
51
|
+
def build_actor
|
52
|
+
# sometimes From: fields are malformed and we can't build an
|
53
|
+
# actor out of it.
|
54
|
+
raise(Chronicle::ETL::UntransformableRecordError, "Can't determine email sender") unless message[:from]&.addrs&.any?
|
55
|
+
|
56
|
+
record = ::Chronicle::ETL::Models::Entity.new
|
57
|
+
record.represents = 'identity'
|
58
|
+
record.provider = 'email'
|
59
|
+
record.slug = message[:from].addrs.first.address
|
60
|
+
record.title = message[:from].addrs.first.display_name
|
61
|
+
|
62
|
+
record.dedupe_on << [:represents, :provider, :slug]
|
63
|
+
|
64
|
+
record
|
65
|
+
end
|
66
|
+
|
67
|
+
def build_message
|
68
|
+
record = ::Chronicle::ETL::Models::Entity.new
|
69
|
+
record.represents = 'message'
|
70
|
+
record.title = clean_subject(message.subject)
|
71
|
+
record.body = clean_body(message)
|
72
|
+
record.provider = 'email'
|
73
|
+
record.provider_id = id
|
74
|
+
|
75
|
+
# TODO: handle consumer
|
76
|
+
# TODO: handle email references
|
77
|
+
# TODO: handle email account owner
|
78
|
+
# TODO: handle attachments
|
79
|
+
|
80
|
+
record
|
81
|
+
end
|
82
|
+
|
83
|
+
def clean_subject(subject)
|
84
|
+
subject&.encode("UTF-8", invalid: :replace, undef: :replace)
|
85
|
+
end
|
86
|
+
|
87
|
+
def clean_body message
|
88
|
+
# FIXME: this all needs to be refactored
|
89
|
+
if message.multipart?
|
90
|
+
body = message.text_part&.decoded rescue Mail::UnknownEncodingType
|
91
|
+
else
|
92
|
+
body = message.body&.decoded rescue Mail::UnknownEncodingType
|
93
|
+
body = body_to_markdown if @config.body_as_markdown
|
94
|
+
end
|
95
|
+
|
96
|
+
return if body == Mail::UnknownEncodingType
|
97
|
+
return unless body && body != ""
|
98
|
+
|
99
|
+
body = body_without_signature(body) if @config.remove_signature
|
100
|
+
|
101
|
+
# Force UTF-8 encoding
|
102
|
+
body.encode("UTF-8", invalid: :replace, undef: :replace)
|
103
|
+
end
|
104
|
+
|
105
|
+
def body_to_markdown(body)
|
106
|
+
ReverseMarkdown.convert(body)
|
107
|
+
rescue StandardError
|
108
|
+
# Fall back to unparsed body? Raise Untransformable error?
|
109
|
+
end
|
110
|
+
|
111
|
+
def body_without_signature(body)
|
112
|
+
# FIXME: regex in EmailReplyParse gem seems to get into infinite loops
|
113
|
+
# with certain long bodies that have binary data
|
114
|
+
parsed_body = Timeout::timeout(5) do
|
115
|
+
EmailReplyParser.parse_reply(body)
|
116
|
+
end
|
117
|
+
rescue Timeout::Error, StandardError => e
|
118
|
+
return body
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'net/imap'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module Email
|
5
|
+
class IMAPExtractor < Chronicle::ETL::Extractor
|
6
|
+
register_connector do |r|
|
7
|
+
r.provider = 'email'
|
8
|
+
r.description = 'imap server'
|
9
|
+
r.identifier = 'imap'
|
10
|
+
end
|
11
|
+
|
12
|
+
setting :host, required: true, default: 'imap.gmail.com'
|
13
|
+
setting :port, type: :numeric, required: true, default: 993
|
14
|
+
setting :mailbox, required: true, default: '[Gmail]/All Mail'
|
15
|
+
setting :username, required: true
|
16
|
+
setting :password, required: true
|
17
|
+
setting :search_query
|
18
|
+
|
19
|
+
def prepare
|
20
|
+
@connection = create_connection
|
21
|
+
@message_ids = fetch_message_ids
|
22
|
+
end
|
23
|
+
|
24
|
+
def results_count
|
25
|
+
@message_ids.count
|
26
|
+
end
|
27
|
+
|
28
|
+
def extract
|
29
|
+
@message_ids.each do |message_id|
|
30
|
+
message = fetch_message(message_id)
|
31
|
+
yield Chronicle::ETL::Extraction.new(data: { email: message.attr["BODY[]"]} )
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def create_connection
|
38
|
+
connection = Net::IMAP.new(@config.host, @config.port, true)
|
39
|
+
connection.login(@config.username, @config.password)
|
40
|
+
connection.select(@config.mailbox)
|
41
|
+
connection
|
42
|
+
rescue Net::IMAP::NoResponseError => e
|
43
|
+
raise(Chronicle::ETL::ExtractionError, "Error connecting to IMAP server. Please check username and password")
|
44
|
+
end
|
45
|
+
|
46
|
+
def fetch_message_ids
|
47
|
+
keys = gmail_mode? ? search_keys_gmail : search_keys_default
|
48
|
+
@connection.search(keys)
|
49
|
+
rescue Net::IMAP::BadResponseError => e
|
50
|
+
raise(Chronicle::ETL::ExtractionError, "Error searching IMAP server for messages")
|
51
|
+
end
|
52
|
+
|
53
|
+
def fetch_message(message_id)
|
54
|
+
response = @connection.fetch(3100020, "BODY.PEEK[]")
|
55
|
+
raise(Chronicle::ETL::ExtractionError, "Error loading message") unless response
|
56
|
+
|
57
|
+
return response[0]
|
58
|
+
end
|
59
|
+
|
60
|
+
def search_keys_gmail
|
61
|
+
# Gmail offers an extension to IMAP that lets us use gmail queries
|
62
|
+
q = ""
|
63
|
+
|
64
|
+
# First, we ignore drafts beacuse they break a lot of assumptions we
|
65
|
+
# make when when processing emails (lack of timestamps, ids, etc)
|
66
|
+
q = "-label:draft"
|
67
|
+
|
68
|
+
# We use UNIX timestamps in gmail filters which let us do more precise
|
69
|
+
# since/until compared with date-based imap filters
|
70
|
+
q += " after:#{@config.since.to_i}" if @config.since
|
71
|
+
q += " before:#{@config.until.to_i}" if @config.until
|
72
|
+
q += " #{@config.search_query}" if @config.search_query
|
73
|
+
|
74
|
+
["X-GM-RAW", q]
|
75
|
+
end
|
76
|
+
|
77
|
+
def search_keys_default
|
78
|
+
keys = []
|
79
|
+
# TODO: test out non-gmail IMAP searching (for @config.search_query)
|
80
|
+
keys += ['SINCE', Net::IMAP.format_date(@config.since)] if @config.since
|
81
|
+
keys += ['BEFORE', Net::IMAP.format_date(@config.until)] if @config.until
|
82
|
+
end
|
83
|
+
|
84
|
+
def gmail_mode?
|
85
|
+
@config.host == 'imap.gmail.com'
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -5,35 +5,55 @@ require 'tempfile'
|
|
5
5
|
module Chronicle
|
6
6
|
module Email
|
7
7
|
class MboxExtractor < Chronicle::ETL::Extractor
|
8
|
+
register_connector do |r|
|
9
|
+
r.provider = 'email'
|
10
|
+
r.description = 'an .mbox file'
|
11
|
+
r.identifier = 'mbox'
|
12
|
+
end
|
13
|
+
|
14
|
+
setting :input, required: true
|
15
|
+
|
8
16
|
# mbox format is a bunch of emails concatanated together, separated
|
9
17
|
# by a line that starts with "From "
|
10
18
|
NEW_EMAIL_REGEX = Regexp.new('^From [^\s]+ .{24}')
|
11
19
|
|
12
20
|
def results_count
|
13
|
-
|
14
|
-
|
15
|
-
file.each do |line|
|
16
|
-
count += 1 if line =~ NEW_EMAIL_REGEX
|
21
|
+
File.foreach(@filename).sum do |line|
|
22
|
+
line.scan(NEW_EMAIL_REGEX).count
|
17
23
|
end
|
18
|
-
|
24
|
+
end
|
25
|
+
|
26
|
+
def prepare
|
27
|
+
@filename = @config.input.first
|
19
28
|
end
|
20
29
|
|
21
30
|
def extract
|
22
|
-
file = File.open(@
|
23
|
-
tmp = Tempfile.new('
|
31
|
+
file = File.open(@filename)
|
32
|
+
tmp = Tempfile.new('chronicle-mbox')
|
24
33
|
|
34
|
+
# Read the .mbox file line by line and look for a header that indicates
|
35
|
+
# the start of a new email. As we read line by line, we save to a tmp
|
36
|
+
# file and then read it back when we notice the next header.
|
37
|
+
# Doing it this way is a lot faster than saving each line to a
|
38
|
+
# a variable, especially when we're reading emails with large binary
|
39
|
+
# attachments.
|
40
|
+
#
|
41
|
+
# TODO: make this thread-safe (one tmp file per email?)
|
25
42
|
file.each do |line|
|
26
43
|
if line =~ NEW_EMAIL_REGEX
|
27
44
|
if File.size(tmp) > 0
|
28
45
|
tmp.rewind
|
29
46
|
email = tmp.read
|
30
|
-
yield email
|
47
|
+
yield Chronicle::ETL::Extraction.new(data: { email: email} )
|
31
48
|
tmp.truncate(0)
|
32
49
|
tmp.rewind
|
33
50
|
end
|
34
51
|
end
|
35
52
|
tmp.write(line)
|
36
53
|
end
|
54
|
+
ensure
|
55
|
+
tmp.close
|
56
|
+
tmp.unlink
|
37
57
|
file.close
|
38
58
|
end
|
39
59
|
end
|
data/lib/chronicle/email.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
require "chronicle/email/version"
|
2
|
-
require "chronicle/email/
|
2
|
+
require "chronicle/email/email_transformer"
|
3
3
|
require "chronicle/email/mbox_extractor"
|
4
|
+
require "chronicle/email/imap_extractor"
|
4
5
|
|
5
6
|
module Chronicle
|
6
7
|
module Email
|
7
|
-
PROVIDER_NAME = "email"
|
8
8
|
end
|
9
9
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chronicle-email
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Louis
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-03-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: chronicle-etl
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.
|
19
|
+
version: '0.5'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.
|
26
|
+
version: '0.5'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: mail
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -53,49 +53,49 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0.5'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: reverse_markdown
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '2.
|
62
|
-
type: :
|
61
|
+
version: '2.0'
|
62
|
+
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '2.
|
68
|
+
version: '2.0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: bundler
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
75
|
+
version: '2.1'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
82
|
+
version: '2.1'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
84
|
+
name: rake
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
89
|
+
version: '13.0'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '
|
96
|
+
version: '13.0'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
98
|
+
name: rspec
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
101
|
- - "~>"
|
@@ -117,11 +117,9 @@ extra_rdoc_files: []
|
|
117
117
|
files:
|
118
118
|
- ".gitignore"
|
119
119
|
- ".rspec"
|
120
|
-
- ".ruby-version"
|
121
120
|
- ".travis.yml"
|
122
121
|
- CODE_OF_CONDUCT.md
|
123
122
|
- Gemfile
|
124
|
-
- Gemfile.lock
|
125
123
|
- LICENSE.txt
|
126
124
|
- README.md
|
127
125
|
- Rakefile
|
@@ -129,7 +127,8 @@ files:
|
|
129
127
|
- bin/setup
|
130
128
|
- chronicle-email.gemspec
|
131
129
|
- lib/chronicle/email.rb
|
132
|
-
- lib/chronicle/email/
|
130
|
+
- lib/chronicle/email/email_transformer.rb
|
131
|
+
- lib/chronicle/email/imap_extractor.rb
|
133
132
|
- lib/chronicle/email/mbox_extractor.rb
|
134
133
|
- lib/chronicle/email/version.rb
|
135
134
|
homepage: https://github.com/chronicle-app/chronicle-email
|
@@ -154,7 +153,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
154
153
|
- !ruby/object:Gem::Version
|
155
154
|
version: '0'
|
156
155
|
requirements: []
|
157
|
-
rubygems_version: 3.
|
156
|
+
rubygems_version: 3.3.3
|
158
157
|
signing_key:
|
159
158
|
specification_version: 4
|
160
159
|
summary: Email importer for Chronicle
|
data/.ruby-version
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
2.7.1
|
data/Gemfile.lock
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
PATH
|
2
|
-
remote: .
|
3
|
-
specs:
|
4
|
-
chronicle-email (0.1.1)
|
5
|
-
chronicle-etl (~> 0.2.2)
|
6
|
-
email_reply_parser (~> 0.5)
|
7
|
-
mail (~> 2.7)
|
8
|
-
|
9
|
-
GEM
|
10
|
-
remote: https://rubygems.org/
|
11
|
-
specs:
|
12
|
-
byebug (11.1.3)
|
13
|
-
chronicle-etl (0.2.2)
|
14
|
-
colorize (~> 0.8.1)
|
15
|
-
thor (~> 0.20)
|
16
|
-
tty-progressbar (~> 0.17)
|
17
|
-
tty-table (~> 0.11)
|
18
|
-
coderay (1.1.3)
|
19
|
-
colorize (0.8.1)
|
20
|
-
diff-lcs (1.4.4)
|
21
|
-
email_reply_parser (0.5.10)
|
22
|
-
equatable (0.6.1)
|
23
|
-
mail (2.7.1)
|
24
|
-
mini_mime (>= 0.1.1)
|
25
|
-
method_source (1.0.0)
|
26
|
-
mini_mime (1.0.2)
|
27
|
-
necromancer (0.6.0)
|
28
|
-
pastel (0.7.4)
|
29
|
-
equatable (~> 0.6)
|
30
|
-
tty-color (~> 0.5)
|
31
|
-
pry (0.13.1)
|
32
|
-
coderay (~> 1.1)
|
33
|
-
method_source (~> 1.0)
|
34
|
-
pry-byebug (3.9.0)
|
35
|
-
byebug (~> 11.0)
|
36
|
-
pry (~> 0.13.0)
|
37
|
-
rake (13.0.1)
|
38
|
-
rspec (3.9.0)
|
39
|
-
rspec-core (~> 3.9.0)
|
40
|
-
rspec-expectations (~> 3.9.0)
|
41
|
-
rspec-mocks (~> 3.9.0)
|
42
|
-
rspec-core (3.9.2)
|
43
|
-
rspec-support (~> 3.9.3)
|
44
|
-
rspec-expectations (3.9.2)
|
45
|
-
diff-lcs (>= 1.2.0, < 2.0)
|
46
|
-
rspec-support (~> 3.9.0)
|
47
|
-
rspec-mocks (3.9.1)
|
48
|
-
diff-lcs (>= 1.2.0, < 2.0)
|
49
|
-
rspec-support (~> 3.9.0)
|
50
|
-
rspec-support (3.9.3)
|
51
|
-
strings (0.1.8)
|
52
|
-
strings-ansi (~> 0.1)
|
53
|
-
unicode-display_width (~> 1.5)
|
54
|
-
unicode_utils (~> 1.4)
|
55
|
-
strings-ansi (0.1.0)
|
56
|
-
thor (0.20.3)
|
57
|
-
tty-color (0.5.2)
|
58
|
-
tty-cursor (0.7.1)
|
59
|
-
tty-progressbar (0.17.0)
|
60
|
-
strings-ansi (~> 0.1.0)
|
61
|
-
tty-cursor (~> 0.7)
|
62
|
-
tty-screen (~> 0.7)
|
63
|
-
unicode-display_width (~> 1.6)
|
64
|
-
tty-screen (0.8.1)
|
65
|
-
tty-table (0.11.0)
|
66
|
-
equatable (~> 0.6)
|
67
|
-
necromancer (~> 0.5)
|
68
|
-
pastel (~> 0.7.2)
|
69
|
-
strings (~> 0.1.5)
|
70
|
-
tty-screen (~> 0.7)
|
71
|
-
unicode-display_width (1.7.0)
|
72
|
-
unicode_utils (1.4.0)
|
73
|
-
|
74
|
-
PLATFORMS
|
75
|
-
ruby
|
76
|
-
|
77
|
-
DEPENDENCIES
|
78
|
-
bundler (~> 2.1)
|
79
|
-
chronicle-email!
|
80
|
-
pry-byebug (~> 3.9)
|
81
|
-
rake (~> 13.0)
|
82
|
-
rspec (~> 3.9)
|
83
|
-
|
84
|
-
BUNDLED WITH
|
85
|
-
2.1.4
|
@@ -1,150 +0,0 @@
|
|
1
|
-
require 'chronicle/etl'
|
2
|
-
require 'mail'
|
3
|
-
require 'timeout'
|
4
|
-
require 'email_reply_parser'
|
5
|
-
|
6
|
-
module Chronicle
|
7
|
-
module Email
|
8
|
-
class ChronicleTransformer < Chronicle::ETL::Transformer
|
9
|
-
def transform
|
10
|
-
message = Mail.new(@data.b)
|
11
|
-
build_messaged(message)
|
12
|
-
end
|
13
|
-
|
14
|
-
def build_messaged message
|
15
|
-
{
|
16
|
-
type: 'activities',
|
17
|
-
attributes: {
|
18
|
-
verb: 'messaged',
|
19
|
-
end_at: message.date,
|
20
|
-
provider: 'email',
|
21
|
-
provider_id: message.message_id,
|
22
|
-
},
|
23
|
-
meta: { dedupe_on: 'verb,provider,provider_id'},
|
24
|
-
relationships: {
|
25
|
-
actor: { data: build_actor(message) },
|
26
|
-
involved: { data: build_message(message) }
|
27
|
-
}
|
28
|
-
}
|
29
|
-
end
|
30
|
-
|
31
|
-
def build_actor message
|
32
|
-
# sometimes From: fields are malformed and we can't build an
|
33
|
-
# actor out of it.
|
34
|
-
return unless message[:from] && message[:from].addrs && message[:from].addrs.any?
|
35
|
-
|
36
|
-
{
|
37
|
-
type: 'entities',
|
38
|
-
attributes: {
|
39
|
-
represents: 'identity',
|
40
|
-
provider: 'email',
|
41
|
-
slug: message[:from].addrs.first.address,
|
42
|
-
title: message[:from].addrs.first.display_name
|
43
|
-
},
|
44
|
-
meta: { dedupe_on: 'represents,provider,slug'}
|
45
|
-
}
|
46
|
-
end
|
47
|
-
|
48
|
-
def build_message message
|
49
|
-
{
|
50
|
-
type: 'entities',
|
51
|
-
attributes: {
|
52
|
-
represents: 'message',
|
53
|
-
title: clean_subject(message.subject),
|
54
|
-
body: clean_body(message),
|
55
|
-
provider: 'email',
|
56
|
-
provider_id: message.message_id
|
57
|
-
},
|
58
|
-
meta: { dedupe_on: 'represents,provider,provider_id'},
|
59
|
-
relationships: {
|
60
|
-
consumers: { data: build_consumers(message) },
|
61
|
-
antecedents: { data: build_references(message) },
|
62
|
-
owners: { data: build_account(message) },
|
63
|
-
# contains: { data: build_attachments(message) }
|
64
|
-
}
|
65
|
-
}
|
66
|
-
end
|
67
|
-
|
68
|
-
def build_account message
|
69
|
-
return unless account_email = [message.header['delivered-to']].flatten[0]&.value
|
70
|
-
|
71
|
-
{
|
72
|
-
type: 'entities',
|
73
|
-
attributes: {
|
74
|
-
represents: 'identity',
|
75
|
-
provider: 'email',
|
76
|
-
slug: account_email
|
77
|
-
},
|
78
|
-
meta: { dedupe_on: 'provider,slug,represents' }
|
79
|
-
}
|
80
|
-
end
|
81
|
-
|
82
|
-
def build_consumers(message)
|
83
|
-
to = []
|
84
|
-
to += message[:to].addrs if message[:to]
|
85
|
-
to += message[:cc].addrs.flatten.compact if message[:cc]
|
86
|
-
|
87
|
-
to.collect do |consumer|
|
88
|
-
{
|
89
|
-
type: 'entities',
|
90
|
-
attributes: {
|
91
|
-
represents: 'identity',
|
92
|
-
provider: 'email',
|
93
|
-
slug: consumer.address,
|
94
|
-
title: consumer.display_name
|
95
|
-
},
|
96
|
-
meta: { dedupe_on: 'provider,slug' }
|
97
|
-
}
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
def build_references(message)
|
102
|
-
references = [message.references].flatten.compact
|
103
|
-
references.collect{|reference|
|
104
|
-
{
|
105
|
-
type: 'entities',
|
106
|
-
attributes: {
|
107
|
-
represents: 'message',
|
108
|
-
provider: 'email',
|
109
|
-
provider_id: reference
|
110
|
-
},
|
111
|
-
meta: { dedupe_on: 'represents,provider,provider_id' }
|
112
|
-
}
|
113
|
-
}
|
114
|
-
end
|
115
|
-
|
116
|
-
def clean_subject(subject)
|
117
|
-
subject&.encode("UTF-8", invalid: :replace, undef: :replace)
|
118
|
-
end
|
119
|
-
|
120
|
-
def clean_body message
|
121
|
-
# FIXME: this all needs to be refactored
|
122
|
-
|
123
|
-
if message.multipart?
|
124
|
-
body = message.text_part&.decoded rescue Mail::UnknownEncodingType
|
125
|
-
else
|
126
|
-
body = message.body&.decoded rescue Mail::UnknownEncodingType
|
127
|
-
end
|
128
|
-
|
129
|
-
if body && body != ""
|
130
|
-
begin
|
131
|
-
# regex in EmailReplyParse gem seems to get into infinite loops with
|
132
|
-
# certain long bodies that have binary data
|
133
|
-
parsed_body = Timeout::timeout(5) do
|
134
|
-
EmailReplyParser.parse_reply(body)
|
135
|
-
end
|
136
|
-
rescue Timeout::Error => e
|
137
|
-
return nil
|
138
|
-
rescue StandardError => e # Whackamole game with these parsing / encoding problems
|
139
|
-
return nil
|
140
|
-
end
|
141
|
-
|
142
|
-
# Force UTF-8 encoding
|
143
|
-
return parsed_body.encode("UTF-8", invalid: :replace, undef: :replace)
|
144
|
-
else
|
145
|
-
return nil
|
146
|
-
end
|
147
|
-
end
|
148
|
-
end
|
149
|
-
end
|
150
|
-
end
|