chronicle-email 0.1.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +66 -10
- data/chronicle-email.gemspec +2 -2
- data/lib/chronicle/email/email_transformer.rb +122 -0
- data/lib/chronicle/email/imap_extractor.rb +89 -0
- data/lib/chronicle/email/mbox_extractor.rb +28 -8
- data/lib/chronicle/email/version.rb +1 -1
- data/lib/chronicle/email.rb +2 -2
- metadata +18 -19
- data/.ruby-version +0 -1
- data/Gemfile.lock +0 -85
- data/lib/chronicle/email/chronicle_transformer.rb +0 -150
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e37ee5bdabd254e03a06962503f5875ab75d2aa0b271aff0b946c73f673d5cc3
|
4
|
+
data.tar.gz: c351c35f1b5545f0d0f9e6b4375076b159dab88eb454cf3e946fdab84ddc03a4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fedb95cf29c21d0dac17d6cbeca81da0a5318ab41ea8a5eaf355e8996606b03398ba70c4233e4591a78ceaef5ce5242b59c5e227e79a6310ba4474c245476e57
|
7
|
+
data.tar.gz: 7a06834ea16425d456da704b3272c96d6e0f427044e46e60a477dbce1cab7f9621056f4f18f7b90a964f83e3e85281a981fdf155155ebf4cb8b06871cd9884ab
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1,16 +1,72 @@
|
|
1
1
|
# Chronicle::Email
|
2
|
+
[](https://badge.fury.io/rb/chronicle-email)
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
## Available classes
|
6
|
-
- `email:mbox` - Extractor for processing .mbox files
|
7
|
-
- `email:chronicle` - Transformer that converts an email into a chronicle schema
|
4
|
+
Extract and work with your email using the command line with this plugin for [chronicle-etl](https://github.com/chronicle-app/chronicle-etl).
|
8
5
|
|
9
6
|
## Usage
|
10
7
|
|
11
|
-
```
|
12
|
-
|
13
|
-
gem install chronicle-
|
8
|
+
```sh
|
9
|
+
# Install chronicle-etl and this plugin
|
10
|
+
$ gem install chronicle-etl
|
11
|
+
$ chronicle-etl plugins:install email
|
12
|
+
```
|
13
|
+
|
14
|
+
### Extracting email from IMAP
|
15
|
+
|
16
|
+
For Gmail accounts, you can create an [app password](https://myaccount.google.com/apppasswords); your email address is your username.
|
17
|
+
|
18
|
+
```sh
|
19
|
+
# Save username and password
|
20
|
+
$ chronicle-etl secrets:set imap username foo@gmail.com
|
21
|
+
$ chronicle-etl secrets:set imap password APPPASSWORD
|
22
|
+
|
23
|
+
# Then, retrieve your email from the last five days
|
24
|
+
$ chronicle-etl --extractor email:imap --transformer email --since 5d --loader json
|
25
|
+
|
26
|
+
# If you don't want to save your credentials as a secret, you can just pass
|
27
|
+
# them to the extractor directly
|
28
|
+
$ chronicle-etl --extractor email:imap --transformer email --since 5d --loader json \
|
29
|
+
--extractor-opts username:foo@gmail.com --password:APPPASSWORD
|
30
|
+
```
|
31
|
+
|
32
|
+
### Processing email from an .mbox file
|
33
|
+
The MBOX format is used to archive an email mailbox. [Google Takeout](https://takeout.google.com/settings/takeout) exports emails from gmail in this format.
|
34
|
+
|
35
|
+
```sh
|
36
|
+
# Retrieve the subject lines of all emails in test.mbox
|
37
|
+
$ chronicle-etl --extractor email:mbox --input inbox.mbox --transformer email --fields subject
|
38
|
+
```
|
39
|
+
|
40
|
+
## Available Connectors
|
41
|
+
### Extractors
|
42
|
+
|
43
|
+
#### `imap`
|
44
|
+
Extractor for importing recent emails from an IMAP server.
|
45
|
+
|
46
|
+
##### Settings
|
47
|
+
|
48
|
+
- `since`: Retrieve emails since this date
|
49
|
+
- `until`: Retrieve emails until this date
|
50
|
+
- `username`
|
51
|
+
- `password`
|
52
|
+
- `host`: (default: imap.gmail.com)
|
53
|
+
- `port`: (default: 993) Use 143 for unencrypted connections
|
54
|
+
- `mailbox`: (default: "[Gmail]/All Mail")
|
55
|
+
- `search_query`: When using Gmail, you can pass in a search query (`from:foo has:attachment`) to filter messages by
|
56
|
+
|
57
|
+
For accessing Gmail, you can create a one-time [app password](https://myaccount.google.com/apppasswords). Your email address is your username.
|
58
|
+
|
59
|
+
#### `mbox`
|
60
|
+
Extractor for importing emails from an MBOX file
|
61
|
+
|
62
|
+
##### Settings
|
63
|
+
- `input`: A path to an .mbox file
|
64
|
+
|
65
|
+
### Transformers
|
66
|
+
|
67
|
+
#### `email`
|
68
|
+
Transform an email (in the form of a string) into Chronicle Schema
|
14
69
|
|
15
|
-
|
16
|
-
|
70
|
+
##### Settings
|
71
|
+
- `body_as_markdown`: (default: false) Whether to convert the email body into markdown
|
72
|
+
- `remove_signature`: (default: true) Whether to attempt to strip out the email signature (using the [`email_reply_parser`](https://github.com/github/email_reply_parser) gem)
|
data/chronicle-email.gemspec
CHANGED
@@ -36,12 +36,12 @@ Gem::Specification.new do |spec|
|
|
36
36
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
37
37
|
spec.require_paths = ["lib"]
|
38
38
|
|
39
|
-
spec.add_dependency "chronicle-etl", "~> 0.
|
39
|
+
spec.add_dependency "chronicle-etl", "~> 0.5"
|
40
40
|
spec.add_dependency "mail", "~> 2.7"
|
41
41
|
spec.add_dependency 'email_reply_parser', '~> 0.5'
|
42
|
+
spec.add_dependency 'reverse_markdown', '~> 2.0'
|
42
43
|
|
43
44
|
spec.add_development_dependency "bundler", "~> 2.1"
|
44
45
|
spec.add_development_dependency "rake", "~> 13.0"
|
45
46
|
spec.add_development_dependency "rspec", "~> 3.9"
|
46
|
-
spec.add_development_dependency "pry-byebug", "~> 3.9"
|
47
47
|
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
require 'chronicle/etl'
|
2
|
+
require 'mail'
|
3
|
+
require 'timeout'
|
4
|
+
require 'email_reply_parser'
|
5
|
+
require 'reverse_markdown'
|
6
|
+
|
7
|
+
module Chronicle
|
8
|
+
module Email
|
9
|
+
class EmailTransformer < Chronicle::ETL::Transformer
|
10
|
+
register_connector do |r|
|
11
|
+
r.description = 'an email object'
|
12
|
+
r.provider = 'email'
|
13
|
+
r.identifier = 'email'
|
14
|
+
end
|
15
|
+
|
16
|
+
setting :body_as_markdown, default: false
|
17
|
+
setting :remove_signature, default: true
|
18
|
+
|
19
|
+
def transform
|
20
|
+
build_messaged
|
21
|
+
end
|
22
|
+
|
23
|
+
def id
|
24
|
+
message.message_id || raise(Chronicle::ETL::UntransformableRecordError, "Email doesn't have an ID")
|
25
|
+
end
|
26
|
+
|
27
|
+
def timestamp
|
28
|
+
message.date&.to_time || raise(Chronicle::ETL::UntransformableRecordError, "Email doesn't have a timestamp")
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def message
|
34
|
+
@message ||= Mail.new(@extraction.data[:email])
|
35
|
+
end
|
36
|
+
|
37
|
+
def build_messaged
|
38
|
+
record = ::Chronicle::ETL::Models::Activity.new
|
39
|
+
record.verb = 'messaged'
|
40
|
+
record.provider = 'email'
|
41
|
+
record.provider_id = id
|
42
|
+
record.end_at = timestamp
|
43
|
+
|
44
|
+
record.dedupe_on << [:verb, :provider, :provider_id]
|
45
|
+
|
46
|
+
record.actor = build_actor
|
47
|
+
record.involved = build_message
|
48
|
+
record
|
49
|
+
end
|
50
|
+
|
51
|
+
def build_actor
|
52
|
+
# sometimes From: fields are malformed and we can't build an
|
53
|
+
# actor out of it.
|
54
|
+
raise(Chronicle::ETL::UntransformableRecordError, "Can't determine email sender") unless message[:from]&.addrs&.any?
|
55
|
+
|
56
|
+
record = ::Chronicle::ETL::Models::Entity.new
|
57
|
+
record.represents = 'identity'
|
58
|
+
record.provider = 'email'
|
59
|
+
record.slug = message[:from].addrs.first.address
|
60
|
+
record.title = message[:from].addrs.first.display_name
|
61
|
+
|
62
|
+
record.dedupe_on << [:represents, :provider, :slug]
|
63
|
+
|
64
|
+
record
|
65
|
+
end
|
66
|
+
|
67
|
+
def build_message
|
68
|
+
record = ::Chronicle::ETL::Models::Entity.new
|
69
|
+
record.represents = 'message'
|
70
|
+
record.title = clean_subject(message.subject)
|
71
|
+
record.body = clean_body(message)
|
72
|
+
record.provider = 'email'
|
73
|
+
record.provider_id = id
|
74
|
+
|
75
|
+
# TODO: handle consumer
|
76
|
+
# TODO: handle email references
|
77
|
+
# TODO: handle email account owner
|
78
|
+
# TODO: handle attachments
|
79
|
+
|
80
|
+
record
|
81
|
+
end
|
82
|
+
|
83
|
+
def clean_subject(subject)
|
84
|
+
subject&.encode("UTF-8", invalid: :replace, undef: :replace)
|
85
|
+
end
|
86
|
+
|
87
|
+
def clean_body message
|
88
|
+
# FIXME: this all needs to be refactored
|
89
|
+
if message.multipart?
|
90
|
+
body = message.text_part&.decoded rescue Mail::UnknownEncodingType
|
91
|
+
else
|
92
|
+
body = message.body&.decoded rescue Mail::UnknownEncodingType
|
93
|
+
body = body_to_markdown if @config.body_as_markdown
|
94
|
+
end
|
95
|
+
|
96
|
+
return if body == Mail::UnknownEncodingType
|
97
|
+
return unless body && body != ""
|
98
|
+
|
99
|
+
body = body_without_signature(body) if @config.remove_signature
|
100
|
+
|
101
|
+
# Force UTF-8 encoding
|
102
|
+
body.encode("UTF-8", invalid: :replace, undef: :replace)
|
103
|
+
end
|
104
|
+
|
105
|
+
def body_to_markdown(body)
|
106
|
+
ReverseMarkdown.convert(body)
|
107
|
+
rescue StandardError
|
108
|
+
# Fall back to unparsed body? Raise Untransformable error?
|
109
|
+
end
|
110
|
+
|
111
|
+
def body_without_signature(body)
|
112
|
+
# FIXME: regex in EmailReplyParse gem seems to get into infinite loops
|
113
|
+
# with certain long bodies that have binary data
|
114
|
+
parsed_body = Timeout::timeout(5) do
|
115
|
+
EmailReplyParser.parse_reply(body)
|
116
|
+
end
|
117
|
+
rescue Timeout::Error, StandardError => e
|
118
|
+
return body
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'net/imap'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module Email
|
5
|
+
class IMAPExtractor < Chronicle::ETL::Extractor
|
6
|
+
register_connector do |r|
|
7
|
+
r.provider = 'email'
|
8
|
+
r.description = 'imap server'
|
9
|
+
r.identifier = 'imap'
|
10
|
+
end
|
11
|
+
|
12
|
+
setting :host, required: true, default: 'imap.gmail.com'
|
13
|
+
setting :port, type: :numeric, required: true, default: 993
|
14
|
+
setting :mailbox, required: true, default: '[Gmail]/All Mail'
|
15
|
+
setting :username, required: true
|
16
|
+
setting :password, required: true
|
17
|
+
setting :search_query
|
18
|
+
|
19
|
+
def prepare
|
20
|
+
@connection = create_connection
|
21
|
+
@message_ids = fetch_message_ids
|
22
|
+
end
|
23
|
+
|
24
|
+
def results_count
|
25
|
+
@message_ids.count
|
26
|
+
end
|
27
|
+
|
28
|
+
def extract
|
29
|
+
@message_ids.each do |message_id|
|
30
|
+
message = fetch_message(message_id)
|
31
|
+
yield Chronicle::ETL::Extraction.new(data: { email: message.attr["BODY[]"]} )
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def create_connection
|
38
|
+
connection = Net::IMAP.new(@config.host, @config.port, true)
|
39
|
+
connection.login(@config.username, @config.password)
|
40
|
+
connection.select(@config.mailbox)
|
41
|
+
connection
|
42
|
+
rescue Net::IMAP::NoResponseError => e
|
43
|
+
raise(Chronicle::ETL::ExtractionError, "Error connecting to IMAP server. Please check username and password")
|
44
|
+
end
|
45
|
+
|
46
|
+
def fetch_message_ids
|
47
|
+
keys = gmail_mode? ? search_keys_gmail : search_keys_default
|
48
|
+
@connection.search(keys)
|
49
|
+
rescue Net::IMAP::BadResponseError => e
|
50
|
+
raise(Chronicle::ETL::ExtractionError, "Error searching IMAP server for messages")
|
51
|
+
end
|
52
|
+
|
53
|
+
def fetch_message(message_id)
|
54
|
+
response = @connection.fetch(3100020, "BODY.PEEK[]")
|
55
|
+
raise(Chronicle::ETL::ExtractionError, "Error loading message") unless response
|
56
|
+
|
57
|
+
return response[0]
|
58
|
+
end
|
59
|
+
|
60
|
+
def search_keys_gmail
|
61
|
+
# Gmail offers an extension to IMAP that lets us use gmail queries
|
62
|
+
q = ""
|
63
|
+
|
64
|
+
# First, we ignore drafts beacuse they break a lot of assumptions we
|
65
|
+
# make when when processing emails (lack of timestamps, ids, etc)
|
66
|
+
q = "-label:draft"
|
67
|
+
|
68
|
+
# We use UNIX timestamps in gmail filters which let us do more precise
|
69
|
+
# since/until compared with date-based imap filters
|
70
|
+
q += " after:#{@config.since.to_i}" if @config.since
|
71
|
+
q += " before:#{@config.until.to_i}" if @config.until
|
72
|
+
q += " #{@config.search_query}" if @config.search_query
|
73
|
+
|
74
|
+
["X-GM-RAW", q]
|
75
|
+
end
|
76
|
+
|
77
|
+
def search_keys_default
|
78
|
+
keys = []
|
79
|
+
# TODO: test out non-gmail IMAP searching (for @config.search_query)
|
80
|
+
keys += ['SINCE', Net::IMAP.format_date(@config.since)] if @config.since
|
81
|
+
keys += ['BEFORE', Net::IMAP.format_date(@config.until)] if @config.until
|
82
|
+
end
|
83
|
+
|
84
|
+
def gmail_mode?
|
85
|
+
@config.host == 'imap.gmail.com'
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -5,35 +5,55 @@ require 'tempfile'
|
|
5
5
|
module Chronicle
|
6
6
|
module Email
|
7
7
|
class MboxExtractor < Chronicle::ETL::Extractor
|
8
|
+
register_connector do |r|
|
9
|
+
r.provider = 'email'
|
10
|
+
r.description = 'an .mbox file'
|
11
|
+
r.identifier = 'mbox'
|
12
|
+
end
|
13
|
+
|
14
|
+
setting :input, required: true
|
15
|
+
|
8
16
|
# mbox format is a bunch of emails concatanated together, separated
|
9
17
|
# by a line that starts with "From "
|
10
18
|
NEW_EMAIL_REGEX = Regexp.new('^From [^\s]+ .{24}')
|
11
19
|
|
12
20
|
def results_count
|
13
|
-
|
14
|
-
|
15
|
-
file.each do |line|
|
16
|
-
count += 1 if line =~ NEW_EMAIL_REGEX
|
21
|
+
File.foreach(@filename).sum do |line|
|
22
|
+
line.scan(NEW_EMAIL_REGEX).count
|
17
23
|
end
|
18
|
-
|
24
|
+
end
|
25
|
+
|
26
|
+
def prepare
|
27
|
+
@filename = @config.input.first
|
19
28
|
end
|
20
29
|
|
21
30
|
def extract
|
22
|
-
file = File.open(@
|
23
|
-
tmp = Tempfile.new('
|
31
|
+
file = File.open(@filename)
|
32
|
+
tmp = Tempfile.new('chronicle-mbox')
|
24
33
|
|
34
|
+
# Read the .mbox file line by line and look for a header that indicates
|
35
|
+
# the start of a new email. As we read line by line, we save to a tmp
|
36
|
+
# file and then read it back when we notice the next header.
|
37
|
+
# Doing it this way is a lot faster than saving each line to a
|
38
|
+
# a variable, especially when we're reading emails with large binary
|
39
|
+
# attachments.
|
40
|
+
#
|
41
|
+
# TODO: make this thread-safe (one tmp file per email?)
|
25
42
|
file.each do |line|
|
26
43
|
if line =~ NEW_EMAIL_REGEX
|
27
44
|
if File.size(tmp) > 0
|
28
45
|
tmp.rewind
|
29
46
|
email = tmp.read
|
30
|
-
yield email
|
47
|
+
yield Chronicle::ETL::Extraction.new(data: { email: email} )
|
31
48
|
tmp.truncate(0)
|
32
49
|
tmp.rewind
|
33
50
|
end
|
34
51
|
end
|
35
52
|
tmp.write(line)
|
36
53
|
end
|
54
|
+
ensure
|
55
|
+
tmp.close
|
56
|
+
tmp.unlink
|
37
57
|
file.close
|
38
58
|
end
|
39
59
|
end
|
data/lib/chronicle/email.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
require "chronicle/email/version"
|
2
|
-
require "chronicle/email/
|
2
|
+
require "chronicle/email/email_transformer"
|
3
3
|
require "chronicle/email/mbox_extractor"
|
4
|
+
require "chronicle/email/imap_extractor"
|
4
5
|
|
5
6
|
module Chronicle
|
6
7
|
module Email
|
7
|
-
PROVIDER_NAME = "email"
|
8
8
|
end
|
9
9
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chronicle-email
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Louis
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-03-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: chronicle-etl
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.
|
19
|
+
version: '0.5'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.
|
26
|
+
version: '0.5'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: mail
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -53,49 +53,49 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0.5'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: reverse_markdown
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '2.
|
62
|
-
type: :
|
61
|
+
version: '2.0'
|
62
|
+
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '2.
|
68
|
+
version: '2.0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: bundler
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
75
|
+
version: '2.1'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
82
|
+
version: '2.1'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
84
|
+
name: rake
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
89
|
+
version: '13.0'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '
|
96
|
+
version: '13.0'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
98
|
+
name: rspec
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
101
|
- - "~>"
|
@@ -117,11 +117,9 @@ extra_rdoc_files: []
|
|
117
117
|
files:
|
118
118
|
- ".gitignore"
|
119
119
|
- ".rspec"
|
120
|
-
- ".ruby-version"
|
121
120
|
- ".travis.yml"
|
122
121
|
- CODE_OF_CONDUCT.md
|
123
122
|
- Gemfile
|
124
|
-
- Gemfile.lock
|
125
123
|
- LICENSE.txt
|
126
124
|
- README.md
|
127
125
|
- Rakefile
|
@@ -129,7 +127,8 @@ files:
|
|
129
127
|
- bin/setup
|
130
128
|
- chronicle-email.gemspec
|
131
129
|
- lib/chronicle/email.rb
|
132
|
-
- lib/chronicle/email/
|
130
|
+
- lib/chronicle/email/email_transformer.rb
|
131
|
+
- lib/chronicle/email/imap_extractor.rb
|
133
132
|
- lib/chronicle/email/mbox_extractor.rb
|
134
133
|
- lib/chronicle/email/version.rb
|
135
134
|
homepage: https://github.com/chronicle-app/chronicle-email
|
@@ -154,7 +153,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
154
153
|
- !ruby/object:Gem::Version
|
155
154
|
version: '0'
|
156
155
|
requirements: []
|
157
|
-
rubygems_version: 3.
|
156
|
+
rubygems_version: 3.3.3
|
158
157
|
signing_key:
|
159
158
|
specification_version: 4
|
160
159
|
summary: Email importer for Chronicle
|
data/.ruby-version
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
2.7.1
|
data/Gemfile.lock
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
PATH
|
2
|
-
remote: .
|
3
|
-
specs:
|
4
|
-
chronicle-email (0.1.1)
|
5
|
-
chronicle-etl (~> 0.2.2)
|
6
|
-
email_reply_parser (~> 0.5)
|
7
|
-
mail (~> 2.7)
|
8
|
-
|
9
|
-
GEM
|
10
|
-
remote: https://rubygems.org/
|
11
|
-
specs:
|
12
|
-
byebug (11.1.3)
|
13
|
-
chronicle-etl (0.2.2)
|
14
|
-
colorize (~> 0.8.1)
|
15
|
-
thor (~> 0.20)
|
16
|
-
tty-progressbar (~> 0.17)
|
17
|
-
tty-table (~> 0.11)
|
18
|
-
coderay (1.1.3)
|
19
|
-
colorize (0.8.1)
|
20
|
-
diff-lcs (1.4.4)
|
21
|
-
email_reply_parser (0.5.10)
|
22
|
-
equatable (0.6.1)
|
23
|
-
mail (2.7.1)
|
24
|
-
mini_mime (>= 0.1.1)
|
25
|
-
method_source (1.0.0)
|
26
|
-
mini_mime (1.0.2)
|
27
|
-
necromancer (0.6.0)
|
28
|
-
pastel (0.7.4)
|
29
|
-
equatable (~> 0.6)
|
30
|
-
tty-color (~> 0.5)
|
31
|
-
pry (0.13.1)
|
32
|
-
coderay (~> 1.1)
|
33
|
-
method_source (~> 1.0)
|
34
|
-
pry-byebug (3.9.0)
|
35
|
-
byebug (~> 11.0)
|
36
|
-
pry (~> 0.13.0)
|
37
|
-
rake (13.0.1)
|
38
|
-
rspec (3.9.0)
|
39
|
-
rspec-core (~> 3.9.0)
|
40
|
-
rspec-expectations (~> 3.9.0)
|
41
|
-
rspec-mocks (~> 3.9.0)
|
42
|
-
rspec-core (3.9.2)
|
43
|
-
rspec-support (~> 3.9.3)
|
44
|
-
rspec-expectations (3.9.2)
|
45
|
-
diff-lcs (>= 1.2.0, < 2.0)
|
46
|
-
rspec-support (~> 3.9.0)
|
47
|
-
rspec-mocks (3.9.1)
|
48
|
-
diff-lcs (>= 1.2.0, < 2.0)
|
49
|
-
rspec-support (~> 3.9.0)
|
50
|
-
rspec-support (3.9.3)
|
51
|
-
strings (0.1.8)
|
52
|
-
strings-ansi (~> 0.1)
|
53
|
-
unicode-display_width (~> 1.5)
|
54
|
-
unicode_utils (~> 1.4)
|
55
|
-
strings-ansi (0.1.0)
|
56
|
-
thor (0.20.3)
|
57
|
-
tty-color (0.5.2)
|
58
|
-
tty-cursor (0.7.1)
|
59
|
-
tty-progressbar (0.17.0)
|
60
|
-
strings-ansi (~> 0.1.0)
|
61
|
-
tty-cursor (~> 0.7)
|
62
|
-
tty-screen (~> 0.7)
|
63
|
-
unicode-display_width (~> 1.6)
|
64
|
-
tty-screen (0.8.1)
|
65
|
-
tty-table (0.11.0)
|
66
|
-
equatable (~> 0.6)
|
67
|
-
necromancer (~> 0.5)
|
68
|
-
pastel (~> 0.7.2)
|
69
|
-
strings (~> 0.1.5)
|
70
|
-
tty-screen (~> 0.7)
|
71
|
-
unicode-display_width (1.7.0)
|
72
|
-
unicode_utils (1.4.0)
|
73
|
-
|
74
|
-
PLATFORMS
|
75
|
-
ruby
|
76
|
-
|
77
|
-
DEPENDENCIES
|
78
|
-
bundler (~> 2.1)
|
79
|
-
chronicle-email!
|
80
|
-
pry-byebug (~> 3.9)
|
81
|
-
rake (~> 13.0)
|
82
|
-
rspec (~> 3.9)
|
83
|
-
|
84
|
-
BUNDLED WITH
|
85
|
-
2.1.4
|
@@ -1,150 +0,0 @@
|
|
1
|
-
require 'chronicle/etl'
|
2
|
-
require 'mail'
|
3
|
-
require 'timeout'
|
4
|
-
require 'email_reply_parser'
|
5
|
-
|
6
|
-
module Chronicle
|
7
|
-
module Email
|
8
|
-
class ChronicleTransformer < Chronicle::ETL::Transformer
|
9
|
-
def transform
|
10
|
-
message = Mail.new(@data.b)
|
11
|
-
build_messaged(message)
|
12
|
-
end
|
13
|
-
|
14
|
-
def build_messaged message
|
15
|
-
{
|
16
|
-
type: 'activities',
|
17
|
-
attributes: {
|
18
|
-
verb: 'messaged',
|
19
|
-
end_at: message.date,
|
20
|
-
provider: 'email',
|
21
|
-
provider_id: message.message_id,
|
22
|
-
},
|
23
|
-
meta: { dedupe_on: 'verb,provider,provider_id'},
|
24
|
-
relationships: {
|
25
|
-
actor: { data: build_actor(message) },
|
26
|
-
involved: { data: build_message(message) }
|
27
|
-
}
|
28
|
-
}
|
29
|
-
end
|
30
|
-
|
31
|
-
def build_actor message
|
32
|
-
# sometimes From: fields are malformed and we can't build an
|
33
|
-
# actor out of it.
|
34
|
-
return unless message[:from] && message[:from].addrs && message[:from].addrs.any?
|
35
|
-
|
36
|
-
{
|
37
|
-
type: 'entities',
|
38
|
-
attributes: {
|
39
|
-
represents: 'identity',
|
40
|
-
provider: 'email',
|
41
|
-
slug: message[:from].addrs.first.address,
|
42
|
-
title: message[:from].addrs.first.display_name
|
43
|
-
},
|
44
|
-
meta: { dedupe_on: 'represents,provider,slug'}
|
45
|
-
}
|
46
|
-
end
|
47
|
-
|
48
|
-
def build_message message
|
49
|
-
{
|
50
|
-
type: 'entities',
|
51
|
-
attributes: {
|
52
|
-
represents: 'message',
|
53
|
-
title: clean_subject(message.subject),
|
54
|
-
body: clean_body(message),
|
55
|
-
provider: 'email',
|
56
|
-
provider_id: message.message_id
|
57
|
-
},
|
58
|
-
meta: { dedupe_on: 'represents,provider,provider_id'},
|
59
|
-
relationships: {
|
60
|
-
consumers: { data: build_consumers(message) },
|
61
|
-
antecedents: { data: build_references(message) },
|
62
|
-
owners: { data: build_account(message) },
|
63
|
-
# contains: { data: build_attachments(message) }
|
64
|
-
}
|
65
|
-
}
|
66
|
-
end
|
67
|
-
|
68
|
-
def build_account message
|
69
|
-
return unless account_email = [message.header['delivered-to']].flatten[0]&.value
|
70
|
-
|
71
|
-
{
|
72
|
-
type: 'entities',
|
73
|
-
attributes: {
|
74
|
-
represents: 'identity',
|
75
|
-
provider: 'email',
|
76
|
-
slug: account_email
|
77
|
-
},
|
78
|
-
meta: { dedupe_on: 'provider,slug,represents' }
|
79
|
-
}
|
80
|
-
end
|
81
|
-
|
82
|
-
def build_consumers(message)
|
83
|
-
to = []
|
84
|
-
to += message[:to].addrs if message[:to]
|
85
|
-
to += message[:cc].addrs.flatten.compact if message[:cc]
|
86
|
-
|
87
|
-
to.collect do |consumer|
|
88
|
-
{
|
89
|
-
type: 'entities',
|
90
|
-
attributes: {
|
91
|
-
represents: 'identity',
|
92
|
-
provider: 'email',
|
93
|
-
slug: consumer.address,
|
94
|
-
title: consumer.display_name
|
95
|
-
},
|
96
|
-
meta: { dedupe_on: 'provider,slug' }
|
97
|
-
}
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
def build_references(message)
|
102
|
-
references = [message.references].flatten.compact
|
103
|
-
references.collect{|reference|
|
104
|
-
{
|
105
|
-
type: 'entities',
|
106
|
-
attributes: {
|
107
|
-
represents: 'message',
|
108
|
-
provider: 'email',
|
109
|
-
provider_id: reference
|
110
|
-
},
|
111
|
-
meta: { dedupe_on: 'represents,provider,provider_id' }
|
112
|
-
}
|
113
|
-
}
|
114
|
-
end
|
115
|
-
|
116
|
-
def clean_subject(subject)
|
117
|
-
subject&.encode("UTF-8", invalid: :replace, undef: :replace)
|
118
|
-
end
|
119
|
-
|
120
|
-
def clean_body message
|
121
|
-
# FIXME: this all needs to be refactored
|
122
|
-
|
123
|
-
if message.multipart?
|
124
|
-
body = message.text_part&.decoded rescue Mail::UnknownEncodingType
|
125
|
-
else
|
126
|
-
body = message.body&.decoded rescue Mail::UnknownEncodingType
|
127
|
-
end
|
128
|
-
|
129
|
-
if body && body != ""
|
130
|
-
begin
|
131
|
-
# regex in EmailReplyParse gem seems to get into infinite loops with
|
132
|
-
# certain long bodies that have binary data
|
133
|
-
parsed_body = Timeout::timeout(5) do
|
134
|
-
EmailReplyParser.parse_reply(body)
|
135
|
-
end
|
136
|
-
rescue Timeout::Error => e
|
137
|
-
return nil
|
138
|
-
rescue StandardError => e # Whackamole game with these parsing / encoding problems
|
139
|
-
return nil
|
140
|
-
end
|
141
|
-
|
142
|
-
# Force UTF-8 encoding
|
143
|
-
return parsed_body.encode("UTF-8", invalid: :replace, undef: :replace)
|
144
|
-
else
|
145
|
-
return nil
|
146
|
-
end
|
147
|
-
end
|
148
|
-
end
|
149
|
-
end
|
150
|
-
end
|