chronicle-email 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +42 -6
- data/lib/chronicle/email/email_transformer.rb +4 -4
- data/lib/chronicle/email/imap_extractor.rb +89 -0
- data/lib/chronicle/email/version.rb +1 -1
- data/lib/chronicle/email.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e37ee5bdabd254e03a06962503f5875ab75d2aa0b271aff0b946c73f673d5cc3
|
4
|
+
data.tar.gz: c351c35f1b5545f0d0f9e6b4375076b159dab88eb454cf3e946fdab84ddc03a4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fedb95cf29c21d0dac17d6cbeca81da0a5318ab41ea8a5eaf355e8996606b03398ba70c4233e4591a78ceaef5ce5242b59c5e227e79a6310ba4474c245476e57
|
7
|
+
data.tar.gz: 7a06834ea16425d456da704b3272c96d6e0f427044e46e60a477dbce1cab7f9621056f4f18f7b90a964f83e3e85281a981fdf155155ebf4cb8b06871cd9884ab
|
data/README.md
CHANGED
@@ -9,16 +9,55 @@ Extract and work with your email using the command line with this plugin for [ch
|
|
9
9
|
# Install chronicle-etl and this plugin
|
10
10
|
$ gem install chronicle-etl
|
11
11
|
$ chronicle-etl plugins:install email
|
12
|
+
```
|
13
|
+
|
14
|
+
### Extracting email from IMAP
|
15
|
+
|
16
|
+
For Gmail accounts, you can create an [app password](https://myaccount.google.com/apppasswords); your email address is your username.
|
17
|
+
|
18
|
+
```sh
|
19
|
+
# Save username and password
|
20
|
+
$ chronicle-etl secrets:set imap username foo@gmail.com
|
21
|
+
$ chronicle-etl secrets:set imap password APPPASSWORD
|
22
|
+
|
23
|
+
# Then, retrieve your email from the last five days
|
24
|
+
$ chronicle-etl --extractor email:imap --transformer email --since 5d --loader json
|
25
|
+
|
26
|
+
# If you don't want to save your credentials as a secret, you can just pass
|
27
|
+
# them to the extractor directly
|
28
|
+
$ chronicle-etl --extractor email:imap --transformer email --since 5d --loader json \
|
29
|
+
--extractor-opts username:foo@gmail.com --password:APPPASSWORD
|
30
|
+
```
|
12
31
|
|
13
|
-
|
14
|
-
|
32
|
+
### Processing email from an .mbox file
|
33
|
+
The MBOX format is used to archive an email mailbox. [Google Takeout](https://takeout.google.com/settings/takeout) exports emails from gmail in this format.
|
34
|
+
|
35
|
+
```sh
|
36
|
+
# Retrieve the subject lines of all emails in test.mbox
|
37
|
+
$ chronicle-etl --extractor email:mbox --input inbox.mbox --transformer email --fields subject
|
15
38
|
```
|
16
39
|
|
17
40
|
## Available Connectors
|
18
41
|
### Extractors
|
19
42
|
|
43
|
+
#### `imap`
|
44
|
+
Extractor for importing recent emails from an IMAP server.
|
45
|
+
|
46
|
+
##### Settings
|
47
|
+
|
48
|
+
- `since`: Retrieve emails since this date
|
49
|
+
- `until`: Retrieve emails until this date
|
50
|
+
- `username`
|
51
|
+
- `password`
|
52
|
+
- `host`: (default: imap.gmail.com)
|
53
|
+
- `port`: (default: 993) Use 143 for unencrypted connections
|
54
|
+
- `mailbox`: (default: "[Gmail]/All Mail")
|
55
|
+
- `search_query`: When using Gmail, you can pass in a search query (`from:foo has:attachment`) to filter messages by
|
56
|
+
|
57
|
+
For accessing Gmail, you can create a one-time [app password](https://myaccount.google.com/apppasswords). Your email address is your username.
|
58
|
+
|
20
59
|
#### `mbox`
|
21
|
-
Extractor for importing emails from an
|
60
|
+
Extractor for importing emails from an MBOX file
|
22
61
|
|
23
62
|
##### Settings
|
24
63
|
- `input`: A path to an .mbox file
|
@@ -31,6 +70,3 @@ Transform an email (in the form of a string) into Chronicle Schema
|
|
31
70
|
##### Settings
|
32
71
|
- `body_as_markdown`: (default: false) Whether to convert the email body into markdown
|
33
72
|
- `remove_signature`: (default: true) Whether to attempt to strip out the email signature (using the [`email_reply_parser`](https://github.com/github/email_reply_parser) gem)
|
34
|
-
|
35
|
-
## Roadmap
|
36
|
-
- Add an IMAP (and gmail) extractor #1
|
@@ -85,8 +85,7 @@ module Chronicle
|
|
85
85
|
end
|
86
86
|
|
87
87
|
def clean_body message
|
88
|
-
# FIXME: this all needs to be refactored
|
89
|
-
|
88
|
+
# FIXME: this all needs to be refactored
|
90
89
|
if message.multipart?
|
91
90
|
body = message.text_part&.decoded rescue Mail::UnknownEncodingType
|
92
91
|
else
|
@@ -94,6 +93,9 @@ module Chronicle
|
|
94
93
|
body = body_to_markdown if @config.body_as_markdown
|
95
94
|
end
|
96
95
|
|
96
|
+
return if body == Mail::UnknownEncodingType
|
97
|
+
return unless body && body != ""
|
98
|
+
|
97
99
|
body = body_without_signature(body) if @config.remove_signature
|
98
100
|
|
99
101
|
# Force UTF-8 encoding
|
@@ -107,8 +109,6 @@ module Chronicle
|
|
107
109
|
end
|
108
110
|
|
109
111
|
def body_without_signature(body)
|
110
|
-
return unless body && body != ""
|
111
|
-
|
112
112
|
# FIXME: regex in EmailReplyParse gem seems to get into infinite loops
|
113
113
|
# with certain long bodies that have binary data
|
114
114
|
parsed_body = Timeout::timeout(5) do
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'net/imap'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module Email
|
5
|
+
class IMAPExtractor < Chronicle::ETL::Extractor
|
6
|
+
register_connector do |r|
|
7
|
+
r.provider = 'email'
|
8
|
+
r.description = 'imap server'
|
9
|
+
r.identifier = 'imap'
|
10
|
+
end
|
11
|
+
|
12
|
+
setting :host, required: true, default: 'imap.gmail.com'
|
13
|
+
setting :port, type: :numeric, required: true, default: 993
|
14
|
+
setting :mailbox, required: true, default: '[Gmail]/All Mail'
|
15
|
+
setting :username, required: true
|
16
|
+
setting :password, required: true
|
17
|
+
setting :search_query
|
18
|
+
|
19
|
+
def prepare
|
20
|
+
@connection = create_connection
|
21
|
+
@message_ids = fetch_message_ids
|
22
|
+
end
|
23
|
+
|
24
|
+
def results_count
|
25
|
+
@message_ids.count
|
26
|
+
end
|
27
|
+
|
28
|
+
def extract
|
29
|
+
@message_ids.each do |message_id|
|
30
|
+
message = fetch_message(message_id)
|
31
|
+
yield Chronicle::ETL::Extraction.new(data: { email: message.attr["BODY[]"]} )
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def create_connection
|
38
|
+
connection = Net::IMAP.new(@config.host, @config.port, true)
|
39
|
+
connection.login(@config.username, @config.password)
|
40
|
+
connection.select(@config.mailbox)
|
41
|
+
connection
|
42
|
+
rescue Net::IMAP::NoResponseError => e
|
43
|
+
raise(Chronicle::ETL::ExtractionError, "Error connecting to IMAP server. Please check username and password")
|
44
|
+
end
|
45
|
+
|
46
|
+
def fetch_message_ids
|
47
|
+
keys = gmail_mode? ? search_keys_gmail : search_keys_default
|
48
|
+
@connection.search(keys)
|
49
|
+
rescue Net::IMAP::BadResponseError => e
|
50
|
+
raise(Chronicle::ETL::ExtractionError, "Error searching IMAP server for messages")
|
51
|
+
end
|
52
|
+
|
53
|
+
def fetch_message(message_id)
|
54
|
+
response = @connection.fetch(3100020, "BODY.PEEK[]")
|
55
|
+
raise(Chronicle::ETL::ExtractionError, "Error loading message") unless response
|
56
|
+
|
57
|
+
return response[0]
|
58
|
+
end
|
59
|
+
|
60
|
+
def search_keys_gmail
|
61
|
+
# Gmail offers an extension to IMAP that lets us use gmail queries
|
62
|
+
q = ""
|
63
|
+
|
64
|
+
# First, we ignore drafts beacuse they break a lot of assumptions we
|
65
|
+
# make when when processing emails (lack of timestamps, ids, etc)
|
66
|
+
q = "-label:draft"
|
67
|
+
|
68
|
+
# We use UNIX timestamps in gmail filters which let us do more precise
|
69
|
+
# since/until compared with date-based imap filters
|
70
|
+
q += " after:#{@config.since.to_i}" if @config.since
|
71
|
+
q += " before:#{@config.until.to_i}" if @config.until
|
72
|
+
q += " #{@config.search_query}" if @config.search_query
|
73
|
+
|
74
|
+
["X-GM-RAW", q]
|
75
|
+
end
|
76
|
+
|
77
|
+
def search_keys_default
|
78
|
+
keys = []
|
79
|
+
# TODO: test out non-gmail IMAP searching (for @config.search_query)
|
80
|
+
keys += ['SINCE', Net::IMAP.format_date(@config.since)] if @config.since
|
81
|
+
keys += ['BEFORE', Net::IMAP.format_date(@config.until)] if @config.until
|
82
|
+
end
|
83
|
+
|
84
|
+
def gmail_mode?
|
85
|
+
@config.host == 'imap.gmail.com'
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
data/lib/chronicle/email.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chronicle-email
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Louis
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-03-
|
11
|
+
date: 2022-03-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: chronicle-etl
|
@@ -128,6 +128,7 @@ files:
|
|
128
128
|
- chronicle-email.gemspec
|
129
129
|
- lib/chronicle/email.rb
|
130
130
|
- lib/chronicle/email/email_transformer.rb
|
131
|
+
- lib/chronicle/email/imap_extractor.rb
|
131
132
|
- lib/chronicle/email/mbox_extractor.rb
|
132
133
|
- lib/chronicle/email/version.rb
|
133
134
|
homepage: https://github.com/chronicle-app/chronicle-email
|