chronicle-email 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +42 -6
- data/lib/chronicle/email/email_transformer.rb +4 -4
- data/lib/chronicle/email/imap_extractor.rb +89 -0
- data/lib/chronicle/email/version.rb +1 -1
- data/lib/chronicle/email.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e37ee5bdabd254e03a06962503f5875ab75d2aa0b271aff0b946c73f673d5cc3
|
4
|
+
data.tar.gz: c351c35f1b5545f0d0f9e6b4375076b159dab88eb454cf3e946fdab84ddc03a4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fedb95cf29c21d0dac17d6cbeca81da0a5318ab41ea8a5eaf355e8996606b03398ba70c4233e4591a78ceaef5ce5242b59c5e227e79a6310ba4474c245476e57
|
7
|
+
data.tar.gz: 7a06834ea16425d456da704b3272c96d6e0f427044e46e60a477dbce1cab7f9621056f4f18f7b90a964f83e3e85281a981fdf155155ebf4cb8b06871cd9884ab
|
data/README.md
CHANGED
@@ -9,16 +9,55 @@ Extract and work with your email using the command line with this plugin for [ch
|
|
9
9
|
# Install chronicle-etl and this plugin
|
10
10
|
$ gem install chronicle-etl
|
11
11
|
$ chronicle-etl plugins:install email
|
12
|
+
```
|
13
|
+
|
14
|
+
### Extracting email from IMAP
|
15
|
+
|
16
|
+
For Gmail accounts, you can create an [app password](https://myaccount.google.com/apppasswords); your email address is your username.
|
17
|
+
|
18
|
+
```sh
|
19
|
+
# Save username and password
|
20
|
+
$ chronicle-etl secrets:set imap username foo@gmail.com
|
21
|
+
$ chronicle-etl secrets:set imap password APPPASSWORD
|
22
|
+
|
23
|
+
# Then, retrieve your email from the last five days
|
24
|
+
$ chronicle-etl --extractor email:imap --transformer email --since 5d --loader json
|
25
|
+
|
26
|
+
# If you don't want to save your credentials as a secret, you can just pass
|
27
|
+
# them to the extractor directly
|
28
|
+
$ chronicle-etl --extractor email:imap --transformer email --since 5d --loader json \
|
29
|
+
--extractor-opts username:foo@gmail.com --password:APPPASSWORD
|
30
|
+
```
|
12
31
|
|
13
|
-
|
14
|
-
|
32
|
+
### Processing email from an .mbox file
|
33
|
+
The MBOX format is used to archive an email mailbox. [Google Takeout](https://takeout.google.com/settings/takeout) exports emails from gmail in this format.
|
34
|
+
|
35
|
+
```sh
|
36
|
+
# Retrieve the subject lines of all emails in test.mbox
|
37
|
+
$ chronicle-etl --extractor email:mbox --input inbox.mbox --transformer email --fields subject
|
15
38
|
```
|
16
39
|
|
17
40
|
## Available Connectors
|
18
41
|
### Extractors
|
19
42
|
|
43
|
+
#### `imap`
|
44
|
+
Extractor for importing recent emails from an IMAP server.
|
45
|
+
|
46
|
+
##### Settings
|
47
|
+
|
48
|
+
- `since`: Retrieve emails since this date
|
49
|
+
- `until`: Retrieve emails until this date
|
50
|
+
- `username`
|
51
|
+
- `password`
|
52
|
+
- `host`: (default: imap.gmail.com)
|
53
|
+
- `port`: (default: 993) Use 143 for unencrypted connections
|
54
|
+
- `mailbox`: (default: "[Gmail]/All Mail")
|
55
|
+
- `search_query`: When using Gmail, you can pass in a search query (`from:foo has:attachment`) to filter messages by
|
56
|
+
|
57
|
+
For accessing Gmail, you can create a one-time [app password](https://myaccount.google.com/apppasswords). Your email address is your username.
|
58
|
+
|
20
59
|
#### `mbox`
|
21
|
-
Extractor for importing emails from an
|
60
|
+
Extractor for importing emails from an MBOX file
|
22
61
|
|
23
62
|
##### Settings
|
24
63
|
- `input`: A path to an .mbox file
|
@@ -31,6 +70,3 @@ Transform an email (in the form of a string) into Chronicle Schema
|
|
31
70
|
##### Settings
|
32
71
|
- `body_as_markdown`: (default: false) Whether to convert the email body into markdown
|
33
72
|
- `remove_signature`: (default: true) Whether to attempt to strip out the email signature (using the [`email_reply_parser`](https://github.com/github/email_reply_parser) gem)
|
34
|
-
|
35
|
-
## Roadmap
|
36
|
-
- Add an IMAP (and gmail) extractor #1
|
@@ -85,8 +85,7 @@ module Chronicle
|
|
85
85
|
end
|
86
86
|
|
87
87
|
def clean_body message
|
88
|
-
# FIXME: this all needs to be refactored
|
89
|
-
|
88
|
+
# FIXME: this all needs to be refactored
|
90
89
|
if message.multipart?
|
91
90
|
body = message.text_part&.decoded rescue Mail::UnknownEncodingType
|
92
91
|
else
|
@@ -94,6 +93,9 @@ module Chronicle
|
|
94
93
|
body = body_to_markdown if @config.body_as_markdown
|
95
94
|
end
|
96
95
|
|
96
|
+
return if body == Mail::UnknownEncodingType
|
97
|
+
return unless body && body != ""
|
98
|
+
|
97
99
|
body = body_without_signature(body) if @config.remove_signature
|
98
100
|
|
99
101
|
# Force UTF-8 encoding
|
@@ -107,8 +109,6 @@ module Chronicle
|
|
107
109
|
end
|
108
110
|
|
109
111
|
def body_without_signature(body)
|
110
|
-
return unless body && body != ""
|
111
|
-
|
112
112
|
# FIXME: regex in EmailReplyParse gem seems to get into infinite loops
|
113
113
|
# with certain long bodies that have binary data
|
114
114
|
parsed_body = Timeout::timeout(5) do
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'net/imap'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module Email
|
5
|
+
class IMAPExtractor < Chronicle::ETL::Extractor
|
6
|
+
register_connector do |r|
|
7
|
+
r.provider = 'email'
|
8
|
+
r.description = 'imap server'
|
9
|
+
r.identifier = 'imap'
|
10
|
+
end
|
11
|
+
|
12
|
+
setting :host, required: true, default: 'imap.gmail.com'
|
13
|
+
setting :port, type: :numeric, required: true, default: 993
|
14
|
+
setting :mailbox, required: true, default: '[Gmail]/All Mail'
|
15
|
+
setting :username, required: true
|
16
|
+
setting :password, required: true
|
17
|
+
setting :search_query
|
18
|
+
|
19
|
+
def prepare
|
20
|
+
@connection = create_connection
|
21
|
+
@message_ids = fetch_message_ids
|
22
|
+
end
|
23
|
+
|
24
|
+
def results_count
|
25
|
+
@message_ids.count
|
26
|
+
end
|
27
|
+
|
28
|
+
def extract
|
29
|
+
@message_ids.each do |message_id|
|
30
|
+
message = fetch_message(message_id)
|
31
|
+
yield Chronicle::ETL::Extraction.new(data: { email: message.attr["BODY[]"]} )
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def create_connection
|
38
|
+
connection = Net::IMAP.new(@config.host, @config.port, true)
|
39
|
+
connection.login(@config.username, @config.password)
|
40
|
+
connection.select(@config.mailbox)
|
41
|
+
connection
|
42
|
+
rescue Net::IMAP::NoResponseError => e
|
43
|
+
raise(Chronicle::ETL::ExtractionError, "Error connecting to IMAP server. Please check username and password")
|
44
|
+
end
|
45
|
+
|
46
|
+
def fetch_message_ids
|
47
|
+
keys = gmail_mode? ? search_keys_gmail : search_keys_default
|
48
|
+
@connection.search(keys)
|
49
|
+
rescue Net::IMAP::BadResponseError => e
|
50
|
+
raise(Chronicle::ETL::ExtractionError, "Error searching IMAP server for messages")
|
51
|
+
end
|
52
|
+
|
53
|
+
def fetch_message(message_id)
|
54
|
+
response = @connection.fetch(3100020, "BODY.PEEK[]")
|
55
|
+
raise(Chronicle::ETL::ExtractionError, "Error loading message") unless response
|
56
|
+
|
57
|
+
return response[0]
|
58
|
+
end
|
59
|
+
|
60
|
+
def search_keys_gmail
|
61
|
+
# Gmail offers an extension to IMAP that lets us use gmail queries
|
62
|
+
q = ""
|
63
|
+
|
64
|
+
# First, we ignore drafts beacuse they break a lot of assumptions we
|
65
|
+
# make when when processing emails (lack of timestamps, ids, etc)
|
66
|
+
q = "-label:draft"
|
67
|
+
|
68
|
+
# We use UNIX timestamps in gmail filters which let us do more precise
|
69
|
+
# since/until compared with date-based imap filters
|
70
|
+
q += " after:#{@config.since.to_i}" if @config.since
|
71
|
+
q += " before:#{@config.until.to_i}" if @config.until
|
72
|
+
q += " #{@config.search_query}" if @config.search_query
|
73
|
+
|
74
|
+
["X-GM-RAW", q]
|
75
|
+
end
|
76
|
+
|
77
|
+
def search_keys_default
|
78
|
+
keys = []
|
79
|
+
# TODO: test out non-gmail IMAP searching (for @config.search_query)
|
80
|
+
keys += ['SINCE', Net::IMAP.format_date(@config.since)] if @config.since
|
81
|
+
keys += ['BEFORE', Net::IMAP.format_date(@config.until)] if @config.until
|
82
|
+
end
|
83
|
+
|
84
|
+
def gmail_mode?
|
85
|
+
@config.host == 'imap.gmail.com'
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
data/lib/chronicle/email.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chronicle-email
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Louis
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-03-
|
11
|
+
date: 2022-03-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: chronicle-etl
|
@@ -128,6 +128,7 @@ files:
|
|
128
128
|
- chronicle-email.gemspec
|
129
129
|
- lib/chronicle/email.rb
|
130
130
|
- lib/chronicle/email/email_transformer.rb
|
131
|
+
- lib/chronicle/email/imap_extractor.rb
|
131
132
|
- lib/chronicle/email/mbox_extractor.rb
|
132
133
|
- lib/chronicle/email/version.rb
|
133
134
|
homepage: https://github.com/chronicle-app/chronicle-email
|