chronicle-email 0.2.0 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +42 -6
- data/chronicle-email.gemspec +6 -2
- data/lib/chronicle/email/email_transformer.rb +4 -4
- data/lib/chronicle/email/imap_extractor.rb +91 -0
- data/lib/chronicle/email/version.rb +1 -1
- data/lib/chronicle/email.rb +1 -0
- metadata +41 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a910606f4987580ac86be55c02a6e3003b7cab21e93476dc1bf6fa4cc4512355
|
4
|
+
data.tar.gz: ef1dbfa473c66feb93c730a5bdd4e426203519a8d94da6afccf227bfa1fa084a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f1ddc8cabf1b9fbb319ad36406e9a99e585eaad8abd772aea378438f838e11be3e75a9c3fe2a1c4805ee6c84d9dfe256ba9cff3ac205406c899ee33a71d1122f
|
7
|
+
data.tar.gz: 00c745bf81b65f59c7e514e735bd3319548f747503907646ca619fbfb1634dd2bb5e199753141a9c651bf210326e803dbb3d4bdc4331bdaeba0ecb70425dd265
|
data/README.md
CHANGED
@@ -9,16 +9,55 @@ Extract and work with your email using the command line with this plugin for [ch
|
|
9
9
|
# Install chronicle-etl and this plugin
|
10
10
|
$ gem install chronicle-etl
|
11
11
|
$ chronicle-etl plugins:install email
|
12
|
+
```
|
13
|
+
|
14
|
+
### Extracting email from IMAP
|
15
|
+
|
16
|
+
For Gmail accounts, you can create an [app password](https://myaccount.google.com/apppasswords); your email address is your username.
|
17
|
+
|
18
|
+
```sh
|
19
|
+
# Save username and password
|
20
|
+
$ chronicle-etl secrets:set imap username foo@gmail.com
|
21
|
+
$ chronicle-etl secrets:set imap password APPPASSWORD
|
22
|
+
|
23
|
+
# Then, retrieve your email from the last five days
|
24
|
+
$ chronicle-etl --extractor email:imap --transformer email --since 5d --loader json
|
25
|
+
|
26
|
+
# If you don't want to save your credentials as a secret, you can just pass
|
27
|
+
# them to the extractor directly
|
28
|
+
$ chronicle-etl --extractor email:imap --transformer email --since 5d --loader json \
|
29
|
+
--extractor-opts username:foo@gmail.com --password:APPPASSWORD
|
30
|
+
```
|
12
31
|
|
13
|
-
|
14
|
-
|
32
|
+
### Processing email from an .mbox file
|
33
|
+
The MBOX format is used to archive an email mailbox. [Google Takeout](https://takeout.google.com/settings/takeout) exports emails from gmail in this format.
|
34
|
+
|
35
|
+
```sh
|
36
|
+
# Retrieve the subject lines of all emails in test.mbox
|
37
|
+
$ chronicle-etl --extractor email:mbox --input inbox.mbox --transformer email --fields subject
|
15
38
|
```
|
16
39
|
|
17
40
|
## Available Connectors
|
18
41
|
### Extractors
|
19
42
|
|
43
|
+
#### `imap`
|
44
|
+
Extractor for importing recent emails from an IMAP server.
|
45
|
+
|
46
|
+
##### Settings
|
47
|
+
|
48
|
+
- `since`: Retrieve emails since this date
|
49
|
+
- `until`: Retrieve emails until this date
|
50
|
+
- `username`
|
51
|
+
- `password`
|
52
|
+
- `host`: (default: imap.gmail.com)
|
53
|
+
- `port`: (default: 993) Use 143 for unencrypted connections
|
54
|
+
- `mailbox`: (default: "[Gmail]/All Mail")
|
55
|
+
- `search_query`: When using Gmail, you can pass in a search query (`from:foo has:attachment`) to filter messages by
|
56
|
+
|
57
|
+
For accessing Gmail, you can create a one-time [app password](https://myaccount.google.com/apppasswords). Your email address is your username.
|
58
|
+
|
20
59
|
#### `mbox`
|
21
|
-
Extractor for importing emails from an
|
60
|
+
Extractor for importing emails from an MBOX file
|
22
61
|
|
23
62
|
##### Settings
|
24
63
|
- `input`: A path to an .mbox file
|
@@ -31,6 +70,3 @@ Transform an email (in the form of a string) into Chronicle Schema
|
|
31
70
|
##### Settings
|
32
71
|
- `body_as_markdown`: (default: false) Whether to convert the email body into markdown
|
33
72
|
- `remove_signature`: (default: true) Whether to attempt to strip out the email signature (using the [`email_reply_parser`](https://github.com/github/email_reply_parser) gem)
|
34
|
-
|
35
|
-
## Roadmap
|
36
|
-
- Add an IMAP (and gmail) extractor #1
|
data/chronicle-email.gemspec
CHANGED
@@ -36,10 +36,14 @@ Gem::Specification.new do |spec|
|
|
36
36
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
37
37
|
spec.require_paths = ["lib"]
|
38
38
|
|
39
|
-
spec.add_dependency "chronicle-etl", "~> 0.4.4"
|
40
|
-
spec.add_dependency "mail", "~> 2.7"
|
41
39
|
spec.add_dependency 'email_reply_parser', '~> 0.5'
|
42
40
|
spec.add_dependency 'reverse_markdown', '~> 2.0'
|
41
|
+
spec.add_dependency "chronicle-etl", "~> 0.5"
|
42
|
+
spec.add_dependency "mail", "~> 2.7"
|
43
|
+
# Needed for Ruby 3.1 compatibility (https://github.com/mikel/mail/pull/1439#issuecomment-1002801221)
|
44
|
+
# TODO: check back for new version of `mail`
|
45
|
+
spec.add_dependency "net-imap"
|
46
|
+
spec.add_dependency "net-smtp"
|
43
47
|
|
44
48
|
spec.add_development_dependency "bundler", "~> 2.1"
|
45
49
|
spec.add_development_dependency "rake", "~> 13.0"
|
@@ -85,8 +85,7 @@ module Chronicle
|
|
85
85
|
end
|
86
86
|
|
87
87
|
def clean_body message
|
88
|
-
# FIXME: this all needs to be refactored
|
89
|
-
|
88
|
+
# FIXME: this all needs to be refactored
|
90
89
|
if message.multipart?
|
91
90
|
body = message.text_part&.decoded rescue Mail::UnknownEncodingType
|
92
91
|
else
|
@@ -94,6 +93,9 @@ module Chronicle
|
|
94
93
|
body = body_to_markdown if @config.body_as_markdown
|
95
94
|
end
|
96
95
|
|
96
|
+
return if body == Mail::UnknownEncodingType
|
97
|
+
return unless body && body != ""
|
98
|
+
|
97
99
|
body = body_without_signature(body) if @config.remove_signature
|
98
100
|
|
99
101
|
# Force UTF-8 encoding
|
@@ -107,8 +109,6 @@ module Chronicle
|
|
107
109
|
end
|
108
110
|
|
109
111
|
def body_without_signature(body)
|
110
|
-
return unless body && body != ""
|
111
|
-
|
112
112
|
# FIXME: regex in EmailReplyParse gem seems to get into infinite loops
|
113
113
|
# with certain long bodies that have binary data
|
114
114
|
parsed_body = Timeout::timeout(5) do
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'net/imap'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module Email
|
5
|
+
class IMAPExtractor < Chronicle::ETL::Extractor
|
6
|
+
register_connector do |r|
|
7
|
+
r.provider = 'email'
|
8
|
+
r.description = 'imap server'
|
9
|
+
r.identifier = 'imap'
|
10
|
+
end
|
11
|
+
|
12
|
+
setting :host, required: true, default: 'imap.gmail.com'
|
13
|
+
setting :port, type: :numeric, required: true, default: 993
|
14
|
+
setting :mailbox, required: true, default: '[Gmail]/All Mail'
|
15
|
+
setting :username, required: true
|
16
|
+
setting :password, required: true
|
17
|
+
setting :search_query
|
18
|
+
|
19
|
+
def prepare
|
20
|
+
@connection = create_connection
|
21
|
+
@message_ids = fetch_message_ids
|
22
|
+
end
|
23
|
+
|
24
|
+
def results_count
|
25
|
+
@message_ids.count
|
26
|
+
end
|
27
|
+
|
28
|
+
def extract
|
29
|
+
@message_ids.each do |message_id|
|
30
|
+
message = fetch_message(message_id)
|
31
|
+
yield Chronicle::ETL::Extraction.new(data: { email: message.attr["BODY[]"]} )
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def create_connection
|
38
|
+
connection = Net::IMAP.new(@config.host, @config.port, true)
|
39
|
+
connection.login(@config.username, @config.password)
|
40
|
+
connection.select(@config.mailbox)
|
41
|
+
connection
|
42
|
+
rescue Net::IMAP::NoResponseError => e
|
43
|
+
raise(Chronicle::ETL::ExtractionError, "Error connecting to IMAP server. Please check username and password")
|
44
|
+
end
|
45
|
+
|
46
|
+
def fetch_message_ids
|
47
|
+
keys = gmail_mode? ? search_keys_gmail : search_keys_default
|
48
|
+
message_ids = @connection.search(keys)
|
49
|
+
message_ids = message_ids.first(@config.limit) if @config.limit
|
50
|
+
message_ids
|
51
|
+
rescue Net::IMAP::BadResponseError => e
|
52
|
+
raise(Chronicle::ETL::ExtractionError, "Error searching IMAP server for messages")
|
53
|
+
end
|
54
|
+
|
55
|
+
def fetch_message(message_id)
|
56
|
+
response = @connection.fetch(message_id, "BODY.PEEK[]")
|
57
|
+
raise(Chronicle::ETL::ExtractionError, "Error loading message") unless response
|
58
|
+
|
59
|
+
return response[0]
|
60
|
+
end
|
61
|
+
|
62
|
+
def search_keys_gmail
|
63
|
+
# Gmail offers an extension to IMAP that lets us use gmail queries
|
64
|
+
q = ""
|
65
|
+
|
66
|
+
# First, we ignore drafts beacuse they break a lot of assumptions we
|
67
|
+
# make when when processing emails (lack of timestamps, ids, etc)
|
68
|
+
q = "-label:draft"
|
69
|
+
|
70
|
+
# We use UNIX timestamps in gmail filters which let us do more precise
|
71
|
+
# since/until compared with date-based imap filters
|
72
|
+
q += " after:#{@config.since.to_i}" if @config.since
|
73
|
+
q += " before:#{@config.until.to_i}" if @config.until
|
74
|
+
q += " #{@config.search_query}" if @config.search_query
|
75
|
+
|
76
|
+
["X-GM-RAW", q]
|
77
|
+
end
|
78
|
+
|
79
|
+
def search_keys_default
|
80
|
+
keys = []
|
81
|
+
# TODO: test out non-gmail IMAP searching (for @config.search_query)
|
82
|
+
keys += ['SINCE', Net::IMAP.format_date(@config.since)] if @config.since
|
83
|
+
keys += ['BEFORE', Net::IMAP.format_date(@config.until)] if @config.until
|
84
|
+
end
|
85
|
+
|
86
|
+
def gmail_mode?
|
87
|
+
@config.host == 'imap.gmail.com'
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
data/lib/chronicle/email.rb
CHANGED
metadata
CHANGED
@@ -1,45 +1,45 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chronicle-email
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Louis
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-03-
|
11
|
+
date: 2022-03-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: email_reply_parser
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.
|
19
|
+
version: '0.5'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.
|
26
|
+
version: '0.5'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: reverse_markdown
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '2.
|
33
|
+
version: '2.0'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '2.
|
40
|
+
version: '2.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: chronicle-etl
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
@@ -53,19 +53,47 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0.5'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: mail
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '2.
|
61
|
+
version: '2.7'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '2.
|
68
|
+
version: '2.7'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: net-imap
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: net-smtp
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
69
97
|
- !ruby/object:Gem::Dependency
|
70
98
|
name: bundler
|
71
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,6 +156,7 @@ files:
|
|
128
156
|
- chronicle-email.gemspec
|
129
157
|
- lib/chronicle/email.rb
|
130
158
|
- lib/chronicle/email/email_transformer.rb
|
159
|
+
- lib/chronicle/email/imap_extractor.rb
|
131
160
|
- lib/chronicle/email/mbox_extractor.rb
|
132
161
|
- lib/chronicle/email/version.rb
|
133
162
|
homepage: https://github.com/chronicle-app/chronicle-email
|