chronicle-email 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 38fae4ff87eef33cef28435958b5d2ee599ab32f318b505d63949ac4a70a066b
4
- data.tar.gz: 694d121dbc9698d435f533edb5e74ed5338d37efe381c3bfb395f1c8591ba89f
3
+ metadata.gz: e37ee5bdabd254e03a06962503f5875ab75d2aa0b271aff0b946c73f673d5cc3
4
+ data.tar.gz: c351c35f1b5545f0d0f9e6b4375076b159dab88eb454cf3e946fdab84ddc03a4
5
5
  SHA512:
6
- metadata.gz: c32fdef034cc65e3bf6139a9b311132ec0330acac4fd8c146248ad53da5f51a7cb517b130cd9337f4f76a84251e958667c269e43b6885c4f4db8df389ba35ec3
7
- data.tar.gz: 5b06bcc3f6f3ef0d9b524e3904614f2fc7cf933eb322c61ef46bdf5212b6413a354660aa29c771f03e7ed2d34c5e0ca20aab385c1501bfa20d07a8abeba0c6e2
6
+ metadata.gz: fedb95cf29c21d0dac17d6cbeca81da0a5318ab41ea8a5eaf355e8996606b03398ba70c4233e4591a78ceaef5ce5242b59c5e227e79a6310ba4474c245476e57
7
+ data.tar.gz: 7a06834ea16425d456da704b3272c96d6e0f427044e46e60a477dbce1cab7f9621056f4f18f7b90a964f83e3e85281a981fdf155155ebf4cb8b06871cd9884ab
data/README.md CHANGED
@@ -9,16 +9,55 @@ Extract and work with your email using the command line with this plugin for [ch
9
9
  # Install chronicle-etl and this plugin
10
10
  $ gem install chronicle-etl
11
11
  $ chronicle-etl plugins:install email
12
+ ```
13
+
14
+ ### Extracting email from IMAP
15
+
16
+ For Gmail accounts, you can create an [app password](https://myaccount.google.com/apppasswords); your email address is your username.
17
+
18
+ ```sh
19
+ # Save username and password
20
+ $ chronicle-etl secrets:set imap username foo@gmail.com
21
+ $ chronicle-etl secrets:set imap password APPPASSWORD
22
+
23
+ # Then, retrieve your email from the last five days
24
+ $ chronicle-etl --extractor email:imap --transformer email --since 5d --loader json
25
+
26
+ # If you don't want to save your credentials as a secret, you can just pass
27
+ # them to the extractor directly
28
+ $ chronicle-etl --extractor email:imap --transformer email --since 5d --loader json \
29
+ --extractor-opts username:foo@gmail.com --password:APPPASSWORD
30
+ ```
12
31
 
13
- # Process emails from an mbox file
14
- $ chronicle-etl --extractor email:mbox -i test.mbox --transformer email --fields subject
32
+ ### Processing email from an .mbox file
33
+ The MBOX format is used to archive an email mailbox. [Google Takeout](https://takeout.google.com/settings/takeout) exports emails from gmail in this format.
34
+
35
+ ```sh
36
+ # Retrieve the subject lines of all emails in test.mbox
37
+ $ chronicle-etl --extractor email:mbox --input inbox.mbox --transformer email --fields subject
15
38
  ```
16
39
 
17
40
  ## Available Connectors
18
41
  ### Extractors
19
42
 
43
+ #### `imap`
44
+ Extractor for importing recent emails from an IMAP server.
45
+
46
+ ##### Settings
47
+
48
+ - `since`: Retrieve emails since this date
49
+ - `until`: Retrieve emails until this date
50
+ - `username`
51
+ - `password`
52
+ - `host`: (default: imap.gmail.com)
53
+ - `port`: (default: 993) Use 143 for unencrypted connections
54
+ - `mailbox`: (default: "[Gmail]/All Mail")
55
+ - `search_query`: When using Gmail, you can pass in a search query (`from:foo has:attachment`) to filter messages by
56
+
57
+ For accessing Gmail, you can create a one-time [app password](https://myaccount.google.com/apppasswords). Your email address is your username.
58
+
20
59
  #### `mbox`
21
- Extractor for importing emails from an mbox file
60
+ Extractor for importing emails from an MBOX file
22
61
 
23
62
  ##### Settings
24
63
  - `input`: A path to an .mbox file
@@ -31,6 +70,3 @@ Transform an email (in the form of a string) into Chronicle Schema
31
70
  ##### Settings
32
71
  - `body_as_markdown`: (default: false) Whether to convert the email body into markdown
33
72
  - `remove_signature`: (default: true) Whether to attempt to strip out the email signature (using the [`email_reply_parser`](https://github.com/github/email_reply_parser) gem)
34
-
35
- ## Roadmap
36
- - Add an IMAP (and gmail) extractor #1
@@ -85,8 +85,7 @@ module Chronicle
85
85
  end
86
86
 
87
87
  def clean_body message
88
- # FIXME: this all needs to be refactored
89
-
88
+ # FIXME: this all needs to be refactored
90
89
  if message.multipart?
91
90
  body = message.text_part&.decoded rescue Mail::UnknownEncodingType
92
91
  else
@@ -94,6 +93,9 @@ module Chronicle
94
93
  body = body_to_markdown if @config.body_as_markdown
95
94
  end
96
95
 
96
+ return if body == Mail::UnknownEncodingType
97
+ return unless body && body != ""
98
+
97
99
  body = body_without_signature(body) if @config.remove_signature
98
100
 
99
101
  # Force UTF-8 encoding
@@ -107,8 +109,6 @@ module Chronicle
107
109
  end
108
110
 
109
111
  def body_without_signature(body)
110
- return unless body && body != ""
111
-
112
112
  # FIXME: regex in EmailReplyParse gem seems to get into infinite loops
113
113
  # with certain long bodies that have binary data
114
114
  parsed_body = Timeout::timeout(5) do
@@ -0,0 +1,89 @@
1
+ require 'net/imap'
2
+
3
+ module Chronicle
4
+ module Email
5
+ class IMAPExtractor < Chronicle::ETL::Extractor
6
+ register_connector do |r|
7
+ r.provider = 'email'
8
+ r.description = 'imap server'
9
+ r.identifier = 'imap'
10
+ end
11
+
12
+ setting :host, required: true, default: 'imap.gmail.com'
13
+ setting :port, type: :numeric, required: true, default: 993
14
+ setting :mailbox, required: true, default: '[Gmail]/All Mail'
15
+ setting :username, required: true
16
+ setting :password, required: true
17
+ setting :search_query
18
+
19
+ def prepare
20
+ @connection = create_connection
21
+ @message_ids = fetch_message_ids
22
+ end
23
+
24
+ def results_count
25
+ @message_ids.count
26
+ end
27
+
28
+ def extract
29
+ @message_ids.each do |message_id|
30
+ message = fetch_message(message_id)
31
+ yield Chronicle::ETL::Extraction.new(data: { email: message.attr["BODY[]"]} )
32
+ end
33
+ end
34
+
35
+ private
36
+
37
+ def create_connection
38
+ connection = Net::IMAP.new(@config.host, @config.port, true)
39
+ connection.login(@config.username, @config.password)
40
+ connection.select(@config.mailbox)
41
+ connection
42
+ rescue Net::IMAP::NoResponseError => e
43
+ raise(Chronicle::ETL::ExtractionError, "Error connecting to IMAP server. Please check username and password")
44
+ end
45
+
46
+ def fetch_message_ids
47
+ keys = gmail_mode? ? search_keys_gmail : search_keys_default
48
+ @connection.search(keys)
49
+ rescue Net::IMAP::BadResponseError => e
50
+ raise(Chronicle::ETL::ExtractionError, "Error searching IMAP server for messages")
51
+ end
52
+
53
+ def fetch_message(message_id)
54
+ response = @connection.fetch(3100020, "BODY.PEEK[]")
55
+ raise(Chronicle::ETL::ExtractionError, "Error loading message") unless response
56
+
57
+ return response[0]
58
+ end
59
+
60
+ def search_keys_gmail
61
+ # Gmail offers an extension to IMAP that lets us use gmail queries
62
+ q = ""
63
+
64
+ # First, we ignore drafts beacuse they break a lot of assumptions we
65
+ # make when when processing emails (lack of timestamps, ids, etc)
66
+ q = "-label:draft"
67
+
68
+ # We use UNIX timestamps in gmail filters which let us do more precise
69
+ # since/until compared with date-based imap filters
70
+ q += " after:#{@config.since.to_i}" if @config.since
71
+ q += " before:#{@config.until.to_i}" if @config.until
72
+ q += " #{@config.search_query}" if @config.search_query
73
+
74
+ ["X-GM-RAW", q]
75
+ end
76
+
77
+ def search_keys_default
78
+ keys = []
79
+ # TODO: test out non-gmail IMAP searching (for @config.search_query)
80
+ keys += ['SINCE', Net::IMAP.format_date(@config.since)] if @config.since
81
+ keys += ['BEFORE', Net::IMAP.format_date(@config.until)] if @config.until
82
+ end
83
+
84
+ def gmail_mode?
85
+ @config.host == 'imap.gmail.com'
86
+ end
87
+ end
88
+ end
89
+ end
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
2
  module Email
3
- VERSION = "0.2.1"
3
+ VERSION = "0.2.2"
4
4
  end
5
5
  end
@@ -1,6 +1,7 @@
1
1
  require "chronicle/email/version"
2
2
  require "chronicle/email/email_transformer"
3
3
  require "chronicle/email/mbox_extractor"
4
+ require "chronicle/email/imap_extractor"
4
5
 
5
6
  module Chronicle
6
7
  module Email
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chronicle-email
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Louis
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-03-25 00:00:00.000000000 Z
11
+ date: 2022-03-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: chronicle-etl
@@ -128,6 +128,7 @@ files:
128
128
  - chronicle-email.gemspec
129
129
  - lib/chronicle/email.rb
130
130
  - lib/chronicle/email/email_transformer.rb
131
+ - lib/chronicle/email/imap_extractor.rb
131
132
  - lib/chronicle/email/mbox_extractor.rb
132
133
  - lib/chronicle/email/version.rb
133
134
  homepage: https://github.com/chronicle-app/chronicle-email