msg_extractor 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +9 -0
- data/LICENSE.txt +21 -0
- data/README.md +88 -0
- data/exe/msg_extractor +5 -0
- data/lib/msg_extractor/appointment.rb +20 -0
- data/lib/msg_extractor/attachment.rb +57 -0
- data/lib/msg_extractor/cfbf/directory.rb +84 -0
- data/lib/msg_extractor/cfbf/fat.rb +75 -0
- data/lib/msg_extractor/cfbf/file.rb +114 -0
- data/lib/msg_extractor/cfbf/header.rb +40 -0
- data/lib/msg_extractor/cli.rb +77 -0
- data/lib/msg_extractor/contact.rb +23 -0
- data/lib/msg_extractor/errors.rb +12 -0
- data/lib/msg_extractor/headers.rb +39 -0
- data/lib/msg_extractor/mapi/decoders.rb +59 -0
- data/lib/msg_extractor/mapi/named_property_map.rb +74 -0
- data/lib/msg_extractor/mapi/property_store.rb +106 -0
- data/lib/msg_extractor/mapi/ptag.rb +55 -0
- data/lib/msg_extractor/message.rb +6 -0
- data/lib/msg_extractor/message_object.rb +100 -0
- data/lib/msg_extractor/recipient.rb +34 -0
- data/lib/msg_extractor/rtf/compressed_rtf.rb +88 -0
- data/lib/msg_extractor/rtf/decapsulator.rb +206 -0
- data/lib/msg_extractor/task.rb +25 -0
- data/lib/msg_extractor/util.rb +72 -0
- data/lib/msg_extractor/version.rb +3 -0
- data/lib/msg_extractor.rb +63 -0
- metadata +74 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 5b63aa09723b0c4324a3d3189885557dfe8f72ea5b32b54631183c5f562fabe0
|
|
4
|
+
data.tar.gz: 2e9d0f2ac08c1d3c0df6c69f5110b7015275eca0a58927575c6969e87b36b152
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: c75dba1d0761bd47f24dc08897ee9846a50be175e5f43a92254364a8d11615bb58536a72db3cca6307ef40d8e236f9e914d093a9b2b30a4c2659f34ed7310ccd
|
|
7
|
+
data.tar.gz: 9fcfb42cff93e849fc4bd86003e714effa0de02d73c83fbead084b9553d7b4d559f3375f9bd299e7171f6a954e19b82936a3a29d20e151c5bf640d856eead773
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.1.0 (2026-06-12)
|
|
4
|
+
|
|
5
|
+
- Initial release: parse Outlook .msg files (emails, contacts, appointments,
|
|
6
|
+
tasks) into Ruby objects.
|
|
7
|
+
- Attachment access including embedded .msg attachments.
|
|
8
|
+
- RTF body decompression (LZFu) and RTF-encapsulated HTML extraction.
|
|
9
|
+
- Save-to-folder helpers and a msg_extractor CLI with --json output.
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Bart Duchesne
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# msg_extractor
|
|
2
|
+
|
|
3
|
+
Pure Ruby parser for Microsoft Outlook `.msg` files. Parses the OLE2/CFBF
|
|
4
|
+
container and MAPI properties into structured Ruby objects — no native
|
|
5
|
+
extensions, no runtime dependencies, no Python. Built for use in Ruby and
|
|
6
|
+
Rails applications.
|
|
7
|
+
|
|
8
|
+
## Installation
|
|
9
|
+
|
|
10
|
+
```ruby
|
|
11
|
+
# Gemfile
|
|
12
|
+
gem "msg_extractor"
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Requires Ruby >= 3.1.
|
|
16
|
+
|
|
17
|
+
## Usage
|
|
18
|
+
|
|
19
|
+
```ruby
|
|
20
|
+
require "msg_extractor"
|
|
21
|
+
|
|
22
|
+
msg = MsgExtractor.open("invoice.msg") # also accepts an IO or a binary String
|
|
23
|
+
|
|
24
|
+
msg.subject # => "Invoice 2026-001"
|
|
25
|
+
msg.sender # => #<MsgExtractor::Recipient name="Bob" email="bob@example.com">
|
|
26
|
+
msg.to # => [Recipient, ...] (also: cc, bcc, recipients)
|
|
27
|
+
msg.date # => Time (UTC)
|
|
28
|
+
msg.body # => plain text body (UTF-8)
|
|
29
|
+
msg.html_body # => HTML body; extracted from the RTF body when absent
|
|
30
|
+
msg.rtf_body # => decompressed RTF (binary) or nil
|
|
31
|
+
msg.headers # => case-insensitive transport headers: msg.headers["Subject"]
|
|
32
|
+
|
|
33
|
+
msg.attachments.each do |att|
|
|
34
|
+
att.filename # => "report.pdf"
|
|
35
|
+
att.mime_type # => "application/pdf"
|
|
36
|
+
att.content_id # => for matching cid: URLs in html_body
|
|
37
|
+
att.data # => raw bytes — hand to ActiveStorage, S3, etc.
|
|
38
|
+
att.save(dir: "tmp/")
|
|
39
|
+
att.message # => parsed MsgExtractor::Message when the attachment
|
|
40
|
+
# is itself an embedded .msg (att.embedded_message?)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
msg.save(dir: "out/") # writes message.txt, message.html and attachments
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
`MsgExtractor.open` returns a typed object based on the message class:
|
|
47
|
+
|
|
48
|
+
| Message class | Returned type | Extra readers |
|
|
49
|
+
|---|---|---|
|
|
50
|
+
| IPM.Note, REPORT.* | `Message` | — |
|
|
51
|
+
| IPM.Contact, IPM.DistList | `Contact` | `display_name`, `given_name`, `surname`, `company`, `job_title`, `business_phone`, `home_phone`, `mobile_phone`, `postal_address`, `emails` |
|
|
52
|
+
| IPM.Appointment, IPM.Schedule.Meeting.* | `Appointment` | `starts_at`, `ends_at`, `location`, `all_day?`, `organizer`, `required_attendees`, `optional_attendees` |
|
|
53
|
+
| IPM.Task | `Task` | `starts_on`, `due_on`, `status`, `percent_complete`, `complete?`, `owner` |
|
|
54
|
+
|
|
55
|
+
Other message classes raise `MsgExtractor::UnsupportedTypeError`; pass
|
|
56
|
+
`strict: false` to get a generic `MessageObject` instead.
|
|
57
|
+
|
|
58
|
+
Errors: all inherit from `MsgExtractor::Error` — `InvalidFormatError`,
|
|
59
|
+
`UnsupportedTypeError`, `CorruptFileError`.
|
|
60
|
+
|
|
61
|
+
## CLI
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
msg_extractor FILE... [--out DIR] [--json] [--attachments-only]
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Development
|
|
68
|
+
|
|
69
|
+
```
|
|
70
|
+
bundle install
|
|
71
|
+
rake test
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Test fixtures and oracle data come from the Python
|
|
75
|
+
[extract_msg](https://github.com/TeamMsgExtractor/msg-extractor) project,
|
|
76
|
+
which this gem uses as a black-box behavioral reference (no code is
|
|
77
|
+
translated from it). See `tool/generate_oracle.py`.
|
|
78
|
+
|
|
79
|
+
## Credits
|
|
80
|
+
|
|
81
|
+
This gem was developed with [Claude Code](https://claude.com/claude-code)
|
|
82
|
+
(Anthropic's Claude Fable 5 model), implementing the Microsoft open
|
|
83
|
+
specifications ([MS-CFB], [MS-OXMSG], [MS-OXRTFCP], [MS-OXRTFEX]) and
|
|
84
|
+
validated against the output of the Python extract_msg library.
|
|
85
|
+
|
|
86
|
+
## License
|
|
87
|
+
|
|
88
|
+
MIT.
|
data/exe/msg_extractor
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
module MsgExtractor
|
|
2
|
+
# IPM.Appointment and IPM.Schedule.Meeting.* items. Times come from
|
|
3
|
+
# PSETID_Appointment named properties and are returned as UTC Time.
|
|
4
|
+
class Appointment < MessageObject
|
|
5
|
+
PSETID_APPOINTMENT = "00062002-0000-0000-c000-000000000046"
|
|
6
|
+
|
|
7
|
+
LID_START_WHOLE = 0x820D
|
|
8
|
+
LID_END_WHOLE = 0x820E
|
|
9
|
+
LID_LOCATION = 0x8208
|
|
10
|
+
LID_ALL_DAY = 0x8215
|
|
11
|
+
|
|
12
|
+
def starts_at = named_value(PSETID_APPOINTMENT, LID_START_WHOLE)
|
|
13
|
+
def ends_at = named_value(PSETID_APPOINTMENT, LID_END_WHOLE)
|
|
14
|
+
def location = named_value(PSETID_APPOINTMENT, LID_LOCATION)
|
|
15
|
+
def all_day? = named_value(PSETID_APPOINTMENT, LID_ALL_DAY) == true
|
|
16
|
+
def organizer = sender
|
|
17
|
+
def required_attendees = to
|
|
18
|
+
def optional_attendees = cc
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
module MsgExtractor
|
|
2
|
+
class Attachment
|
|
3
|
+
BY_VALUE = 1
|
|
4
|
+
EMBEDDED_MSG = 5
|
|
5
|
+
|
|
6
|
+
attr_reader :properties
|
|
7
|
+
|
|
8
|
+
def initialize(cfbf, storage, named: nil)
|
|
9
|
+
@cfbf = cfbf
|
|
10
|
+
@storage = storage
|
|
11
|
+
@named = named
|
|
12
|
+
@properties = Mapi::PropertyStore.new(cfbf, storage, :attachment)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def filename
|
|
16
|
+
properties[Mapi::PR_ATTACH_LONG_FILENAME] || properties[Mapi::PR_ATTACH_FILENAME]
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def mime_type = properties[Mapi::PR_ATTACH_MIME_TAG]
|
|
20
|
+
def content_id = properties[Mapi::PR_ATTACH_CONTENT_ID]
|
|
21
|
+
def attach_method = properties[Mapi::PR_ATTACH_METHOD] || BY_VALUE
|
|
22
|
+
def embedded_message? = attach_method == EMBEDDED_MSG
|
|
23
|
+
|
|
24
|
+
def inline?
|
|
25
|
+
properties[Mapi::PR_ATTACHMENT_HIDDEN] == true || !content_id.nil?
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Bytes are memoized on first read and pinned until this Attachment is GC'd.
|
|
29
|
+
# That is deliberate: size, save, and data all reuse the same single read.
|
|
30
|
+
def data
|
|
31
|
+
return nil if embedded_message?
|
|
32
|
+
return @data if defined?(@data)
|
|
33
|
+
@data = properties.raw(Mapi::PR_ATTACH_DATA)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def size = data&.bytesize
|
|
37
|
+
|
|
38
|
+
# The parsed embedded message, when this attachment is a nested .msg.
|
|
39
|
+
# Always lenient (strict: false): raising lazily from an accessor would
|
|
40
|
+
# surprise callers, and embedded items are often non-email types.
|
|
41
|
+
def message
|
|
42
|
+
return @message if defined?(@message)
|
|
43
|
+
@message =
|
|
44
|
+
if embedded_message? && (sub = @storage.children["__SUBSTG1.0_3701000D"]) && sub.storage?
|
|
45
|
+
MsgExtractor.from_storage(@cfbf, sub, named: @named, kind: :embedded, strict: false)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def save(dir: ".")
|
|
50
|
+
raise Error, "cannot save an embedded message attachment as a file" if embedded_message?
|
|
51
|
+
raise Error, "attachment #{filename.inspect} has no data stream" if data.nil?
|
|
52
|
+
path = Util.dedupe_path(::File.join(dir, Util.sanitize_filename(filename || "attachment")))
|
|
53
|
+
::File.binwrite(path, data)
|
|
54
|
+
path
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
module MsgExtractor
|
|
2
|
+
module Cfbf
|
|
3
|
+
# One 128-byte directory entry: a storage (folder), stream (file), or root.
|
|
4
|
+
class Entry
|
|
5
|
+
TYPE_STORAGE = 1
|
|
6
|
+
TYPE_STREAM = 2
|
|
7
|
+
TYPE_ROOT = 5
|
|
8
|
+
|
|
9
|
+
attr_reader :name, :type, :left, :right, :child, :start_sector, :size
|
|
10
|
+
attr_accessor :children
|
|
11
|
+
|
|
12
|
+
def initialize(record)
|
|
13
|
+
name_len = record.byteslice(64, 2).unpack1("v")
|
|
14
|
+
name_len = 64 if name_len > 64
|
|
15
|
+
@name = if name_len >= 2
|
|
16
|
+
record.byteslice(0, name_len - 2)
|
|
17
|
+
.force_encoding(Encoding::UTF_16LE)
|
|
18
|
+
.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
|
|
19
|
+
else
|
|
20
|
+
""
|
|
21
|
+
end
|
|
22
|
+
@type = record.getbyte(66)
|
|
23
|
+
@left, @right, @child = record.byteslice(68, 12).unpack("V3")
|
|
24
|
+
@start_sector = record.byteslice(116, 4).unpack1("V")
|
|
25
|
+
@size = record.byteslice(120, 8).unpack1("Q<")
|
|
26
|
+
@children = {}
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def storage? = @type == TYPE_STORAGE || @type == TYPE_ROOT
|
|
30
|
+
def stream? = @type == TYPE_STREAM
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Parses the directory sector chain and links each storage's red-black
|
|
34
|
+
# sibling tree into a flat children hash (keyed by upcased name).
|
|
35
|
+
class Directory
|
|
36
|
+
attr_reader :root
|
|
37
|
+
|
|
38
|
+
def initialize(dir_bytes)
|
|
39
|
+
@entries = []
|
|
40
|
+
(dir_bytes.bytesize / 128).times do |i|
|
|
41
|
+
record = dir_bytes.byteslice(i * 128, 128)
|
|
42
|
+
@entries << (record.getbyte(66).to_i.zero? ? nil : Entry.new(record))
|
|
43
|
+
end
|
|
44
|
+
@root = @entries[0]
|
|
45
|
+
unless @root && @root.type == Entry::TYPE_ROOT
|
|
46
|
+
raise CorruptFileError, "compound file has no root directory entry"
|
|
47
|
+
end
|
|
48
|
+
link_all_children
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
# Iterative directory walk with a single visited-set shared across all
|
|
54
|
+
# storages. Each work-queue item is [parent_entry, child_index].
|
|
55
|
+
# Raises CorruptFileError if any entry index is visited more than once
|
|
56
|
+
# (catches both sibling-tree cycles and cross-storage back-references).
|
|
57
|
+
def link_all_children
|
|
58
|
+
visited = {}
|
|
59
|
+
# queue items: [parent_entry, entry_index_to_process]
|
|
60
|
+
queue = []
|
|
61
|
+
queue.push([@root, @root.child]) unless @root.child == NOSTREAM
|
|
62
|
+
|
|
63
|
+
until queue.empty?
|
|
64
|
+
parent, index = queue.shift
|
|
65
|
+
next if index == NOSTREAM
|
|
66
|
+
raise CorruptFileError, "directory entry cycle at index #{index}" if visited[index]
|
|
67
|
+
visited[index] = true
|
|
68
|
+
|
|
69
|
+
child = @entries[index]
|
|
70
|
+
raise CorruptFileError, "directory references missing entry #{index}" unless child
|
|
71
|
+
|
|
72
|
+
parent.children[child.name.upcase] = child
|
|
73
|
+
|
|
74
|
+
queue.push([parent, child.left]) unless child.left == NOSTREAM
|
|
75
|
+
queue.push([parent, child.right]) unless child.right == NOSTREAM
|
|
76
|
+
|
|
77
|
+
if child.storage? && child.child != NOSTREAM
|
|
78
|
+
queue.push([child, child.child])
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
module MsgExtractor
|
|
2
|
+
module Cfbf
|
|
3
|
+
# The File Allocation Table: maps each sector index to the next sector in
|
|
4
|
+
# its chain. Built from the header DIFAT plus chained DIFAT sectors.
|
|
5
|
+
class Fat
|
|
6
|
+
def initialize(data, header)
|
|
7
|
+
@data = data
|
|
8
|
+
@header = header
|
|
9
|
+
@entries = build_entries
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def sector_bytes(sector)
|
|
13
|
+
offset = (sector + 1) * @header.sector_size
|
|
14
|
+
bytes = @data.byteslice(offset, @header.sector_size)
|
|
15
|
+
if bytes.nil? || bytes.empty?
|
|
16
|
+
raise CorruptFileError, "sector #{sector} beyond end of file"
|
|
17
|
+
end
|
|
18
|
+
bytes
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def chain(start)
|
|
22
|
+
sectors = []
|
|
23
|
+
seen = {}
|
|
24
|
+
sector = start
|
|
25
|
+
while sector != ENDOFCHAIN
|
|
26
|
+
raise CorruptFileError, "FAT chain cycle at sector #{sector}" if seen[sector]
|
|
27
|
+
if sector >= @entries.size
|
|
28
|
+
raise CorruptFileError, "FAT chain references invalid sector #{sector}"
|
|
29
|
+
end
|
|
30
|
+
seen[sector] = true
|
|
31
|
+
sectors << sector
|
|
32
|
+
sector = @entries[sector]
|
|
33
|
+
end
|
|
34
|
+
sectors
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def read_chain(start)
|
|
38
|
+
chain(start).map { |s| sector_bytes(s) }.join
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
def build_entries
|
|
44
|
+
max_sectors = @data.bytesize / @header.sector_size
|
|
45
|
+
fat_sectors = @header.difat_head.reject { |s| s == FREESECT }
|
|
46
|
+
difat_sector = @header.first_difat_sector
|
|
47
|
+
seen_difat = {}
|
|
48
|
+
iterations = 0
|
|
49
|
+
while difat_sector != ENDOFCHAIN && difat_sector != FREESECT
|
|
50
|
+
if seen_difat[difat_sector]
|
|
51
|
+
raise CorruptFileError, "DIFAT sector cycle at #{difat_sector}"
|
|
52
|
+
end
|
|
53
|
+
seen_difat[difat_sector] = true
|
|
54
|
+
iterations += 1
|
|
55
|
+
if iterations > max_sectors
|
|
56
|
+
raise CorruptFileError, "DIFAT chain exceeds file size"
|
|
57
|
+
end
|
|
58
|
+
slice = sector_bytes(difat_sector)
|
|
59
|
+
if slice.bytesize < @header.sector_size
|
|
60
|
+
raise CorruptFileError, "DIFAT sector #{difat_sector} is truncated"
|
|
61
|
+
end
|
|
62
|
+
values = slice.unpack("V*")
|
|
63
|
+
next_difat = values.pop
|
|
64
|
+
fat_sectors.concat(values.reject { |s| s == FREESECT })
|
|
65
|
+
# M2: stop appending if fat_sectors count would exceed max possible sectors
|
|
66
|
+
if fat_sectors.size > max_sectors
|
|
67
|
+
raise CorruptFileError, "FAT sector list exceeds file size"
|
|
68
|
+
end
|
|
69
|
+
difat_sector = next_difat
|
|
70
|
+
end
|
|
71
|
+
fat_sectors.flat_map { |s| sector_bytes(s).unpack("V*") }
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
module MsgExtractor
|
|
2
|
+
module Cfbf
|
|
3
|
+
# The assembled compound file: directory tree plus stream extraction.
|
|
4
|
+
# Streams smaller than the mini-stream cutoff (4096) live in the
|
|
5
|
+
# ministream and are chained through the miniFAT; larger streams are
|
|
6
|
+
# chained directly through the FAT.
|
|
7
|
+
class File
|
|
8
|
+
attr_reader :root
|
|
9
|
+
|
|
10
|
+
# Accepts a filesystem path, a binary String of file content, or an IO.
|
|
11
|
+
# Strings beginning with the OLE signature are treated as content;
|
|
12
|
+
# all other Strings are treated as filesystem paths.
|
|
13
|
+
def self.read(source)
|
|
14
|
+
data =
|
|
15
|
+
if source.is_a?(::String)
|
|
16
|
+
b = source.b
|
|
17
|
+
if b.byteslice(0, 8) == Header::SIGNATURE
|
|
18
|
+
b
|
|
19
|
+
else
|
|
20
|
+
begin
|
|
21
|
+
::File.binread(source)
|
|
22
|
+
rescue Errno::ENAMETOOLONG, Errno::EINVAL, ArgumentError
|
|
23
|
+
raise InvalidFormatError, "not an OLE2 file and not a readable path"
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
elsif source.respond_to?(:read)
|
|
27
|
+
source.read.b
|
|
28
|
+
else
|
|
29
|
+
raise ArgumentError, "cannot read MSG from #{source.class}"
|
|
30
|
+
end
|
|
31
|
+
new(data)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def initialize(data)
|
|
35
|
+
@data = data.encoding == Encoding::BINARY ? data : data.b
|
|
36
|
+
raise InvalidFormatError, "file too small to be an OLE2 file" if @data.bytesize < 512
|
|
37
|
+
@header = Header.new(@data.byteslice(0, 512))
|
|
38
|
+
@fat = Fat.new(@data, @header)
|
|
39
|
+
@directory = Directory.new(@fat.read_chain(@header.first_dir_sector))
|
|
40
|
+
@root = @directory.root
|
|
41
|
+
@mini_stream =
|
|
42
|
+
if @root.size.zero?
|
|
43
|
+
"".b
|
|
44
|
+
else
|
|
45
|
+
if @root.size > @data.bytesize
|
|
46
|
+
raise CorruptFileError, "root ministream size #{@root.size} exceeds file size"
|
|
47
|
+
end
|
|
48
|
+
chain_bytes = @fat.read_chain(@root.start_sector)
|
|
49
|
+
if chain_bytes.bytesize < @root.size
|
|
50
|
+
raise CorruptFileError, "root ministream chain shorter than declared size"
|
|
51
|
+
end
|
|
52
|
+
chain_bytes.byteslice(0, @root.size)
|
|
53
|
+
end
|
|
54
|
+
@minifat =
|
|
55
|
+
if @header.num_minifat_sectors.positive? && @header.first_minifat_sector != ENDOFCHAIN
|
|
56
|
+
@fat.read_chain(@header.first_minifat_sector).unpack("V*")
|
|
57
|
+
else
|
|
58
|
+
[]
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Path components separated by "/", matched case-insensitively.
|
|
63
|
+
def entry(path)
|
|
64
|
+
current = @root
|
|
65
|
+
path.split("/").each do |part|
|
|
66
|
+
current = current.children[part.upcase]
|
|
67
|
+
return nil unless current
|
|
68
|
+
end
|
|
69
|
+
current
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def stream(path)
|
|
73
|
+
e = entry(path)
|
|
74
|
+
e&.stream? ? read_stream(e) : nil
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def read_stream(entry)
|
|
78
|
+
return "".b if entry.size.zero?
|
|
79
|
+
if entry.size > @data.bytesize
|
|
80
|
+
raise CorruptFileError, "stream entry size #{entry.size} exceeds file size"
|
|
81
|
+
end
|
|
82
|
+
if entry.size < @header.mini_stream_cutoff
|
|
83
|
+
read_mini_stream(entry)
|
|
84
|
+
else
|
|
85
|
+
chain_bytes = @fat.read_chain(entry.start_sector)
|
|
86
|
+
if chain_bytes.bytesize < entry.size
|
|
87
|
+
raise CorruptFileError, "FAT stream chain shorter than declared size"
|
|
88
|
+
end
|
|
89
|
+
chain_bytes.byteslice(0, entry.size)
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
private
|
|
94
|
+
|
|
95
|
+
def read_mini_stream(entry)
|
|
96
|
+
out = +"".b
|
|
97
|
+
size = @header.mini_sector_size
|
|
98
|
+
sector = entry.start_sector
|
|
99
|
+
seen = {}
|
|
100
|
+
while sector != ENDOFCHAIN
|
|
101
|
+
if seen[sector] || sector >= @minifat.size
|
|
102
|
+
raise CorruptFileError, "broken miniFAT chain at #{sector}"
|
|
103
|
+
end
|
|
104
|
+
seen[sector] = true
|
|
105
|
+
slice = @mini_stream.byteslice(sector * size, size)
|
|
106
|
+
raise CorruptFileError, "mini-stream sector #{sector} out of range" if slice.nil?
|
|
107
|
+
out << slice
|
|
108
|
+
sector = @minifat[sector]
|
|
109
|
+
end
|
|
110
|
+
out.byteslice(0, entry.size)
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
module MsgExtractor
|
|
2
|
+
# Reader for the OLE2 / Compound File Binary Format ([MS-CFB]) used as the
|
|
3
|
+
# container of .msg files.
|
|
4
|
+
module Cfbf
|
|
5
|
+
FREESECT = 0xFFFFFFFF
|
|
6
|
+
ENDOFCHAIN = 0xFFFFFFFE
|
|
7
|
+
FATSECT = 0xFFFFFFFD
|
|
8
|
+
DIFSECT = 0xFFFFFFFC
|
|
9
|
+
NOSTREAM = 0xFFFFFFFF
|
|
10
|
+
|
|
11
|
+
class Header
|
|
12
|
+
SIGNATURE = "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1".b.freeze
|
|
13
|
+
|
|
14
|
+
attr_reader :sector_size, :mini_sector_size, :num_fat_sectors,
|
|
15
|
+
:first_dir_sector, :mini_stream_cutoff,
|
|
16
|
+
:first_minifat_sector, :num_minifat_sectors,
|
|
17
|
+
:first_difat_sector, :num_difat_sectors, :difat_head
|
|
18
|
+
|
|
19
|
+
def initialize(bytes)
|
|
20
|
+
unless bytes && bytes.bytesize >= 512 && bytes.byteslice(0, 8) == SIGNATURE
|
|
21
|
+
raise InvalidFormatError, "not an OLE2 compound file (bad signature)"
|
|
22
|
+
end
|
|
23
|
+
sector_shift = bytes.byteslice(30, 2).unpack1("v")
|
|
24
|
+
mini_shift = bytes.byteslice(32, 2).unpack1("v")
|
|
25
|
+
unless sector_shift == 9 || sector_shift == 12
|
|
26
|
+
raise InvalidFormatError, "invalid sector shift #{sector_shift} (must be 9 or 12)"
|
|
27
|
+
end
|
|
28
|
+
unless mini_shift == 6
|
|
29
|
+
raise InvalidFormatError, "invalid mini sector shift #{mini_shift} (must be 6)"
|
|
30
|
+
end
|
|
31
|
+
@sector_size = 1 << sector_shift
|
|
32
|
+
@mini_sector_size = 1 << mini_shift
|
|
33
|
+
@num_fat_sectors, @first_dir_sector, _txn_sig, @mini_stream_cutoff,
|
|
34
|
+
@first_minifat_sector, @num_minifat_sectors,
|
|
35
|
+
@first_difat_sector, @num_difat_sectors = bytes.byteslice(44, 32).unpack("V8")
|
|
36
|
+
@difat_head = bytes.byteslice(76, 436).unpack("V109")
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
require "optparse"
|
|
2
|
+
require "json"
|
|
3
|
+
require "time"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
|
|
6
|
+
module MsgExtractor
|
|
7
|
+
class CLI
|
|
8
|
+
def self.run(argv, stdout: $stdout, stderr: $stderr)
|
|
9
|
+
options = { out: ".", json: false, attachments_only: false }
|
|
10
|
+
parser = OptionParser.new do |opts|
|
|
11
|
+
opts.banner = "Usage: msg_extractor FILE... [options]"
|
|
12
|
+
opts.on("--out DIR", "Output directory (default: current directory)") do |dir|
|
|
13
|
+
options[:out] = dir
|
|
14
|
+
end
|
|
15
|
+
opts.on("--json", "Print one JSON object per file to stdout instead of saving") do
|
|
16
|
+
options[:json] = true
|
|
17
|
+
end
|
|
18
|
+
opts.on("--attachments-only", "Save only the attachments, flat into --out") do
|
|
19
|
+
options[:attachments_only] = true
|
|
20
|
+
end
|
|
21
|
+
opts.on("--version", "Print version") do
|
|
22
|
+
stdout.puts MsgExtractor::VERSION
|
|
23
|
+
return 0
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
begin
|
|
28
|
+
files = parser.parse(argv)
|
|
29
|
+
rescue OptionParser::ParseError => e
|
|
30
|
+
stderr.puts e.message
|
|
31
|
+
stderr.puts parser.banner
|
|
32
|
+
return 2
|
|
33
|
+
end
|
|
34
|
+
if files.empty?
|
|
35
|
+
stderr.puts parser.banner
|
|
36
|
+
return 2
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
status = 0
|
|
40
|
+
files.each do |file|
|
|
41
|
+
msg = MsgExtractor.open(file)
|
|
42
|
+
if options[:json]
|
|
43
|
+
stdout.puts JSON.generate(json_for(msg, file))
|
|
44
|
+
elsif options[:attachments_only]
|
|
45
|
+
FileUtils.mkdir_p(options[:out])
|
|
46
|
+
msg.attachments.reject(&:embedded_message?).each { |a| a.save(dir: options[:out]) }
|
|
47
|
+
else
|
|
48
|
+
FileUtils.mkdir_p(options[:out])
|
|
49
|
+
msg.save(dir: options[:out])
|
|
50
|
+
end
|
|
51
|
+
rescue MsgExtractor::Error, SystemCallError => e
|
|
52
|
+
stderr.puts "#{file}: #{e.message}"
|
|
53
|
+
status = 1
|
|
54
|
+
end
|
|
55
|
+
status
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def self.json_for(msg, file)
|
|
59
|
+
{
|
|
60
|
+
file: file,
|
|
61
|
+
message_class: msg.message_class,
|
|
62
|
+
subject: msg.subject,
|
|
63
|
+
date: msg.date&.utc&.iso8601,
|
|
64
|
+
sender: msg.sender && { name: msg.sender.name, email: msg.sender.email },
|
|
65
|
+
to: msg.to.map { |r| { name: r.name, email: r.email } },
|
|
66
|
+
cc: msg.cc.map { |r| { name: r.name, email: r.email } },
|
|
67
|
+
bcc: msg.bcc.map { |r| { name: r.name, email: r.email } },
|
|
68
|
+
body: msg.body,
|
|
69
|
+
html_body: msg.html_body,
|
|
70
|
+
attachments: msg.attachments.map do |a|
|
|
71
|
+
{ filename: a.filename, mime_type: a.mime_type, content_id: a.content_id,
|
|
72
|
+
size: a.size, embedded_message: a.embedded_message? }
|
|
73
|
+
end
|
|
74
|
+
}
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
module MsgExtractor
|
|
2
|
+
# IPM.Contact / IPM.DistList. Email addresses live in PSETID_Address named
|
|
3
|
+
# properties (Email1/2/3 EmailAddress LIDs).
|
|
4
|
+
class Contact < MessageObject
|
|
5
|
+
PSETID_ADDRESS = "00062004-0000-0000-c000-000000000046"
|
|
6
|
+
|
|
7
|
+
EMAIL_LIDS = [0x8083, 0x8093, 0x80A3].freeze
|
|
8
|
+
|
|
9
|
+
def display_name = properties[Mapi::PR_DISPLAY_NAME] || subject
|
|
10
|
+
def given_name = properties[Mapi::PR_GIVEN_NAME]
|
|
11
|
+
def surname = properties[Mapi::PR_SURNAME]
|
|
12
|
+
def company = properties[Mapi::PR_COMPANY_NAME]
|
|
13
|
+
def job_title = properties[Mapi::PR_JOB_TITLE]
|
|
14
|
+
def business_phone = properties[Mapi::PR_BUSINESS_PHONE]
|
|
15
|
+
def home_phone = properties[Mapi::PR_HOME_PHONE]
|
|
16
|
+
def mobile_phone = properties[Mapi::PR_MOBILE_PHONE]
|
|
17
|
+
def postal_address = properties[Mapi::PR_POSTAL_ADDRESS]
|
|
18
|
+
|
|
19
|
+
def emails
|
|
20
|
+
EMAIL_LIDS.filter_map { |lid| named_value(PSETID_ADDRESS, lid) }
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
module MsgExtractor
|
|
2
|
+
class Error < StandardError; end
|
|
3
|
+
|
|
4
|
+
# Not an OLE file, or an OLE file without MSG property streams.
|
|
5
|
+
class InvalidFormatError < Error; end
|
|
6
|
+
|
|
7
|
+
# Recognized MSG container but an unsupported message class.
|
|
8
|
+
class UnsupportedTypeError < Error; end
|
|
9
|
+
|
|
10
|
+
# Structurally broken file (bad FAT chains, truncated streams, bad records).
|
|
11
|
+
class CorruptFileError < Error; end
|
|
12
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
module MsgExtractor
|
|
2
|
+
# Parsed RFC 5322 transport headers with case-insensitive lookup.
|
|
3
|
+
#
|
|
4
|
+
# * +[]+ and +all+ are case-insensitive (e.g. h["subject"] == h["Subject"]).
|
|
5
|
+
# * +[]+ returns the first value for a header name.
|
|
6
|
+
# * +all+ returns every value for a header name.
|
|
7
|
+
# * +to_h+ keeps the original case of each header name and is first-value-wins
|
|
8
|
+
# when the same name appears more than once.
|
|
9
|
+
class Headers
|
|
10
|
+
include Enumerable
|
|
11
|
+
|
|
12
|
+
def self.parse(text)
|
|
13
|
+
return new([]) if text.nil? || text.empty?
|
|
14
|
+
fields = []
|
|
15
|
+
text.each_line(chomp: true) do |line|
|
|
16
|
+
break if line.empty? # end of the header block
|
|
17
|
+
if line.start_with?(" ", "\t")
|
|
18
|
+
fields.last[1] << " " << line.strip unless fields.empty?
|
|
19
|
+
elsif (match = /\A([!-9;-~]+):[ \t]*(.*)\z/.match(line))
|
|
20
|
+
fields << [match[1], +match[2]]
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
new(fields)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def initialize(fields)
|
|
27
|
+
@fields = fields
|
|
28
|
+
# Build a downcased-name → values hash for O(1) lookup.
|
|
29
|
+
@index = {}
|
|
30
|
+
@fields.each { |name, value| (@index[name.downcase] ||= []) << value }
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def [](key) = @index[key.downcase]&.first
|
|
34
|
+
def all(key) = (@index[key.downcase] || []).dup
|
|
35
|
+
def each(&block) = @fields.each(&block)
|
|
36
|
+
def to_h = @fields.each_with_object({}) { |(name, value), h| h[name] ||= value }
|
|
37
|
+
def empty? = @fields.empty?
|
|
38
|
+
end
|
|
39
|
+
end
|