stevedore-uploader 1.0.4-java → 1.0.5-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/parsers/stevedore_blob.rb +1 -1
- data/lib/parsers/stevedore_email.rb +0 -1
- data/lib/split_archive.rb +57 -23
- data/lib/stevedore-uploader.rb +10 -3
- metadata +8 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5ae48d657db4ddb854c7cda165ded9188ea97c5a
|
4
|
+
data.tar.gz: c527586ca91bc3538efe9ca59a6889bf750b37a3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 67e9b259a01c543863cfe8e0207b373b1aa5752d3b6c00ca322d9fa1f5ef6aab15d51c47eea8e5e4ce4d15a07db4fe31903a5980cb736299caa0edf923da9770
|
7
|
+
data.tar.gz: b0fb2255c089bc0e085e825bb523cfdbc819d09b4a03a99e54d3461ac883775d6ebbfa9253ce32017413a8df8ebe9627d7d489c0655574f76c81f4c3a6afdaad
|
@@ -20,7 +20,6 @@ module Stevedore
|
|
20
20
|
attachment_filename = CGI::unescape(raw_attachment_filename)
|
21
21
|
possible_filename = File.join(File.dirname(filepath), attachment_filename)
|
22
22
|
eml_filename = File.join(File.dirname(filepath), File.basename(filepath, '.eml') + '-' + attachment_filename)
|
23
|
-
s3_path = S3_BASEPATH + File.dirname(filepath).gsub(::FOLDER, '')
|
24
23
|
possible_s3_url = S3_BASEPATH + '/' + CGI::escape(File.basename(possible_filename))
|
25
24
|
possible_eml_s3_url = S3_BASEPATH + '/' + CGI::escape(File.basename(eml_filename))
|
26
25
|
|
data/lib/split_archive.rb
CHANGED
@@ -1,7 +1,11 @@
|
|
1
|
-
# splits zip, mbox, pst files into their constituent documents --
|
1
|
+
# splits zip, mbox, eml and pst files into their constituent documents -- mesasges and attachments
|
2
2
|
# and puts them into a tmp folder
|
3
3
|
# which is then parsed normally
|
4
4
|
|
5
|
+
# why .eml you ask? those aren't archives!
|
6
|
+
# you're right, but they do contain other files (i.e. attachments)
|
7
|
+
# so I figure this is the place to handle files that contain other files.
|
8
|
+
|
5
9
|
require 'tmpdir'
|
6
10
|
require 'mail'
|
7
11
|
require 'zip'
|
@@ -29,23 +33,24 @@ module Stevedore
|
|
29
33
|
elsif extension == "zip"
|
30
34
|
self.split_zip(archive_filename)
|
31
35
|
elsif extension == "eml"
|
32
|
-
self.get_attachments_from_eml(archive_filename)
|
36
|
+
self.get_attachments_from_eml(archive_filename)
|
33
37
|
end
|
34
38
|
# should yield a relative filename
|
35
39
|
# and a lambda that will write the file contents to the given filename
|
36
40
|
FileUtils.mkdir_p(File.join(tmpdir, File.basename(archive_filename)))
|
37
41
|
|
38
|
-
constituent_files.each_with_index do |
|
39
|
-
basename, contents_lambda = *
|
40
|
-
tmp_filename = File.join(tmpdir, File.basename(archive_filename), basename
|
42
|
+
constituent_files.each_with_index do |basename_contents_lambda_attachments_parent, idx|
|
43
|
+
basename, contents_lambda, attachments, parent = *basename_contents_lambda_attachments_parent
|
44
|
+
tmp_filename = File.join(tmpdir, File.basename(archive_filename), basename )
|
41
45
|
FileUtils.mkdir_p(File.dirname(tmp_filename))
|
42
46
|
begin
|
43
47
|
contents_lambda.call(tmp_filename)
|
44
48
|
rescue Errno::ENOENT
|
45
|
-
puts "#{tmp_filename} wasn't extracted from #{archive_filename}"
|
49
|
+
puts "#{tmp_filename} wasn't extracted from #{archive_filename}"
|
46
50
|
next
|
47
51
|
end
|
48
|
-
|
52
|
+
attachments ||= []
|
53
|
+
yielder.yield tmp_filename, File.join(File.basename(archive_filename), basename), attachments, parent
|
49
54
|
end
|
50
55
|
end
|
51
56
|
end
|
@@ -77,6 +82,45 @@ module Stevedore
|
|
77
82
|
end
|
78
83
|
end
|
79
84
|
|
85
|
+
|
86
|
+
def self.split_pst(archive_filename)
|
87
|
+
pstfile = Java::ComPFF::PSTFile.new(archive_filename)
|
88
|
+
idx = 0
|
89
|
+
folders = pstfile.root.sub_folders.inject({}) do |memo,f|
|
90
|
+
memo[f.name] = f
|
91
|
+
memo
|
92
|
+
end
|
93
|
+
Enumerator.new do |yielder|
|
94
|
+
folders.each do |folder_name, folder|
|
95
|
+
while mail = folder.getNextChild
|
96
|
+
|
97
|
+
eml_str = mail.get_transport_message_headers + mail.get_body
|
98
|
+
|
99
|
+
yielder << ["#{idx}.eml", lambda{|fn| open(fn, 'wb'){|fh| fh << eml_str } }]
|
100
|
+
attachment_count = mail.get_number_of_attachments
|
101
|
+
attachment_count.times do |attachment_idx|
|
102
|
+
attachment = mail.get_attachment(attachment_idx)
|
103
|
+
attachment_filename = attachment.get_filename
|
104
|
+
yielder << ["#{idx}-#{attachment_filename}", lambda {|fn| open(fn, 'wb'){ |fh| fh << attachment.get_file_input_stream.to_io.read }}]
|
105
|
+
end
|
106
|
+
idx += 1
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def self.get_attachments_from_eml(email_filename)
|
113
|
+
Enumerator.new do |yielder|
|
114
|
+
mail = Mail.new open(email_filename){|f| f.read }
|
115
|
+
attachment_results = mail.attachments.map do |attachment|
|
116
|
+
[attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}, [], File.basename(email_filename)]
|
117
|
+
end
|
118
|
+
attachment_basenames = attachment_results.map{|a| File.basename(a[0]) }
|
119
|
+
yielder << [File.basename(email_filename), lambda{|fn| open(fn, 'wb'){|fh| fh << open(email_filename){|f| f.read } } }, attachment_basenames, nil]
|
120
|
+
attachment_results.each{|res| yielder << res }
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
80
124
|
def self.split_mbox(archive_filename)
|
81
125
|
# stolen shamelessly from the Ruby Enumerable docs, actually
|
82
126
|
# split mails in mbox (slice before Unix From line after an empty line)
|
@@ -87,6 +131,7 @@ module Stevedore
|
|
87
131
|
h[:empty] = line == "\n" || line == "\r\n" || line == "\r"
|
88
132
|
previous_was_empty && line.start_with?("From ")
|
89
133
|
end.each_with_index do |mail_str, idx|
|
134
|
+
# TODO copy over stuff from get_attachments_from_eml for attachment/parents if
|
90
135
|
mail_str.pop if mail_str.last == "\n" # remove last line if prexent
|
91
136
|
yielder << ["#{idx}.eml", lambda{|fn| open(fn, 'wb'){|fh| fh << mail_str.join("") } }]
|
92
137
|
mail = Mail.new mail_str.join("")
|
@@ -98,26 +143,15 @@ module Stevedore
|
|
98
143
|
end
|
99
144
|
end
|
100
145
|
|
101
|
-
def self.get_attachments_from_eml(email_filename)
|
102
|
-
Enumerator.new do |yielder|
|
103
|
-
yielder << [File.basename(email_filename), lambda{|fn| open(fn, 'wb'){|fh| fh << open(email_filename){|f| f.read } } }]
|
104
|
-
mail = Mail.new open(email_filename){|f| f.read }
|
105
|
-
mail.attachments.each do |attachment|
|
106
|
-
yielder << [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}]
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
110
|
-
|
111
|
-
|
112
146
|
def self.split_zip(archive_filename)
|
113
147
|
Zip::File.open(archive_filename) do |zip_file|
|
114
148
|
Enumerator.new do |yielder|
|
115
149
|
zip_file.each do |entry|
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
150
|
+
begin
|
151
|
+
yielder << [entry.name, lambda{|fn| entry.extract(fn) }]
|
152
|
+
rescue
|
153
|
+
puts "unable to extract #{entry.name} from #{archive_filename}"
|
154
|
+
end
|
121
155
|
end
|
122
156
|
end
|
123
157
|
end
|
data/lib/stevedore-uploader.rb
CHANGED
@@ -23,6 +23,7 @@ module Stevedore
|
|
23
23
|
|
24
24
|
def initialize(es_host, es_index, s3_bucket=nil, s3_path=nil)
|
25
25
|
@errors = []
|
26
|
+
puts "es_host, #{es_host}"
|
26
27
|
@client = Elasticsearch::Client.new({
|
27
28
|
log: false,
|
28
29
|
url: es_host,
|
@@ -240,9 +241,11 @@ module Stevedore
|
|
240
241
|
# but, for now, standalone emails are treated as one document
|
241
242
|
# PDFs can (theoretically) contain documents as "attachments" -- those aren't handled here either.x
|
242
243
|
if ArchiveSplitter::HANDLED_FORMATS.include?(tmp_filename.split(".")[-1])
|
243
|
-
ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename|
|
244
|
-
doc = {} if doc.nil?
|
244
|
+
ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
|
245
245
|
doc, content, metadata = process_document(constituent_file, download_filename)
|
246
|
+
doc["analyzed"] ||= {}
|
247
|
+
doc["analyzed"]["metadata"] ||= {}
|
248
|
+
doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
|
246
249
|
doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
|
247
250
|
yield doc, obj.key, content, metadata if block_given?
|
248
251
|
FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
|
@@ -294,8 +297,12 @@ module Stevedore
|
|
294
297
|
# but, for now, standalone emails are treated as one document
|
295
298
|
# PDFs can (theoretically) contain documents as "attachments" -- those aren't handled here either.
|
296
299
|
if ArchiveSplitter::HANDLED_FORMATS.include?(filename.split(".")[-1])
|
297
|
-
|
300
|
+
ArchiveSplitter.split(filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
|
298
301
|
doc, content, metadata = process_document(constituent_file, download_filename)
|
302
|
+
doc = {} if doc.nil?
|
303
|
+
doc["analyzed"] ||= {}
|
304
|
+
doc["analyzed"]["metadata"] ||= {}
|
305
|
+
doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
|
299
306
|
doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
|
300
307
|
doc["id"] = doc["sha1"]
|
301
308
|
yield doc, filename, content, metadata if block_given?
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stevedore-uploader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-11-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -27,17 +27,17 @@ dependencies:
|
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
requirement: !ruby/object:Gem::Requirement
|
29
29
|
requirements:
|
30
|
-
- -
|
30
|
+
- - '='
|
31
31
|
- !ruby/object:Gem::Version
|
32
|
-
version:
|
32
|
+
version: 0.6.0
|
33
33
|
name: manticore
|
34
34
|
prerelease: false
|
35
35
|
type: :runtime
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - '='
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 0.6.0
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
requirement: !ruby/object:Gem::Requirement
|
43
43
|
requirements:
|
@@ -71,7 +71,7 @@ dependencies:
|
|
71
71
|
requirements:
|
72
72
|
- - ">="
|
73
73
|
- !ruby/object:Gem::Version
|
74
|
-
version: 1.
|
74
|
+
version: 1.7.0
|
75
75
|
name: rika-stevedore
|
76
76
|
prerelease: false
|
77
77
|
type: :runtime
|
@@ -79,7 +79,7 @@ dependencies:
|
|
79
79
|
requirements:
|
80
80
|
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: 1.
|
82
|
+
version: 1.7.0
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
requirement: !ruby/object:Gem::Requirement
|
85
85
|
requirements:
|