stevedore-uploader 1.0.4-java → 1.0.5-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/parsers/stevedore_blob.rb +1 -1
- data/lib/parsers/stevedore_email.rb +0 -1
- data/lib/split_archive.rb +57 -23
- data/lib/stevedore-uploader.rb +10 -3
- metadata +8 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5ae48d657db4ddb854c7cda165ded9188ea97c5a
|
4
|
+
data.tar.gz: c527586ca91bc3538efe9ca59a6889bf750b37a3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 67e9b259a01c543863cfe8e0207b373b1aa5752d3b6c00ca322d9fa1f5ef6aab15d51c47eea8e5e4ce4d15a07db4fe31903a5980cb736299caa0edf923da9770
|
7
|
+
data.tar.gz: b0fb2255c089bc0e085e825bb523cfdbc819d09b4a03a99e54d3461ac883775d6ebbfa9253ce32017413a8df8ebe9627d7d489c0655574f76c81f4c3a6afdaad
|
@@ -20,7 +20,6 @@ module Stevedore
|
|
20
20
|
attachment_filename = CGI::unescape(raw_attachment_filename)
|
21
21
|
possible_filename = File.join(File.dirname(filepath), attachment_filename)
|
22
22
|
eml_filename = File.join(File.dirname(filepath), File.basename(filepath, '.eml') + '-' + attachment_filename)
|
23
|
-
s3_path = S3_BASEPATH + File.dirname(filepath).gsub(::FOLDER, '')
|
24
23
|
possible_s3_url = S3_BASEPATH + '/' + CGI::escape(File.basename(possible_filename))
|
25
24
|
possible_eml_s3_url = S3_BASEPATH + '/' + CGI::escape(File.basename(eml_filename))
|
26
25
|
|
data/lib/split_archive.rb
CHANGED
@@ -1,7 +1,11 @@
|
|
1
|
-
# splits zip, mbox, pst files into their constituent documents --
|
1
|
+
# splits zip, mbox, eml and pst files into their constituent documents -- mesasges and attachments
|
2
2
|
# and puts them into a tmp folder
|
3
3
|
# which is then parsed normally
|
4
4
|
|
5
|
+
# why .eml you ask? those aren't archives!
|
6
|
+
# you're right, but they do contain other files (i.e. attachments)
|
7
|
+
# so I figure this is the place to handle files that contain other files.
|
8
|
+
|
5
9
|
require 'tmpdir'
|
6
10
|
require 'mail'
|
7
11
|
require 'zip'
|
@@ -29,23 +33,24 @@ module Stevedore
|
|
29
33
|
elsif extension == "zip"
|
30
34
|
self.split_zip(archive_filename)
|
31
35
|
elsif extension == "eml"
|
32
|
-
self.get_attachments_from_eml(archive_filename)
|
36
|
+
self.get_attachments_from_eml(archive_filename)
|
33
37
|
end
|
34
38
|
# should yield a relative filename
|
35
39
|
# and a lambda that will write the file contents to the given filename
|
36
40
|
FileUtils.mkdir_p(File.join(tmpdir, File.basename(archive_filename)))
|
37
41
|
|
38
|
-
constituent_files.each_with_index do |
|
39
|
-
basename, contents_lambda = *
|
40
|
-
tmp_filename = File.join(tmpdir, File.basename(archive_filename), basename
|
42
|
+
constituent_files.each_with_index do |basename_contents_lambda_attachments_parent, idx|
|
43
|
+
basename, contents_lambda, attachments, parent = *basename_contents_lambda_attachments_parent
|
44
|
+
tmp_filename = File.join(tmpdir, File.basename(archive_filename), basename )
|
41
45
|
FileUtils.mkdir_p(File.dirname(tmp_filename))
|
42
46
|
begin
|
43
47
|
contents_lambda.call(tmp_filename)
|
44
48
|
rescue Errno::ENOENT
|
45
|
-
puts "#{tmp_filename} wasn't extracted from #{archive_filename}"
|
49
|
+
puts "#{tmp_filename} wasn't extracted from #{archive_filename}"
|
46
50
|
next
|
47
51
|
end
|
48
|
-
|
52
|
+
attachments ||= []
|
53
|
+
yielder.yield tmp_filename, File.join(File.basename(archive_filename), basename), attachments, parent
|
49
54
|
end
|
50
55
|
end
|
51
56
|
end
|
@@ -77,6 +82,45 @@ module Stevedore
|
|
77
82
|
end
|
78
83
|
end
|
79
84
|
|
85
|
+
|
86
|
+
def self.split_pst(archive_filename)
|
87
|
+
pstfile = Java::ComPFF::PSTFile.new(archive_filename)
|
88
|
+
idx = 0
|
89
|
+
folders = pstfile.root.sub_folders.inject({}) do |memo,f|
|
90
|
+
memo[f.name] = f
|
91
|
+
memo
|
92
|
+
end
|
93
|
+
Enumerator.new do |yielder|
|
94
|
+
folders.each do |folder_name, folder|
|
95
|
+
while mail = folder.getNextChild
|
96
|
+
|
97
|
+
eml_str = mail.get_transport_message_headers + mail.get_body
|
98
|
+
|
99
|
+
yielder << ["#{idx}.eml", lambda{|fn| open(fn, 'wb'){|fh| fh << eml_str } }]
|
100
|
+
attachment_count = mail.get_number_of_attachments
|
101
|
+
attachment_count.times do |attachment_idx|
|
102
|
+
attachment = mail.get_attachment(attachment_idx)
|
103
|
+
attachment_filename = attachment.get_filename
|
104
|
+
yielder << ["#{idx}-#{attachment_filename}", lambda {|fn| open(fn, 'wb'){ |fh| fh << attachment.get_file_input_stream.to_io.read }}]
|
105
|
+
end
|
106
|
+
idx += 1
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def self.get_attachments_from_eml(email_filename)
|
113
|
+
Enumerator.new do |yielder|
|
114
|
+
mail = Mail.new open(email_filename){|f| f.read }
|
115
|
+
attachment_results = mail.attachments.map do |attachment|
|
116
|
+
[attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}, [], File.basename(email_filename)]
|
117
|
+
end
|
118
|
+
attachment_basenames = attachment_results.map{|a| File.basename(a[0]) }
|
119
|
+
yielder << [File.basename(email_filename), lambda{|fn| open(fn, 'wb'){|fh| fh << open(email_filename){|f| f.read } } }, attachment_basenames, nil]
|
120
|
+
attachment_results.each{|res| yielder << res }
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
80
124
|
def self.split_mbox(archive_filename)
|
81
125
|
# stolen shamelessly from the Ruby Enumerable docs, actually
|
82
126
|
# split mails in mbox (slice before Unix From line after an empty line)
|
@@ -87,6 +131,7 @@ module Stevedore
|
|
87
131
|
h[:empty] = line == "\n" || line == "\r\n" || line == "\r"
|
88
132
|
previous_was_empty && line.start_with?("From ")
|
89
133
|
end.each_with_index do |mail_str, idx|
|
134
|
+
# TODO copy over stuff from get_attachments_from_eml for attachment/parents if
|
90
135
|
mail_str.pop if mail_str.last == "\n" # remove last line if prexent
|
91
136
|
yielder << ["#{idx}.eml", lambda{|fn| open(fn, 'wb'){|fh| fh << mail_str.join("") } }]
|
92
137
|
mail = Mail.new mail_str.join("")
|
@@ -98,26 +143,15 @@ module Stevedore
|
|
98
143
|
end
|
99
144
|
end
|
100
145
|
|
101
|
-
def self.get_attachments_from_eml(email_filename)
|
102
|
-
Enumerator.new do |yielder|
|
103
|
-
yielder << [File.basename(email_filename), lambda{|fn| open(fn, 'wb'){|fh| fh << open(email_filename){|f| f.read } } }]
|
104
|
-
mail = Mail.new open(email_filename){|f| f.read }
|
105
|
-
mail.attachments.each do |attachment|
|
106
|
-
yielder << [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}]
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
110
|
-
|
111
|
-
|
112
146
|
def self.split_zip(archive_filename)
|
113
147
|
Zip::File.open(archive_filename) do |zip_file|
|
114
148
|
Enumerator.new do |yielder|
|
115
149
|
zip_file.each do |entry|
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
150
|
+
begin
|
151
|
+
yielder << [entry.name, lambda{|fn| entry.extract(fn) }]
|
152
|
+
rescue
|
153
|
+
puts "unable to extract #{entry.name} from #{archive_filename}"
|
154
|
+
end
|
121
155
|
end
|
122
156
|
end
|
123
157
|
end
|
data/lib/stevedore-uploader.rb
CHANGED
@@ -23,6 +23,7 @@ module Stevedore
|
|
23
23
|
|
24
24
|
def initialize(es_host, es_index, s3_bucket=nil, s3_path=nil)
|
25
25
|
@errors = []
|
26
|
+
puts "es_host, #{es_host}"
|
26
27
|
@client = Elasticsearch::Client.new({
|
27
28
|
log: false,
|
28
29
|
url: es_host,
|
@@ -240,9 +241,11 @@ module Stevedore
|
|
240
241
|
# but, for now, standalone emails are treated as one document
|
241
242
|
# PDFs can (theoretically) contain documents as "attachments" -- those aren't handled here either.x
|
242
243
|
if ArchiveSplitter::HANDLED_FORMATS.include?(tmp_filename.split(".")[-1])
|
243
|
-
ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename|
|
244
|
-
doc = {} if doc.nil?
|
244
|
+
ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
|
245
245
|
doc, content, metadata = process_document(constituent_file, download_filename)
|
246
|
+
doc["analyzed"] ||= {}
|
247
|
+
doc["analyzed"]["metadata"] ||= {}
|
248
|
+
doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
|
246
249
|
doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
|
247
250
|
yield doc, obj.key, content, metadata if block_given?
|
248
251
|
FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
|
@@ -294,8 +297,12 @@ module Stevedore
|
|
294
297
|
# but, for now, standalone emails are treated as one document
|
295
298
|
# PDFs can (theoretically) contain documents as "attachments" -- those aren't handled here either.
|
296
299
|
if ArchiveSplitter::HANDLED_FORMATS.include?(filename.split(".")[-1])
|
297
|
-
|
300
|
+
ArchiveSplitter.split(filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
|
298
301
|
doc, content, metadata = process_document(constituent_file, download_filename)
|
302
|
+
doc = {} if doc.nil?
|
303
|
+
doc["analyzed"] ||= {}
|
304
|
+
doc["analyzed"]["metadata"] ||= {}
|
305
|
+
doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
|
299
306
|
doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
|
300
307
|
doc["id"] = doc["sha1"]
|
301
308
|
yield doc, filename, content, metadata if block_given?
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stevedore-uploader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-11-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -27,17 +27,17 @@ dependencies:
|
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
requirement: !ruby/object:Gem::Requirement
|
29
29
|
requirements:
|
30
|
-
- -
|
30
|
+
- - '='
|
31
31
|
- !ruby/object:Gem::Version
|
32
|
-
version:
|
32
|
+
version: 0.6.0
|
33
33
|
name: manticore
|
34
34
|
prerelease: false
|
35
35
|
type: :runtime
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - '='
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 0.6.0
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
requirement: !ruby/object:Gem::Requirement
|
43
43
|
requirements:
|
@@ -71,7 +71,7 @@ dependencies:
|
|
71
71
|
requirements:
|
72
72
|
- - ">="
|
73
73
|
- !ruby/object:Gem::Version
|
74
|
-
version: 1.
|
74
|
+
version: 1.7.0
|
75
75
|
name: rika-stevedore
|
76
76
|
prerelease: false
|
77
77
|
type: :runtime
|
@@ -79,7 +79,7 @@ dependencies:
|
|
79
79
|
requirements:
|
80
80
|
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: 1.
|
82
|
+
version: 1.7.0
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
requirement: !ruby/object:Gem::Requirement
|
85
85
|
requirements:
|