stevedore-uploader 1.0.4-java → 1.0.5-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8caed7d7d3043a2377282f4da9038aebcc091214
4
- data.tar.gz: 16fac6b1d157a15a552270ea1f61666e2add3f53
3
+ metadata.gz: 5ae48d657db4ddb854c7cda165ded9188ea97c5a
4
+ data.tar.gz: c527586ca91bc3538efe9ca59a6889bf750b37a3
5
5
  SHA512:
6
- metadata.gz: 954380cf579eb786d91cee303bb820a1bc19123a183d60a0f649c1835ca90fc8a564a0d2c89a833a14710e43fff0fce8b0ff4b377ee2f398c4cbd52ac90ad851
7
- data.tar.gz: b85c53a5642cb4c2c8c6906179737470fb2d25ba9dc089dd3b23fc768618120a930e1851c862d032171fd4b58b08a8c6d6dfd7727c6ad952fa6ac8d806a25de3
6
+ metadata.gz: 67e9b259a01c543863cfe8e0207b373b1aa5752d3b6c00ca322d9fa1f5ef6aab15d51c47eea8e5e4ce4d15a07db4fe31903a5980cb736299caa0edf923da9770
7
+ data.tar.gz: b0fb2255c089bc0e085e825bb523cfdbc819d09b4a03a99e54d3461ac883775d6ebbfa9253ce32017413a8df8ebe9627d7d489c0655574f76c81f4c3a6afdaad
@@ -17,7 +17,7 @@ module Stevedore
17
17
  end
18
18
 
19
19
  def self.new_from_tika(content, metadata, download_url, filename)
20
- self.new(metadata["title"], content, download_url)
20
+ self.new(metadata["title"] || File.basename(filename), content, download_url)
21
21
  end
22
22
 
23
23
  def analyze!
@@ -20,7 +20,6 @@ module Stevedore
20
20
  attachment_filename = CGI::unescape(raw_attachment_filename)
21
21
  possible_filename = File.join(File.dirname(filepath), attachment_filename)
22
22
  eml_filename = File.join(File.dirname(filepath), File.basename(filepath, '.eml') + '-' + attachment_filename)
23
- s3_path = S3_BASEPATH + File.dirname(filepath).gsub(::FOLDER, '')
24
23
  possible_s3_url = S3_BASEPATH + '/' + CGI::escape(File.basename(possible_filename))
25
24
  possible_eml_s3_url = S3_BASEPATH + '/' + CGI::escape(File.basename(eml_filename))
26
25
 
data/lib/split_archive.rb CHANGED
@@ -1,7 +1,11 @@
1
- # splits zip, mbox, pst files into their constituent documents -- messages and attachments
1
+ # splits zip, mbox, eml and pst files into their constituent documents -- mesasges and attachments
2
2
  # and puts them into a tmp folder
3
3
  # which is then parsed normally
4
4
 
5
+ # why .eml you ask? those aren't archives!
6
+ # you're right, but they do contain other files (i.e. attachments)
7
+ # so I figure this is the place to handle files that contain other files.
8
+
5
9
  require 'tmpdir'
6
10
  require 'mail'
7
11
  require 'zip'
@@ -29,23 +33,24 @@ module Stevedore
29
33
  elsif extension == "zip"
30
34
  self.split_zip(archive_filename)
31
35
  elsif extension == "eml"
32
- self.get_attachments_from_eml(archive_filename)
36
+ self.get_attachments_from_eml(archive_filename)
33
37
  end
34
38
  # should yield a relative filename
35
39
  # and a lambda that will write the file contents to the given filename
36
40
  FileUtils.mkdir_p(File.join(tmpdir, File.basename(archive_filename)))
37
41
 
38
- constituent_files.each_with_index do |basename_contents_lambda, idx|
39
- basename, contents_lambda = *basename_contents_lambda
40
- tmp_filename = File.join(tmpdir, File.basename(archive_filename), basename.gsub("/", "") )
42
+ constituent_files.each_with_index do |basename_contents_lambda_attachments_parent, idx|
43
+ basename, contents_lambda, attachments, parent = *basename_contents_lambda_attachments_parent
44
+ tmp_filename = File.join(tmpdir, File.basename(archive_filename), basename )
41
45
  FileUtils.mkdir_p(File.dirname(tmp_filename))
42
46
  begin
43
47
  contents_lambda.call(tmp_filename)
44
48
  rescue Errno::ENOENT
45
- puts "#{tmp_filename} wasn't extracted from #{archive_filename}"
49
+ puts "#{tmp_filename} wasn't extracted from #{archive_filename}"
46
50
  next
47
51
  end
48
- yielder.yield tmp_filename, File.join(File.basename(archive_filename), basename)
52
+ attachments ||= []
53
+ yielder.yield tmp_filename, File.join(File.basename(archive_filename), basename), attachments, parent
49
54
  end
50
55
  end
51
56
  end
@@ -77,6 +82,45 @@ module Stevedore
77
82
  end
78
83
  end
79
84
 
85
+
86
+ def self.split_pst(archive_filename)
87
+ pstfile = Java::ComPFF::PSTFile.new(archive_filename)
88
+ idx = 0
89
+ folders = pstfile.root.sub_folders.inject({}) do |memo,f|
90
+ memo[f.name] = f
91
+ memo
92
+ end
93
+ Enumerator.new do |yielder|
94
+ folders.each do |folder_name, folder|
95
+ while mail = folder.getNextChild
96
+
97
+ eml_str = mail.get_transport_message_headers + mail.get_body
98
+
99
+ yielder << ["#{idx}.eml", lambda{|fn| open(fn, 'wb'){|fh| fh << eml_str } }]
100
+ attachment_count = mail.get_number_of_attachments
101
+ attachment_count.times do |attachment_idx|
102
+ attachment = mail.get_attachment(attachment_idx)
103
+ attachment_filename = attachment.get_filename
104
+ yielder << ["#{idx}-#{attachment_filename}", lambda {|fn| open(fn, 'wb'){ |fh| fh << attachment.get_file_input_stream.to_io.read }}]
105
+ end
106
+ idx += 1
107
+ end
108
+ end
109
+ end
110
+ end
111
+
112
+ def self.get_attachments_from_eml(email_filename)
113
+ Enumerator.new do |yielder|
114
+ mail = Mail.new open(email_filename){|f| f.read }
115
+ attachment_results = mail.attachments.map do |attachment|
116
+ [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}, [], File.basename(email_filename)]
117
+ end
118
+ attachment_basenames = attachment_results.map{|a| File.basename(a[0]) }
119
+ yielder << [File.basename(email_filename), lambda{|fn| open(fn, 'wb'){|fh| fh << open(email_filename){|f| f.read } } }, attachment_basenames, nil]
120
+ attachment_results.each{|res| yielder << res }
121
+ end
122
+ end
123
+
80
124
  def self.split_mbox(archive_filename)
81
125
  # stolen shamelessly from the Ruby Enumerable docs, actually
82
126
  # split mails in mbox (slice before Unix From line after an empty line)
@@ -87,6 +131,7 @@ module Stevedore
87
131
  h[:empty] = line == "\n" || line == "\r\n" || line == "\r"
88
132
  previous_was_empty && line.start_with?("From ")
89
133
  end.each_with_index do |mail_str, idx|
134
+ # TODO copy over stuff from get_attachments_from_eml for attachment/parents if
90
135
  mail_str.pop if mail_str.last == "\n" # remove last line if prexent
91
136
  yielder << ["#{idx}.eml", lambda{|fn| open(fn, 'wb'){|fh| fh << mail_str.join("") } }]
92
137
  mail = Mail.new mail_str.join("")
@@ -98,26 +143,15 @@ module Stevedore
98
143
  end
99
144
  end
100
145
 
101
- def self.get_attachments_from_eml(email_filename)
102
- Enumerator.new do |yielder|
103
- yielder << [File.basename(email_filename), lambda{|fn| open(fn, 'wb'){|fh| fh << open(email_filename){|f| f.read } } }]
104
- mail = Mail.new open(email_filename){|f| f.read }
105
- mail.attachments.each do |attachment|
106
- yielder << [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}]
107
- end
108
- end
109
- end
110
-
111
-
112
146
  def self.split_zip(archive_filename)
113
147
  Zip::File.open(archive_filename) do |zip_file|
114
148
  Enumerator.new do |yielder|
115
149
  zip_file.each do |entry|
116
- begin
117
- yielder << [entry.name, lambda{|fn| entry.extract(fn) }]
118
- rescue
119
- puts "unable to extract #{entry.name} from #{archive_filename}"
120
- end
150
+ begin
151
+ yielder << [entry.name, lambda{|fn| entry.extract(fn) }]
152
+ rescue
153
+ puts "unable to extract #{entry.name} from #{archive_filename}"
154
+ end
121
155
  end
122
156
  end
123
157
  end
@@ -23,6 +23,7 @@ module Stevedore
23
23
 
24
24
  def initialize(es_host, es_index, s3_bucket=nil, s3_path=nil)
25
25
  @errors = []
26
+ puts "es_host, #{es_host}"
26
27
  @client = Elasticsearch::Client.new({
27
28
  log: false,
28
29
  url: es_host,
@@ -240,9 +241,11 @@ module Stevedore
240
241
  # but, for now, standalone emails are treated as one document
241
242
  # PDFs can (theoretically) contain documents as "attachments" -- those aren't handled here either.x
242
243
  if ArchiveSplitter::HANDLED_FORMATS.include?(tmp_filename.split(".")[-1])
243
- ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename|
244
- doc = {} if doc.nil?
244
+ ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
245
245
  doc, content, metadata = process_document(constituent_file, download_filename)
246
+ doc["analyzed"] ||= {}
247
+ doc["analyzed"]["metadata"] ||= {}
248
+ doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
246
249
  doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
247
250
  yield doc, obj.key, content, metadata if block_given?
248
251
  FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
@@ -294,8 +297,12 @@ module Stevedore
294
297
  # but, for now, standalone emails are treated as one document
295
298
  # PDFs can (theoretically) contain documents as "attachments" -- those aren't handled here either.
296
299
  if ArchiveSplitter::HANDLED_FORMATS.include?(filename.split(".")[-1])
297
- ArchiveSplitter.split(filename).map do |constituent_file, constituent_basename|
300
+ ArchiveSplitter.split(filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
298
301
  doc, content, metadata = process_document(constituent_file, download_filename)
302
+ doc = {} if doc.nil?
303
+ doc["analyzed"] ||= {}
304
+ doc["analyzed"]["metadata"] ||= {}
305
+ doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
299
306
  doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
300
307
  doc["id"] = doc["sha1"]
301
308
  yield doc, filename, content, metadata if block_given?
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: stevedore-uploader
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.0.5
5
5
  platform: java
6
6
  authors:
7
7
  - Jeremy B. Merrill
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-10-06 00:00:00.000000000 Z
11
+ date: 2016-11-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -27,17 +27,17 @@ dependencies:
27
27
  - !ruby/object:Gem::Dependency
28
28
  requirement: !ruby/object:Gem::Requirement
29
29
  requirements:
30
- - - ">="
30
+ - - '='
31
31
  - !ruby/object:Gem::Version
32
- version: '0'
32
+ version: 0.6.0
33
33
  name: manticore
34
34
  prerelease: false
35
35
  type: :runtime
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ">="
38
+ - - '='
39
39
  - !ruby/object:Gem::Version
40
- version: '0'
40
+ version: 0.6.0
41
41
  - !ruby/object:Gem::Dependency
42
42
  requirement: !ruby/object:Gem::Requirement
43
43
  requirements:
@@ -71,7 +71,7 @@ dependencies:
71
71
  requirements:
72
72
  - - ">="
73
73
  - !ruby/object:Gem::Version
74
- version: 1.6.1
74
+ version: 1.7.0
75
75
  name: rika-stevedore
76
76
  prerelease: false
77
77
  type: :runtime
@@ -79,7 +79,7 @@ dependencies:
79
79
  requirements:
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
- version: 1.6.1
82
+ version: 1.7.0
83
83
  - !ruby/object:Gem::Dependency
84
84
  requirement: !ruby/object:Gem::Requirement
85
85
  requirements: