stevedore-uploader 1.0.4-java → 1.0.5-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8caed7d7d3043a2377282f4da9038aebcc091214
4
- data.tar.gz: 16fac6b1d157a15a552270ea1f61666e2add3f53
3
+ metadata.gz: 5ae48d657db4ddb854c7cda165ded9188ea97c5a
4
+ data.tar.gz: c527586ca91bc3538efe9ca59a6889bf750b37a3
5
5
  SHA512:
6
- metadata.gz: 954380cf579eb786d91cee303bb820a1bc19123a183d60a0f649c1835ca90fc8a564a0d2c89a833a14710e43fff0fce8b0ff4b377ee2f398c4cbd52ac90ad851
7
- data.tar.gz: b85c53a5642cb4c2c8c6906179737470fb2d25ba9dc089dd3b23fc768618120a930e1851c862d032171fd4b58b08a8c6d6dfd7727c6ad952fa6ac8d806a25de3
6
+ metadata.gz: 67e9b259a01c543863cfe8e0207b373b1aa5752d3b6c00ca322d9fa1f5ef6aab15d51c47eea8e5e4ce4d15a07db4fe31903a5980cb736299caa0edf923da9770
7
+ data.tar.gz: b0fb2255c089bc0e085e825bb523cfdbc819d09b4a03a99e54d3461ac883775d6ebbfa9253ce32017413a8df8ebe9627d7d489c0655574f76c81f4c3a6afdaad
@@ -17,7 +17,7 @@ module Stevedore
17
17
  end
18
18
 
19
19
  def self.new_from_tika(content, metadata, download_url, filename)
20
- self.new(metadata["title"], content, download_url)
20
+ self.new(metadata["title"] || File.basename(filename), content, download_url)
21
21
  end
22
22
 
23
23
  def analyze!
@@ -20,7 +20,6 @@ module Stevedore
20
20
  attachment_filename = CGI::unescape(raw_attachment_filename)
21
21
  possible_filename = File.join(File.dirname(filepath), attachment_filename)
22
22
  eml_filename = File.join(File.dirname(filepath), File.basename(filepath, '.eml') + '-' + attachment_filename)
23
- s3_path = S3_BASEPATH + File.dirname(filepath).gsub(::FOLDER, '')
24
23
  possible_s3_url = S3_BASEPATH + '/' + CGI::escape(File.basename(possible_filename))
25
24
  possible_eml_s3_url = S3_BASEPATH + '/' + CGI::escape(File.basename(eml_filename))
26
25
 
data/lib/split_archive.rb CHANGED
@@ -1,7 +1,11 @@
1
- # splits zip, mbox, pst files into their constituent documents -- messages and attachments
1
+ # splits zip, mbox, eml and pst files into their constituent documents -- mesasges and attachments
2
2
  # and puts them into a tmp folder
3
3
  # which is then parsed normally
4
4
 
5
+ # why .eml you ask? those aren't archives!
6
+ # you're right, but they do contain other files (i.e. attachments)
7
+ # so I figure this is the place to handle files that contain other files.
8
+
5
9
  require 'tmpdir'
6
10
  require 'mail'
7
11
  require 'zip'
@@ -29,23 +33,24 @@ module Stevedore
29
33
  elsif extension == "zip"
30
34
  self.split_zip(archive_filename)
31
35
  elsif extension == "eml"
32
- self.get_attachments_from_eml(archive_filename)
36
+ self.get_attachments_from_eml(archive_filename)
33
37
  end
34
38
  # should yield a relative filename
35
39
  # and a lambda that will write the file contents to the given filename
36
40
  FileUtils.mkdir_p(File.join(tmpdir, File.basename(archive_filename)))
37
41
 
38
- constituent_files.each_with_index do |basename_contents_lambda, idx|
39
- basename, contents_lambda = *basename_contents_lambda
40
- tmp_filename = File.join(tmpdir, File.basename(archive_filename), basename.gsub("/", "") )
42
+ constituent_files.each_with_index do |basename_contents_lambda_attachments_parent, idx|
43
+ basename, contents_lambda, attachments, parent = *basename_contents_lambda_attachments_parent
44
+ tmp_filename = File.join(tmpdir, File.basename(archive_filename), basename )
41
45
  FileUtils.mkdir_p(File.dirname(tmp_filename))
42
46
  begin
43
47
  contents_lambda.call(tmp_filename)
44
48
  rescue Errno::ENOENT
45
- puts "#{tmp_filename} wasn't extracted from #{archive_filename}"
49
+ puts "#{tmp_filename} wasn't extracted from #{archive_filename}"
46
50
  next
47
51
  end
48
- yielder.yield tmp_filename, File.join(File.basename(archive_filename), basename)
52
+ attachments ||= []
53
+ yielder.yield tmp_filename, File.join(File.basename(archive_filename), basename), attachments, parent
49
54
  end
50
55
  end
51
56
  end
@@ -77,6 +82,45 @@ module Stevedore
77
82
  end
78
83
  end
79
84
 
85
+
86
+ def self.split_pst(archive_filename)
87
+ pstfile = Java::ComPFF::PSTFile.new(archive_filename)
88
+ idx = 0
89
+ folders = pstfile.root.sub_folders.inject({}) do |memo,f|
90
+ memo[f.name] = f
91
+ memo
92
+ end
93
+ Enumerator.new do |yielder|
94
+ folders.each do |folder_name, folder|
95
+ while mail = folder.getNextChild
96
+
97
+ eml_str = mail.get_transport_message_headers + mail.get_body
98
+
99
+ yielder << ["#{idx}.eml", lambda{|fn| open(fn, 'wb'){|fh| fh << eml_str } }]
100
+ attachment_count = mail.get_number_of_attachments
101
+ attachment_count.times do |attachment_idx|
102
+ attachment = mail.get_attachment(attachment_idx)
103
+ attachment_filename = attachment.get_filename
104
+ yielder << ["#{idx}-#{attachment_filename}", lambda {|fn| open(fn, 'wb'){ |fh| fh << attachment.get_file_input_stream.to_io.read }}]
105
+ end
106
+ idx += 1
107
+ end
108
+ end
109
+ end
110
+ end
111
+
112
+ def self.get_attachments_from_eml(email_filename)
113
+ Enumerator.new do |yielder|
114
+ mail = Mail.new open(email_filename){|f| f.read }
115
+ attachment_results = mail.attachments.map do |attachment|
116
+ [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}, [], File.basename(email_filename)]
117
+ end
118
+ attachment_basenames = attachment_results.map{|a| File.basename(a[0]) }
119
+ yielder << [File.basename(email_filename), lambda{|fn| open(fn, 'wb'){|fh| fh << open(email_filename){|f| f.read } } }, attachment_basenames, nil]
120
+ attachment_results.each{|res| yielder << res }
121
+ end
122
+ end
123
+
80
124
  def self.split_mbox(archive_filename)
81
125
  # stolen shamelessly from the Ruby Enumerable docs, actually
82
126
  # split mails in mbox (slice before Unix From line after an empty line)
@@ -87,6 +131,7 @@ module Stevedore
87
131
  h[:empty] = line == "\n" || line == "\r\n" || line == "\r"
88
132
  previous_was_empty && line.start_with?("From ")
89
133
  end.each_with_index do |mail_str, idx|
134
+ # TODO copy over stuff from get_attachments_from_eml for attachment/parents if
90
135
  mail_str.pop if mail_str.last == "\n" # remove last line if prexent
91
136
  yielder << ["#{idx}.eml", lambda{|fn| open(fn, 'wb'){|fh| fh << mail_str.join("") } }]
92
137
  mail = Mail.new mail_str.join("")
@@ -98,26 +143,15 @@ module Stevedore
98
143
  end
99
144
  end
100
145
 
101
- def self.get_attachments_from_eml(email_filename)
102
- Enumerator.new do |yielder|
103
- yielder << [File.basename(email_filename), lambda{|fn| open(fn, 'wb'){|fh| fh << open(email_filename){|f| f.read } } }]
104
- mail = Mail.new open(email_filename){|f| f.read }
105
- mail.attachments.each do |attachment|
106
- yielder << [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}]
107
- end
108
- end
109
- end
110
-
111
-
112
146
  def self.split_zip(archive_filename)
113
147
  Zip::File.open(archive_filename) do |zip_file|
114
148
  Enumerator.new do |yielder|
115
149
  zip_file.each do |entry|
116
- begin
117
- yielder << [entry.name, lambda{|fn| entry.extract(fn) }]
118
- rescue
119
- puts "unable to extract #{entry.name} from #{archive_filename}"
120
- end
150
+ begin
151
+ yielder << [entry.name, lambda{|fn| entry.extract(fn) }]
152
+ rescue
153
+ puts "unable to extract #{entry.name} from #{archive_filename}"
154
+ end
121
155
  end
122
156
  end
123
157
  end
@@ -23,6 +23,7 @@ module Stevedore
23
23
 
24
24
  def initialize(es_host, es_index, s3_bucket=nil, s3_path=nil)
25
25
  @errors = []
26
+ puts "es_host, #{es_host}"
26
27
  @client = Elasticsearch::Client.new({
27
28
  log: false,
28
29
  url: es_host,
@@ -240,9 +241,11 @@ module Stevedore
240
241
  # but, for now, standalone emails are treated as one document
241
242
  # PDFs can (theoretically) contain documents as "attachments" -- those aren't handled here either.x
242
243
  if ArchiveSplitter::HANDLED_FORMATS.include?(tmp_filename.split(".")[-1])
243
- ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename|
244
- doc = {} if doc.nil?
244
+ ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
245
245
  doc, content, metadata = process_document(constituent_file, download_filename)
246
+ doc["analyzed"] ||= {}
247
+ doc["analyzed"]["metadata"] ||= {}
248
+ doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
246
249
  doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
247
250
  yield doc, obj.key, content, metadata if block_given?
248
251
  FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
@@ -294,8 +297,12 @@ module Stevedore
294
297
  # but, for now, standalone emails are treated as one document
295
298
  # PDFs can (theoretically) contain documents as "attachments" -- those aren't handled here either.
296
299
  if ArchiveSplitter::HANDLED_FORMATS.include?(filename.split(".")[-1])
297
- ArchiveSplitter.split(filename).map do |constituent_file, constituent_basename|
300
+ ArchiveSplitter.split(filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
298
301
  doc, content, metadata = process_document(constituent_file, download_filename)
302
+ doc = {} if doc.nil?
303
+ doc["analyzed"] ||= {}
304
+ doc["analyzed"]["metadata"] ||= {}
305
+ doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
299
306
  doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
300
307
  doc["id"] = doc["sha1"]
301
308
  yield doc, filename, content, metadata if block_given?
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: stevedore-uploader
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.0.5
5
5
  platform: java
6
6
  authors:
7
7
  - Jeremy B. Merrill
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-10-06 00:00:00.000000000 Z
11
+ date: 2016-11-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -27,17 +27,17 @@ dependencies:
27
27
  - !ruby/object:Gem::Dependency
28
28
  requirement: !ruby/object:Gem::Requirement
29
29
  requirements:
30
- - - ">="
30
+ - - '='
31
31
  - !ruby/object:Gem::Version
32
- version: '0'
32
+ version: 0.6.0
33
33
  name: manticore
34
34
  prerelease: false
35
35
  type: :runtime
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ">="
38
+ - - '='
39
39
  - !ruby/object:Gem::Version
40
- version: '0'
40
+ version: 0.6.0
41
41
  - !ruby/object:Gem::Dependency
42
42
  requirement: !ruby/object:Gem::Requirement
43
43
  requirements:
@@ -71,7 +71,7 @@ dependencies:
71
71
  requirements:
72
72
  - - ">="
73
73
  - !ruby/object:Gem::Version
74
- version: 1.6.1
74
+ version: 1.7.0
75
75
  name: rika-stevedore
76
76
  prerelease: false
77
77
  type: :runtime
@@ -79,7 +79,7 @@ dependencies:
79
79
  requirements:
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
- version: 1.6.1
82
+ version: 1.7.0
83
83
  - !ruby/object:Gem::Dependency
84
84
  requirement: !ruby/object:Gem::Requirement
85
85
  requirements: