RubyGems - stevedore-uploader - Versions diffs - 1.0.4-java → 1.0.5-java - Mend

stevedore-uploader 1.0.4-java → 1.0.5-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/lib/parsers/stevedore_blob.rb +1 -1
data/lib/parsers/stevedore_email.rb +0 -1
data/lib/split_archive.rb +57 -23
data/lib/stevedore-uploader.rb +10 -3
metadata +8 -8

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 8caed7d7d3043a2377282f4da9038aebcc091214
-  data.tar.gz: 16fac6b1d157a15a552270ea1f61666e2add3f53
+  metadata.gz: 5ae48d657db4ddb854c7cda165ded9188ea97c5a
+  data.tar.gz: c527586ca91bc3538efe9ca59a6889bf750b37a3
 SHA512:
-  metadata.gz: 954380cf579eb786d91cee303bb820a1bc19123a183d60a0f649c1835ca90fc8a564a0d2c89a833a14710e43fff0fce8b0ff4b377ee2f398c4cbd52ac90ad851
-  data.tar.gz: b85c53a5642cb4c2c8c6906179737470fb2d25ba9dc089dd3b23fc768618120a930e1851c862d032171fd4b58b08a8c6d6dfd7727c6ad952fa6ac8d806a25de3
+  metadata.gz: 67e9b259a01c543863cfe8e0207b373b1aa5752d3b6c00ca322d9fa1f5ef6aab15d51c47eea8e5e4ce4d15a07db4fe31903a5980cb736299caa0edf923da9770
+  data.tar.gz: b0fb2255c089bc0e085e825bb523cfdbc819d09b4a03a99e54d3461ac883775d6ebbfa9253ce32017413a8df8ebe9627d7d489c0655574f76c81f4c3a6afdaad

data/lib/parsers/stevedore_blob.rb CHANGED Viewed

@@ -17,7 +17,7 @@ module Stevedore
     end
     def self.new_from_tika(content, metadata, download_url, filename)
-      self.new(metadata["title"], content, download_url)
+      self.new(metadata["title"] || File.basename(filename), content, download_url)
     end
     def analyze!

data/lib/parsers/stevedore_email.rb CHANGED Viewed

@@ -20,7 +20,6 @@ module Stevedore
         attachment_filename = CGI::unescape(raw_attachment_filename)
         possible_filename = File.join(File.dirname(filepath), attachment_filename)
         eml_filename = File.join(File.dirname(filepath), File.basename(filepath, '.eml') + '-' + attachment_filename)
-        s3_path = S3_BASEPATH + File.dirname(filepath).gsub(::FOLDER, '')
         possible_s3_url = S3_BASEPATH + '/' + CGI::escape(File.basename(possible_filename))
         possible_eml_s3_url = S3_BASEPATH + '/' + CGI::escape(File.basename(eml_filename))

data/lib/split_archive.rb CHANGED Viewed

@@ -1,7 +1,11 @@
-# splits zip, mbox, pst files into their constituent documents -- messages and attachments
+# splits zip, mbox, eml and pst files into their constituent documents -- mesasges and attachments
 # and puts them into a tmp folder
 # which is then parsed normally
+# why .eml you ask? those aren't archives!
+# you're right, but they do contain other files (i.e. attachments)
+# so I figure this is the place to handle files that contain other files.
 require 'tmpdir'
 require 'mail'
 require 'zip'
@@ -29,23 +33,24 @@ module Stevedore
                         elsif extension == "zip"
                           self.split_zip(archive_filename)
                         elsif extension == "eml"
-                          self.get_attachments_from_eml(archive_filename)
+                          self.get_attachments_from_eml(archive_filename)
                         end
           # should yield a relative filename
           # and a lambda that will write the file contents to the given filename
           FileUtils.mkdir_p(File.join(tmpdir, File.basename(archive_filename)))
-          constituent_files.each_with_index do |basename_contents_lambda, idx|
-            basename, contents_lambda = *basename_contents_lambda
-            tmp_filename = File.join(tmpdir, File.basename(archive_filename), basename.gsub("/", "") )
+          constituent_files.each_with_index do |basename_contents_lambda_attachments_parent, idx|
+            basename, contents_lambda, attachments, parent = *basename_contents_lambda_attachments_parent
+            tmp_filename = File.join(tmpdir, File.basename(archive_filename), basename )
             FileUtils.mkdir_p(File.dirname(tmp_filename))
             begin
               contents_lambda.call(tmp_filename)
             rescue Errno::ENOENT
-              puts "#{tmp_filename} wasn't extracted from #{archive_filename}"
+              puts "#{tmp_filename} wasn't extracted from #{archive_filename}"
               next
             end
-            yielder.yield tmp_filename, File.join(File.basename(archive_filename), basename)
+            attachments ||= []
+            yielder.yield tmp_filename, File.join(File.basename(archive_filename), basename), attachments, parent
           end
         end
       end
@@ -77,6 +82,45 @@ module Stevedore
       end
     end
+    def self.split_pst(archive_filename)
+      pstfile = Java::ComPFF::PSTFile.new(archive_filename)
+      idx = 0
+      folders = pstfile.root.sub_folders.inject({}) do |memo,f|
+        memo[f.name] = f
+        memo
+      end
+      Enumerator.new do |yielder|
+        folders.each do |folder_name, folder|
+          while mail = folder.getNextChild
+            eml_str = mail.get_transport_message_headers + mail.get_body
+            yielder << ["#{idx}.eml", lambda{|fn| open(fn, 'wb'){|fh| fh << eml_str } }]
+            attachment_count = mail.get_number_of_attachments
+            attachment_count.times do |attachment_idx|
+              attachment = mail.get_attachment(attachment_idx)
+              attachment_filename = attachment.get_filename
+              yielder << ["#{idx}-#{attachment_filename}", lambda {|fn| open(fn, 'wb'){ |fh| fh << attachment.get_file_input_stream.to_io.read }}]
+            end
+            idx += 1
+          end
+        end
+      end
+    end
+    def self.get_attachments_from_eml(email_filename)
+      Enumerator.new do |yielder|
+        mail = Mail.new open(email_filename){|f| f.read }
+        attachment_results = mail.attachments.map do |attachment|
+          [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}, [], File.basename(email_filename)]
+        end
+        attachment_basenames = attachment_results.map{|a| File.basename(a[0]) }
+        yielder << [File.basename(email_filename), lambda{|fn| open(fn, 'wb'){|fh| fh << open(email_filename){|f| f.read } } }, attachment_basenames, nil]
+        attachment_results.each{|res| yielder << res }
+      end
+    end
     def self.split_mbox(archive_filename)
       # stolen shamelessly from the Ruby Enumerable docs, actually
       # split mails in mbox (slice before Unix From line after an empty line)
@@ -87,6 +131,7 @@ module Stevedore
             h[:empty] = line == "\n" || line == "\r\n" || line == "\r"
             previous_was_empty && line.start_with?("From ")
           end.each_with_index do |mail_str, idx|
+            # TODO copy over stuff from get_attachments_from_eml for attachment/parents if
             mail_str.pop if mail_str.last == "\n" # remove last line if prexent
             yielder << ["#{idx}.eml", lambda{|fn| open(fn, 'wb'){|fh| fh << mail_str.join("") } }]
             mail = Mail.new mail_str.join("")
@@ -98,26 +143,15 @@ module Stevedore
       end
     end
-    def self.get_attachments_from_eml(email_filename)
-      Enumerator.new do |yielder|
-        yielder << [File.basename(email_filename), lambda{|fn| open(fn, 'wb'){|fh| fh << open(email_filename){|f| f.read } } }]
-        mail = Mail.new open(email_filename){|f| f.read }
-        mail.attachments.each do |attachment|
-          yielder << [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}]
-        end
-      end
-    end
     def self.split_zip(archive_filename)
       Zip::File.open(archive_filename) do |zip_file|
         Enumerator.new do |yielder|
           zip_file.each do |entry|
-           begin
-             yielder << [entry.name, lambda{|fn| entry.extract(fn) }]
-           rescue
-             puts "unable to extract #{entry.name} from #{archive_filename}"
-           end
+            begin
+              yielder << [entry.name, lambda{|fn| entry.extract(fn) }]
+            rescue
+              puts "unable to extract #{entry.name} from #{archive_filename}"
+            end
           end
         end
       end

data/lib/stevedore-uploader.rb CHANGED Viewed

@@ -23,6 +23,7 @@ module Stevedore
     def initialize(es_host, es_index, s3_bucket=nil, s3_path=nil)
       @errors = []
+      puts "es_host, #{es_host}"
       @client = Elasticsearch::Client.new({
           log: false,
           url: es_host,
@@ -240,9 +241,11 @@ module Stevedore
               # but, for now, standalone emails are treated as one document
               # PDFs can (theoretically) contain documents as "attachments" -- those aren't handled here either.x
               if ArchiveSplitter::HANDLED_FORMATS.include?(tmp_filename.split(".")[-1])
-                ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename|
-                  doc = {} if doc.nil?
+                ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
                   doc, content, metadata = process_document(constituent_file, download_filename)
+                  doc["analyzed"] ||= {}
+                  doc["analyzed"]["metadata"] ||= {}
+                  doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
                   doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
                   yield doc, obj.key, content, metadata if block_given?
                   FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
@@ -294,8 +297,12 @@ module Stevedore
             # but, for now, standalone emails are treated as one document
             # PDFs can (theoretically) contain documents as "attachments" -- those aren't handled here either.
             if ArchiveSplitter::HANDLED_FORMATS.include?(filename.split(".")[-1])
-              ArchiveSplitter.split(filename).map do |constituent_file, constituent_basename|
+                ArchiveSplitter.split(filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
                 doc, content, metadata = process_document(constituent_file, download_filename)
+                doc = {} if doc.nil?
+                doc["analyzed"] ||= {}
+                doc["analyzed"]["metadata"] ||= {}
+                doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
                 doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
                 doc["id"] = doc["sha1"]
                 yield doc, filename, content, metadata if block_given?

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: stevedore-uploader
 version: !ruby/object:Gem::Version
-  version: 1.0.4
+  version: 1.0.5
 platform: java
 authors:
 - Jeremy B. Merrill
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-10-06 00:00:00.000000000 Z
+date: 2016-11-02 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement
@@ -27,17 +27,17 @@ dependencies:
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - '='
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 0.6.0
   name: manticore
   prerelease: false
   type: :runtime
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - '='
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 0.6.0
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement
     requirements:
@@ -71,7 +71,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.6.1
+        version: 1.7.0
   name: rika-stevedore
   prerelease: false
   type: :runtime
@@ -79,7 +79,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.6.1
+        version: 1.7.0
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement
     requirements: