RubyGems - stevedore-uploader - Versions diffs - 1.0.3-java → 1.0.4-java - Mend

stevedore-uploader 1.0.3-java → 1.0.4-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/README.md +20 -5
data/bin/upload_to_elasticsearch.rb +10 -3
data/lib/split_archive.rb +29 -6
data/lib/stevedore-uploader.rb +58 -39
metadata +4 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: f0a7cf14db52879c45ea2916acced51d4a7c92a3
-  data.tar.gz: ff29eb5a93bf41c3dca4eb1d68b0ec3a2b7f0ca0
+  metadata.gz: 8caed7d7d3043a2377282f4da9038aebcc091214
+  data.tar.gz: 16fac6b1d157a15a552270ea1f61666e2add3f53
 SHA512:
-  metadata.gz: b1249d754597a1fcacb1bfe5c61de1fc6779b49267ba341dba728d7be5a2d9718be5eeaec196c8b1a5c42236947ecd048ba10a8fbaa7303eb0607451ce7bc9fe
-  data.tar.gz: 91dc8f1252253183ee9904b81650393435ab32d1a329d1dfcd3a4f6cbff4ea2876d6080dbd59ca5373a8c2f34075bd8f5972db178591aae4c823ad1732175d5e
+  metadata.gz: 954380cf579eb786d91cee303bb820a1bc19123a183d60a0f649c1835ca90fc8a564a0d2c89a833a14710e43fff0fce8b0ff4b377ee2f398c4cbd52ac90ad851
+  data.tar.gz: b85c53a5642cb4c2c8c6906179737470fb2d25ba9dc089dd3b23fc768618120a930e1851c862d032171fd4b58b08a8c6d6dfd7727c6ad952fa6ac8d806a25de3

data/README.md CHANGED Viewed

@@ -19,8 +19,23 @@ This project is in JRuby, so we can leverage the transformative enterprise stabi
 2. be sure you're running Java 8. (java 7 is deprecated, c'mon c'mon)
 3. `bundle install`
-Usage
------
+Command-Line Options
+--------------------
+````
+Usage: upload_to_elasticsearch [options] target_(dir_or_csv)
+    -h, --host=SERVER:PORT           The location of the ElasticSearch server
+    -i, --index=NAME                 A name to use for the ES index (defaults to using the directory name)
+    -s, --s3path=PATH                The path under your bucket where these files have been uploaded. (defaults to ES index)
+    -b, --s3bucket=PATH              The s3 bucket where these files have already been be uploaded (or will be later).
+        --title_column=COLNAME       If target file is a CSV, which column contains the title of the row. Integer index or string column name.
+        --text_column=COLNAME        If target file is a CSV, which column contains the main, searchable of the row. Integer index or string column name.
+    -o, --[no-]ocr                   don't attempt to OCR any PDFs, even if they contain no text
+    -?, --help                       Display this screen
+````
+Advanced Usage
+--------------
 **This is a piece of a larger upload workflow, [described here](https://github.com/newsdev/stevedore/blob/master/README.md). You should read that first, then come back here.**
@@ -37,12 +52,12 @@ if host isn't specified, we assume `localhost:9200`.
 e.g.
 ```
-bundle exec ruby bin/upload_to_elasticsearch.rb --index=jrubytest --host=https://stevedore.newsdev.net/es/ ~/code/marco-rubios-emails/emls/
+bundle exec ruby bin/upload_to_elasticsearch.rb --index=jrubytest --host=https://stevedore.elasticsearch.yourdomain.net/es/ ~/code/marco-rubios-emails/emls/
 ```
-you may also specify an S3 location of documents to parse, instead of a local directory, e.g.
+you may also specify an s3:// location of documents to parse, instead of a local directory, e.g.
 ```
-bundle exec ruby bin/upload_to_elasticsearch.rb --index=jrubytest --host=https://stevedore.newsdev.net/es/ s3://int-data-dumps/marco-rubio-fire-drill
+bundle exec ruby bin/upload_to_elasticsearch.rb --index=jrubytest --host=https://stevedore.elasticsearch.yourdomain.net/es/ s3://int-data-dumps/marco-rubio-fire-drill
 ```
 if you choose to process documents from S3, you should upload those documents using your choice of tool -- but `awscli` is a good choice. *Stevedore-Uploader does NOT upload documents to S3 on your behalf.

data/bin/upload_to_elasticsearch.rb CHANGED Viewed

@@ -34,16 +34,21 @@ if __FILE__ == $0
       options.s3bucket = s3bucket
     end
-    opts.on("--title_column=COLNAME",
+    opts.on("--title-column=COLNAME",
             "If target file is a CSV, which column contains the title of the row. Integer index or string column name."
       ) do |title_column|
       options.title_column = title_column
     end
-    opts.on("--text_column=COLNAME",
+    opts.on("--text-column=COLNAME",
             "If target file is a CSV, which column contains the main, searchable of the row. Integer index or string column name."
       ) do |text_column|
       options.text_column = text_column
     end
+    opts.on("--slice-size=SLICE",
+            "Process documents in batches of SLICE. Default is 100. Lower this if you get timeouts. Raise it to go faster."
+      ) do |slice_size|
+      options.slice_size = slice_size.to_i
+    end
     opts.on("-o", "--[no-]ocr", "don't attempt to OCR any PDFs, even if they contain no text") do |v|
       options.ocr = v
@@ -95,7 +100,9 @@ raise ArgumentError, "specify the elasticsearch host" unless ES_HOST
 if __FILE__ == $0
   f = Stevedore::ESUploader.new(ES_HOST, ES_INDEX, S3_BUCKET, S3_BASEPATH)
   f.should_ocr = options.ocr
-  puts "Will not OCR, per --no-ocr option" unless f.should_ocr
+  puts "Will not OCR, per --no-ocr option" unless f.should_ocr
+  f.slice_size = options.slice_size if options.slice_size
+  puts "Slice size set to #{f.slice_size}" if options.slice_size
   if FOLDER.match(/\.[ct]sv$/)
     f.do_csv!(FOLDER, File.join(f.s3_basepath, File.basename(FOLDER)), options.title_column, options.text_column)

data/lib/split_archive.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-# splits zip, mbox (and eventually pst) files into their constituent documents -- mesages and attachments
+# splits zip, mbox, pst files into their constituent documents -- messages and attachments
 # and puts them into a tmp folder
 # which is then parsed normally
@@ -11,7 +11,7 @@ require 'pst' # for PST files
 # splits PST and Mbox formats
 module Stevedore
   class ArchiveSplitter
-    HANDLED_FORMATS = ["zip", "mbox", "pst"]
+    HANDLED_FORMATS = ["zip", "mbox", "pst", "eml"]
     def self.split(archive_filename)
       # if it's a PST use split_pst
@@ -28,6 +28,8 @@ module Stevedore
                           self.split_pst(archive_filename)
                         elsif extension == "zip"
                           self.split_zip(archive_filename)
+                        elsif extension == "eml"
+                          self.get_attachments_from_eml(archive_filename)
                         end
           # should yield a relative filename
           # and a lambda that will write the file contents to the given filename
@@ -36,8 +38,14 @@ module Stevedore
           constituent_files.each_with_index do |basename_contents_lambda, idx|
             basename, contents_lambda = *basename_contents_lambda
             tmp_filename = File.join(tmpdir, File.basename(archive_filename), basename.gsub("/", "") )
-            contents_lambda.call(tmp_filename)
-            yielder.yield tmp_filename, File.join(File.basename(archive_filename), basename)
+            FileUtils.mkdir_p(File.dirname(tmp_filename))
+            begin
+              contents_lambda.call(tmp_filename)
+            rescue Errno::ENOENT
+              puts "#{tmp_filename} wasn't extracted from #{archive_filename}"
+              next
+            end
+            yielder.yield tmp_filename, File.join(File.basename(archive_filename), basename)
           end
         end
       end
@@ -83,18 +91,33 @@ module Stevedore
             yielder << ["#{idx}.eml", lambda{|fn| open(fn, 'wb'){|fh| fh << mail_str.join("") } }]
             mail = Mail.new mail_str.join("")
             mail.attachments.each do |attachment|
-              yielder << [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| attachment.save fh }}]
+              yielder << [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}]
             end
           end
         end
       end
     end
+    def self.get_attachments_from_eml(email_filename)
+      Enumerator.new do |yielder|
+        yielder << [File.basename(email_filename), lambda{|fn| open(fn, 'wb'){|fh| fh << open(email_filename){|f| f.read } } }]
+        mail = Mail.new open(email_filename){|f| f.read }
+        mail.attachments.each do |attachment|
+          yielder << [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}]
+        end
+      end
+    end
     def self.split_zip(archive_filename)
       Zip::File.open(archive_filename) do |zip_file|
         Enumerator.new do |yielder|
           zip_file.each do |entry|
-            yielder << [entry.name, lambda{|fn| entry.extract(fn) }]
+           begin
+             yielder << [entry.name, lambda{|fn| entry.extract(fn) }]
+           rescue
+             puts "unable to extract #{entry.name} from #{archive_filename}"
+           end
           end
         end
       end

data/lib/stevedore-uploader.rb CHANGED Viewed

@@ -1,8 +1,6 @@
-Dir["#{File.expand_path(File.dirname(__FILE__))}/../lib/*.rb"].each {|f| require f}
-Dir["#{File.expand_path(File.dirname(__FILE__))}/../lib/parsers/*.rb"].each {|f| require f}
 require 'rika'
+require 'jruby-openssl'
 require 'net/https'
 require 'elasticsearch'
 require 'elasticsearch/transport/transport/http/manticore'
@@ -12,8 +10,9 @@ require 'manticore'
 require 'fileutils'
 require 'csv'
 require 'aws-sdk'
+Dir["#{File.expand_path(File.dirname(__FILE__))}/../lib/*.rb"].each {|f| require f}
+Dir["#{File.expand_path(File.dirname(__FILE__))}/../lib/parsers/*.rb"].each {|f| require f}
 module Stevedore
@@ -33,15 +32,16 @@ module Stevedore
         },
       )
       @es_index = es_index
-      @s3_bucket = s3_bucket #|| (Stevedore::ESUploader.const_defined?(FOLDER) && FOLDER.downcase.include?('s3://') ? FOLDER.gsub(/s3:\/\//i, '').split("/", 2).first : nil)
-      @s3_basepath = "https://#{s3_bucket}.s3.amazonaws.com/#{s3_path || es_index}"
+      @s3_bucket = s3_bucket
+      @s3_basepath = "https://#{s3_bucket}.s3.amazonaws.com/#{s3_path || es_index}/"
+      @use_s3 = !s3_bucket.nil?
       @slice_size =  100
       @should_ocr = false
       self.create_index!
-      self.add_mapping(:doc, MAPPING)
+      self.add_mapping(:doc, Stevedore.const_defined?("MAPPING") ? MAPPING : DEFAULT_MAPPING)
     end
     def create_index!
@@ -92,25 +92,30 @@ module Stevedore
       }) # was "rescue nil" but that obscured meaningful errors
     end
-    def bulk_upload_to_es!(data, type=nil)
-      return nil if data.empty?
-      begin
-        resp = @client.bulk body: data.map{|datum| {index: {_index: @es_index, _type: type || 'doc', data: datum }} }
-        puts resp if resp[:errors]
-      rescue JSON::GeneratorError
-        data.each do |datum|
-          begin
-            @client.bulk body: [datum].map{|datum| {index: {_index: @es_index, _type: type || 'doc', data: datum }} }
-          rescue JSON::GeneratorError
-            next
+    def bulk_upload_to_es!(data, type=:doc)
+      return nil if data.compact.empty?
+      if data.size == 1
+        resp = @client.index  index: @es_index, type: type, id: data.first["_id"], body: data.first
+      else
+        begin
+          resp = @client.bulk body: data.map{|datum| {index: {_index: @es_index, _type: type || 'doc', data: datum }} }
+          puts resp if resp[:errors]
+        rescue JSON::GeneratorError, Elasticsearch::Transport::Transport::Errors::InternalServerError
+          data.each do |datum|
+            begin
+              @client.bulk body: [datum].map{|datum| {index: {_index: @es_index, _type: type || 'doc', data: datum }} } unless datum.nil?
+            rescue JSON::GeneratorError, Elasticsearch::Transport::Transport::Errors::InternalServerError
+              next
+            end
           end
+          resp = nil
         end
         resp = nil
       end
       resp
     end
-    def process_document(filename, filename_for_s3)
+    def process_document(filename, download_url)
       begin
         puts "begin to process #{filename}"
         # puts "size: #{File.size(filename)}"
@@ -121,7 +126,7 @@ module Stevedore
           metadata = "couldn't be parsed"
         end
         puts "parsed: #{content.size}"
-        if content.size > 10 * (10 ** 6)
+        if content.size > 3 * (10 ** 7)
           @errors << filename
           puts "skipping #{filename} for being too big"
           return nil
@@ -133,9 +138,9 @@ module Stevedore
         # document types on its own
         ret = case                             # .eml                                          # .msg
               when metadata["Content-Type"] == "message/rfc822" || metadata["Content-Type"] == "application/vnd.ms-outlook"
-                ::Stevedore::StevedoreEmail.new_from_tika(content, metadata, filename_for_s3, filename).to_hash
+                ::Stevedore::StevedoreEmail.new_from_tika(content, metadata, download_url, filename).to_hash
               when metadata["Content-Type"] && ["application/html", "application/xhtml+xml"].include?(metadata["Content-Type"].split(";").first)
-                ::Stevedore::StevedoreHTML.new_from_tika(content, metadata, filename_for_s3, filename).to_hash
+                ::Stevedore::StevedoreHTML.new_from_tika(content, metadata, download_url, filename).to_hash
               when @should_ocr && metadata["Content-Type"] == "application/pdf" && (content.match(/\A\s*\z/) || content.size < 50 * metadata["xmpTPg:NPages"].to_i )
                 # this is a scanned PDF.
                 puts "scanned PDF #{File.basename(filename)} detected; OCRing"
@@ -146,7 +151,7 @@ module Stevedore
                   File.delete(png)
                   # no need to use a system call when we could use the stdlib!
                   # system("rm", "-f", png) rescue nil
-                  File.delete("#{png}.txt")
+                  File.delete("#{png}.txt") rescue nil
                 end.join("\n\n")
                 # e.g.  Analysis-Corporation-2.png.pdf or Torture.pdf
                 files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)[1].to_i })
@@ -154,9 +159,9 @@ module Stevedore
                 system('pdftk', *files, "cat", "output", "#{pdf_basename}.ocr.pdf")
                 content, _ = Rika.parse_content_and_metadata("#{pdf_basename}.ocr.pdf")
                 puts "OCRed content (#{File.basename(filename)}) length: #{content.length}"
-                ::Stevedore::StevedoreBlob.new_from_tika(content, metadata, filename_for_s3, filename).to_hash
+                ::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
               else
-                ::Stevedore::StevedoreBlob.new_from_tika(content, metadata, filename_for_s3, filename).to_hash
+                ::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
               end
       [ret, content, metadata]
       rescue StandardError, java.lang.NoClassDefFoundError, org.apache.tika.exception.TikaException => e
@@ -169,7 +174,7 @@ module Stevedore
       end
     end
-    def do_csv!(file, download_url, title_column=0, text_column=nil)
+    def do_csv!(file, download_url, title_column=0, text_column=nil, type=nil)
       docs_so_far = 0
       CSV.open(file, headers: (!title_column.is_a? Fixnum ) ).each_slice(@slice_size).each_with_index do |slice, slice_index|
         slice_of_rows = slice.map.each_with_index do |row, i|
@@ -185,7 +190,7 @@ module Stevedore
           doc
         end
         begin
-          resp = bulk_upload_to_es!(slice_of_rows.compact)
+          resp = bulk_upload_to_es!(slice_of_rows.compact.reject(&:empty?), type)
           docs_so_far += @slice_size
         rescue Manticore::Timeout, Manticore::SocketException
           STDERR.puts("retrying at #{Time.now}")
@@ -200,7 +205,6 @@ module Stevedore
       output_stream.puts "Processing documents from #{target_path}"
       docs_so_far = 0
-      # use_s3 = false # option to set this (an option to set document URLs to be relative to the search engine root) is TK
       @s3_bucket =  target_path.gsub(/s3:\/\//i, '').split("/", 2).first if @s3_bucket.nil? && target_path.downcase.include?('s3://')
       if target_path.downcase.include?("s3://")
@@ -237,6 +241,7 @@ module Stevedore
               # PDFs can (theoretically) contain documents as "attachments" -- those aren't handled here either.x
               if ArchiveSplitter::HANDLED_FORMATS.include?(tmp_filename.split(".")[-1])
                 ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename|
+                  doc = {} if doc.nil?
                   doc, content, metadata = process_document(constituent_file, download_filename)
                   doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
                   yield doc, obj.key, content, metadata if block_given?
@@ -250,31 +255,38 @@ module Stevedore
                 [doc]
               end
             end
+            retry_count = 0
             begin
-              resp = bulk_upload_to_es!(slice_of_objs.compact.flatten(1)) # flatten, in case there's an archive
+              resp = bulk_upload_to_es!(slice_of_objs.compact.flatten(1).reject(&:empty?)) # flatten, in case there's an archive
               puts resp.inspect if resp && resp["errors"]
             rescue Manticore::Timeout, Manticore::SocketException
               output_stream.puts("retrying at #{Time.now}")
-              retry
+              if retry_count < 10
+                retry_count += 1
+                retry
+              else
+                @errors << filename
+              end
             end
             output_stream.puts "uploaded #{slice_of_objs.size} files to #{@es_index}; #{docs_so_far} uploaded so far"
             output_stream.puts "Errors in bulk upload: #{resp.inspect}" if resp && resp["errors"]
           end
         end
       else
-        list_of_files = File.file?(target_path) ? [target_path] : Dir[File.join(target_path, target_path.include?('*') ? '' : '**/*')]
+        list_of_files = File.file?(target_path) ? [target_path] : Dir[target_path.include?('*') ? target_path : File.join(target_path, '**/*')]
         list_of_files.each_slice(@slice_size) do |slice_of_files|
           output_stream.puts "starting a set of #{@slice_size}"
           docs_so_far += slice_of_files.size
           slice_of_files.map! do |filename|
             next unless File.file?(filename)
-            filename_basepath = filename.gsub(target_path, '')
-            # if use_s3  # turning this on TK
-              download_filename = @s3_basepath + filename_basepath
-            # else
-            #   download_filename = "/files/#{@es_index}/#{filename_basepath}"
-            # end
+            filename_basepath = filename.gsub(target_path.split("*").first, '')
+            if @use_s3  # turning this on TK
+              download_filename = @s3_basepath + ((filename_basepath[0] == '/' || @s3_basepath[-1] == '/') ? '' : '/') + filename_basepath
+            else
+              download_filename = "/files/#{@es_index}/#{filename_basepath}"
+            end
             # is this file an archive that contains a bunch of documents we should index separately?
             # obviously, there is not a strict definition here.
@@ -285,6 +297,7 @@ module Stevedore
               ArchiveSplitter.split(filename).map do |constituent_file, constituent_basename|
                 doc, content, metadata = process_document(constituent_file, download_filename)
                 doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
+                doc["id"] = doc["sha1"]
                 yield doc, filename, content, metadata if block_given?
                 # FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
                 doc
@@ -295,6 +308,7 @@ module Stevedore
               [doc]
             end
           end
+          retry_count = 0
           begin
             resp = bulk_upload_to_es!(slice_of_files.compact.flatten(1)) # flatten, in case there's an archive
             puts resp.inspect if resp && resp["errors"]
@@ -303,7 +317,12 @@ module Stevedore
             output_stream.puts "Upload error: #{e} #{e.message}."
             output_stream.puts e.backtrace.join("\n") + "\n\n\n"
             output_stream.puts("retrying at #{Time.now}")
-            retry
+            if retry_count < 10
+              retry_count += 1
+              retry
+            else
+              @errors << filename
+            end
           end
           output_stream.puts "uploaded #{slice_of_files.size} files to #{@es_index}; #{docs_so_far} uploaded so far"
           output_stream.puts "Errors in bulk upload: #{resp.inspect}" if resp && resp["errors"]
@@ -311,7 +330,7 @@ module Stevedore
       end
     end
   end
-  MAPPING = {
+  DEFAULT_MAPPING = {
               sha1: {type: :string, index: :not_analyzed},
               title: { type: :string, analyzer: :keyword },
               source_url: {type: :string, index: :not_analyzed},

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: stevedore-uploader
 version: !ruby/object:Gem::Version
-  version: 1.0.3
+  version: 1.0.4
 platform: java
 authors:
 - Jeremy B. Merrill
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-05-27 00:00:00.000000000 Z
+date: 2016-10-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement
@@ -43,7 +43,7 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.9'
+        version: 0.9.17
   name: jruby-openssl
   prerelease: false
   type: :runtime
@@ -51,7 +51,7 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.9'
+        version: 0.9.17
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement
     requirements: