RubyGems - stevedore-uploader - Versions diffs - 1.0.3-java → 1.0.4-java - Mend

stevedore-uploader 1.0.3-java → 1.0.4-java

Files changed (6) hide show

checksums.yaml +4 -4
data/README.md +20 -5
data/bin/upload_to_elasticsearch.rb +10 -3
data/lib/split_archive.rb +29 -6
data/lib/stevedore-uploader.rb +58 -39
metadata +4 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: f0a7cf14db52879c45ea2916acced51d4a7c92a3
-  data.tar.gz: ff29eb5a93bf41c3dca4eb1d68b0ec3a2b7f0ca0
+  metadata.gz: 8caed7d7d3043a2377282f4da9038aebcc091214
+  data.tar.gz: 16fac6b1d157a15a552270ea1f61666e2add3f53
 SHA512:
-  metadata.gz: b1249d754597a1fcacb1bfe5c61de1fc6779b49267ba341dba728d7be5a2d9718be5eeaec196c8b1a5c42236947ecd048ba10a8fbaa7303eb0607451ce7bc9fe
-  data.tar.gz: 91dc8f1252253183ee9904b81650393435ab32d1a329d1dfcd3a4f6cbff4ea2876d6080dbd59ca5373a8c2f34075bd8f5972db178591aae4c823ad1732175d5e
+  metadata.gz: 954380cf579eb786d91cee303bb820a1bc19123a183d60a0f649c1835ca90fc8a564a0d2c89a833a14710e43fff0fce8b0ff4b377ee2f398c4cbd52ac90ad851
+  data.tar.gz: b85c53a5642cb4c2c8c6906179737470fb2d25ba9dc089dd3b23fc768618120a930e1851c862d032171fd4b58b08a8c6d6dfd7727c6ad952fa6ac8d806a25de3

data/README.md CHANGED Viewed

@@ -19,8 +19,23 @@ This project is in JRuby, so we can leverage the transformative enterprise stabi
 2. be sure you're running Java 8. (java 7 is deprecated, c'mon c'mon)
 3. `bundle install`
-Usage
------
+Command-Line Options
+--------------------
+````
+Usage: upload_to_elasticsearch [options] target_(dir_or_csv)
+    -h, --host=SERVER:PORT           The location of the ElasticSearch server
+    -i, --index=NAME                 A name to use for the ES index (defaults to using the directory name)
+    -s, --s3path=PATH                The path under your bucket where these files have been uploaded. (defaults to ES index)
+    -b, --s3bucket=PATH              The s3 bucket where these files have already been be uploaded (or will be later).
+        --title_column=COLNAME       If target file is a CSV, which column contains the title of the row. Integer index or string column name.
+        --text_column=COLNAME        If target file is a CSV, which column contains the main, searchable of the row. Integer index or string column name.
+    -o, --[no-]ocr                   don't attempt to OCR any PDFs, even if they contain no text
+    -?, --help                       Display this screen
+````
+Advanced Usage
+--------------
 **This is a piece of a larger upload workflow, [described here](https://github.com/newsdev/stevedore/blob/master/README.md). You should read that first, then come back here.**
@@ -37,12 +52,12 @@ if host isn't specified, we assume `localhost:9200`.
 e.g.
 ```
-bundle exec ruby bin/upload_to_elasticsearch.rb --index=jrubytest --host=https://stevedore.newsdev.net/es/ ~/code/marco-rubios-emails/emls/
+bundle exec ruby bin/upload_to_elasticsearch.rb --index=jrubytest --host=https://stevedore.elasticsearch.yourdomain.net/es/ ~/code/marco-rubios-emails/emls/
 ```
-you may also specify an S3 location of documents to parse, instead of a local directory, e.g.
+you may also specify an s3:// location of documents to parse, instead of a local directory, e.g.
 ```
-bundle exec ruby bin/upload_to_elasticsearch.rb --index=jrubytest --host=https://stevedore.newsdev.net/es/ s3://int-data-dumps/marco-rubio-fire-drill
+bundle exec ruby bin/upload_to_elasticsearch.rb --index=jrubytest --host=https://stevedore.elasticsearch.yourdomain.net/es/ s3://int-data-dumps/marco-rubio-fire-drill
 ```
 if you choose to process documents from S3, you should upload those documents using your choice of tool -- but `awscli` is a good choice. *Stevedore-Uploader does NOT upload documents to S3 on your behalf.

data/bin/upload_to_elasticsearch.rb CHANGED Viewed

@@ -34,16 +34,21 @@ if __FILE__ == $0
       options.s3bucket = s3bucket
     end
-    opts.on("--title_column=COLNAME",
+    opts.on("--title-column=COLNAME",
             "If target file is a CSV, which column contains the title of the row. Integer index or string column name."
       ) do |title_column|
       options.title_column = title_column
     end
-    opts.on("--text_column=COLNAME",
+    opts.on("--text-column=COLNAME",
             "If target file is a CSV, which column contains the main, searchable of the row. Integer index or string column name."
       ) do |text_column|
       options.text_column = text_column
     end
+    opts.on("--slice-size=SLICE",
+            "Process documents in batches of SLICE. Default is 100. Lower this if you get timeouts. Raise it to go faster."
+      ) do |slice_size|
+      options.slice_size = slice_size.to_i
+    end
     opts.on("-o", "--[no-]ocr", "don't attempt to OCR any PDFs, even if they contain no text") do |v|
       options.ocr = v
@@ -95,7 +100,9 @@ raise ArgumentError, "specify the elasticsearch host" unless ES_HOST
 if __FILE__ == $0
   f = Stevedore::ESUploader.new(ES_HOST, ES_INDEX, S3_BUCKET, S3_BASEPATH)
   f.should_ocr = options.ocr
-  puts "Will not OCR, per --no-ocr option" unless f.should_ocr
+  puts "Will not OCR, per --no-ocr option" unless f.should_ocr
+  f.slice_size = options.slice_size if options.slice_size
+  puts "Slice size set to #{f.slice_size}" if options.slice_size
   if FOLDER.match(/\.[ct]sv$/)
     f.do_csv!(FOLDER, File.join(f.s3_basepath, File.basename(FOLDER)), options.title_column, options.text_column)

data/lib/split_archive.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-# splits zip, mbox (and eventually pst) files into their constituent documents -- mesages and attachments
+# splits zip, mbox, pst files into their constituent documents -- messages and attachments
 # and puts them into a tmp folder
 # which is then parsed normally
@@ -11,7 +11,7 @@ require 'pst' # for PST files
 # splits PST and Mbox formats
 module Stevedore
   class ArchiveSplitter
-    HANDLED_FORMATS = ["zip", "mbox", "pst"]
+    HANDLED_FORMATS = ["zip", "mbox", "pst", "eml"]
     def self.split(archive_filename)
       # if it's a PST use split_pst
@@ -28,6 +28,8 @@ module Stevedore
                           self.split_pst(archive_filename)
                         elsif extension == "zip"
                           self.split_zip(archive_filename)
+                        elsif extension == "eml"
+                          self.get_attachments_from_eml(archive_filename)
                         end
           # should yield a relative filename
           # and a lambda that will write the file contents to the given filename
@@ -36,8 +38,14 @@ module Stevedore
           constituent_files.each_with_index do |basename_contents_lambda, idx|
             basename, contents_lambda = *basename_contents_lambda
             tmp_filename = File.join(tmpdir, File.basename(archive_filename), basename.gsub("/", "") )
-            contents_lambda.call(tmp_filename)
-            yielder.yield tmp_filename, File.join(File.basename(archive_filename), basename)
+            FileUtils.mkdir_p(File.dirname(tmp_filename))
+            begin
+              contents_lambda.call(tmp_filename)
+            rescue Errno::ENOENT
+              puts "#{tmp_filename} wasn't extracted from #{archive_filename}"
+              next
+            end
+            yielder.yield tmp_filename, File.join(File.basename(archive_filename), basename)
           end
         end
       end
@@ -83,18 +91,33 @@ module Stevedore
             yielder << ["#{idx}.eml", lambda{|fn| open(fn, 'wb'){|fh| fh << mail_str.join("") } }]
             mail = Mail.new mail_str.join("")
             mail.attachments.each do |attachment|
-              yielder << [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| attachment.save fh }}]
+              yielder << [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}]
             end
           end
         end
       end
     end
+    def self.get_attachments_from_eml(email_filename)
+      Enumerator.new do |yielder|
+        yielder << [File.basename(email_filename), lambda{|fn| open(fn, 'wb'){|fh| fh << open(email_filename){|f| f.read } } }]
+        mail = Mail.new open(email_filename){|f| f.read }
+        mail.attachments.each do |attachment|
+          yielder << [attachment.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << attachment.body.decoded }}]
+        end
+      end
+    end
     def self.split_zip(archive_filename)
       Zip::File.open(archive_filename) do |zip_file|
         Enumerator.new do |yielder|
           zip_file.each do |entry|
-            yielder << [entry.name, lambda{|fn| entry.extract(fn) }]
+           begin
+             yielder << [entry.name, lambda{|fn| entry.extract(fn) }]
+           rescue
+             puts "unable to extract #{entry.name} from #{archive_filename}"
+           end
           end
         end
       end

data/lib/stevedore-uploader.rb CHANGED Viewed

@@ -1,8 +1,6 @@
-Dir["#{File.expand_path(File.dirname(__FILE__))}/../lib/*.rb"].each {|f| require f}
-Dir["#{File.expand_path(File.dirname(__FILE__))}/../lib/parsers/*.rb"].each {|f| require f}
 require 'rika'
+require 'jruby-openssl'
 require 'net/https'
 require 'elasticsearch'
 require 'elasticsearch/transport/transport/http/manticore'
@@ -12,8 +10,9 @@ require 'manticore'
 require 'fileutils'
 require 'csv'
 require 'aws-sdk'
+Dir["#{File.expand_path(File.dirname(__FILE__))}/../lib/*.rb"].each {|f| require f}
+Dir["#{File.expand_path(File.dirname(__FILE__))}/../lib/parsers/*.rb"].each {|f| require f}
 module Stevedore
@@ -33,15 +32,16 @@ module Stevedore
         },
       )
       @es_index = es_index
-      @s3_bucket = s3_bucket #|| (Stevedore::ESUploader.const_defined?(FOLDER) && FOLDER.downcase.include?('s3://') ? FOLDER.gsub(/s3:\/\//i, '').split("/", 2).first : nil)
-      @s3_basepath = "https://#{s3_bucket}.s3.amazonaws.com/#{s3_path || es_index}"
+      @s3_bucket = s3_bucket
+      @s3_basepath = "https://#{s3_bucket}.s3.amazonaws.com/#{s3_path || es_index}/"
+      @use_s3 = !s3_bucket.nil?
       @slice_size =  100
       @should_ocr = false
       self.create_index!
-      self.add_mapping(:doc, MAPPING)
+      self.add_mapping(:doc, Stevedore.const_defined?("MAPPING") ? MAPPING : DEFAULT_MAPPING)
     end
     def create_index!
@@ -92,25 +92,30 @@ module Stevedore
       }) # was "rescue nil" but that obscured meaningful errors
     end
-    def bulk_upload_to_es!(data, type=nil)
-      return nil if data.empty?
-      begin
-        resp = @client.bulk body: data.map{|datum| {index: {_index: @es_index, _type: type || 'doc', data: datum }} }
-        puts resp if resp[:errors]
-      rescue JSON::GeneratorError
-        data.each do |datum|
-          begin
-            @client.bulk body: [datum].map{|datum| {index: {_index: @es_index, _type: type || 'doc', data: datum }} }
-          rescue JSON::GeneratorError
-            next
+    def bulk_upload_to_es!(data, type=:doc)
+      return nil if data.compact.empty?
+      if data.size == 1
+        resp = @client.index  index: @es_index, type: type, id: data.first["_id"], body: data.first
+      else
+        begin
+          resp = @client.bulk body: data.map{|datum| {index: {_index: @es_index, _type: type || 'doc', data: datum }} }
+          puts resp if resp[:errors]
+        rescue JSON::GeneratorError, Elasticsearch::Transport::Transport::Errors::InternalServerError
+          data.each do |datum|
+            begin
+              @client.bulk body: [datum].map{|datum| {index: {_index: @es_index, _type: type || 'doc', data: datum }} } unless datum.nil?
+            rescue JSON::GeneratorError, Elasticsearch::Transport::Transport::Errors::InternalServerError
+              next
+            end
           end
+          resp = nil
         end
         resp = nil
       end
       resp
     end
-    def process_document(filename, filename_for_s3)
+    def process_document(filename, download_url)
       begin
         puts "begin to process #{filename}"
         # puts "size: #{File.size(filename)}"
@@ -121,7 +126,7 @@ module Stevedore
           metadata = "couldn't be parsed"
         end
         puts "parsed: #{content.size}"
-        if content.size > 10 * (10 ** 6)
+        if content.size > 3 * (10 ** 7)
           @errors << filename
           puts "skipping #{filename} for being too big"
           return nil
@@ -133,9 +138,9 @@ module Stevedore
         # document types on its own
         ret = case                             # .eml                                          # .msg
               when metadata["Content-Type"] == "message/rfc822" || metadata["Content-Type"] == "application/vnd.ms-outlook"
-                ::Stevedore::StevedoreEmail.new_from_tika(content, metadata, filename_for_s3, filename).to_hash
+                ::Stevedore::StevedoreEmail.new_from_tika(content, metadata, download_url, filename).to_hash
               when metadata["Content-Type"] && ["application/html", "application/xhtml+xml"].include?(metadata["Content-Type"].split(";").first)
-                ::Stevedore::StevedoreHTML.new_from_tika(content, metadata, filename_for_s3, filename).to_hash
+                ::Stevedore::StevedoreHTML.new_from_tika(content, metadata, download_url, filename).to_hash
               when @should_ocr && metadata["Content-Type"] == "application/pdf" && (content.match(/\A\s*\z/) || content.size < 50 * metadata["xmpTPg:NPages"].to_i )
                 # this is a scanned PDF.
                 puts "scanned PDF #{File.basename(filename)} detected; OCRing"
@@ -146,7 +151,7 @@ module Stevedore
                   File.delete(png)
                   # no need to use a system call when we could use the stdlib!
                   # system("rm", "-f", png) rescue nil
-                  File.delete("#{png}.txt")
+                  File.delete("#{png}.txt") rescue nil
                 end.join("\n\n")
                 # e.g.  Analysis-Corporation-2.png.pdf or Torture.pdf
                 files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)[1].to_i })
@@ -154,9 +159,9 @@ module Stevedore
                 system('pdftk', *files, "cat", "output", "#{pdf_basename}.ocr.pdf")
                 content, _ = Rika.parse_content_and_metadata("#{pdf_basename}.ocr.pdf")
                 puts "OCRed content (#{File.basename(filename)}) length: #{content.length}"
-                ::Stevedore::StevedoreBlob.new_from_tika(content, metadata, filename_for_s3, filename).to_hash
+                ::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
               else
-                ::Stevedore::StevedoreBlob.new_from_tika(content, metadata, filename_for_s3, filename).to_hash
+                ::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
               end
       [ret, content, metadata]
       rescue StandardError, java.lang.NoClassDefFoundError, org.apache.tika.exception.TikaException => e
@@ -169,7 +174,7 @@ module Stevedore
       end
     end
-    def do_csv!(file, download_url, title_column=0, text_column=nil)
+    def do_csv!(file, download_url, title_column=0, text_column=nil, type=nil)
       docs_so_far = 0
       CSV.open(file, headers: (!title_column.is_a? Fixnum ) ).each_slice(@slice_size).each_with_index do |slice, slice_index|
         slice_of_rows = slice.map.each_with_index do |row, i|
@@ -185,7 +190,7 @@ module Stevedore
           doc
         end
         begin
-          resp = bulk_upload_to_es!(slice_of_rows.compact)
+          resp = bulk_upload_to_es!(slice_of_rows.compact.reject(&:empty?), type)
           docs_so_far += @slice_size
         rescue Manticore::Timeout, Manticore::SocketException
           STDERR.puts("retrying at #{Time.now}")
@@ -200,7 +205,6 @@ module Stevedore
       output_stream.puts "Processing documents from #{target_path}"
       docs_so_far = 0
-      # use_s3 = false # option to set this (an option to set document URLs to be relative to the search engine root) is TK
       @s3_bucket =  target_path.gsub(/s3:\/\//i, '').split("/", 2).first if @s3_bucket.nil? && target_path.downcase.include?('s3://')
       if target_path.downcase.include?("s3://")
@@ -237,6 +241,7 @@ module Stevedore
               # PDFs can (theoretically) contain documents as "attachments" -- those aren't handled here either.x
               if ArchiveSplitter::HANDLED_FORMATS.include?(tmp_filename.split(".")[-1])
                 ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename|
+                  doc = {} if doc.nil?
                   doc, content, metadata = process_document(constituent_file, download_filename)
                   doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
                   yield doc, obj.key, content, metadata if block_given?
@@ -250,31 +255,38 @@ module Stevedore
                 [doc]
               end
             end
+            retry_count = 0
             begin
-              resp = bulk_upload_to_es!(slice_of_objs.compact.flatten(1)) # flatten, in case there's an archive
+              resp = bulk_upload_to_es!(slice_of_objs.compact.flatten(1).reject(&:empty?)) # flatten, in case there's an archive
               puts resp.inspect if resp && resp["errors"]
             rescue Manticore::Timeout, Manticore::SocketException
               output_stream.puts("retrying at #{Time.now}")
-              retry
+              if retry_count < 10
+                retry_count += 1
+                retry
+              else
+                @errors << filename
+              end
             end
             output_stream.puts "uploaded #{slice_of_objs.size} files to #{@es_index}; #{docs_so_far} uploaded so far"
             output_stream.puts "Errors in bulk upload: #{resp.inspect}" if resp && resp["errors"]
           end
         end
       else
-        list_of_files = File.file?(target_path) ? [target_path] : Dir[File.join(target_path, target_path.include?('*') ? '' : '**/*')]
+        list_of_files = File.file?(target_path) ? [target_path] : Dir[target_path.include?('*') ? target_path : File.join(target_path, '**/*')]
         list_of_files.each_slice(@slice_size) do |slice_of_files|
           output_stream.puts "starting a set of #{@slice_size}"
           docs_so_far += slice_of_files.size
           slice_of_files.map! do |filename|
             next unless File.file?(filename)
-            filename_basepath = filename.gsub(target_path, '')
-            # if use_s3  # turning this on TK
-              download_filename = @s3_basepath + filename_basepath
-            # else
-            #   download_filename = "/files/#{@es_index}/#{filename_basepath}"
-            # end
+            filename_basepath = filename.gsub(target_path.split("*").first, '')
+            if @use_s3  # turning this on TK
+              download_filename = @s3_basepath + ((filename_basepath[0] == '/' || @s3_basepath[-1] == '/') ? '' : '/') + filename_basepath
+            else
+              download_filename = "/files/#{@es_index}/#{filename_basepath}"
+            end
             # is this file an archive that contains a bunch of documents we should index separately?
             # obviously, there is not a strict definition here.
@@ -285,6 +297,7 @@ module Stevedore
               ArchiveSplitter.split(filename).map do |constituent_file, constituent_basename|
                 doc, content, metadata = process_document(constituent_file, download_filename)
                 doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
+                doc["id"] = doc["sha1"]
                 yield doc, filename, content, metadata if block_given?
                 # FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
                 doc
@@ -295,6 +308,7 @@ module Stevedore
               [doc]
             end
           end
+          retry_count = 0
           begin
             resp = bulk_upload_to_es!(slice_of_files.compact.flatten(1)) # flatten, in case there's an archive
             puts resp.inspect if resp && resp["errors"]
@@ -303,7 +317,12 @@ module Stevedore
             output_stream.puts "Upload error: #{e} #{e.message}."
             output_stream.puts e.backtrace.join("\n") + "\n\n\n"
             output_stream.puts("retrying at #{Time.now}")
-            retry
+            if retry_count < 10
+              retry_count += 1
+              retry
+            else
+              @errors << filename
+            end
           end
           output_stream.puts "uploaded #{slice_of_files.size} files to #{@es_index}; #{docs_so_far} uploaded so far"
           output_stream.puts "Errors in bulk upload: #{resp.inspect}" if resp && resp["errors"]
@@ -311,7 +330,7 @@ module Stevedore
       end
     end
   end
-  MAPPING = {
+  DEFAULT_MAPPING = {
               sha1: {type: :string, index: :not_analyzed},
               title: { type: :string, analyzer: :keyword },
               source_url: {type: :string, index: :not_analyzed},

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: stevedore-uploader
 version: !ruby/object:Gem::Version
-  version: 1.0.3
+  version: 1.0.4
 platform: java
 authors:
 - Jeremy B. Merrill
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-05-27 00:00:00.000000000 Z
+date: 2016-10-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement
@@ -43,7 +43,7 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.9'
+        version: 0.9.17
   name: jruby-openssl
   prerelease: false
   type: :runtime
@@ -51,7 +51,7 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.9'
+        version: 0.9.17
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement
     requirements: