RubyGems - docsplit - Versions diffs - 0.7.2 → 0.7.3 - Mend

docsplit 0.7.2 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +7 -0
data/LICENSE +2 -1
data/docsplit.gemspec +3 -2
data/lib/docsplit.rb +1 -1
data/lib/docsplit/command_line.rb +1 -0
data/lib/docsplit/info_extractor.rb +1 -1
data/lib/docsplit/pdf_extractor.rb +9 -2
data/lib/docsplit/text_cleaner.rb +7 -2
metadata +31 -29

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA512:
+  data.tar.gz: e50c360ba40c1aa51cd563d9a08d0c5a444b50c4e97635133aec13f17fcc4e541e20d2bb9867ca316486df9a501852ed482801bed26986e88dbc472bd2609d65
+  metadata.gz: 5da217d234a3b390963f2c6259ab859ecf805d71c60138c335eebdd9ae4c166cce63e825cee2e10c7e3029c47e4530e53b2d84dbdc91ab222c0a98c6f342d32a
+SHA1:
+  data.tar.gz: 03c2ec2439773679080a055463d35e7004d1decf
+  metadata.gz: ddd8d4475b4a34fcfd2abb013d2824295718aa8c

data/LICENSE CHANGED Viewed

@@ -1,6 +1,7 @@
 JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
-Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
+Copyright (c) 2009-2011 Jeremy Ashkenas, DocumentCloud
+Copyright (c) 2011-2013 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation

data/docsplit.gemspec CHANGED Viewed

@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'docsplit'
-  s.version   = '0.7.2'         # Keep version in sync with docsplit.rb
-  s.date      = '2013-02-21'
+  s.version   = '0.7.3'         # Keep version in sync with docsplit.rb
+  s.date      = '2014-02-16'
   s.homepage    = "http://documentcloud.github.com/docsplit/"
   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -15,6 +15,7 @@ Gem::Specification.new do |s|
   s.authors           = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
   s.email             = 'opensource@documentcloud.org'
   s.rubyforge_project = 'docsplit'
+  s.license           = 'MIT'
   s.require_paths     = ['lib']
   s.executables       = ['docsplit']

data/lib/docsplit.rb CHANGED Viewed

@@ -5,7 +5,7 @@ require 'shellwords'
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
-  VERSION       = '0.7.2' # Keep in sync with gemspec.
+  VERSION       = '0.7.3' # Keep in sync with gemspec.
   ESCAPE        = lambda {|x| Shellwords.shellescape(x) }

data/lib/docsplit/command_line.rb CHANGED Viewed

@@ -96,6 +96,7 @@ Options:
         end
         opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
           @options[:language] = l
+          @options[:clean] = false
         end
         opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
           @options[:rolling] = true

data/lib/docsplit/info_extractor.rb CHANGED Viewed

@@ -27,7 +27,7 @@ module Docsplit
       raise ExtractionFailed, result if $? != 0
       # ruby  1.8 (iconv) and 1.9 (String#encode) :
       if String.method_defined?(:encode)
-        result.encode!('UTF-8', 'UTF-8', :invalid => :replace)
+        result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding?
       else
         require 'iconv' unless defined?(Iconv)
         ic = Iconv.new('UTF-8//IGNORE','UTF-8')

data/lib/docsplit/pdf_extractor.rb CHANGED Viewed

@@ -19,7 +19,12 @@ module Docsplit
     # The first line of the help output holds the name and version number
     # of the office software to be used for extraction.
     def version_string
-      @@help ||= `#{office_executable} -h 2>&1`.split("\n").first
+      versionstr =  `#{office_executable} -h 2>&1`.split("\n").first
+        if !!versionstr.match(/[0-9]*/)
+                versionstr =  `#{office_executable} --version`.split("\n").first
+        end
+        @@help ||= versionstr
     end
     def libre_office?
       !!version_string.match(/^LibreOffice/)
@@ -37,7 +42,7 @@ module Docsplit
       if windows?
         office_names       = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
         program_files_path = ENV["CommonProgramFiles"]
-        search_paths       = office_name.map{ |program| File.join(program_files_path, program) }
+        search_paths       = office_names.map{ |program| File.join(program_files_path, program) }
       elsif osx?
         search_paths = %w(
           /Applications/LibreOffice.app/Contents
@@ -46,8 +51,10 @@ module Docsplit
       else # probably linux/unix
         search_paths = %w(
           /usr/lib/libreoffice
+          /usr/lib64/libreoffice
           /opt/libreoffice
           /usr/lib/openoffice
+          /usr/lib64/openoffice
           /opt/openoffice.org3
         )
       end

data/lib/docsplit/text_cleaner.rb CHANGED Viewed

@@ -35,8 +35,13 @@ module Docsplit
     # For the time being, `clean` uses the regular StringScanner, and not the
     # multibyte-aware version, coercing to ASCII first.
     def clean(text)
-      require 'iconv' unless defined?(Iconv)
-      text    = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
+      if String.method_defined?(:encode)
+        text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
+      else
+        require 'iconv' unless defined?(Iconv)
+        text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
+      end
       scanner = StringScanner.new(text)
       cleaned = []
       spaced  = false

metadata CHANGED Viewed

@@ -1,28 +1,28 @@
---- !ruby/object:Gem::Specification
+--- !ruby/object:Gem::Specification
 name: docsplit
-version: !ruby/object:Gem::Version
-  version: 0.7.2
-  prerelease:
+version: !ruby/object:Gem::Version
+  version: 0.7.3
 platform: ruby
-authors:
+authors:
 - Jeremy Ashkenas
 - Samuel Clay
 - Ted Han
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-02-21 00:00:00.000000000 Z
+date: 2014-02-16 00:00:00 Z
 dependencies: []
-description: ! "    Docsplit is a command-line utility and Ruby library for splitting
-  apart\n    documents into their component parts: searchable UTF-8 plain text, page\n
-  \   images or thumbnails in any format, PDFs, single pages, and document\n    metadata
-  (title, author, number of pages...)\n"
+description: "    Docsplit is a command-line utility and Ruby library for splitting apart\n    documents into their component parts: searchable UTF-8 plain text, page\n    images or thumbnails in any format, PDFs, single pages, and document\n    metadata (title, author, number of pages...)\n"
 email: opensource@documentcloud.org
-executables:
+executables:
 - docsplit
 extensions: []
 extra_rdoc_files: []
-files:
+files:
 - lib/docsplit/command_line.rb
 - lib/docsplit/image_extractor.rb
 - lib/docsplit/info_extractor.rb
@@ -47,28 +47,30 @@ files:
 - LICENSE
 - README
 homepage: http://documentcloud.github.com/docsplit/
-licenses: []
+licenses:
+- MIT
+metadata: {}
 post_install_message:
 rdoc_options: []
-require_paths:
+require_paths:
 - lib
-required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
-  requirements:
-  - - ! '>='
-    - !ruby/object:Gem::Version
-      version: '0'
-required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
-  requirements:
-  - - ! '>='
-    - !ruby/object:Gem::Version
-      version: '0'
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - &id001
+    - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - *id001
 requirements: []
 rubyforge_project: docsplit
-rubygems_version: 1.8.24
+rubygems_version: 2.0.13
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: Break Apart Documents into Images, Text, Pages and PDFs
 test_files: []
-has_rdoc: