docsplit 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +1 -1
- data/lib/docsplit/image_extractor.rb +4 -2
- data/lib/docsplit/text_extractor.rb +6 -6
- data/lib/docsplit/transparent_pdfs.rb +3 -3
- metadata +9 -4
    
        data/docsplit.gemspec
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            Gem::Specification.new do |s|
         | 
| 2 2 | 
             
              s.name      = 'docsplit'
         | 
| 3 | 
            -
              s.version   = '0.3. | 
| 4 | 
            -
              s.date      = '2010-8- | 
| 3 | 
            +
              s.version   = '0.3.3'         # Keep version in sync with docsplit.rb
         | 
| 4 | 
            +
              s.date      = '2010-8-17'
         | 
| 5 5 |  | 
| 6 6 | 
             
              s.homepage    = "http://documentcloud.github.com/docsplit/"
         | 
| 7 7 | 
             
              s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"
         | 
    
        data/lib/docsplit.rb
    CHANGED
    
    
| @@ -24,6 +24,7 @@ module Docsplit | |
| 24 24 |  | 
| 25 25 | 
             
                # Convert a single PDF into page images at the specified size and format.
         | 
| 26 26 | 
             
                def convert(pdf, size, format, previous=nil)
         | 
| 27 | 
            +
                  tempdir   = Dir.mktmpdir
         | 
| 27 28 | 
             
                  basename  = File.basename(pdf, File.extname(pdf))
         | 
| 28 29 | 
             
                  directory = directory_for(size)
         | 
| 29 30 | 
             
                  FileUtils.mkdir_p(directory) unless File.exists?(directory)
         | 
| @@ -31,13 +32,14 @@ module Docsplit | |
| 31 32 | 
             
                  common    = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
         | 
| 32 33 | 
             
                  if previous
         | 
| 33 34 | 
             
                    FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
         | 
| 34 | 
            -
                    cmd = "OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
         | 
| 35 | 
            +
                    cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
         | 
| 35 36 | 
             
                  else
         | 
| 36 | 
            -
                    cmd = "OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
         | 
| 37 | 
            +
                    cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
         | 
| 37 38 | 
             
                  end
         | 
| 38 39 | 
             
                  result = `#{cmd}`.chomp
         | 
| 39 40 | 
             
                  raise ExtractionFailed, result if $? != 0
         | 
| 40 41 | 
             
                  renumber_images(out_file, format)
         | 
| 42 | 
            +
                  FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
         | 
| 41 43 | 
             
                end
         | 
| 42 44 |  | 
| 43 45 |  | 
| @@ -42,7 +42,6 @@ module Docsplit | |
| 42 42 | 
             
                      end
         | 
| 43 43 | 
             
                    end
         | 
| 44 44 | 
             
                  end
         | 
| 45 | 
            -
                  FileUtils.remove_entry_secure @tempdir if @tempdir
         | 
| 46 45 | 
             
                end
         | 
| 47 46 |  | 
| 48 47 | 
             
                # Does a PDF have any text embedded?
         | 
| @@ -59,19 +58,20 @@ module Docsplit | |
| 59 58 |  | 
| 60 59 | 
             
                # Extract a page range worth of text from a PDF via OCR.
         | 
| 61 60 | 
             
                def extract_from_ocr(pdf, pages)
         | 
| 62 | 
            -
                   | 
| 61 | 
            +
                  tempdir = Dir.mktmpdir
         | 
| 63 62 | 
             
                  base_path = File.join(@output, @pdf_name)
         | 
| 64 63 | 
             
                  if pages
         | 
| 65 | 
            -
                    run "OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{ | 
| 64 | 
            +
                    run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
         | 
| 66 65 | 
             
                    @tiffs_generated = true
         | 
| 67 66 | 
             
                    pages.each do |page|
         | 
| 68 | 
            -
                      run "tesseract #{ | 
| 67 | 
            +
                      run "tesseract #{tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
         | 
| 69 68 | 
             
                    end
         | 
| 70 69 | 
             
                  else
         | 
| 71 | 
            -
                    tiff = "#{ | 
| 72 | 
            -
                    run "OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
         | 
| 70 | 
            +
                    tiff = "#{tempdir}/#{@pdf_name}.tif"
         | 
| 71 | 
            +
                    run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
         | 
| 73 72 | 
             
                    run "tesseract #{tiff} #{base_path} -l eng 2>&1"
         | 
| 74 73 | 
             
                  end
         | 
| 74 | 
            +
                  FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
         | 
| 75 75 | 
             
                end
         | 
| 76 76 |  | 
| 77 77 |  | 
| @@ -12,9 +12,9 @@ module Docsplit | |
| 12 12 | 
             
                    if ext.downcase == '.pdf'
         | 
| 13 13 | 
             
                      doc
         | 
| 14 14 | 
             
                    else
         | 
| 15 | 
            -
                       | 
| 16 | 
            -
                      extract_pdf([doc], {:output =>  | 
| 17 | 
            -
                      File.join( | 
| 15 | 
            +
                      tempdir = File.join(Dir.tmpdir, 'docsplit')
         | 
| 16 | 
            +
                      extract_pdf([doc], {:output => tempdir})
         | 
| 17 | 
            +
                      File.join(tempdir, File.basename(doc, ext) + '.pdf')
         | 
| 18 18 | 
             
                    end
         | 
| 19 19 | 
             
                  end
         | 
| 20 20 | 
             
                end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,12 +1,13 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 2 | 
             
            name: docsplit
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            +
              hash: 21
         | 
| 4 5 | 
             
              prerelease: false
         | 
| 5 6 | 
             
              segments: 
         | 
| 6 7 | 
             
              - 0
         | 
| 7 8 | 
             
              - 3
         | 
| 8 | 
            -
              -  | 
| 9 | 
            -
              version: 0.3. | 
| 9 | 
            +
              - 3
         | 
| 10 | 
            +
              version: 0.3.3
         | 
| 10 11 | 
             
            platform: ruby
         | 
| 11 12 | 
             
            authors: 
         | 
| 12 13 | 
             
            - Jeremy Ashkenas
         | 
| @@ -15,7 +16,7 @@ autorequire: | |
| 15 16 | 
             
            bindir: bin
         | 
| 16 17 | 
             
            cert_chain: []
         | 
| 17 18 |  | 
| 18 | 
            -
            date: 2010-08- | 
| 19 | 
            +
            date: 2010-08-17 00:00:00 -04:00
         | 
| 19 20 | 
             
            default_executable: 
         | 
| 20 21 | 
             
            dependencies: []
         | 
| 21 22 |  | 
| @@ -60,23 +61,27 @@ rdoc_options: [] | |
| 60 61 | 
             
            require_paths: 
         | 
| 61 62 | 
             
            - lib
         | 
| 62 63 | 
             
            required_ruby_version: !ruby/object:Gem::Requirement 
         | 
| 64 | 
            +
              none: false
         | 
| 63 65 | 
             
              requirements: 
         | 
| 64 66 | 
             
              - - ">="
         | 
| 65 67 | 
             
                - !ruby/object:Gem::Version 
         | 
| 68 | 
            +
                  hash: 3
         | 
| 66 69 | 
             
                  segments: 
         | 
| 67 70 | 
             
                  - 0
         | 
| 68 71 | 
             
                  version: "0"
         | 
| 69 72 | 
             
            required_rubygems_version: !ruby/object:Gem::Requirement 
         | 
| 73 | 
            +
              none: false
         | 
| 70 74 | 
             
              requirements: 
         | 
| 71 75 | 
             
              - - ">="
         | 
| 72 76 | 
             
                - !ruby/object:Gem::Version 
         | 
| 77 | 
            +
                  hash: 3
         | 
| 73 78 | 
             
                  segments: 
         | 
| 74 79 | 
             
                  - 0
         | 
| 75 80 | 
             
                  version: "0"
         | 
| 76 81 | 
             
            requirements: []
         | 
| 77 82 |  | 
| 78 83 | 
             
            rubyforge_project: docsplit
         | 
| 79 | 
            -
            rubygems_version: 1.3. | 
| 84 | 
            +
            rubygems_version: 1.3.7
         | 
| 80 85 | 
             
            signing_key: 
         | 
| 81 86 | 
             
            specification_version: 3
         | 
| 82 87 | 
             
            summary: Break Apart Documents into Images, Text, Pages and PDFs
         |