RubyGems - docsplit - Versions diffs - 0.2.0 → 0.3.0 - Mend

docsplit 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

data/LICENSE +0 -1
data/docsplit.gemspec +2 -2
data/lib/docsplit.rb +30 -7
data/lib/docsplit/command_line.rb +9 -3
data/lib/docsplit/image_extractor.rb +27 -8
data/lib/docsplit/info_extractor.rb +32 -0
data/lib/docsplit/page_extractor.rb +31 -0
data/lib/docsplit/text_extractor.rb +93 -35
metadata +7 -26
data/build/org/documentcloud/ExtractInfo$1.class +0 -0
data/build/org/documentcloud/ExtractInfo$Keys.class +0 -0
data/build/org/documentcloud/ExtractInfo.class +0 -0
data/build/org/documentcloud/ExtractPages.class +0 -0
data/build/org/documentcloud/ExtractText.class +0 -0
data/build/org/documentcloud/Extractor.class +0 -0
data/lib/docsplit/ExtractInfo.java +0 -63
data/lib/docsplit/ExtractPages.java +0 -54
data/lib/docsplit/ExtractText.java +0 -80
data/lib/docsplit/Extractor.java +0 -91
data/lib/docsplit/argument_parser.rb +0 -31
data/vendor/bcmail.jar +0 -0
data/vendor/bcprov.jar +0 -0
data/vendor/commons-logging.jar +0 -0
data/vendor/fontbox.jar +0 -0
data/vendor/pdfbox.jar +0 -0

data/LICENSE CHANGED Viewed

@@ -1,5 +1,4 @@
 JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
-PDFBox is licensed under the Apache 2 License: apache.org/licenses/LICENSE-2.0
 Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud

data/docsplit.gemspec CHANGED Viewed

@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'docsplit'
-  s.version   = '0.2.0'         # Keep version in sync with docsplit.rb
-  s.date      = '2010-7-29'
+  s.version   = '0.3.0'         # Keep version in sync with docsplit.rb
+  s.date      = '2010-8-5'
   s.homepage    = "http://documentcloud.github.com/docsplit/"
   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"

data/lib/docsplit.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
-  VERSION       = '0.2.0' # Keep in sync with gemspec.
+  VERSION       = '0.3.0' # Keep in sync with gemspec.
   ROOT          = File.expand_path(File.dirname(__FILE__) + '/..')
@@ -13,6 +13,20 @@ module Docsplit
   METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
+  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
+  # Check for all dependencies, and warn of their absence.
+  dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
+  DEPENDENCIES.each_key do |dep|
+    dirs.each do |dir|
+      if File.executable?(File.join(dir, dep.to_s))
+        DEPENDENCIES[dep] = true
+        break
+      end
+    end
+    warn "Warning: Docsplit dependency #{dep} not found." if !DEPENDENCIES[dep]
+  end
   # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
   # broke.
   class ExtractionFailed < StandardError; end
@@ -20,7 +34,7 @@ module Docsplit
   # Use the ExtractPages Java class to burst a PDF into single pages.
   def self.extract_pages(pdfs, opts={})
     pdfs = ensure_pdfs(pdfs)
-    run "org.documentcloud.ExtractPages", pdfs, opts
+    PageExtractor.new.extract(pdfs, opts)
   end
   # Use the ExtractText Java class to write out all embedded text.
@@ -50,8 +64,7 @@ module Docsplit
     instance_eval <<-EOS
       def self.extract_#{key}(pdfs, opts={})
         pdfs = ensure_pdfs(pdfs)
-        result = run "org.documentcloud.ExtractInfo #{key}", pdfs, opts, true
-        :#{key} == :length ? result.to_i : result
+        InfoExtractor.new.extract(:#{key}, pdfs, opts)
       end
     EOS
   end
@@ -62,18 +75,28 @@ module Docsplit
   # Runs a Java command, with quieted logging, and the classpath set properly.
   def self.run(command, pdfs, opts, return_output=false)
     pdfs    = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
-    args    = parse_options(opts)
-    cmd     = "java #{HEADLESS} #{LOGGING} -cp #{CLASSPATH} #{command} #{args} #{pdfs} 2>&1"
+    cmd     = "java #{HEADLESS} #{LOGGING} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
     result  = `#{cmd}`.chomp
     raise ExtractionFailed, result if $? != 0
     return return_output ? (result.empty? ? nil : result) : true
   end
+  # Normalize a value in an options hash for the command line.
+  # Ranges look like: 1-10, Arrays like: 1,2,3.
+  def self.normalize_value(value)
+    case value
+    when Range then normalize_range(value)
+    when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
+    else            value.to_s
+    end
+  end
 end
 require 'tmpdir'
 require 'fileutils'
 require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
-require "#{Docsplit::ROOT}/lib/docsplit/argument_parser"
 require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
 require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
+require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
+require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"

data/lib/docsplit/command_line.rb CHANGED Viewed

@@ -8,7 +8,7 @@ module Docsplit
     BANNER = <<-EOS
 docsplit breaks apart documents into images, text, or individual pages.
-It wraps PDFBox, GraphicsMagick, and JODConverter.
+It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
 Usage:
   docsplit COMMAND [OPTIONS] path/to/doc.pdf
@@ -71,7 +71,7 @@ Options:
     # Use the OptionParser library to parse out all supported options. Return
     # options formatted for the Ruby API.
     def parse_options
-      @options = {}
+      @options = {:ocr => :default}
       @option_parser = OptionParser.new do |opts|
         opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
           @options[:output] = d
@@ -85,8 +85,14 @@ Options:
         opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
           @options[:format] = t.split(',')
         end
+        opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
+          @options[:ocr] = o
+        end
+        opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
+          @options[:rolling] = true
+        end
         opts.on_tail('-v', '--version', 'display docsplit version') do
-          puts "docsplit version #{Docsplit::VERSION}"
+          puts "Docsplit version #{Docsplit::VERSION}"
           exit
         end
         opts.on_tail('-h', '--help', 'display this help message') do

data/lib/docsplit/image_extractor.rb CHANGED Viewed

@@ -4,26 +4,37 @@ module Docsplit
   # nicely sized images.
   class ImageExtractor
-    DENSITY_ARG = "-density 150"
-    MEMORY_ARGS = "-limit memory 128MiB -limit map 256MiB"
-    DEFAULT_FORMAT = :png
+    DENSITY_ARG     = "-density 150"
+    MEMORY_ARGS     = "-limit memory 128MiB -limit map 256MiB"
+    DEFAULT_FORMAT  = :png
     # Extract a list of PDFs as rasterized page images, according to the
     # configuration in options.
     def extract(pdfs, options)
       @pdfs = [pdfs].flatten
       extract_options(options)
-      @pdfs.each {|p| @sizes.each {|s| @formats.each {|f| convert(p, s, f) }}}
+      @pdfs.each do |pdf|
+        previous = nil
+        @sizes.each_with_index do |size, i|
+          @formats.each {|format| convert(pdf, size, format, previous) }
+          previous = size if @rolling
+        end
+      end
     end
     # Convert a single PDF into page images at the specified size and format.
-    def convert(pdf, size, format)
+    def convert(pdf, size, format, previous=nil)
       basename  = File.basename(pdf, File.extname(pdf))
-      subfolder = @sizes.length > 1 ? size.to_s : ''
-      directory = File.join(@output, subfolder)
+      directory = directory_for(size)
       FileUtils.mkdir_p(directory) unless File.exists?(directory)
       out_file  = File.join(directory, "#{basename}_%05d.#{format}")
-      cmd = "gm convert +adjoin #{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
+      common    = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
+      if previous
+        FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
+        cmd = "OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
+      else
+        cmd = "OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
+      end
       result = `#{cmd}`.chomp
       raise ExtractionFailed, result if $? != 0
       renumber_images(out_file, format)
@@ -39,6 +50,14 @@ module Docsplit
       @formats = [options[:format] || DEFAULT_FORMAT].flatten
       @sizes   = [options[:size]].flatten.compact
       @sizes   = [nil] if @sizes.empty?
+      @rolling = !!options[:rolling]
+    end
+    # If there's only one size requested, generate the images directly into
+    # the output directory. Multiple sizes each get a directory of their own.
+    def directory_for(size)
+      path = @sizes.length == 1 ? @output : File.join(@output, size)
+      File.expand_path(path)
     end
     # Generate the resize argument.

data/lib/docsplit/info_extractor.rb ADDED Viewed

@@ -0,0 +1,32 @@
+module Docsplit
+  # Delegates to **pdfinfo** in order to extract information about a PDF file.
+  class InfoExtractor
+    # Regex matchers for different bits of information.
+    MATCHERS = {
+      :author   => /^Author:\s+([^\n]+)/,
+      :date     => /^CreationDate:\s+([^\n]+)/,
+      :creator  => /^Creator:\s+([^\n]+)/,
+      :keywords => /^Keywords:\s+([^\n]+)/,
+      :producer => /^Producer:\s+([^\n]+)/,
+      :subject  => /^Subject:\s+([^\n]+)/,
+      :title    => /^Title:\s+([^\n]+)/,
+      :length   => /^Pages:\s+([^\n]+)/,
+    }
+    # Pull out a single datum from a pdf.
+    def extract(key, pdfs, opts)
+      pdf = [pdfs].flatten.first
+      cmd = "pdfinfo #{pdf} 2>&1"
+      result = `#{cmd}`.chomp
+      raise ExtractionFailed, result if $? != 0
+      match = result.match(MATCHERS[key])
+      answer = match && match[1]
+      answer = answer.to_i if answer && key == :length
+      answer
+    end
+  end
+end

data/lib/docsplit/page_extractor.rb ADDED Viewed

@@ -0,0 +1,31 @@
+module Docsplit
+  # Delegates to **pdftk** in order to create bursted single pages from
+  # a PDF document.
+  class PageExtractor
+    # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
+    def extract(pdfs, opts)
+      extract_options opts
+      [pdfs].flatten.each do |pdf|
+        pdf_name = File.basename(pdf, File.extname(pdf))
+        page_path = File.join(@output, "#{pdf_name}_%d.pdf")
+        FileUtils.mkdir_p @output unless File.exists?(@output)
+        cmd = "pdftk #{pdf} burst output #{page_path} 2>&1"
+        result = `#{cmd}`.chomp
+        FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
+        raise ExtractionFailed, result if $? != 0
+        result
+      end
+    end
+    private
+    def extract_options(options)
+      @output = options[:output] || '.'
+    end
+  end
+end

data/lib/docsplit/text_extractor.rb CHANGED Viewed

@@ -1,53 +1,111 @@
 module Docsplit
+  # Delegates to **pdftotext** and **tesseract** in order to extract text from
+  # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
+  # forbid OCR extraction, but by default the heuristic works like this:
+  #
+  #  * Check for the presence of fonts in the PDF. If no fonts are detected,
+  #    OCR is used automatically.
+  #  * Extract the text of each page with **pdftotext**, if the page has less
+  #    than 100 bytes of text (a scanned image page, or a page that just
+  #    contains a filename and a page number), then add it to the list of
+  #    `@pages_to_ocr`.
+  #  * Re-OCR each page in the `@pages_to_ocr` list at the end.
+  #
   class TextExtractor
-    PAGE_COUNT_MATCHER = /Pages:\s+(\d+?)\n/
+    NO_TEXT_DETECTED = /---------\n\Z/
+    OCR_FLAGS = '-density 200x200 -colorspace GRAY'
+    MIN_TEXT_PER_PAGE = 100 # in bytes
+    def initialize
+      @tiffs_generated = false
+      @pages_to_ocr    = []
+    end
+    # Extract text from a list of PDFs.
     def extract(pdfs, opts)
       extract_options opts
-      pdfs = [pdfs].flatten
-      pdfs.each do |pdf|
-        pdf_name = File.basename(pdf, File.extname(pdf))
-        text_path = File.join(@output, "#{pdf_name}.txt")
-        FileUtils.mkdir_p @output
-        if @pages
-          pages = (@pages == 'all') ? 1..get_pages(pdf) : @pages
-          pages.each do |page|
-            extract_page pdf, page, pdf_name
-          end
+      FileUtils.mkdir_p @output unless File.exists?(@output)
+      [pdfs].flatten.each do |pdf|
+        @pdf_name = File.basename(pdf, File.extname(pdf))
+        pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
+        if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
+          extract_from_ocr(pdf, pages)
         else
-          cmd = "pdftotext -enc UTF-8 #{pdf} #{text_path}"
-          result = `#{cmd}`.chomp
-          raise ExtractionFailed, result if $? != 0
+          extract_from_pdf(pdf, pages)
+          if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
+            extract_from_ocr(pdf, @pages_to_ocr)
+          end
+        end
+      end
+      FileUtils.remove_entry_secure @tempdir if @tempdir
+    end
+    # Does a PDF have any text embedded?
+    def contains_text?(pdf)
+      fonts = `pdffonts #{pdf} 2>&1`
+      !fonts.match(NO_TEXT_DETECTED)
+    end
+    # Extract a page range worth of text from a PDF, directly.
+    def extract_from_pdf(pdf, pages)
+      return extract_full(pdf) unless pages
+      pages.each {|page| extract_page(pdf, page) }
+    end
+    # Extract a page range worth of text from a PDF via OCR.
+    def extract_from_ocr(pdf, pages)
+      @tempdir  ||= Dir.mktmpdir
+      base_path = File.join(@output, @pdf_name)
+      if pages
+        run "gm convert +adjoin #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
+        @tiffs_generated = true
+        pages.each do |page|
+          run "tesseract #{@tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
         end
+      else
+        tiff = "#{@tempdir}/#{@pdf_name}.tif"
+        run "gm convert #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
+        run "tesseract #{tiff} #{base_path} -l eng 2>&1"
       end
     end
-    def extract_page(pdf, page, pdf_name)
-      text_path = File.join(@output, "#{pdf_name}_#{page}.txt")
-      cmd = "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path}"
-      result = `#{cmd}`.chomp
+    private
+    # Run an external process and raise an exception if it fails.
+    def run(command)
+      result = `#{command}`
       raise ExtractionFailed, result if $? != 0
       result
     end
-    def get_pages(pdf_path)
-      info = `pdfinfo #{pdf_path}`
-      raise ExtractionFailed, result if $? != 0
-      match = info.match(PAGE_COUNT_MATCHER)
-      raise ExtractionFailed if match.nil?
-      match[1].to_i
+    # Extract the full contents of a pdf as a single file, directly.
+    def extract_full(pdf)
+      text_path = File.join(@output, "#{@pdf_name}.txt")
+      run "pdftotext -enc UTF-8 #{pdf} #{text_path} 2>&1"
     end
-    private
+    # Extract the contents of a single page of text, directly, adding it to
+    # the `@pages_to_ocr` list if the text length is inadequate.
+    def extract_page(pdf, page)
+      text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
+      run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path} 2>&1"
+      unless @forbid_ocr
+        @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
+      end
+    end
     def extract_options(options)
-      @output  = options[:output] || '.'
-      @pages   = options[:pages]
+      @output     = options[:output] || '.'
+      @pages      = options[:pages]
+      @force_ocr  = options[:ocr] == true
+      @forbid_ocr = options[:ocr] == false
     end
   end
 end

metadata CHANGED Viewed

@@ -1,13 +1,12 @@
 --- !ruby/object:Gem::Specification
 name: docsplit
 version: !ruby/object:Gem::Version
-  hash: 23
   prerelease: false
   segments:
   - 0
-  - 2
+  - 3
   - 0
-  version: 0.2.0
+  version: 0.3.0
 platform: ruby
 authors:
 - Jeremy Ashkenas
@@ -16,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-07-29 00:00:00 -04:00
+date: 2010-08-05 00:00:00 -04:00
 default_executable:
 dependencies: []
@@ -29,27 +28,14 @@ extensions: []
 extra_rdoc_files: []
 files:
-- build/org/documentcloud/ExtractInfo$1.class
-- build/org/documentcloud/ExtractInfo$Keys.class
-- build/org/documentcloud/ExtractInfo.class
-- build/org/documentcloud/Extractor.class
-- build/org/documentcloud/ExtractPages.class
-- build/org/documentcloud/ExtractText.class
-- lib/docsplit/argument_parser.rb
 - lib/docsplit/command_line.rb
-- lib/docsplit/ExtractInfo.java
-- lib/docsplit/Extractor.java
-- lib/docsplit/ExtractPages.java
-- lib/docsplit/ExtractText.java
 - lib/docsplit/image_extractor.rb
+- lib/docsplit/info_extractor.rb
+- lib/docsplit/page_extractor.rb
 - lib/docsplit/text_extractor.rb
 - lib/docsplit/transparent_pdfs.rb
 - lib/docsplit.rb
 - bin/docsplit
-- vendor/bcmail.jar
-- vendor/bcprov.jar
-- vendor/commons-logging.jar
-- vendor/fontbox.jar
 - vendor/jodconverter/commons-cli-1.2.jar
 - vendor/jodconverter/commons-io-1.4.jar
 - vendor/jodconverter/jodconverter-2.2.2.jar
@@ -61,11 +47,10 @@ files:
 - vendor/jodconverter/slf4j-jdk14-1.5.6.jar
 - vendor/jodconverter/unoil-3.0.1.jar
 - vendor/logging.properties
-- vendor/pdfbox.jar
 - docsplit.gemspec
 - LICENSE
 - README
-has_rdoc: true
+has_rdoc: false
 homepage: http://documentcloud.github.com/docsplit/
 licenses: []
@@ -75,27 +60,23 @@ rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: 3
       segments:
       - 0
       version: "0"
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: 3
       segments:
       - 0
       version: "0"
 requirements: []
 rubyforge_project: docsplit
-rubygems_version: 1.3.7
+rubygems_version: 1.3.6
 signing_key:
 specification_version: 3
 summary: Break Apart Documents into Images, Text, Pages and PDFs

data/build/org/documentcloud/ExtractInfo$1.class DELETED Viewed

Binary file

data/build/org/documentcloud/ExtractInfo$Keys.class DELETED Viewed

Binary file

data/build/org/documentcloud/ExtractInfo.class DELETED Viewed

Binary file

data/build/org/documentcloud/ExtractPages.class DELETED Viewed

Binary file

data/build/org/documentcloud/ExtractText.class DELETED Viewed

Binary file

data/build/org/documentcloud/Extractor.class DELETED Viewed

Binary file

data/lib/docsplit/ExtractInfo.java DELETED Viewed

@@ -1,63 +0,0 @@
-package org.documentcloud;
-import java.util.List;
-import java.io.IOException;
-import java.text.SimpleDateFormat;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDDocumentInformation;
-// Extracts metadata from a PDF file.
-public class ExtractInfo extends Extractor {
-  private PDDocument doc;
-  private PDDocumentInformation info;
-  private String key;
-  // The list of metadata keys we know how to extract.
-  private enum Keys {
-    AUTHOR, DATE, CREATOR, KEYWORDS, PRODUCER, SUBJECT, TITLE, LENGTH
-  }
-  // The mainline.
-  public static void main(String[] args) {
-    (new ExtractInfo()).run(args);
-  }
-  // The first argument is always the name of the metadata key.
-  protected void parseArguments(List<String> args) {
-    super.parseArguments(args);
-    key = args.remove(0).toUpperCase();
-  }
-  // Extract the configured bit of metadata from a PDF, decrypting if necessary.
-  public void extract(String pdfPath) {
-    try {
-      doc = PDDocument.load(pdfPath, false);
-      decrypt(doc);
-      info = doc.getDocumentInformation();
-      String val = extractInfo();
-      if (val != null) System.out.println(val);
-      doc.close();
-    } catch(IOException e) {
-      System.out.println(e.getMessage());
-      System.exit(1);
-    }
-  }
-  // Use the PDDocumentInformation object to fetch metadata values as strings.
-  public String extractInfo() throws IOException {
-    switch(Keys.valueOf(key)) {
-      case AUTHOR:    return info.getAuthor();
-      case DATE:      return new SimpleDateFormat("yyyy-MM-dd").format(info.getCreationDate().getTime());
-      case CREATOR:   return info.getCreator();
-      case KEYWORDS:  return info.getKeywords();
-      case PRODUCER:  return info.getProducer();
-      case SUBJECT:   return info.getSubject();
-      case TITLE:     return info.getTitle();
-      case LENGTH:    return String.valueOf(doc.getNumberOfPages());
-      default:        return null;
-    }
-  }
-}

data/lib/docsplit/ExtractPages.java DELETED Viewed

@@ -1,54 +0,0 @@
-package org.documentcloud;
-import java.util.List;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.util.Splitter;
-import org.apache.pdfbox.pdfwriter.COSWriter;
-import org.apache.pdfbox.exceptions.COSVisitorException;
-// Use PDFBox's Splitter to break apart a large PDF into individual pages.
-public class ExtractPages extends Extractor {
-  private PDDocument doc;
-  private String basename;
-  // The mainline.
-  public static void main(String[] args) {
-    (new ExtractPages()).run(args);
-  }
-  // Extract each page of the given PDF.
-  public void extract(String pdfPath) {
-    try {
-      basename  = getBasename(pdfPath);
-      doc = PDDocument.load(pdfPath);
-      decrypt(doc);
-      List pages = (new Splitter()).split(doc);
-      if (pageNumbers != null) {
-        for (Integer num : pageNumbers) writePage((PDDocument) pages.get(num.intValue()- 1), num.intValue());
-      } else {
-        for (int i=0; i<pages.size(); i++) writePage((PDDocument) pages.get(i), i + 1);
-      }
-      doc.close();
-    } catch(Exception e) {
-      System.out.println(e.getMessage());
-      System.exit(1);
-    }
-  }
-  // Writes out a page as a single-page PDF.
-  private void writePage(PDDocument page, int pageNumber) throws IOException, COSVisitorException {
-    String pageName       = basename + "_" + String.valueOf(pageNumber) + ".pdf";
-    FileOutputStream out  = new FileOutputStream(outputFile(pageName));
-    COSWriter writer      = new COSWriter(out);
-    writer.write(page);
-    out.close();
-    writer.close();
-    page.close();
-  }
-}

data/lib/docsplit/ExtractText.java DELETED Viewed

@@ -1,80 +0,0 @@
-package org.documentcloud;
-import java.util.List;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.util.PDFTextStripper;
-// Uses PDFBox's PDFTextStripper to extract the full, plain, UTF-8 text of a
-// PDF document. Pass --pages to write out the plain text for each individual
-// page; --pages-only to omit the text for the entire document.
-public class ExtractText extends Extractor {
-  private PDDocument doc;
-  private String basename;
-  // The mainline.
-  public static void main(String[] args) {
-    (new ExtractText()).run(args);
-  }
-  // Extract the plain text for a PDF, and write it into the requested output
-  // sizes.
-  public void extract(String pdfPath) {
-    try {
-      basename = getBasename(pdfPath);
-      doc = PDDocument.load(pdfPath, false);
-      decrypt(doc);
-      if (allPages || (pageNumbers != null)) {
-        writePageText();
-      } else {
-        writeFullText();
-      }
-      doc.close();
-    } catch(IOException e) {
-      System.out.println(e.getMessage());
-      System.exit(1);
-    }
-  }
-  // Write out the extracted full text for the entire PDF.
-  public void writeFullText() throws IOException {
-    OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outputFile(basename + ".txt")), "UTF-8");
-    extractTextForPageRange(output, 1, Integer.MAX_VALUE);
-    output.close();
-  }
-  // Write out the full text for each specified page.
-  public void writePageText() throws IOException {
-    if (pageNumbers != null) {
-      for (Integer num : pageNumbers) writePageText(num.intValue());
-    } else {
-      int pages = doc.getNumberOfPages();
-      for (int i=1; i<=pages; i++) writePageText(i);
-    }
-  }
-  // Write out the full text for a single page.
-  public void writePageText(int pageNumber) throws IOException {
-    File outfile = outputFile(basename + "_" + String.valueOf(pageNumber) + ".txt");
-    OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8");
-    extractTextForPageRange(output, pageNumber, pageNumber);
-    output.close();
-  }
-  // Internal method to writes out text from the PDF for a given page range
-  // to a provided output stream.
-  private void extractTextForPageRange(OutputStreamWriter output, int startPage, int endPage) throws IOException {
-    PDFTextStripper stripper = new PDFTextStripper("UTF-8");
-    stripper.setSortByPosition(false);
-    stripper.setShouldSeparateByBeads(true);
-    stripper.setStartPage(startPage);
-    stripper.setEndPage(endPage);
-    stripper.writeText(doc, output);
-  }
-}

data/lib/docsplit/Extractor.java DELETED Viewed

@@ -1,91 +0,0 @@
-package org.documentcloud;
-import java.io.File;
-import java.util.List;
-import java.util.Arrays;
-import java.util.ArrayList;
-import java.util.Iterator;
-import org.apache.pdfbox.pdmodel.PDDocument;
-// The base Extractor class contains the common functionality needed to run
-// command-line extractors.
-public abstract class Extractor {
-  protected File output;
-  protected boolean allPages = false;
-  protected ArrayList<Integer> pageNumbers;
-  // Running an extractor consists of converting the arguments array into a
-  // more manageable List, parsing arguments, and extracting pdfs.
-  public void run(String[] arguments) {
-    List<String> args = new ArrayList<String>(Arrays.asList(arguments));
-    parseArguments(args);
-    Iterator<String> iter = args.iterator();
-    while(iter.hasNext()) extract(iter.next());
-  }
-  // Subclasses must override "extract" to perform their specific extraction.
-  public abstract void extract(String pdfPath);
-  // The default "parseArguments" method handles common arguments.
-  protected void parseArguments(List<String> args) {
-    int dirLoc = args.indexOf("--output");
-    if (dirLoc >= 0) {
-      output = new File(args.remove(dirLoc + 1));
-      args.remove(dirLoc);
-    }
-    int pagesLoc = args.indexOf("--pages");
-    if (pagesLoc >= 0) {
-      parsePages(args.remove(pagesLoc + 1));
-      args.remove(pagesLoc);
-    }
-  }
-  // Utility function to get the basename of a file path.
-  // After File.basename in Ruby.
-  public String getBasename(String pdfPath) {
-    String basename = new File(pdfPath).getName();
-    return basename.substring(0, basename.lastIndexOf('.'));
-  }
-  // Get a reference to an output file, placed inside any configured directories,
-  // while ensuring that parent directories exist.
-  public File outputFile(String path) {
-    File file = output != null ? new File(output, path) : new File(path);
-    File parent = file.getParentFile();
-    if (parent != null) parent.mkdirs();
-    return file;
-  }
-  // Decrypt a non-passworded but still encrypted document.
-  public void decrypt(PDDocument doc) {
-    if (!doc.isEncrypted()) return;
-    try {
-      doc.decrypt("");
-    } catch (Exception e) {
-      System.out.println("Error decrypting document, details: " + e.getMessage());
-      System.exit(1);
-    }
-  }
-  private void parsePages(String pageList) {
-    if (pageList.equals("all")) {
-      allPages = true;
-      return;
-    }
-    pageNumbers = new ArrayList<Integer>();
-    String[] groups = pageList.split(",");
-    for (String group : groups) {
-      if (group.contains("-")) {
-        String[] range = group.split("-");
-        int start = Integer.parseInt(range[0]);
-        int end = Integer.parseInt(range[1]);
-        for (int i=start; i<=end; i++) pageNumbers.add(new Integer(i));
-      } else {
-        pageNumbers.add(new Integer(Integer.parseInt(group)));
-      }
-    }
-  }
-}

data/lib/docsplit/argument_parser.rb DELETED Viewed

@@ -1,31 +0,0 @@
-module Docsplit
-  module ArgumentParser
-    # Flatten an options hash into an arguments string suitable for the command
-    # line.
-    def parse_options(opts)
-      opts.map {|k, v| ["--#{k}", normalize_value(v)] }.flatten.join(' ')
-    end
-    # Normalize a value in an options hash for the command line.
-    # Ranges look like: 1-10, Arrays like: 1,2,3.
-    def normalize_value(value)
-      case value
-      when Range then normalize_range(value)
-      when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
-      else            value.to_s
-      end
-    end
-    # Serialize a Ruby range into it's command-line equivalent.
-    def normalize_range(range)
-      arr = range.to_a
-      arr.empty? ? range.first.to_s : "#{range.first}-#{arr.last}"
-    end
-  end
-  extend ArgumentParser
-end

data/vendor/bcmail.jar DELETED Viewed

Binary file

data/vendor/bcprov.jar DELETED Viewed

Binary file

data/vendor/commons-logging.jar DELETED Viewed

Binary file

data/vendor/fontbox.jar DELETED Viewed

Binary file

data/vendor/pdfbox.jar DELETED Viewed

Binary file