docsplit 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. data/LICENSE +22 -0
  2. data/README +22 -0
  3. data/bin/docsplit +5 -0
  4. data/build/org/documentcloud/ExtractInfo$1.class +0 -0
  5. data/build/org/documentcloud/ExtractInfo$Keys.class +0 -0
  6. data/build/org/documentcloud/ExtractInfo.class +0 -0
  7. data/build/org/documentcloud/ExtractPages.class +0 -0
  8. data/build/org/documentcloud/ExtractText.class +0 -0
  9. data/build/org/documentcloud/Extractor.class +0 -0
  10. data/docsplit.gemspec +25 -0
  11. data/lib/docsplit/ExtractInfo.java +63 -0
  12. data/lib/docsplit/ExtractPages.java +54 -0
  13. data/lib/docsplit/ExtractText.java +80 -0
  14. data/lib/docsplit/Extractor.java +91 -0
  15. data/lib/docsplit/argument_parser.rb +31 -0
  16. data/lib/docsplit/command_line.rb +107 -0
  17. data/lib/docsplit/image_extractor.rb +94 -0
  18. data/lib/docsplit/transparent_pdfs.rb +26 -0
  19. data/lib/docsplit.rb +78 -0
  20. data/vendor/bcmail.jar +0 -0
  21. data/vendor/bcprov.jar +0 -0
  22. data/vendor/commons-logging.jar +0 -0
  23. data/vendor/fontbox.jar +0 -0
  24. data/vendor/jodconverter/commons-cli-1.2.jar +0 -0
  25. data/vendor/jodconverter/commons-io-1.4.jar +0 -0
  26. data/vendor/jodconverter/jodconverter-2.2.2.jar +0 -0
  27. data/vendor/jodconverter/jodconverter-cli-2.2.2.jar +0 -0
  28. data/vendor/jodconverter/juh-3.0.1.jar +0 -0
  29. data/vendor/jodconverter/jurt-3.0.1.jar +0 -0
  30. data/vendor/jodconverter/ridl-3.0.1.jar +0 -0
  31. data/vendor/jodconverter/slf4j-api-1.5.6.jar +0 -0
  32. data/vendor/jodconverter/slf4j-jdk14-1.5.6.jar +0 -0
  33. data/vendor/jodconverter/unoil-3.0.1.jar +0 -0
  34. data/vendor/logging.properties +1 -0
  35. data/vendor/pdfbox.jar +0 -0
  36. metadata +89 -0
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
2
+
3
+ Permission is hereby granted, free of charge, to any person
4
+ obtaining a copy of this software and associated documentation
5
+ files (the "Software"), to deal in the Software without
6
+ restriction, including without limitation the rights to use,
7
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the
9
+ Software is furnished to do so, subject to the following
10
+ conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,22 @@
1
+ ==
2
+ __ ___ __
3
+ ____/ /___ ______________ / (_) /_
4
+ / __ / __ \/ ___/ ___/ __ \/ / / __/
5
+ / /_/ / /_/ / /__(__ ) /_/ / / / /_
6
+ \____/\____/\___/____/ .___/_/_/\__/
7
+ /_/
8
+
9
+ Docsplit is a command-line utility and Ruby library for splitting apart
10
+ documents into their component parts: searchable UTF-8 plain text, page
11
+ images or thumbnails in any format, PDFs, single pages, and document
12
+ metadata (title, author, number of pages...)
13
+
14
+ Installation:
15
+ gem install docsplit
16
+
17
+ For documentation, usage, and examples, see:
18
+ http://documentcloud.github.com/docsplit/
19
+
20
+ To suggest a feature or report a bug:
21
+ http://github.com/documentcloud/docsplit/issues/
22
+
data/bin/docsplit ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "#{File.dirname(__FILE__)}/../lib/docsplit/command_line.rb"
4
+
5
+ Docsplit::CommandLine.new
data/docsplit.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'docsplit'
3
+ s.version = '0.1.0' # Keep version in sync with jammit.rb
4
+ s.date = '2009-12-07'
5
+
6
+ s.homepage = "http://documentcloud.github.com/docsplit/"
7
+ s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
8
+ s.description = <<-EOS
9
+ Docsplit is a command-line utility and Ruby library for splitting apart
10
+ documents into their component parts: searchable UTF-8 plain text, page
11
+ images or thumbnails in any format, PDFs, single pages, and document
12
+ metadata (title, author, number of pages...)
13
+ EOS
14
+
15
+ s.authors = ['Jeremy Ashkenas']
16
+ s.email = 'jeremy@documentcloud.org'
17
+ s.rubyforge_project = 'docsplit'
18
+ s.has_rdoc = false
19
+
20
+ s.require_paths = ['lib']
21
+ s.executables = ['docsplit']
22
+
23
+ s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
24
+ 'docsplit.gemspec', 'LICENSE', 'README']
25
+ end
@@ -0,0 +1,63 @@
1
+ package org.documentcloud;
2
+
3
+ import java.util.List;
4
+ import java.io.IOException;
5
+ import java.text.SimpleDateFormat;
6
+
7
+ import org.apache.pdfbox.pdmodel.PDDocument;
8
+ import org.apache.pdfbox.pdmodel.PDDocumentInformation;
9
+
10
+ // Extracts metadata from a PDF file.
11
+ public class ExtractInfo extends Extractor {
12
+
13
+ private PDDocument doc;
14
+ private PDDocumentInformation info;
15
+ private String key;
16
+
17
+ // The list of metadata keys we know how to extract.
18
+ private enum Keys {
19
+ AUTHOR, DATE, CREATOR, KEYWORDS, PRODUCER, SUBJECT, TITLE, LENGTH
20
+ }
21
+
22
+ // The mainline.
23
+ public static void main(String[] args) {
24
+ (new ExtractInfo()).run(args);
25
+ }
26
+
27
+ // The first argument is always the name of the metadata key.
28
+ protected void parseArguments(List<String> args) {
29
+ super.parseArguments(args);
30
+ key = args.remove(0).toUpperCase();
31
+ }
32
+
33
+ // Extract the configured bit of metadata from a PDF, decrypting if necessary.
34
+ public void extract(String pdfPath) {
35
+ try {
36
+ doc = PDDocument.load(pdfPath, false);
37
+ decrypt(doc);
38
+ info = doc.getDocumentInformation();
39
+ String val = extractInfo();
40
+ if (val != null) System.out.println(val);
41
+ doc.close();
42
+ } catch(IOException e) {
43
+ System.out.println(e.getMessage());
44
+ System.exit(1);
45
+ }
46
+ }
47
+
48
+ // Use the PDDocumentInformation object to fetch metadata values as strings.
49
+ public String extractInfo() throws IOException {
50
+ switch(Keys.valueOf(key)) {
51
+ case AUTHOR: return info.getAuthor();
52
+ case DATE: return new SimpleDateFormat("yyyy-MM-dd").format(info.getCreationDate().getTime());
53
+ case CREATOR: return info.getCreator();
54
+ case KEYWORDS: return info.getKeywords();
55
+ case PRODUCER: return info.getProducer();
56
+ case SUBJECT: return info.getSubject();
57
+ case TITLE: return info.getTitle();
58
+ case LENGTH: return String.valueOf(doc.getNumberOfPages());
59
+ default: return null;
60
+ }
61
+ }
62
+
63
+ }
@@ -0,0 +1,54 @@
1
+ package org.documentcloud;
2
+
3
+ import java.util.List;
4
+ import java.io.File;
5
+ import java.io.FileOutputStream;
6
+ import java.io.IOException;
7
+
8
+ import org.apache.pdfbox.pdmodel.PDDocument;
9
+ import org.apache.pdfbox.util.Splitter;
10
+ import org.apache.pdfbox.pdfwriter.COSWriter;
11
+ import org.apache.pdfbox.exceptions.COSVisitorException;
12
+
13
+ // Use PDFBox's Splitter to break apart a large PDF into individual pages.
14
+ public class ExtractPages extends Extractor {
15
+
16
+ private PDDocument doc;
17
+ private String basename;
18
+
19
+ // The mainline.
20
+ public static void main(String[] args) {
21
+ (new ExtractPages()).run(args);
22
+ }
23
+
24
+ // Extract each page of the given PDF.
25
+ public void extract(String pdfPath) {
26
+ try {
27
+ basename = getBasename(pdfPath);
28
+ doc = PDDocument.load(pdfPath);
29
+ decrypt(doc);
30
+ List pages = (new Splitter()).split(doc);
31
+ if (pageNumbers != null) {
32
+ for (Integer num : pageNumbers) writePage((PDDocument) pages.get(num.intValue()- 1), num.intValue());
33
+ } else {
34
+ for (int i=0; i<pages.size(); i++) writePage((PDDocument) pages.get(i), i + 1);
35
+ }
36
+ doc.close();
37
+ } catch(Exception e) {
38
+ System.out.println(e.getMessage());
39
+ System.exit(1);
40
+ }
41
+ }
42
+
43
+ // Writes out a page as a single-page PDF.
44
+ private void writePage(PDDocument page, int pageNumber) throws IOException, COSVisitorException {
45
+ String pageName = basename + "_" + String.valueOf(pageNumber) + ".pdf";
46
+ FileOutputStream out = new FileOutputStream(outputFile(pageName));
47
+ COSWriter writer = new COSWriter(out);
48
+ writer.write(page);
49
+ out.close();
50
+ writer.close();
51
+ page.close();
52
+ }
53
+
54
+ }
@@ -0,0 +1,80 @@
1
+ package org.documentcloud;
2
+
3
+ import java.util.List;
4
+ import java.io.File;
5
+ import java.io.FileOutputStream;
6
+ import java.io.IOException;
7
+ import java.io.OutputStreamWriter;
8
+
9
+ import org.apache.pdfbox.pdmodel.PDDocument;
10
+ import org.apache.pdfbox.util.PDFTextStripper;
11
+
12
+ // Uses PDFBox's PDFTextStripper to extract the full, plain, UTF-8 text of a
13
+ // PDF document. Pass --pages to write out the plain text for each individual
14
+ // page; --pages-only to omit the text for the entire document.
15
+ public class ExtractText extends Extractor {
16
+
17
+ private PDDocument doc;
18
+ private String basename;
19
+
20
+ // The mainline.
21
+ public static void main(String[] args) {
22
+ (new ExtractText()).run(args);
23
+ }
24
+
25
+ // Extract the plain text for a PDF, and write it into the requested output
26
+ // sizes.
27
+ public void extract(String pdfPath) {
28
+ try {
29
+ basename = getBasename(pdfPath);
30
+ doc = PDDocument.load(pdfPath, false);
31
+ decrypt(doc);
32
+ if (allPages || (pageNumbers != null)) {
33
+ writePageText();
34
+ } else {
35
+ writeFullText();
36
+ }
37
+ doc.close();
38
+ } catch(IOException e) {
39
+ System.out.println(e.getMessage());
40
+ System.exit(1);
41
+ }
42
+ }
43
+
44
+ // Write out the extracted full text for the entire PDF.
45
+ public void writeFullText() throws IOException {
46
+ OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outputFile(basename + ".txt")), "UTF-8");
47
+ extractTextForPageRange(output, 1, Integer.MAX_VALUE);
48
+ output.close();
49
+ }
50
+
51
+ // Write out the full text for each specified page.
52
+ public void writePageText() throws IOException {
53
+ if (pageNumbers != null) {
54
+ for (Integer num : pageNumbers) writePageText(num.intValue());
55
+ } else {
56
+ int pages = doc.getNumberOfPages();
57
+ for (int i=1; i<=pages; i++) writePageText(i);
58
+ }
59
+ }
60
+
61
+ // Write out the full text for a single page.
62
+ public void writePageText(int pageNumber) throws IOException {
63
+ File outfile = outputFile(basename + "_" + String.valueOf(pageNumber) + ".txt");
64
+ OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8");
65
+ extractTextForPageRange(output, pageNumber, pageNumber);
66
+ output.close();
67
+ }
68
+
69
+ // Internal method to writes out text from the PDF for a given page range
70
+ // to a provided output stream.
71
+ private void extractTextForPageRange(OutputStreamWriter output, int startPage, int endPage) throws IOException {
72
+ PDFTextStripper stripper = new PDFTextStripper("UTF-8");
73
+ stripper.setSortByPosition(false);
74
+ stripper.setShouldSeparateByBeads(true);
75
+ stripper.setStartPage(startPage);
76
+ stripper.setEndPage(endPage);
77
+ stripper.writeText(doc, output);
78
+ }
79
+
80
+ }
@@ -0,0 +1,91 @@
1
+ package org.documentcloud;
2
+
3
+ import java.io.File;
4
+ import java.util.List;
5
+ import java.util.Arrays;
6
+ import java.util.ArrayList;
7
+ import java.util.Iterator;
8
+
9
+ import org.apache.pdfbox.pdmodel.PDDocument;
10
+
11
+ // The base Extractor class contains the common functionality needed to run
12
+ // command-line extractors.
13
+ public abstract class Extractor {
14
+
15
+ protected File output;
16
+ protected boolean allPages = false;
17
+ protected ArrayList<Integer> pageNumbers;
18
+
19
+ // Running an extractor consists of converting the arguments array into a
20
+ // more manageable List, parsing arguments, and extracting pdfs.
21
+ public void run(String[] arguments) {
22
+ List<String> args = new ArrayList<String>(Arrays.asList(arguments));
23
+ parseArguments(args);
24
+ Iterator<String> iter = args.iterator();
25
+ while(iter.hasNext()) extract(iter.next());
26
+ }
27
+
28
+ // Subclasses must override "extract" to perform their specific extraction.
29
+ public abstract void extract(String pdfPath);
30
+
31
+ // The default "parseArguments" method handles common arguments.
32
+ protected void parseArguments(List<String> args) {
33
+ int dirLoc = args.indexOf("--output");
34
+ if (dirLoc >= 0) {
35
+ output = new File(args.remove(dirLoc + 1));
36
+ args.remove(dirLoc);
37
+ }
38
+ int pagesLoc = args.indexOf("--pages");
39
+ if (pagesLoc >= 0) {
40
+ parsePages(args.remove(pagesLoc + 1));
41
+ args.remove(pagesLoc);
42
+ }
43
+ }
44
+
45
+ // Utility function to get the basename of a file path.
46
+ // After File.basename in Ruby.
47
+ public String getBasename(String pdfPath) {
48
+ String basename = new File(pdfPath).getName();
49
+ return basename.substring(0, basename.lastIndexOf('.'));
50
+ }
51
+
52
+ // Get a reference to an output file, placed inside any configured directories,
53
+ // while ensuring that parent directories exist.
54
+ public File outputFile(String path) {
55
+ File file = output != null ? new File(output, path) : new File(path);
56
+ File parent = file.getParentFile();
57
+ if (parent != null) parent.mkdirs();
58
+ return file;
59
+ }
60
+
61
+ // Decrypt a non-passworded but still encrypted document.
62
+ public void decrypt(PDDocument doc) {
63
+ if (!doc.isEncrypted()) return;
64
+ try {
65
+ doc.decrypt("");
66
+ } catch (Exception e) {
67
+ System.out.println("Error decrypting document, details: " + e.getMessage());
68
+ System.exit(1);
69
+ }
70
+ }
71
+
72
+ private void parsePages(String pageList) {
73
+ if (pageList.equals("all")) {
74
+ allPages = true;
75
+ return;
76
+ }
77
+ pageNumbers = new ArrayList<Integer>();
78
+ String[] groups = pageList.split(",");
79
+ for (String group : groups) {
80
+ if (group.contains("-")) {
81
+ String[] range = group.split("-");
82
+ int start = Integer.parseInt(range[0]);
83
+ int end = Integer.parseInt(range[1]);
84
+ for (int i=start; i<=end; i++) pageNumbers.add(new Integer(i));
85
+ } else {
86
+ pageNumbers.add(new Integer(Integer.parseInt(group)));
87
+ }
88
+ }
89
+ }
90
+
91
+ }
@@ -0,0 +1,31 @@
1
+ module Docsplit
2
+
3
+ module ArgumentParser
4
+
5
+ # Flatten an options hash into an arguments string suitable for the command
6
+ # line.
7
+ def parse_options(opts)
8
+ opts.map {|k, v| ["--#{k}", normalize_value(v)] }.flatten.join(' ')
9
+ end
10
+
11
+ # Normalize a value in an options hash for the command line.
12
+ # Ranges look like: 1-10, Arrays like: 1,2,3.
13
+ def normalize_value(value)
14
+ case value
15
+ when Range then normalize_range(value)
16
+ when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
17
+ else value.to_s
18
+ end
19
+ end
20
+
21
+ # Serialize a Ruby range into it's command-line equivalent.
22
+ def normalize_range(range)
23
+ arr = range.to_a
24
+ arr.empty? ? range.first.to_s : "#{range.first}-#{arr.last}"
25
+ end
26
+
27
+ end
28
+
29
+ extend ArgumentParser
30
+
31
+ end
@@ -0,0 +1,107 @@
1
+ require 'optparse'
2
+ require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
3
+
4
+ module Docsplit
5
+
6
+ # A single command-line utility to separate a PDF into all its component parts.
7
+ class CommandLine
8
+
9
+ BANNER = <<-EOS
10
+ docsplit breaks apart documents into images, text, or individual pages.
11
+ It wraps PDFBox, GraphicsMagick, and JODConverter.
12
+
13
+ Usage:
14
+ docsplit COMMAND [OPTIONS] path/to/doc.pdf
15
+ Main commands:
16
+ pages, images, text, pdf.
17
+ Metadata commands:
18
+ author, date, creator, keywords, producer, subject, title, length.
19
+
20
+ Example:
21
+ docsplit images --size 700x --format jpg document.pdf
22
+
23
+ Dependencies:
24
+ Ruby, Java, A working GraphicsMagick (gm) command,
25
+ and a headless OpenOffice server for non-PDF documents.
26
+
27
+ Options:
28
+ (size, pages and format can take comma-separated values)
29
+
30
+ EOS
31
+
32
+ # Creating a CommandLine runs off of the contents of ARGV.
33
+ def initialize
34
+ parse_options
35
+ cmd = ARGV.shift
36
+ @command = cmd && cmd.to_sym
37
+ run
38
+ end
39
+
40
+ # Delegate to the Docsplit Ruby API to perform all extractions.
41
+ def run
42
+ begin
43
+ case @command
44
+ when :images then Docsplit.extract_images(ARGV, @options)
45
+ when :pages then Docsplit.extract_pages(ARGV, @options)
46
+ when :text then Docsplit.extract_text(ARGV, @options)
47
+ when :pdf then Docsplit.extract_pdf(ARGV, @options)
48
+ else
49
+ if METADATA_KEYS.include?(@command)
50
+ value = Docsplit.send("extract_#{@command}", ARGV, @options)
51
+ puts value unless value.nil?
52
+ else
53
+ usage
54
+ end
55
+ end
56
+ rescue ExtractionFailed => e
57
+ puts e.message.chomp
58
+ exit(1)
59
+ end
60
+ end
61
+
62
+ # Print out the usage help message.
63
+ def usage
64
+ puts "\n#{@option_parser}\n"
65
+ exit
66
+ end
67
+
68
+
69
+ private
70
+
71
+ # Use the OptionParser library to parse out all supported options. Return
72
+ # options formatted for the Ruby API.
73
+ def parse_options
74
+ @options = {}
75
+ @option_parser = OptionParser.new do |opts|
76
+ opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
77
+ @options[:output] = d
78
+ end
79
+ opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
80
+ @options[:pages] = p
81
+ end
82
+ opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
83
+ @options[:size] = s.split(',')
84
+ end
85
+ opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
86
+ @options[:format] = t.split(',')
87
+ end
88
+ opts.on_tail('-v', '--version', 'display docsplit version') do
89
+ puts "docsplit version #{Docsplit::VERSION}"
90
+ exit
91
+ end
92
+ opts.on_tail('-h', '--help', 'display this help message') do
93
+ usage
94
+ end
95
+ end
96
+ @option_parser.banner = BANNER
97
+ begin
98
+ @option_parser.parse!(ARGV)
99
+ rescue OptionParser::InvalidOption => e
100
+ puts e.message
101
+ exit(1)
102
+ end
103
+ end
104
+
105
+ end
106
+
107
+ end
@@ -0,0 +1,94 @@
1
+ module Docsplit
2
+
3
+ # Delegates to GraphicsMagick in order to convert PDF documents into
4
+ # nicely sized images.
5
+ class ImageExtractor
6
+
7
+ DENSITY_ARG = "-density 150"
8
+ DEFAULT_FORMAT = :png
9
+
10
+ # Extract a list of PDFs as rasterized page images, according to the
11
+ # configuration in options.
12
+ def extract(pdfs, options)
13
+ @pdfs = [pdfs].flatten
14
+ extract_options(options)
15
+ @pdfs.each {|p| @sizes.each {|s| @formats.each {|f| convert(p, s, f) }}}
16
+ end
17
+
18
+ # Convert a single PDF into page images at the specified size and format.
19
+ def convert(pdf, size, format)
20
+ basename = File.basename(pdf, File.extname(pdf))
21
+ subfolder = @sizes.length > 1 ? size.to_s : ''
22
+ directory = File.join(@output, subfolder)
23
+ FileUtils.mkdir_p(directory) unless File.exists?(directory)
24
+ out_file = File.join(directory, "#{basename}_%05d.#{format}")
25
+ cmd = "gm convert #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
26
+ result = `#{cmd}`.chomp
27
+ raise ExtractionFailed, result if $? != 0
28
+ renumber_images(out_file, format)
29
+ end
30
+
31
+
32
+ private
33
+
34
+ # Extract the relevant GraphicsMagick options from the options hash.
35
+ def extract_options(options)
36
+ @output = options[:output] || '.'
37
+ @pages = options[:pages]
38
+ @formats = [options[:format] || DEFAULT_FORMAT].flatten
39
+ @sizes = [options[:size]].flatten.compact
40
+ @sizes = [nil] if @sizes.empty?
41
+ end
42
+
43
+ # Generate the resize argument.
44
+ def resize_arg(size)
45
+ size.nil? ? '' : "-resize #{size}"
46
+ end
47
+
48
+ # Generate the appropriate quality argument for the image format.
49
+ def quality_arg(format)
50
+ case format.to_s
51
+ when /jpe?g/ then "-quality 85"
52
+ when /png/ then "-quality 100"
53
+ else ""
54
+ end
55
+ end
56
+
57
+ # Generate the requested page index into the document.
58
+ def pages_arg
59
+ return '' if @pages.nil?
60
+ pages = @pages.gsub(/\d+/) {|digits| (digits.to_i - 1).to_s }
61
+ "[#{pages}]"
62
+ end
63
+
64
+ # Generate the expanded list of requested page numbers.
65
+ def page_list
66
+ @pages.split(',').map { |range|
67
+ if range.include?('-')
68
+ range = range.split('-')
69
+ Range.new(range.first, range.last).to_a.map {|n| n.to_i }
70
+ else
71
+ range.to_i
72
+ end
73
+ }.flatten.sort
74
+ end
75
+
76
+ # When GraphicsMagick is through, it will have generated a number of
77
+ # incrementing page images, starting at 0. Renumber them with their correct
78
+ # page numbers.
79
+ def renumber_images(template, format)
80
+ suffixer = /_0+(\d+)\.#{format}\Z/
81
+ images = Dir[template.sub('%05d', '0*')].map do |path|
82
+ index = path[suffixer, 1].to_i
83
+ {:path => path, :index => index, :page_number => index + 1}
84
+ end
85
+ numbers = @pages ? page_list.reverse : nil
86
+ images.sort_by {|i| -i[:page_number] }.each_with_index do |image, i|
87
+ number = numbers ? numbers[i] : image[:page_number]
88
+ FileUtils.mv(image[:path], image[:path].sub(suffixer, "_#{number}.#{format}"))
89
+ end
90
+ end
91
+
92
+ end
93
+
94
+ end
@@ -0,0 +1,26 @@
1
+ module Docsplit
2
+
3
+ # Include a method to transparently convert non-PDF arguments to temporary
4
+ # PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
5
+ module TransparentPDFs
6
+
7
+ # Temporarily convert any non-PDF documents to PDFs before running them
8
+ # through further extraction.
9
+ def ensure_pdfs(docs)
10
+ [docs].flatten.map do |doc|
11
+ ext = File.extname(doc)
12
+ if ext == '.pdf'
13
+ doc
14
+ else
15
+ @tempdir ||= File.join(Dir.tmpdir, 'docsplit')
16
+ extract_pdf([doc], {:output => @tempdir})
17
+ File.join(@tempdir, File.basename(doc, ext) + '.pdf')
18
+ end
19
+ end
20
+ end
21
+
22
+ end
23
+
24
+ extend TransparentPDFs
25
+
26
+ end
data/lib/docsplit.rb ADDED
@@ -0,0 +1,78 @@
1
+ # The Docsplit module delegates to the Java PDF extractors.
2
+ module Docsplit
3
+
4
+ VERSION = '0.1.0' # Keep in sync with gemspec.
5
+
6
+ ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
+
8
+ CLASSPATH = "#{ROOT}/build#{File::PATH_SEPARATOR}#{ROOT}/vendor/'*'"
9
+
10
+ LOGGING = "-Djava.util.logging.config.file=#{ROOT}/vendor/logging.properties"
11
+
12
+ HEADLESS = "-Djava.awt.headless=true"
13
+
14
+ METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
15
+
16
+ # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
17
+ # broke.
18
+ class ExtractionFailed < StandardError; end
19
+
20
+ # Use the ExtractPages Java class to burst a PDF into single pages.
21
+ def self.extract_pages(pdfs, opts={})
22
+ pdfs = ensure_pdfs(pdfs)
23
+ run "org.documentcloud.ExtractPages", pdfs, opts
24
+ end
25
+
26
+ # Use the ExtractText Java class to write out all embedded text.
27
+ def self.extract_text(pdfs, opts={})
28
+ pdfs = ensure_pdfs(pdfs)
29
+ run "org.documentcloud.ExtractText", pdfs, opts
30
+ end
31
+
32
+ # Use the ExtractImages Java class to rasterize a PDF into each page's image.
33
+ def self.extract_images(pdfs, opts={})
34
+ pdfs = ensure_pdfs(pdfs)
35
+ opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
36
+ ImageExtractor.new.extract(pdfs, opts)
37
+ end
38
+
39
+ # Use JODCConverter to extract the documents as PDFs.
40
+ def self.extract_pdf(docs, opts={})
41
+ [docs].flatten.each do |doc|
42
+ basename = File.basename(doc, File.extname(doc))
43
+ run "-jar #{ROOT}/vendor/jodconverter/jodconverter-cli-2.2.2.jar #{doc} #{opts[:output] || '.'}/#{basename}.pdf", [], {}
44
+ end
45
+ end
46
+
47
+ # Define custom methods for each of the metadata keys that we support.
48
+ # Use the ExtractInfo Java class to print out a single bit of metadata.
49
+ METADATA_KEYS.each do |key|
50
+ instance_eval <<-EOS
51
+ def self.extract_#{key}(pdfs, opts={})
52
+ pdfs = ensure_pdfs(pdfs)
53
+ result = run "org.documentcloud.ExtractInfo #{key}", pdfs, opts, true
54
+ :#{key} == :length ? result.to_i : result
55
+ end
56
+ EOS
57
+ end
58
+
59
+
60
+ private
61
+
62
+ # Runs a Java command, with quieted logging, and the classpath set properly.
63
+ def self.run(command, pdfs, opts, return_output=false)
64
+ pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
65
+ args = parse_options(opts)
66
+ cmd = "java #{HEADLESS} #{LOGGING} -cp #{CLASSPATH} #{command} #{args} #{pdfs} 2>&1"
67
+ result = `#{cmd}`.chomp
68
+ raise ExtractionFailed, result if $? != 0
69
+ return return_output ? (result.empty? ? nil : result) : true
70
+ end
71
+
72
+ end
73
+
74
+ require 'tmpdir'
75
+ require 'fileutils'
76
+ require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
77
+ require "#{Docsplit::ROOT}/lib/docsplit/argument_parser"
78
+ require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
data/vendor/bcmail.jar ADDED
Binary file
data/vendor/bcprov.jar ADDED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1 @@
1
+ .level=WARNING
data/vendor/pdfbox.jar ADDED
Binary file
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: docsplit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jeremy Ashkenas
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-12-07 00:00:00 -05:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
17
+ email: jeremy@documentcloud.org
18
+ executables:
19
+ - docsplit
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - build/org/documentcloud/ExtractInfo$1.class
26
+ - build/org/documentcloud/ExtractInfo$Keys.class
27
+ - build/org/documentcloud/ExtractInfo.class
28
+ - build/org/documentcloud/Extractor.class
29
+ - build/org/documentcloud/ExtractPages.class
30
+ - build/org/documentcloud/ExtractText.class
31
+ - lib/docsplit/argument_parser.rb
32
+ - lib/docsplit/command_line.rb
33
+ - lib/docsplit/ExtractInfo.java
34
+ - lib/docsplit/Extractor.java
35
+ - lib/docsplit/ExtractPages.java
36
+ - lib/docsplit/ExtractText.java
37
+ - lib/docsplit/image_extractor.rb
38
+ - lib/docsplit/transparent_pdfs.rb
39
+ - lib/docsplit.rb
40
+ - bin/docsplit
41
+ - vendor/bcmail.jar
42
+ - vendor/bcprov.jar
43
+ - vendor/commons-logging.jar
44
+ - vendor/fontbox.jar
45
+ - vendor/jodconverter/commons-cli-1.2.jar
46
+ - vendor/jodconverter/commons-io-1.4.jar
47
+ - vendor/jodconverter/jodconverter-2.2.2.jar
48
+ - vendor/jodconverter/jodconverter-cli-2.2.2.jar
49
+ - vendor/jodconverter/juh-3.0.1.jar
50
+ - vendor/jodconverter/jurt-3.0.1.jar
51
+ - vendor/jodconverter/ridl-3.0.1.jar
52
+ - vendor/jodconverter/slf4j-api-1.5.6.jar
53
+ - vendor/jodconverter/slf4j-jdk14-1.5.6.jar
54
+ - vendor/jodconverter/unoil-3.0.1.jar
55
+ - vendor/logging.properties
56
+ - vendor/pdfbox.jar
57
+ - docsplit.gemspec
58
+ - LICENSE
59
+ - README
60
+ has_rdoc: false
61
+ homepage: http://documentcloud.github.com/docsplit/
62
+ licenses: []
63
+
64
+ post_install_message:
65
+ rdoc_options: []
66
+
67
+ require_paths:
68
+ - lib
69
+ required_ruby_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: "0"
74
+ version:
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: "0"
80
+ version:
81
+ requirements: []
82
+
83
+ rubyforge_project: docsplit
84
+ rubygems_version: 1.3.5
85
+ signing_key:
86
+ specification_version: 3
87
+ summary: Break Apart Documents into Images, Text, Pages and PDFs
88
+ test_files: []
89
+