docsplit 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. data/LICENSE +22 -0
  2. data/README +22 -0
  3. data/bin/docsplit +5 -0
  4. data/build/org/documentcloud/ExtractInfo$1.class +0 -0
  5. data/build/org/documentcloud/ExtractInfo$Keys.class +0 -0
  6. data/build/org/documentcloud/ExtractInfo.class +0 -0
  7. data/build/org/documentcloud/ExtractPages.class +0 -0
  8. data/build/org/documentcloud/ExtractText.class +0 -0
  9. data/build/org/documentcloud/Extractor.class +0 -0
  10. data/docsplit.gemspec +25 -0
  11. data/lib/docsplit/ExtractInfo.java +63 -0
  12. data/lib/docsplit/ExtractPages.java +54 -0
  13. data/lib/docsplit/ExtractText.java +80 -0
  14. data/lib/docsplit/Extractor.java +91 -0
  15. data/lib/docsplit/argument_parser.rb +31 -0
  16. data/lib/docsplit/command_line.rb +107 -0
  17. data/lib/docsplit/image_extractor.rb +94 -0
  18. data/lib/docsplit/transparent_pdfs.rb +26 -0
  19. data/lib/docsplit.rb +78 -0
  20. data/vendor/bcmail.jar +0 -0
  21. data/vendor/bcprov.jar +0 -0
  22. data/vendor/commons-logging.jar +0 -0
  23. data/vendor/fontbox.jar +0 -0
  24. data/vendor/jodconverter/commons-cli-1.2.jar +0 -0
  25. data/vendor/jodconverter/commons-io-1.4.jar +0 -0
  26. data/vendor/jodconverter/jodconverter-2.2.2.jar +0 -0
  27. data/vendor/jodconverter/jodconverter-cli-2.2.2.jar +0 -0
  28. data/vendor/jodconverter/juh-3.0.1.jar +0 -0
  29. data/vendor/jodconverter/jurt-3.0.1.jar +0 -0
  30. data/vendor/jodconverter/ridl-3.0.1.jar +0 -0
  31. data/vendor/jodconverter/slf4j-api-1.5.6.jar +0 -0
  32. data/vendor/jodconverter/slf4j-jdk14-1.5.6.jar +0 -0
  33. data/vendor/jodconverter/unoil-3.0.1.jar +0 -0
  34. data/vendor/logging.properties +1 -0
  35. data/vendor/pdfbox.jar +0 -0
  36. metadata +89 -0
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
2
+
3
+ Permission is hereby granted, free of charge, to any person
4
+ obtaining a copy of this software and associated documentation
5
+ files (the "Software"), to deal in the Software without
6
+ restriction, including without limitation the rights to use,
7
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the
9
+ Software is furnished to do so, subject to the following
10
+ conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,22 @@
1
+ ==
2
+ __ ___ __
3
+ ____/ /___ ______________ / (_) /_
4
+ / __ / __ \/ ___/ ___/ __ \/ / / __/
5
+ / /_/ / /_/ / /__(__ ) /_/ / / / /_
6
+ \____/\____/\___/____/ .___/_/_/\__/
7
+ /_/
8
+
9
+ Docsplit is a command-line utility and Ruby library for splitting apart
10
+ documents into their component parts: searchable UTF-8 plain text, page
11
+ images or thumbnails in any format, PDFs, single pages, and document
12
+ metadata (title, author, number of pages...)
13
+
14
+ Installation:
15
+ gem install docsplit
16
+
17
+ For documentation, usage, and examples, see:
18
+ http://documentcloud.github.com/docsplit/
19
+
20
+ To suggest a feature or report a bug:
21
+ http://github.com/documentcloud/docsplit/issues/
22
+
data/bin/docsplit ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "#{File.dirname(__FILE__)}/../lib/docsplit/command_line.rb"
4
+
5
+ Docsplit::CommandLine.new
data/docsplit.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'docsplit'
3
+ s.version = '0.1.0' # Keep version in sync with jammit.rb
4
+ s.date = '2009-12-07'
5
+
6
+ s.homepage = "http://documentcloud.github.com/docsplit/"
7
+ s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
8
+ s.description = <<-EOS
9
+ Docsplit is a command-line utility and Ruby library for splitting apart
10
+ documents into their component parts: searchable UTF-8 plain text, page
11
+ images or thumbnails in any format, PDFs, single pages, and document
12
+ metadata (title, author, number of pages...)
13
+ EOS
14
+
15
+ s.authors = ['Jeremy Ashkenas']
16
+ s.email = 'jeremy@documentcloud.org'
17
+ s.rubyforge_project = 'docsplit'
18
+ s.has_rdoc = false
19
+
20
+ s.require_paths = ['lib']
21
+ s.executables = ['docsplit']
22
+
23
+ s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
24
+ 'docsplit.gemspec', 'LICENSE', 'README']
25
+ end
@@ -0,0 +1,63 @@
1
+ package org.documentcloud;
2
+
3
+ import java.util.List;
4
+ import java.io.IOException;
5
+ import java.text.SimpleDateFormat;
6
+
7
+ import org.apache.pdfbox.pdmodel.PDDocument;
8
+ import org.apache.pdfbox.pdmodel.PDDocumentInformation;
9
+
10
+ // Extracts metadata from a PDF file.
11
+ public class ExtractInfo extends Extractor {
12
+
13
+ private PDDocument doc;
14
+ private PDDocumentInformation info;
15
+ private String key;
16
+
17
+ // The list of metadata keys we know how to extract.
18
+ private enum Keys {
19
+ AUTHOR, DATE, CREATOR, KEYWORDS, PRODUCER, SUBJECT, TITLE, LENGTH
20
+ }
21
+
22
+ // The mainline.
23
+ public static void main(String[] args) {
24
+ (new ExtractInfo()).run(args);
25
+ }
26
+
27
+ // The first argument is always the name of the metadata key.
28
+ protected void parseArguments(List<String> args) {
29
+ super.parseArguments(args);
30
+ key = args.remove(0).toUpperCase();
31
+ }
32
+
33
+ // Extract the configured bit of metadata from a PDF, decrypting if necessary.
34
+ public void extract(String pdfPath) {
35
+ try {
36
+ doc = PDDocument.load(pdfPath, false);
37
+ decrypt(doc);
38
+ info = doc.getDocumentInformation();
39
+ String val = extractInfo();
40
+ if (val != null) System.out.println(val);
41
+ doc.close();
42
+ } catch(IOException e) {
43
+ System.out.println(e.getMessage());
44
+ System.exit(1);
45
+ }
46
+ }
47
+
48
+ // Use the PDDocumentInformation object to fetch metadata values as strings.
49
+ public String extractInfo() throws IOException {
50
+ switch(Keys.valueOf(key)) {
51
+ case AUTHOR: return info.getAuthor();
52
+ case DATE: return new SimpleDateFormat("yyyy-MM-dd").format(info.getCreationDate().getTime());
53
+ case CREATOR: return info.getCreator();
54
+ case KEYWORDS: return info.getKeywords();
55
+ case PRODUCER: return info.getProducer();
56
+ case SUBJECT: return info.getSubject();
57
+ case TITLE: return info.getTitle();
58
+ case LENGTH: return String.valueOf(doc.getNumberOfPages());
59
+ default: return null;
60
+ }
61
+ }
62
+
63
+ }
@@ -0,0 +1,54 @@
1
+ package org.documentcloud;
2
+
3
+ import java.util.List;
4
+ import java.io.File;
5
+ import java.io.FileOutputStream;
6
+ import java.io.IOException;
7
+
8
+ import org.apache.pdfbox.pdmodel.PDDocument;
9
+ import org.apache.pdfbox.util.Splitter;
10
+ import org.apache.pdfbox.pdfwriter.COSWriter;
11
+ import org.apache.pdfbox.exceptions.COSVisitorException;
12
+
13
+ // Use PDFBox's Splitter to break apart a large PDF into individual pages.
14
+ public class ExtractPages extends Extractor {
15
+
16
+ private PDDocument doc;
17
+ private String basename;
18
+
19
+ // The mainline.
20
+ public static void main(String[] args) {
21
+ (new ExtractPages()).run(args);
22
+ }
23
+
24
+ // Extract each page of the given PDF.
25
+ public void extract(String pdfPath) {
26
+ try {
27
+ basename = getBasename(pdfPath);
28
+ doc = PDDocument.load(pdfPath);
29
+ decrypt(doc);
30
+ List pages = (new Splitter()).split(doc);
31
+ if (pageNumbers != null) {
32
+ for (Integer num : pageNumbers) writePage((PDDocument) pages.get(num.intValue()- 1), num.intValue());
33
+ } else {
34
+ for (int i=0; i<pages.size(); i++) writePage((PDDocument) pages.get(i), i + 1);
35
+ }
36
+ doc.close();
37
+ } catch(Exception e) {
38
+ System.out.println(e.getMessage());
39
+ System.exit(1);
40
+ }
41
+ }
42
+
43
+ // Writes out a page as a single-page PDF.
44
+ private void writePage(PDDocument page, int pageNumber) throws IOException, COSVisitorException {
45
+ String pageName = basename + "_" + String.valueOf(pageNumber) + ".pdf";
46
+ FileOutputStream out = new FileOutputStream(outputFile(pageName));
47
+ COSWriter writer = new COSWriter(out);
48
+ writer.write(page);
49
+ out.close();
50
+ writer.close();
51
+ page.close();
52
+ }
53
+
54
+ }
@@ -0,0 +1,80 @@
1
+ package org.documentcloud;
2
+
3
+ import java.util.List;
4
+ import java.io.File;
5
+ import java.io.FileOutputStream;
6
+ import java.io.IOException;
7
+ import java.io.OutputStreamWriter;
8
+
9
+ import org.apache.pdfbox.pdmodel.PDDocument;
10
+ import org.apache.pdfbox.util.PDFTextStripper;
11
+
12
+ // Uses PDFBox's PDFTextStripper to extract the full, plain, UTF-8 text of a
13
+ // PDF document. Pass --pages to write out the plain text for each individual
14
+ // page; --pages-only to omit the text for the entire document.
15
+ public class ExtractText extends Extractor {
16
+
17
+ private PDDocument doc;
18
+ private String basename;
19
+
20
+ // The mainline.
21
+ public static void main(String[] args) {
22
+ (new ExtractText()).run(args);
23
+ }
24
+
25
+ // Extract the plain text for a PDF, and write it into the requested output
26
+ // sizes.
27
+ public void extract(String pdfPath) {
28
+ try {
29
+ basename = getBasename(pdfPath);
30
+ doc = PDDocument.load(pdfPath, false);
31
+ decrypt(doc);
32
+ if (allPages || (pageNumbers != null)) {
33
+ writePageText();
34
+ } else {
35
+ writeFullText();
36
+ }
37
+ doc.close();
38
+ } catch(IOException e) {
39
+ System.out.println(e.getMessage());
40
+ System.exit(1);
41
+ }
42
+ }
43
+
44
+ // Write out the extracted full text for the entire PDF.
45
+ public void writeFullText() throws IOException {
46
+ OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outputFile(basename + ".txt")), "UTF-8");
47
+ extractTextForPageRange(output, 1, Integer.MAX_VALUE);
48
+ output.close();
49
+ }
50
+
51
+ // Write out the full text for each specified page.
52
+ public void writePageText() throws IOException {
53
+ if (pageNumbers != null) {
54
+ for (Integer num : pageNumbers) writePageText(num.intValue());
55
+ } else {
56
+ int pages = doc.getNumberOfPages();
57
+ for (int i=1; i<=pages; i++) writePageText(i);
58
+ }
59
+ }
60
+
61
+ // Write out the full text for a single page.
62
+ public void writePageText(int pageNumber) throws IOException {
63
+ File outfile = outputFile(basename + "_" + String.valueOf(pageNumber) + ".txt");
64
+ OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8");
65
+ extractTextForPageRange(output, pageNumber, pageNumber);
66
+ output.close();
67
+ }
68
+
69
+ // Internal method to writes out text from the PDF for a given page range
70
+ // to a provided output stream.
71
+ private void extractTextForPageRange(OutputStreamWriter output, int startPage, int endPage) throws IOException {
72
+ PDFTextStripper stripper = new PDFTextStripper("UTF-8");
73
+ stripper.setSortByPosition(false);
74
+ stripper.setShouldSeparateByBeads(true);
75
+ stripper.setStartPage(startPage);
76
+ stripper.setEndPage(endPage);
77
+ stripper.writeText(doc, output);
78
+ }
79
+
80
+ }
@@ -0,0 +1,91 @@
1
+ package org.documentcloud;
2
+
3
+ import java.io.File;
4
+ import java.util.List;
5
+ import java.util.Arrays;
6
+ import java.util.ArrayList;
7
+ import java.util.Iterator;
8
+
9
+ import org.apache.pdfbox.pdmodel.PDDocument;
10
+
11
+ // The base Extractor class contains the common functionality needed to run
12
+ // command-line extractors.
13
+ public abstract class Extractor {
14
+
15
+ protected File output;
16
+ protected boolean allPages = false;
17
+ protected ArrayList<Integer> pageNumbers;
18
+
19
+ // Running an extractor consists of converting the arguments array into a
20
+ // more manageable List, parsing arguments, and extracting pdfs.
21
+ public void run(String[] arguments) {
22
+ List<String> args = new ArrayList<String>(Arrays.asList(arguments));
23
+ parseArguments(args);
24
+ Iterator<String> iter = args.iterator();
25
+ while(iter.hasNext()) extract(iter.next());
26
+ }
27
+
28
+ // Subclasses must override "extract" to perform their specific extraction.
29
+ public abstract void extract(String pdfPath);
30
+
31
+ // The default "parseArguments" method handles common arguments.
32
+ protected void parseArguments(List<String> args) {
33
+ int dirLoc = args.indexOf("--output");
34
+ if (dirLoc >= 0) {
35
+ output = new File(args.remove(dirLoc + 1));
36
+ args.remove(dirLoc);
37
+ }
38
+ int pagesLoc = args.indexOf("--pages");
39
+ if (pagesLoc >= 0) {
40
+ parsePages(args.remove(pagesLoc + 1));
41
+ args.remove(pagesLoc);
42
+ }
43
+ }
44
+
45
+ // Utility function to get the basename of a file path.
46
+ // After File.basename in Ruby.
47
+ public String getBasename(String pdfPath) {
48
+ String basename = new File(pdfPath).getName();
49
+ return basename.substring(0, basename.lastIndexOf('.'));
50
+ }
51
+
52
+ // Get a reference to an output file, placed inside any configured directories,
53
+ // while ensuring that parent directories exist.
54
+ public File outputFile(String path) {
55
+ File file = output != null ? new File(output, path) : new File(path);
56
+ File parent = file.getParentFile();
57
+ if (parent != null) parent.mkdirs();
58
+ return file;
59
+ }
60
+
61
+ // Decrypt a non-passworded but still encrypted document.
62
+ public void decrypt(PDDocument doc) {
63
+ if (!doc.isEncrypted()) return;
64
+ try {
65
+ doc.decrypt("");
66
+ } catch (Exception e) {
67
+ System.out.println("Error decrypting document, details: " + e.getMessage());
68
+ System.exit(1);
69
+ }
70
+ }
71
+
72
+ private void parsePages(String pageList) {
73
+ if (pageList.equals("all")) {
74
+ allPages = true;
75
+ return;
76
+ }
77
+ pageNumbers = new ArrayList<Integer>();
78
+ String[] groups = pageList.split(",");
79
+ for (String group : groups) {
80
+ if (group.contains("-")) {
81
+ String[] range = group.split("-");
82
+ int start = Integer.parseInt(range[0]);
83
+ int end = Integer.parseInt(range[1]);
84
+ for (int i=start; i<=end; i++) pageNumbers.add(new Integer(i));
85
+ } else {
86
+ pageNumbers.add(new Integer(Integer.parseInt(group)));
87
+ }
88
+ }
89
+ }
90
+
91
+ }
@@ -0,0 +1,31 @@
1
+ module Docsplit
2
+
3
+ module ArgumentParser
4
+
5
+ # Flatten an options hash into an arguments string suitable for the command
6
+ # line.
7
+ def parse_options(opts)
8
+ opts.map {|k, v| ["--#{k}", normalize_value(v)] }.flatten.join(' ')
9
+ end
10
+
11
+ # Normalize a value in an options hash for the command line.
12
+ # Ranges look like: 1-10, Arrays like: 1,2,3.
13
+ def normalize_value(value)
14
+ case value
15
+ when Range then normalize_range(value)
16
+ when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
17
+ else value.to_s
18
+ end
19
+ end
20
+
21
+ # Serialize a Ruby range into it's command-line equivalent.
22
+ def normalize_range(range)
23
+ arr = range.to_a
24
+ arr.empty? ? range.first.to_s : "#{range.first}-#{arr.last}"
25
+ end
26
+
27
+ end
28
+
29
+ extend ArgumentParser
30
+
31
+ end
@@ -0,0 +1,107 @@
1
+ require 'optparse'
2
+ require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
3
+
4
+ module Docsplit
5
+
6
+ # A single command-line utility to separate a PDF into all its component parts.
7
+ class CommandLine
8
+
9
+ BANNER = <<-EOS
10
+ docsplit breaks apart documents into images, text, or individual pages.
11
+ It wraps PDFBox, GraphicsMagick, and JODConverter.
12
+
13
+ Usage:
14
+ docsplit COMMAND [OPTIONS] path/to/doc.pdf
15
+ Main commands:
16
+ pages, images, text, pdf.
17
+ Metadata commands:
18
+ author, date, creator, keywords, producer, subject, title, length.
19
+
20
+ Example:
21
+ docsplit images --size 700x --format jpg document.pdf
22
+
23
+ Dependencies:
24
+ Ruby, Java, A working GraphicsMagick (gm) command,
25
+ and a headless OpenOffice server for non-PDF documents.
26
+
27
+ Options:
28
+ (size, pages and format can take comma-separated values)
29
+
30
+ EOS
31
+
32
+ # Creating a CommandLine runs off of the contents of ARGV.
33
+ def initialize
34
+ parse_options
35
+ cmd = ARGV.shift
36
+ @command = cmd && cmd.to_sym
37
+ run
38
+ end
39
+
40
+ # Delegate to the Docsplit Ruby API to perform all extractions.
41
+ def run
42
+ begin
43
+ case @command
44
+ when :images then Docsplit.extract_images(ARGV, @options)
45
+ when :pages then Docsplit.extract_pages(ARGV, @options)
46
+ when :text then Docsplit.extract_text(ARGV, @options)
47
+ when :pdf then Docsplit.extract_pdf(ARGV, @options)
48
+ else
49
+ if METADATA_KEYS.include?(@command)
50
+ value = Docsplit.send("extract_#{@command}", ARGV, @options)
51
+ puts value unless value.nil?
52
+ else
53
+ usage
54
+ end
55
+ end
56
+ rescue ExtractionFailed => e
57
+ puts e.message.chomp
58
+ exit(1)
59
+ end
60
+ end
61
+
62
+ # Print out the usage help message.
63
+ def usage
64
+ puts "\n#{@option_parser}\n"
65
+ exit
66
+ end
67
+
68
+
69
+ private
70
+
71
+ # Use the OptionParser library to parse out all supported options. Return
72
+ # options formatted for the Ruby API.
73
+ def parse_options
74
+ @options = {}
75
+ @option_parser = OptionParser.new do |opts|
76
+ opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
77
+ @options[:output] = d
78
+ end
79
+ opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
80
+ @options[:pages] = p
81
+ end
82
+ opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
83
+ @options[:size] = s.split(',')
84
+ end
85
+ opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
86
+ @options[:format] = t.split(',')
87
+ end
88
+ opts.on_tail('-v', '--version', 'display docsplit version') do
89
+ puts "docsplit version #{Docsplit::VERSION}"
90
+ exit
91
+ end
92
+ opts.on_tail('-h', '--help', 'display this help message') do
93
+ usage
94
+ end
95
+ end
96
+ @option_parser.banner = BANNER
97
+ begin
98
+ @option_parser.parse!(ARGV)
99
+ rescue OptionParser::InvalidOption => e
100
+ puts e.message
101
+ exit(1)
102
+ end
103
+ end
104
+
105
+ end
106
+
107
+ end
@@ -0,0 +1,94 @@
1
+ module Docsplit
2
+
3
+ # Delegates to GraphicsMagick in order to convert PDF documents into
4
+ # nicely sized images.
5
+ class ImageExtractor
6
+
7
+ DENSITY_ARG = "-density 150"
8
+ DEFAULT_FORMAT = :png
9
+
10
+ # Extract a list of PDFs as rasterized page images, according to the
11
+ # configuration in options.
12
+ def extract(pdfs, options)
13
+ @pdfs = [pdfs].flatten
14
+ extract_options(options)
15
+ @pdfs.each {|p| @sizes.each {|s| @formats.each {|f| convert(p, s, f) }}}
16
+ end
17
+
18
+ # Convert a single PDF into page images at the specified size and format.
19
+ def convert(pdf, size, format)
20
+ basename = File.basename(pdf, File.extname(pdf))
21
+ subfolder = @sizes.length > 1 ? size.to_s : ''
22
+ directory = File.join(@output, subfolder)
23
+ FileUtils.mkdir_p(directory) unless File.exists?(directory)
24
+ out_file = File.join(directory, "#{basename}_%05d.#{format}")
25
+ cmd = "gm convert #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
26
+ result = `#{cmd}`.chomp
27
+ raise ExtractionFailed, result if $? != 0
28
+ renumber_images(out_file, format)
29
+ end
30
+
31
+
32
+ private
33
+
34
+ # Extract the relevant GraphicsMagick options from the options hash.
35
+ def extract_options(options)
36
+ @output = options[:output] || '.'
37
+ @pages = options[:pages]
38
+ @formats = [options[:format] || DEFAULT_FORMAT].flatten
39
+ @sizes = [options[:size]].flatten.compact
40
+ @sizes = [nil] if @sizes.empty?
41
+ end
42
+
43
+ # Generate the resize argument.
44
+ def resize_arg(size)
45
+ size.nil? ? '' : "-resize #{size}"
46
+ end
47
+
48
+ # Generate the appropriate quality argument for the image format.
49
+ def quality_arg(format)
50
+ case format.to_s
51
+ when /jpe?g/ then "-quality 85"
52
+ when /png/ then "-quality 100"
53
+ else ""
54
+ end
55
+ end
56
+
57
+ # Generate the requested page index into the document.
58
+ def pages_arg
59
+ return '' if @pages.nil?
60
+ pages = @pages.gsub(/\d+/) {|digits| (digits.to_i - 1).to_s }
61
+ "[#{pages}]"
62
+ end
63
+
64
+ # Generate the expanded list of requested page numbers.
65
+ def page_list
66
+ @pages.split(',').map { |range|
67
+ if range.include?('-')
68
+ range = range.split('-')
69
+ Range.new(range.first, range.last).to_a.map {|n| n.to_i }
70
+ else
71
+ range.to_i
72
+ end
73
+ }.flatten.sort
74
+ end
75
+
76
+ # When GraphicsMagick is through, it will have generated a number of
77
+ # incrementing page images, starting at 0. Renumber them with their correct
78
+ # page numbers.
79
+ def renumber_images(template, format)
80
+ suffixer = /_0+(\d+)\.#{format}\Z/
81
+ images = Dir[template.sub('%05d', '0*')].map do |path|
82
+ index = path[suffixer, 1].to_i
83
+ {:path => path, :index => index, :page_number => index + 1}
84
+ end
85
+ numbers = @pages ? page_list.reverse : nil
86
+ images.sort_by {|i| -i[:page_number] }.each_with_index do |image, i|
87
+ number = numbers ? numbers[i] : image[:page_number]
88
+ FileUtils.mv(image[:path], image[:path].sub(suffixer, "_#{number}.#{format}"))
89
+ end
90
+ end
91
+
92
+ end
93
+
94
+ end
@@ -0,0 +1,26 @@
1
+ module Docsplit
2
+
3
+ # Include a method to transparently convert non-PDF arguments to temporary
4
+ # PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
5
+ module TransparentPDFs
6
+
7
+ # Temporarily convert any non-PDF documents to PDFs before running them
8
+ # through further extraction.
9
+ def ensure_pdfs(docs)
10
+ [docs].flatten.map do |doc|
11
+ ext = File.extname(doc)
12
+ if ext == '.pdf'
13
+ doc
14
+ else
15
+ @tempdir ||= File.join(Dir.tmpdir, 'docsplit')
16
+ extract_pdf([doc], {:output => @tempdir})
17
+ File.join(@tempdir, File.basename(doc, ext) + '.pdf')
18
+ end
19
+ end
20
+ end
21
+
22
+ end
23
+
24
+ extend TransparentPDFs
25
+
26
+ end
data/lib/docsplit.rb ADDED
@@ -0,0 +1,78 @@
1
+ # The Docsplit module delegates to the Java PDF extractors.
2
+ module Docsplit
3
+
4
+ VERSION = '0.1.0' # Keep in sync with gemspec.
5
+
6
+ ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
+
8
+ CLASSPATH = "#{ROOT}/build#{File::PATH_SEPARATOR}#{ROOT}/vendor/'*'"
9
+
10
+ LOGGING = "-Djava.util.logging.config.file=#{ROOT}/vendor/logging.properties"
11
+
12
+ HEADLESS = "-Djava.awt.headless=true"
13
+
14
+ METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
15
+
16
+ # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
17
+ # broke.
18
+ class ExtractionFailed < StandardError; end
19
+
20
+ # Use the ExtractPages Java class to burst a PDF into single pages.
21
+ def self.extract_pages(pdfs, opts={})
22
+ pdfs = ensure_pdfs(pdfs)
23
+ run "org.documentcloud.ExtractPages", pdfs, opts
24
+ end
25
+
26
+ # Use the ExtractText Java class to write out all embedded text.
27
+ def self.extract_text(pdfs, opts={})
28
+ pdfs = ensure_pdfs(pdfs)
29
+ run "org.documentcloud.ExtractText", pdfs, opts
30
+ end
31
+
32
+ # Use the ExtractImages Java class to rasterize a PDF into each page's image.
33
+ def self.extract_images(pdfs, opts={})
34
+ pdfs = ensure_pdfs(pdfs)
35
+ opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
36
+ ImageExtractor.new.extract(pdfs, opts)
37
+ end
38
+
39
+ # Use JODCConverter to extract the documents as PDFs.
40
+ def self.extract_pdf(docs, opts={})
41
+ [docs].flatten.each do |doc|
42
+ basename = File.basename(doc, File.extname(doc))
43
+ run "-jar #{ROOT}/vendor/jodconverter/jodconverter-cli-2.2.2.jar #{doc} #{opts[:output] || '.'}/#{basename}.pdf", [], {}
44
+ end
45
+ end
46
+
47
+ # Define custom methods for each of the metadata keys that we support.
48
+ # Use the ExtractInfo Java class to print out a single bit of metadata.
49
+ METADATA_KEYS.each do |key|
50
+ instance_eval <<-EOS
51
+ def self.extract_#{key}(pdfs, opts={})
52
+ pdfs = ensure_pdfs(pdfs)
53
+ result = run "org.documentcloud.ExtractInfo #{key}", pdfs, opts, true
54
+ :#{key} == :length ? result.to_i : result
55
+ end
56
+ EOS
57
+ end
58
+
59
+
60
+ private
61
+
62
+ # Runs a Java command, with quieted logging, and the classpath set properly.
63
+ def self.run(command, pdfs, opts, return_output=false)
64
+ pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
65
+ args = parse_options(opts)
66
+ cmd = "java #{HEADLESS} #{LOGGING} -cp #{CLASSPATH} #{command} #{args} #{pdfs} 2>&1"
67
+ result = `#{cmd}`.chomp
68
+ raise ExtractionFailed, result if $? != 0
69
+ return return_output ? (result.empty? ? nil : result) : true
70
+ end
71
+
72
+ end
73
+
74
+ require 'tmpdir'
75
+ require 'fileutils'
76
+ require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
77
+ require "#{Docsplit::ROOT}/lib/docsplit/argument_parser"
78
+ require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
data/vendor/bcmail.jar ADDED
Binary file
data/vendor/bcprov.jar ADDED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1 @@
1
+ .level=WARNING
data/vendor/pdfbox.jar ADDED
Binary file
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: docsplit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jeremy Ashkenas
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-12-07 00:00:00 -05:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
17
+ email: jeremy@documentcloud.org
18
+ executables:
19
+ - docsplit
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - build/org/documentcloud/ExtractInfo$1.class
26
+ - build/org/documentcloud/ExtractInfo$Keys.class
27
+ - build/org/documentcloud/ExtractInfo.class
28
+ - build/org/documentcloud/Extractor.class
29
+ - build/org/documentcloud/ExtractPages.class
30
+ - build/org/documentcloud/ExtractText.class
31
+ - lib/docsplit/argument_parser.rb
32
+ - lib/docsplit/command_line.rb
33
+ - lib/docsplit/ExtractInfo.java
34
+ - lib/docsplit/Extractor.java
35
+ - lib/docsplit/ExtractPages.java
36
+ - lib/docsplit/ExtractText.java
37
+ - lib/docsplit/image_extractor.rb
38
+ - lib/docsplit/transparent_pdfs.rb
39
+ - lib/docsplit.rb
40
+ - bin/docsplit
41
+ - vendor/bcmail.jar
42
+ - vendor/bcprov.jar
43
+ - vendor/commons-logging.jar
44
+ - vendor/fontbox.jar
45
+ - vendor/jodconverter/commons-cli-1.2.jar
46
+ - vendor/jodconverter/commons-io-1.4.jar
47
+ - vendor/jodconverter/jodconverter-2.2.2.jar
48
+ - vendor/jodconverter/jodconverter-cli-2.2.2.jar
49
+ - vendor/jodconverter/juh-3.0.1.jar
50
+ - vendor/jodconverter/jurt-3.0.1.jar
51
+ - vendor/jodconverter/ridl-3.0.1.jar
52
+ - vendor/jodconverter/slf4j-api-1.5.6.jar
53
+ - vendor/jodconverter/slf4j-jdk14-1.5.6.jar
54
+ - vendor/jodconverter/unoil-3.0.1.jar
55
+ - vendor/logging.properties
56
+ - vendor/pdfbox.jar
57
+ - docsplit.gemspec
58
+ - LICENSE
59
+ - README
60
+ has_rdoc: false
61
+ homepage: http://documentcloud.github.com/docsplit/
62
+ licenses: []
63
+
64
+ post_install_message:
65
+ rdoc_options: []
66
+
67
+ require_paths:
68
+ - lib
69
+ required_ruby_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: "0"
74
+ version:
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: "0"
80
+ version:
81
+ requirements: []
82
+
83
+ rubyforge_project: docsplit
84
+ rubygems_version: 1.3.5
85
+ signing_key:
86
+ specification_version: 3
87
+ summary: Break Apart Documents into Images, Text, Pages and PDFs
88
+ test_files: []
89
+