docsplit 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +22 -0
- data/README +22 -0
- data/bin/docsplit +5 -0
- data/build/org/documentcloud/ExtractInfo$1.class +0 -0
- data/build/org/documentcloud/ExtractInfo$Keys.class +0 -0
- data/build/org/documentcloud/ExtractInfo.class +0 -0
- data/build/org/documentcloud/ExtractPages.class +0 -0
- data/build/org/documentcloud/ExtractText.class +0 -0
- data/build/org/documentcloud/Extractor.class +0 -0
- data/docsplit.gemspec +25 -0
- data/lib/docsplit/ExtractInfo.java +63 -0
- data/lib/docsplit/ExtractPages.java +54 -0
- data/lib/docsplit/ExtractText.java +80 -0
- data/lib/docsplit/Extractor.java +91 -0
- data/lib/docsplit/argument_parser.rb +31 -0
- data/lib/docsplit/command_line.rb +107 -0
- data/lib/docsplit/image_extractor.rb +94 -0
- data/lib/docsplit/transparent_pdfs.rb +26 -0
- data/lib/docsplit.rb +78 -0
- data/vendor/bcmail.jar +0 -0
- data/vendor/bcprov.jar +0 -0
- data/vendor/commons-logging.jar +0 -0
- data/vendor/fontbox.jar +0 -0
- data/vendor/jodconverter/commons-cli-1.2.jar +0 -0
- data/vendor/jodconverter/commons-io-1.4.jar +0 -0
- data/vendor/jodconverter/jodconverter-2.2.2.jar +0 -0
- data/vendor/jodconverter/jodconverter-cli-2.2.2.jar +0 -0
- data/vendor/jodconverter/juh-3.0.1.jar +0 -0
- data/vendor/jodconverter/jurt-3.0.1.jar +0 -0
- data/vendor/jodconverter/ridl-3.0.1.jar +0 -0
- data/vendor/jodconverter/slf4j-api-1.5.6.jar +0 -0
- data/vendor/jodconverter/slf4j-jdk14-1.5.6.jar +0 -0
- data/vendor/jodconverter/unoil-3.0.1.jar +0 -0
- data/vendor/logging.properties +1 -0
- data/vendor/pdfbox.jar +0 -0
- metadata +89 -0
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person
|
4
|
+
obtaining a copy of this software and associated documentation
|
5
|
+
files (the "Software"), to deal in the Software without
|
6
|
+
restriction, including without limitation the rights to use,
|
7
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
+
copies of the Software, and to permit persons to whom the
|
9
|
+
Software is furnished to do so, subject to the following
|
10
|
+
conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be
|
13
|
+
included in all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
17
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
19
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
20
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
21
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
22
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
==
|
2
|
+
__ ___ __
|
3
|
+
____/ /___ ______________ / (_) /_
|
4
|
+
/ __ / __ \/ ___/ ___/ __ \/ / / __/
|
5
|
+
/ /_/ / /_/ / /__(__ ) /_/ / / / /_
|
6
|
+
\____/\____/\___/____/ .___/_/_/\__/
|
7
|
+
/_/
|
8
|
+
|
9
|
+
Docsplit is a command-line utility and Ruby library for splitting apart
|
10
|
+
documents into their component parts: searchable UTF-8 plain text, page
|
11
|
+
images or thumbnails in any format, PDFs, single pages, and document
|
12
|
+
metadata (title, author, number of pages...)
|
13
|
+
|
14
|
+
Installation:
|
15
|
+
gem install docsplit
|
16
|
+
|
17
|
+
For documentation, usage, and examples, see:
|
18
|
+
http://documentcloud.github.com/docsplit/
|
19
|
+
|
20
|
+
To suggest a feature or report a bug:
|
21
|
+
http://github.com/documentcloud/docsplit/issues/
|
22
|
+
|
data/bin/docsplit
ADDED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/docsplit.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = 'docsplit'
|
3
|
+
s.version = '0.1.0' # Keep version in sync with jammit.rb
|
4
|
+
s.date = '2009-12-07'
|
5
|
+
|
6
|
+
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
|
+
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
8
|
+
s.description = <<-EOS
|
9
|
+
Docsplit is a command-line utility and Ruby library for splitting apart
|
10
|
+
documents into their component parts: searchable UTF-8 plain text, page
|
11
|
+
images or thumbnails in any format, PDFs, single pages, and document
|
12
|
+
metadata (title, author, number of pages...)
|
13
|
+
EOS
|
14
|
+
|
15
|
+
s.authors = ['Jeremy Ashkenas']
|
16
|
+
s.email = 'jeremy@documentcloud.org'
|
17
|
+
s.rubyforge_project = 'docsplit'
|
18
|
+
s.has_rdoc = false
|
19
|
+
|
20
|
+
s.require_paths = ['lib']
|
21
|
+
s.executables = ['docsplit']
|
22
|
+
|
23
|
+
s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
|
24
|
+
'docsplit.gemspec', 'LICENSE', 'README']
|
25
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
package org.documentcloud;
|
2
|
+
|
3
|
+
import java.util.List;
|
4
|
+
import java.io.IOException;
|
5
|
+
import java.text.SimpleDateFormat;
|
6
|
+
|
7
|
+
import org.apache.pdfbox.pdmodel.PDDocument;
|
8
|
+
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
9
|
+
|
10
|
+
// Extracts metadata from a PDF file.
|
11
|
+
public class ExtractInfo extends Extractor {
|
12
|
+
|
13
|
+
private PDDocument doc;
|
14
|
+
private PDDocumentInformation info;
|
15
|
+
private String key;
|
16
|
+
|
17
|
+
// The list of metadata keys we know how to extract.
|
18
|
+
private enum Keys {
|
19
|
+
AUTHOR, DATE, CREATOR, KEYWORDS, PRODUCER, SUBJECT, TITLE, LENGTH
|
20
|
+
}
|
21
|
+
|
22
|
+
// The mainline.
|
23
|
+
public static void main(String[] args) {
|
24
|
+
(new ExtractInfo()).run(args);
|
25
|
+
}
|
26
|
+
|
27
|
+
// The first argument is always the name of the metadata key.
|
28
|
+
protected void parseArguments(List<String> args) {
|
29
|
+
super.parseArguments(args);
|
30
|
+
key = args.remove(0).toUpperCase();
|
31
|
+
}
|
32
|
+
|
33
|
+
// Extract the configured bit of metadata from a PDF, decrypting if necessary.
|
34
|
+
public void extract(String pdfPath) {
|
35
|
+
try {
|
36
|
+
doc = PDDocument.load(pdfPath, false);
|
37
|
+
decrypt(doc);
|
38
|
+
info = doc.getDocumentInformation();
|
39
|
+
String val = extractInfo();
|
40
|
+
if (val != null) System.out.println(val);
|
41
|
+
doc.close();
|
42
|
+
} catch(IOException e) {
|
43
|
+
System.out.println(e.getMessage());
|
44
|
+
System.exit(1);
|
45
|
+
}
|
46
|
+
}
|
47
|
+
|
48
|
+
// Use the PDDocumentInformation object to fetch metadata values as strings.
|
49
|
+
public String extractInfo() throws IOException {
|
50
|
+
switch(Keys.valueOf(key)) {
|
51
|
+
case AUTHOR: return info.getAuthor();
|
52
|
+
case DATE: return new SimpleDateFormat("yyyy-MM-dd").format(info.getCreationDate().getTime());
|
53
|
+
case CREATOR: return info.getCreator();
|
54
|
+
case KEYWORDS: return info.getKeywords();
|
55
|
+
case PRODUCER: return info.getProducer();
|
56
|
+
case SUBJECT: return info.getSubject();
|
57
|
+
case TITLE: return info.getTitle();
|
58
|
+
case LENGTH: return String.valueOf(doc.getNumberOfPages());
|
59
|
+
default: return null;
|
60
|
+
}
|
61
|
+
}
|
62
|
+
|
63
|
+
}
|
@@ -0,0 +1,54 @@
|
|
1
|
+
package org.documentcloud;
|
2
|
+
|
3
|
+
import java.util.List;
|
4
|
+
import java.io.File;
|
5
|
+
import java.io.FileOutputStream;
|
6
|
+
import java.io.IOException;
|
7
|
+
|
8
|
+
import org.apache.pdfbox.pdmodel.PDDocument;
|
9
|
+
import org.apache.pdfbox.util.Splitter;
|
10
|
+
import org.apache.pdfbox.pdfwriter.COSWriter;
|
11
|
+
import org.apache.pdfbox.exceptions.COSVisitorException;
|
12
|
+
|
13
|
+
// Use PDFBox's Splitter to break apart a large PDF into individual pages.
|
14
|
+
public class ExtractPages extends Extractor {
|
15
|
+
|
16
|
+
private PDDocument doc;
|
17
|
+
private String basename;
|
18
|
+
|
19
|
+
// The mainline.
|
20
|
+
public static void main(String[] args) {
|
21
|
+
(new ExtractPages()).run(args);
|
22
|
+
}
|
23
|
+
|
24
|
+
// Extract each page of the given PDF.
|
25
|
+
public void extract(String pdfPath) {
|
26
|
+
try {
|
27
|
+
basename = getBasename(pdfPath);
|
28
|
+
doc = PDDocument.load(pdfPath);
|
29
|
+
decrypt(doc);
|
30
|
+
List pages = (new Splitter()).split(doc);
|
31
|
+
if (pageNumbers != null) {
|
32
|
+
for (Integer num : pageNumbers) writePage((PDDocument) pages.get(num.intValue()- 1), num.intValue());
|
33
|
+
} else {
|
34
|
+
for (int i=0; i<pages.size(); i++) writePage((PDDocument) pages.get(i), i + 1);
|
35
|
+
}
|
36
|
+
doc.close();
|
37
|
+
} catch(Exception e) {
|
38
|
+
System.out.println(e.getMessage());
|
39
|
+
System.exit(1);
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
43
|
+
// Writes out a page as a single-page PDF.
|
44
|
+
private void writePage(PDDocument page, int pageNumber) throws IOException, COSVisitorException {
|
45
|
+
String pageName = basename + "_" + String.valueOf(pageNumber) + ".pdf";
|
46
|
+
FileOutputStream out = new FileOutputStream(outputFile(pageName));
|
47
|
+
COSWriter writer = new COSWriter(out);
|
48
|
+
writer.write(page);
|
49
|
+
out.close();
|
50
|
+
writer.close();
|
51
|
+
page.close();
|
52
|
+
}
|
53
|
+
|
54
|
+
}
|
@@ -0,0 +1,80 @@
|
|
1
|
+
package org.documentcloud;
|
2
|
+
|
3
|
+
import java.util.List;
|
4
|
+
import java.io.File;
|
5
|
+
import java.io.FileOutputStream;
|
6
|
+
import java.io.IOException;
|
7
|
+
import java.io.OutputStreamWriter;
|
8
|
+
|
9
|
+
import org.apache.pdfbox.pdmodel.PDDocument;
|
10
|
+
import org.apache.pdfbox.util.PDFTextStripper;
|
11
|
+
|
12
|
+
// Uses PDFBox's PDFTextStripper to extract the full, plain, UTF-8 text of a
|
13
|
+
// PDF document. Pass --pages to write out the plain text for each individual
|
14
|
+
// page; --pages-only to omit the text for the entire document.
|
15
|
+
public class ExtractText extends Extractor {
|
16
|
+
|
17
|
+
private PDDocument doc;
|
18
|
+
private String basename;
|
19
|
+
|
20
|
+
// The mainline.
|
21
|
+
public static void main(String[] args) {
|
22
|
+
(new ExtractText()).run(args);
|
23
|
+
}
|
24
|
+
|
25
|
+
// Extract the plain text for a PDF, and write it into the requested output
|
26
|
+
// sizes.
|
27
|
+
public void extract(String pdfPath) {
|
28
|
+
try {
|
29
|
+
basename = getBasename(pdfPath);
|
30
|
+
doc = PDDocument.load(pdfPath, false);
|
31
|
+
decrypt(doc);
|
32
|
+
if (allPages || (pageNumbers != null)) {
|
33
|
+
writePageText();
|
34
|
+
} else {
|
35
|
+
writeFullText();
|
36
|
+
}
|
37
|
+
doc.close();
|
38
|
+
} catch(IOException e) {
|
39
|
+
System.out.println(e.getMessage());
|
40
|
+
System.exit(1);
|
41
|
+
}
|
42
|
+
}
|
43
|
+
|
44
|
+
// Write out the extracted full text for the entire PDF.
|
45
|
+
public void writeFullText() throws IOException {
|
46
|
+
OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outputFile(basename + ".txt")), "UTF-8");
|
47
|
+
extractTextForPageRange(output, 1, Integer.MAX_VALUE);
|
48
|
+
output.close();
|
49
|
+
}
|
50
|
+
|
51
|
+
// Write out the full text for each specified page.
|
52
|
+
public void writePageText() throws IOException {
|
53
|
+
if (pageNumbers != null) {
|
54
|
+
for (Integer num : pageNumbers) writePageText(num.intValue());
|
55
|
+
} else {
|
56
|
+
int pages = doc.getNumberOfPages();
|
57
|
+
for (int i=1; i<=pages; i++) writePageText(i);
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
// Write out the full text for a single page.
|
62
|
+
public void writePageText(int pageNumber) throws IOException {
|
63
|
+
File outfile = outputFile(basename + "_" + String.valueOf(pageNumber) + ".txt");
|
64
|
+
OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8");
|
65
|
+
extractTextForPageRange(output, pageNumber, pageNumber);
|
66
|
+
output.close();
|
67
|
+
}
|
68
|
+
|
69
|
+
// Internal method to writes out text from the PDF for a given page range
|
70
|
+
// to a provided output stream.
|
71
|
+
private void extractTextForPageRange(OutputStreamWriter output, int startPage, int endPage) throws IOException {
|
72
|
+
PDFTextStripper stripper = new PDFTextStripper("UTF-8");
|
73
|
+
stripper.setSortByPosition(false);
|
74
|
+
stripper.setShouldSeparateByBeads(true);
|
75
|
+
stripper.setStartPage(startPage);
|
76
|
+
stripper.setEndPage(endPage);
|
77
|
+
stripper.writeText(doc, output);
|
78
|
+
}
|
79
|
+
|
80
|
+
}
|
@@ -0,0 +1,91 @@
|
|
1
|
+
package org.documentcloud;
|
2
|
+
|
3
|
+
import java.io.File;
|
4
|
+
import java.util.List;
|
5
|
+
import java.util.Arrays;
|
6
|
+
import java.util.ArrayList;
|
7
|
+
import java.util.Iterator;
|
8
|
+
|
9
|
+
import org.apache.pdfbox.pdmodel.PDDocument;
|
10
|
+
|
11
|
+
// The base Extractor class contains the common functionality needed to run
|
12
|
+
// command-line extractors.
|
13
|
+
public abstract class Extractor {
|
14
|
+
|
15
|
+
protected File output;
|
16
|
+
protected boolean allPages = false;
|
17
|
+
protected ArrayList<Integer> pageNumbers;
|
18
|
+
|
19
|
+
// Running an extractor consists of converting the arguments array into a
|
20
|
+
// more manageable List, parsing arguments, and extracting pdfs.
|
21
|
+
public void run(String[] arguments) {
|
22
|
+
List<String> args = new ArrayList<String>(Arrays.asList(arguments));
|
23
|
+
parseArguments(args);
|
24
|
+
Iterator<String> iter = args.iterator();
|
25
|
+
while(iter.hasNext()) extract(iter.next());
|
26
|
+
}
|
27
|
+
|
28
|
+
// Subclasses must override "extract" to perform their specific extraction.
|
29
|
+
public abstract void extract(String pdfPath);
|
30
|
+
|
31
|
+
// The default "parseArguments" method handles common arguments.
|
32
|
+
protected void parseArguments(List<String> args) {
|
33
|
+
int dirLoc = args.indexOf("--output");
|
34
|
+
if (dirLoc >= 0) {
|
35
|
+
output = new File(args.remove(dirLoc + 1));
|
36
|
+
args.remove(dirLoc);
|
37
|
+
}
|
38
|
+
int pagesLoc = args.indexOf("--pages");
|
39
|
+
if (pagesLoc >= 0) {
|
40
|
+
parsePages(args.remove(pagesLoc + 1));
|
41
|
+
args.remove(pagesLoc);
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
// Utility function to get the basename of a file path.
|
46
|
+
// After File.basename in Ruby.
|
47
|
+
public String getBasename(String pdfPath) {
|
48
|
+
String basename = new File(pdfPath).getName();
|
49
|
+
return basename.substring(0, basename.lastIndexOf('.'));
|
50
|
+
}
|
51
|
+
|
52
|
+
// Get a reference to an output file, placed inside any configured directories,
|
53
|
+
// while ensuring that parent directories exist.
|
54
|
+
public File outputFile(String path) {
|
55
|
+
File file = output != null ? new File(output, path) : new File(path);
|
56
|
+
File parent = file.getParentFile();
|
57
|
+
if (parent != null) parent.mkdirs();
|
58
|
+
return file;
|
59
|
+
}
|
60
|
+
|
61
|
+
// Decrypt a non-passworded but still encrypted document.
|
62
|
+
public void decrypt(PDDocument doc) {
|
63
|
+
if (!doc.isEncrypted()) return;
|
64
|
+
try {
|
65
|
+
doc.decrypt("");
|
66
|
+
} catch (Exception e) {
|
67
|
+
System.out.println("Error decrypting document, details: " + e.getMessage());
|
68
|
+
System.exit(1);
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
private void parsePages(String pageList) {
|
73
|
+
if (pageList.equals("all")) {
|
74
|
+
allPages = true;
|
75
|
+
return;
|
76
|
+
}
|
77
|
+
pageNumbers = new ArrayList<Integer>();
|
78
|
+
String[] groups = pageList.split(",");
|
79
|
+
for (String group : groups) {
|
80
|
+
if (group.contains("-")) {
|
81
|
+
String[] range = group.split("-");
|
82
|
+
int start = Integer.parseInt(range[0]);
|
83
|
+
int end = Integer.parseInt(range[1]);
|
84
|
+
for (int i=start; i<=end; i++) pageNumbers.add(new Integer(i));
|
85
|
+
} else {
|
86
|
+
pageNumbers.add(new Integer(Integer.parseInt(group)));
|
87
|
+
}
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
91
|
+
}
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
module ArgumentParser
|
4
|
+
|
5
|
+
# Flatten an options hash into an arguments string suitable for the command
|
6
|
+
# line.
|
7
|
+
def parse_options(opts)
|
8
|
+
opts.map {|k, v| ["--#{k}", normalize_value(v)] }.flatten.join(' ')
|
9
|
+
end
|
10
|
+
|
11
|
+
# Normalize a value in an options hash for the command line.
|
12
|
+
# Ranges look like: 1-10, Arrays like: 1,2,3.
|
13
|
+
def normalize_value(value)
|
14
|
+
case value
|
15
|
+
when Range then normalize_range(value)
|
16
|
+
when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
|
17
|
+
else value.to_s
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Serialize a Ruby range into it's command-line equivalent.
|
22
|
+
def normalize_range(range)
|
23
|
+
arr = range.to_a
|
24
|
+
arr.empty? ? range.first.to_s : "#{range.first}-#{arr.last}"
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
extend ArgumentParser
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
|
3
|
+
|
4
|
+
module Docsplit
|
5
|
+
|
6
|
+
# A single command-line utility to separate a PDF into all its component parts.
|
7
|
+
class CommandLine
|
8
|
+
|
9
|
+
BANNER = <<-EOS
|
10
|
+
docsplit breaks apart documents into images, text, or individual pages.
|
11
|
+
It wraps PDFBox, GraphicsMagick, and JODConverter.
|
12
|
+
|
13
|
+
Usage:
|
14
|
+
docsplit COMMAND [OPTIONS] path/to/doc.pdf
|
15
|
+
Main commands:
|
16
|
+
pages, images, text, pdf.
|
17
|
+
Metadata commands:
|
18
|
+
author, date, creator, keywords, producer, subject, title, length.
|
19
|
+
|
20
|
+
Example:
|
21
|
+
docsplit images --size 700x --format jpg document.pdf
|
22
|
+
|
23
|
+
Dependencies:
|
24
|
+
Ruby, Java, A working GraphicsMagick (gm) command,
|
25
|
+
and a headless OpenOffice server for non-PDF documents.
|
26
|
+
|
27
|
+
Options:
|
28
|
+
(size, pages and format can take comma-separated values)
|
29
|
+
|
30
|
+
EOS
|
31
|
+
|
32
|
+
# Creating a CommandLine runs off of the contents of ARGV.
|
33
|
+
def initialize
|
34
|
+
parse_options
|
35
|
+
cmd = ARGV.shift
|
36
|
+
@command = cmd && cmd.to_sym
|
37
|
+
run
|
38
|
+
end
|
39
|
+
|
40
|
+
# Delegate to the Docsplit Ruby API to perform all extractions.
|
41
|
+
def run
|
42
|
+
begin
|
43
|
+
case @command
|
44
|
+
when :images then Docsplit.extract_images(ARGV, @options)
|
45
|
+
when :pages then Docsplit.extract_pages(ARGV, @options)
|
46
|
+
when :text then Docsplit.extract_text(ARGV, @options)
|
47
|
+
when :pdf then Docsplit.extract_pdf(ARGV, @options)
|
48
|
+
else
|
49
|
+
if METADATA_KEYS.include?(@command)
|
50
|
+
value = Docsplit.send("extract_#{@command}", ARGV, @options)
|
51
|
+
puts value unless value.nil?
|
52
|
+
else
|
53
|
+
usage
|
54
|
+
end
|
55
|
+
end
|
56
|
+
rescue ExtractionFailed => e
|
57
|
+
puts e.message.chomp
|
58
|
+
exit(1)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Print out the usage help message.
|
63
|
+
def usage
|
64
|
+
puts "\n#{@option_parser}\n"
|
65
|
+
exit
|
66
|
+
end
|
67
|
+
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
# Use the OptionParser library to parse out all supported options. Return
|
72
|
+
# options formatted for the Ruby API.
|
73
|
+
def parse_options
|
74
|
+
@options = {}
|
75
|
+
@option_parser = OptionParser.new do |opts|
|
76
|
+
opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
|
77
|
+
@options[:output] = d
|
78
|
+
end
|
79
|
+
opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
|
80
|
+
@options[:pages] = p
|
81
|
+
end
|
82
|
+
opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
|
83
|
+
@options[:size] = s.split(',')
|
84
|
+
end
|
85
|
+
opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
|
86
|
+
@options[:format] = t.split(',')
|
87
|
+
end
|
88
|
+
opts.on_tail('-v', '--version', 'display docsplit version') do
|
89
|
+
puts "docsplit version #{Docsplit::VERSION}"
|
90
|
+
exit
|
91
|
+
end
|
92
|
+
opts.on_tail('-h', '--help', 'display this help message') do
|
93
|
+
usage
|
94
|
+
end
|
95
|
+
end
|
96
|
+
@option_parser.banner = BANNER
|
97
|
+
begin
|
98
|
+
@option_parser.parse!(ARGV)
|
99
|
+
rescue OptionParser::InvalidOption => e
|
100
|
+
puts e.message
|
101
|
+
exit(1)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Delegates to GraphicsMagick in order to convert PDF documents into
|
4
|
+
# nicely sized images.
|
5
|
+
class ImageExtractor
|
6
|
+
|
7
|
+
DENSITY_ARG = "-density 150"
|
8
|
+
DEFAULT_FORMAT = :png
|
9
|
+
|
10
|
+
# Extract a list of PDFs as rasterized page images, according to the
|
11
|
+
# configuration in options.
|
12
|
+
def extract(pdfs, options)
|
13
|
+
@pdfs = [pdfs].flatten
|
14
|
+
extract_options(options)
|
15
|
+
@pdfs.each {|p| @sizes.each {|s| @formats.each {|f| convert(p, s, f) }}}
|
16
|
+
end
|
17
|
+
|
18
|
+
# Convert a single PDF into page images at the specified size and format.
|
19
|
+
def convert(pdf, size, format)
|
20
|
+
basename = File.basename(pdf, File.extname(pdf))
|
21
|
+
subfolder = @sizes.length > 1 ? size.to_s : ''
|
22
|
+
directory = File.join(@output, subfolder)
|
23
|
+
FileUtils.mkdir_p(directory) unless File.exists?(directory)
|
24
|
+
out_file = File.join(directory, "#{basename}_%05d.#{format}")
|
25
|
+
cmd = "gm convert #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
|
26
|
+
result = `#{cmd}`.chomp
|
27
|
+
raise ExtractionFailed, result if $? != 0
|
28
|
+
renumber_images(out_file, format)
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
# Extract the relevant GraphicsMagick options from the options hash.
|
35
|
+
def extract_options(options)
|
36
|
+
@output = options[:output] || '.'
|
37
|
+
@pages = options[:pages]
|
38
|
+
@formats = [options[:format] || DEFAULT_FORMAT].flatten
|
39
|
+
@sizes = [options[:size]].flatten.compact
|
40
|
+
@sizes = [nil] if @sizes.empty?
|
41
|
+
end
|
42
|
+
|
43
|
+
# Generate the resize argument.
|
44
|
+
def resize_arg(size)
|
45
|
+
size.nil? ? '' : "-resize #{size}"
|
46
|
+
end
|
47
|
+
|
48
|
+
# Generate the appropriate quality argument for the image format.
|
49
|
+
def quality_arg(format)
|
50
|
+
case format.to_s
|
51
|
+
when /jpe?g/ then "-quality 85"
|
52
|
+
when /png/ then "-quality 100"
|
53
|
+
else ""
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Generate the requested page index into the document.
|
58
|
+
def pages_arg
|
59
|
+
return '' if @pages.nil?
|
60
|
+
pages = @pages.gsub(/\d+/) {|digits| (digits.to_i - 1).to_s }
|
61
|
+
"[#{pages}]"
|
62
|
+
end
|
63
|
+
|
64
|
+
# Generate the expanded list of requested page numbers.
|
65
|
+
def page_list
|
66
|
+
@pages.split(',').map { |range|
|
67
|
+
if range.include?('-')
|
68
|
+
range = range.split('-')
|
69
|
+
Range.new(range.first, range.last).to_a.map {|n| n.to_i }
|
70
|
+
else
|
71
|
+
range.to_i
|
72
|
+
end
|
73
|
+
}.flatten.sort
|
74
|
+
end
|
75
|
+
|
76
|
+
# When GraphicsMagick is through, it will have generated a number of
|
77
|
+
# incrementing page images, starting at 0. Renumber them with their correct
|
78
|
+
# page numbers.
|
79
|
+
def renumber_images(template, format)
|
80
|
+
suffixer = /_0+(\d+)\.#{format}\Z/
|
81
|
+
images = Dir[template.sub('%05d', '0*')].map do |path|
|
82
|
+
index = path[suffixer, 1].to_i
|
83
|
+
{:path => path, :index => index, :page_number => index + 1}
|
84
|
+
end
|
85
|
+
numbers = @pages ? page_list.reverse : nil
|
86
|
+
images.sort_by {|i| -i[:page_number] }.each_with_index do |image, i|
|
87
|
+
number = numbers ? numbers[i] : image[:page_number]
|
88
|
+
FileUtils.mv(image[:path], image[:path].sub(suffixer, "_#{number}.#{format}"))
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Include a method to transparently convert non-PDF arguments to temporary
|
4
|
+
# PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
|
5
|
+
module TransparentPDFs
|
6
|
+
|
7
|
+
# Temporarily convert any non-PDF documents to PDFs before running them
|
8
|
+
# through further extraction.
|
9
|
+
def ensure_pdfs(docs)
|
10
|
+
[docs].flatten.map do |doc|
|
11
|
+
ext = File.extname(doc)
|
12
|
+
if ext == '.pdf'
|
13
|
+
doc
|
14
|
+
else
|
15
|
+
@tempdir ||= File.join(Dir.tmpdir, 'docsplit')
|
16
|
+
extract_pdf([doc], {:output => @tempdir})
|
17
|
+
File.join(@tempdir, File.basename(doc, ext) + '.pdf')
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
extend TransparentPDFs
|
25
|
+
|
26
|
+
end
|
data/lib/docsplit.rb
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
# The Docsplit module delegates to the Java PDF extractors.
|
2
|
+
module Docsplit
|
3
|
+
|
4
|
+
VERSION = '0.1.0' # Keep in sync with gemspec.
|
5
|
+
|
6
|
+
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
7
|
+
|
8
|
+
CLASSPATH = "#{ROOT}/build#{File::PATH_SEPARATOR}#{ROOT}/vendor/'*'"
|
9
|
+
|
10
|
+
LOGGING = "-Djava.util.logging.config.file=#{ROOT}/vendor/logging.properties"
|
11
|
+
|
12
|
+
HEADLESS = "-Djava.awt.headless=true"
|
13
|
+
|
14
|
+
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
|
15
|
+
|
16
|
+
# Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
|
17
|
+
# broke.
|
18
|
+
class ExtractionFailed < StandardError; end
|
19
|
+
|
20
|
+
# Use the ExtractPages Java class to burst a PDF into single pages.
|
21
|
+
def self.extract_pages(pdfs, opts={})
|
22
|
+
pdfs = ensure_pdfs(pdfs)
|
23
|
+
run "org.documentcloud.ExtractPages", pdfs, opts
|
24
|
+
end
|
25
|
+
|
26
|
+
# Use the ExtractText Java class to write out all embedded text.
|
27
|
+
def self.extract_text(pdfs, opts={})
|
28
|
+
pdfs = ensure_pdfs(pdfs)
|
29
|
+
run "org.documentcloud.ExtractText", pdfs, opts
|
30
|
+
end
|
31
|
+
|
32
|
+
# Use the ExtractImages Java class to rasterize a PDF into each page's image.
|
33
|
+
def self.extract_images(pdfs, opts={})
|
34
|
+
pdfs = ensure_pdfs(pdfs)
|
35
|
+
opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
|
36
|
+
ImageExtractor.new.extract(pdfs, opts)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Use JODCConverter to extract the documents as PDFs.
|
40
|
+
def self.extract_pdf(docs, opts={})
|
41
|
+
[docs].flatten.each do |doc|
|
42
|
+
basename = File.basename(doc, File.extname(doc))
|
43
|
+
run "-jar #{ROOT}/vendor/jodconverter/jodconverter-cli-2.2.2.jar #{doc} #{opts[:output] || '.'}/#{basename}.pdf", [], {}
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# Define custom methods for each of the metadata keys that we support.
|
48
|
+
# Use the ExtractInfo Java class to print out a single bit of metadata.
|
49
|
+
METADATA_KEYS.each do |key|
|
50
|
+
instance_eval <<-EOS
|
51
|
+
def self.extract_#{key}(pdfs, opts={})
|
52
|
+
pdfs = ensure_pdfs(pdfs)
|
53
|
+
result = run "org.documentcloud.ExtractInfo #{key}", pdfs, opts, true
|
54
|
+
:#{key} == :length ? result.to_i : result
|
55
|
+
end
|
56
|
+
EOS
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
# Runs a Java command, with quieted logging, and the classpath set properly.
|
63
|
+
def self.run(command, pdfs, opts, return_output=false)
|
64
|
+
pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
|
65
|
+
args = parse_options(opts)
|
66
|
+
cmd = "java #{HEADLESS} #{LOGGING} -cp #{CLASSPATH} #{command} #{args} #{pdfs} 2>&1"
|
67
|
+
result = `#{cmd}`.chomp
|
68
|
+
raise ExtractionFailed, result if $? != 0
|
69
|
+
return return_output ? (result.empty? ? nil : result) : true
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
require 'tmpdir'
|
75
|
+
require 'fileutils'
|
76
|
+
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
77
|
+
require "#{Docsplit::ROOT}/lib/docsplit/argument_parser"
|
78
|
+
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
data/vendor/bcmail.jar
ADDED
Binary file
|
data/vendor/bcprov.jar
ADDED
Binary file
|
Binary file
|
data/vendor/fontbox.jar
ADDED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1 @@
|
|
1
|
+
.level=WARNING
|
data/vendor/pdfbox.jar
ADDED
Binary file
|
metadata
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: docsplit
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jeremy Ashkenas
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-12-07 00:00:00 -05:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
|
17
|
+
email: jeremy@documentcloud.org
|
18
|
+
executables:
|
19
|
+
- docsplit
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- build/org/documentcloud/ExtractInfo$1.class
|
26
|
+
- build/org/documentcloud/ExtractInfo$Keys.class
|
27
|
+
- build/org/documentcloud/ExtractInfo.class
|
28
|
+
- build/org/documentcloud/Extractor.class
|
29
|
+
- build/org/documentcloud/ExtractPages.class
|
30
|
+
- build/org/documentcloud/ExtractText.class
|
31
|
+
- lib/docsplit/argument_parser.rb
|
32
|
+
- lib/docsplit/command_line.rb
|
33
|
+
- lib/docsplit/ExtractInfo.java
|
34
|
+
- lib/docsplit/Extractor.java
|
35
|
+
- lib/docsplit/ExtractPages.java
|
36
|
+
- lib/docsplit/ExtractText.java
|
37
|
+
- lib/docsplit/image_extractor.rb
|
38
|
+
- lib/docsplit/transparent_pdfs.rb
|
39
|
+
- lib/docsplit.rb
|
40
|
+
- bin/docsplit
|
41
|
+
- vendor/bcmail.jar
|
42
|
+
- vendor/bcprov.jar
|
43
|
+
- vendor/commons-logging.jar
|
44
|
+
- vendor/fontbox.jar
|
45
|
+
- vendor/jodconverter/commons-cli-1.2.jar
|
46
|
+
- vendor/jodconverter/commons-io-1.4.jar
|
47
|
+
- vendor/jodconverter/jodconverter-2.2.2.jar
|
48
|
+
- vendor/jodconverter/jodconverter-cli-2.2.2.jar
|
49
|
+
- vendor/jodconverter/juh-3.0.1.jar
|
50
|
+
- vendor/jodconverter/jurt-3.0.1.jar
|
51
|
+
- vendor/jodconverter/ridl-3.0.1.jar
|
52
|
+
- vendor/jodconverter/slf4j-api-1.5.6.jar
|
53
|
+
- vendor/jodconverter/slf4j-jdk14-1.5.6.jar
|
54
|
+
- vendor/jodconverter/unoil-3.0.1.jar
|
55
|
+
- vendor/logging.properties
|
56
|
+
- vendor/pdfbox.jar
|
57
|
+
- docsplit.gemspec
|
58
|
+
- LICENSE
|
59
|
+
- README
|
60
|
+
has_rdoc: false
|
61
|
+
homepage: http://documentcloud.github.com/docsplit/
|
62
|
+
licenses: []
|
63
|
+
|
64
|
+
post_install_message:
|
65
|
+
rdoc_options: []
|
66
|
+
|
67
|
+
require_paths:
|
68
|
+
- lib
|
69
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: "0"
|
74
|
+
version:
|
75
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: "0"
|
80
|
+
version:
|
81
|
+
requirements: []
|
82
|
+
|
83
|
+
rubyforge_project: docsplit
|
84
|
+
rubygems_version: 1.3.5
|
85
|
+
signing_key:
|
86
|
+
specification_version: 3
|
87
|
+
summary: Break Apart Documents into Images, Text, Pages and PDFs
|
88
|
+
test_files: []
|
89
|
+
|