docsplit 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +22 -0
- data/README +22 -0
- data/bin/docsplit +5 -0
- data/build/org/documentcloud/ExtractInfo$1.class +0 -0
- data/build/org/documentcloud/ExtractInfo$Keys.class +0 -0
- data/build/org/documentcloud/ExtractInfo.class +0 -0
- data/build/org/documentcloud/ExtractPages.class +0 -0
- data/build/org/documentcloud/ExtractText.class +0 -0
- data/build/org/documentcloud/Extractor.class +0 -0
- data/docsplit.gemspec +25 -0
- data/lib/docsplit/ExtractInfo.java +63 -0
- data/lib/docsplit/ExtractPages.java +54 -0
- data/lib/docsplit/ExtractText.java +80 -0
- data/lib/docsplit/Extractor.java +91 -0
- data/lib/docsplit/argument_parser.rb +31 -0
- data/lib/docsplit/command_line.rb +107 -0
- data/lib/docsplit/image_extractor.rb +94 -0
- data/lib/docsplit/transparent_pdfs.rb +26 -0
- data/lib/docsplit.rb +78 -0
- data/vendor/bcmail.jar +0 -0
- data/vendor/bcprov.jar +0 -0
- data/vendor/commons-logging.jar +0 -0
- data/vendor/fontbox.jar +0 -0
- data/vendor/jodconverter/commons-cli-1.2.jar +0 -0
- data/vendor/jodconverter/commons-io-1.4.jar +0 -0
- data/vendor/jodconverter/jodconverter-2.2.2.jar +0 -0
- data/vendor/jodconverter/jodconverter-cli-2.2.2.jar +0 -0
- data/vendor/jodconverter/juh-3.0.1.jar +0 -0
- data/vendor/jodconverter/jurt-3.0.1.jar +0 -0
- data/vendor/jodconverter/ridl-3.0.1.jar +0 -0
- data/vendor/jodconverter/slf4j-api-1.5.6.jar +0 -0
- data/vendor/jodconverter/slf4j-jdk14-1.5.6.jar +0 -0
- data/vendor/jodconverter/unoil-3.0.1.jar +0 -0
- data/vendor/logging.properties +1 -0
- data/vendor/pdfbox.jar +0 -0
- metadata +89 -0
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person
|
4
|
+
obtaining a copy of this software and associated documentation
|
5
|
+
files (the "Software"), to deal in the Software without
|
6
|
+
restriction, including without limitation the rights to use,
|
7
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
+
copies of the Software, and to permit persons to whom the
|
9
|
+
Software is furnished to do so, subject to the following
|
10
|
+
conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be
|
13
|
+
included in all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
17
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
19
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
20
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
21
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
22
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
==
|
2
|
+
__ ___ __
|
3
|
+
____/ /___ ______________ / (_) /_
|
4
|
+
/ __ / __ \/ ___/ ___/ __ \/ / / __/
|
5
|
+
/ /_/ / /_/ / /__(__ ) /_/ / / / /_
|
6
|
+
\____/\____/\___/____/ .___/_/_/\__/
|
7
|
+
/_/
|
8
|
+
|
9
|
+
Docsplit is a command-line utility and Ruby library for splitting apart
|
10
|
+
documents into their component parts: searchable UTF-8 plain text, page
|
11
|
+
images or thumbnails in any format, PDFs, single pages, and document
|
12
|
+
metadata (title, author, number of pages...)
|
13
|
+
|
14
|
+
Installation:
|
15
|
+
gem install docsplit
|
16
|
+
|
17
|
+
For documentation, usage, and examples, see:
|
18
|
+
http://documentcloud.github.com/docsplit/
|
19
|
+
|
20
|
+
To suggest a feature or report a bug:
|
21
|
+
http://github.com/documentcloud/docsplit/issues/
|
22
|
+
|
data/bin/docsplit
ADDED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/docsplit.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = 'docsplit'
|
3
|
+
s.version = '0.1.0' # Keep version in sync with jammit.rb
|
4
|
+
s.date = '2009-12-07'
|
5
|
+
|
6
|
+
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
|
+
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
8
|
+
s.description = <<-EOS
|
9
|
+
Docsplit is a command-line utility and Ruby library for splitting apart
|
10
|
+
documents into their component parts: searchable UTF-8 plain text, page
|
11
|
+
images or thumbnails in any format, PDFs, single pages, and document
|
12
|
+
metadata (title, author, number of pages...)
|
13
|
+
EOS
|
14
|
+
|
15
|
+
s.authors = ['Jeremy Ashkenas']
|
16
|
+
s.email = 'jeremy@documentcloud.org'
|
17
|
+
s.rubyforge_project = 'docsplit'
|
18
|
+
s.has_rdoc = false
|
19
|
+
|
20
|
+
s.require_paths = ['lib']
|
21
|
+
s.executables = ['docsplit']
|
22
|
+
|
23
|
+
s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
|
24
|
+
'docsplit.gemspec', 'LICENSE', 'README']
|
25
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
package org.documentcloud;
|
2
|
+
|
3
|
+
import java.util.List;
|
4
|
+
import java.io.IOException;
|
5
|
+
import java.text.SimpleDateFormat;
|
6
|
+
|
7
|
+
import org.apache.pdfbox.pdmodel.PDDocument;
|
8
|
+
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
9
|
+
|
10
|
+
// Extracts metadata from a PDF file.
|
11
|
+
public class ExtractInfo extends Extractor {
|
12
|
+
|
13
|
+
private PDDocument doc;
|
14
|
+
private PDDocumentInformation info;
|
15
|
+
private String key;
|
16
|
+
|
17
|
+
// The list of metadata keys we know how to extract.
|
18
|
+
private enum Keys {
|
19
|
+
AUTHOR, DATE, CREATOR, KEYWORDS, PRODUCER, SUBJECT, TITLE, LENGTH
|
20
|
+
}
|
21
|
+
|
22
|
+
// The mainline.
|
23
|
+
public static void main(String[] args) {
|
24
|
+
(new ExtractInfo()).run(args);
|
25
|
+
}
|
26
|
+
|
27
|
+
// The first argument is always the name of the metadata key.
|
28
|
+
protected void parseArguments(List<String> args) {
|
29
|
+
super.parseArguments(args);
|
30
|
+
key = args.remove(0).toUpperCase();
|
31
|
+
}
|
32
|
+
|
33
|
+
// Extract the configured bit of metadata from a PDF, decrypting if necessary.
|
34
|
+
public void extract(String pdfPath) {
|
35
|
+
try {
|
36
|
+
doc = PDDocument.load(pdfPath, false);
|
37
|
+
decrypt(doc);
|
38
|
+
info = doc.getDocumentInformation();
|
39
|
+
String val = extractInfo();
|
40
|
+
if (val != null) System.out.println(val);
|
41
|
+
doc.close();
|
42
|
+
} catch(IOException e) {
|
43
|
+
System.out.println(e.getMessage());
|
44
|
+
System.exit(1);
|
45
|
+
}
|
46
|
+
}
|
47
|
+
|
48
|
+
// Use the PDDocumentInformation object to fetch metadata values as strings.
|
49
|
+
public String extractInfo() throws IOException {
|
50
|
+
switch(Keys.valueOf(key)) {
|
51
|
+
case AUTHOR: return info.getAuthor();
|
52
|
+
case DATE: return new SimpleDateFormat("yyyy-MM-dd").format(info.getCreationDate().getTime());
|
53
|
+
case CREATOR: return info.getCreator();
|
54
|
+
case KEYWORDS: return info.getKeywords();
|
55
|
+
case PRODUCER: return info.getProducer();
|
56
|
+
case SUBJECT: return info.getSubject();
|
57
|
+
case TITLE: return info.getTitle();
|
58
|
+
case LENGTH: return String.valueOf(doc.getNumberOfPages());
|
59
|
+
default: return null;
|
60
|
+
}
|
61
|
+
}
|
62
|
+
|
63
|
+
}
|
@@ -0,0 +1,54 @@
|
|
1
|
+
package org.documentcloud;
|
2
|
+
|
3
|
+
import java.util.List;
|
4
|
+
import java.io.File;
|
5
|
+
import java.io.FileOutputStream;
|
6
|
+
import java.io.IOException;
|
7
|
+
|
8
|
+
import org.apache.pdfbox.pdmodel.PDDocument;
|
9
|
+
import org.apache.pdfbox.util.Splitter;
|
10
|
+
import org.apache.pdfbox.pdfwriter.COSWriter;
|
11
|
+
import org.apache.pdfbox.exceptions.COSVisitorException;
|
12
|
+
|
13
|
+
// Use PDFBox's Splitter to break apart a large PDF into individual pages.
|
14
|
+
public class ExtractPages extends Extractor {
|
15
|
+
|
16
|
+
private PDDocument doc;
|
17
|
+
private String basename;
|
18
|
+
|
19
|
+
// The mainline.
|
20
|
+
public static void main(String[] args) {
|
21
|
+
(new ExtractPages()).run(args);
|
22
|
+
}
|
23
|
+
|
24
|
+
// Extract each page of the given PDF.
|
25
|
+
public void extract(String pdfPath) {
|
26
|
+
try {
|
27
|
+
basename = getBasename(pdfPath);
|
28
|
+
doc = PDDocument.load(pdfPath);
|
29
|
+
decrypt(doc);
|
30
|
+
List pages = (new Splitter()).split(doc);
|
31
|
+
if (pageNumbers != null) {
|
32
|
+
for (Integer num : pageNumbers) writePage((PDDocument) pages.get(num.intValue()- 1), num.intValue());
|
33
|
+
} else {
|
34
|
+
for (int i=0; i<pages.size(); i++) writePage((PDDocument) pages.get(i), i + 1);
|
35
|
+
}
|
36
|
+
doc.close();
|
37
|
+
} catch(Exception e) {
|
38
|
+
System.out.println(e.getMessage());
|
39
|
+
System.exit(1);
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
43
|
+
// Writes out a page as a single-page PDF.
|
44
|
+
private void writePage(PDDocument page, int pageNumber) throws IOException, COSVisitorException {
|
45
|
+
String pageName = basename + "_" + String.valueOf(pageNumber) + ".pdf";
|
46
|
+
FileOutputStream out = new FileOutputStream(outputFile(pageName));
|
47
|
+
COSWriter writer = new COSWriter(out);
|
48
|
+
writer.write(page);
|
49
|
+
out.close();
|
50
|
+
writer.close();
|
51
|
+
page.close();
|
52
|
+
}
|
53
|
+
|
54
|
+
}
|
@@ -0,0 +1,80 @@
|
|
1
|
+
package org.documentcloud;
|
2
|
+
|
3
|
+
import java.util.List;
|
4
|
+
import java.io.File;
|
5
|
+
import java.io.FileOutputStream;
|
6
|
+
import java.io.IOException;
|
7
|
+
import java.io.OutputStreamWriter;
|
8
|
+
|
9
|
+
import org.apache.pdfbox.pdmodel.PDDocument;
|
10
|
+
import org.apache.pdfbox.util.PDFTextStripper;
|
11
|
+
|
12
|
+
// Uses PDFBox's PDFTextStripper to extract the full, plain, UTF-8 text of a
|
13
|
+
// PDF document. Pass --pages to write out the plain text for each individual
|
14
|
+
// page; --pages-only to omit the text for the entire document.
|
15
|
+
public class ExtractText extends Extractor {
|
16
|
+
|
17
|
+
private PDDocument doc;
|
18
|
+
private String basename;
|
19
|
+
|
20
|
+
// The mainline.
|
21
|
+
public static void main(String[] args) {
|
22
|
+
(new ExtractText()).run(args);
|
23
|
+
}
|
24
|
+
|
25
|
+
// Extract the plain text for a PDF, and write it into the requested output
|
26
|
+
// sizes.
|
27
|
+
public void extract(String pdfPath) {
|
28
|
+
try {
|
29
|
+
basename = getBasename(pdfPath);
|
30
|
+
doc = PDDocument.load(pdfPath, false);
|
31
|
+
decrypt(doc);
|
32
|
+
if (allPages || (pageNumbers != null)) {
|
33
|
+
writePageText();
|
34
|
+
} else {
|
35
|
+
writeFullText();
|
36
|
+
}
|
37
|
+
doc.close();
|
38
|
+
} catch(IOException e) {
|
39
|
+
System.out.println(e.getMessage());
|
40
|
+
System.exit(1);
|
41
|
+
}
|
42
|
+
}
|
43
|
+
|
44
|
+
// Write out the extracted full text for the entire PDF.
|
45
|
+
public void writeFullText() throws IOException {
|
46
|
+
OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outputFile(basename + ".txt")), "UTF-8");
|
47
|
+
extractTextForPageRange(output, 1, Integer.MAX_VALUE);
|
48
|
+
output.close();
|
49
|
+
}
|
50
|
+
|
51
|
+
// Write out the full text for each specified page.
|
52
|
+
public void writePageText() throws IOException {
|
53
|
+
if (pageNumbers != null) {
|
54
|
+
for (Integer num : pageNumbers) writePageText(num.intValue());
|
55
|
+
} else {
|
56
|
+
int pages = doc.getNumberOfPages();
|
57
|
+
for (int i=1; i<=pages; i++) writePageText(i);
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
// Write out the full text for a single page.
|
62
|
+
public void writePageText(int pageNumber) throws IOException {
|
63
|
+
File outfile = outputFile(basename + "_" + String.valueOf(pageNumber) + ".txt");
|
64
|
+
OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8");
|
65
|
+
extractTextForPageRange(output, pageNumber, pageNumber);
|
66
|
+
output.close();
|
67
|
+
}
|
68
|
+
|
69
|
+
// Internal method to writes out text from the PDF for a given page range
|
70
|
+
// to a provided output stream.
|
71
|
+
private void extractTextForPageRange(OutputStreamWriter output, int startPage, int endPage) throws IOException {
|
72
|
+
PDFTextStripper stripper = new PDFTextStripper("UTF-8");
|
73
|
+
stripper.setSortByPosition(false);
|
74
|
+
stripper.setShouldSeparateByBeads(true);
|
75
|
+
stripper.setStartPage(startPage);
|
76
|
+
stripper.setEndPage(endPage);
|
77
|
+
stripper.writeText(doc, output);
|
78
|
+
}
|
79
|
+
|
80
|
+
}
|
@@ -0,0 +1,91 @@
|
|
1
|
+
package org.documentcloud;
|
2
|
+
|
3
|
+
import java.io.File;
|
4
|
+
import java.util.List;
|
5
|
+
import java.util.Arrays;
|
6
|
+
import java.util.ArrayList;
|
7
|
+
import java.util.Iterator;
|
8
|
+
|
9
|
+
import org.apache.pdfbox.pdmodel.PDDocument;
|
10
|
+
|
11
|
+
// The base Extractor class contains the common functionality needed to run
|
12
|
+
// command-line extractors.
|
13
|
+
public abstract class Extractor {
|
14
|
+
|
15
|
+
protected File output;
|
16
|
+
protected boolean allPages = false;
|
17
|
+
protected ArrayList<Integer> pageNumbers;
|
18
|
+
|
19
|
+
// Running an extractor consists of converting the arguments array into a
|
20
|
+
// more manageable List, parsing arguments, and extracting pdfs.
|
21
|
+
public void run(String[] arguments) {
|
22
|
+
List<String> args = new ArrayList<String>(Arrays.asList(arguments));
|
23
|
+
parseArguments(args);
|
24
|
+
Iterator<String> iter = args.iterator();
|
25
|
+
while(iter.hasNext()) extract(iter.next());
|
26
|
+
}
|
27
|
+
|
28
|
+
// Subclasses must override "extract" to perform their specific extraction.
|
29
|
+
public abstract void extract(String pdfPath);
|
30
|
+
|
31
|
+
// The default "parseArguments" method handles common arguments.
|
32
|
+
protected void parseArguments(List<String> args) {
|
33
|
+
int dirLoc = args.indexOf("--output");
|
34
|
+
if (dirLoc >= 0) {
|
35
|
+
output = new File(args.remove(dirLoc + 1));
|
36
|
+
args.remove(dirLoc);
|
37
|
+
}
|
38
|
+
int pagesLoc = args.indexOf("--pages");
|
39
|
+
if (pagesLoc >= 0) {
|
40
|
+
parsePages(args.remove(pagesLoc + 1));
|
41
|
+
args.remove(pagesLoc);
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
// Utility function to get the basename of a file path.
|
46
|
+
// After File.basename in Ruby.
|
47
|
+
public String getBasename(String pdfPath) {
|
48
|
+
String basename = new File(pdfPath).getName();
|
49
|
+
return basename.substring(0, basename.lastIndexOf('.'));
|
50
|
+
}
|
51
|
+
|
52
|
+
// Get a reference to an output file, placed inside any configured directories,
|
53
|
+
// while ensuring that parent directories exist.
|
54
|
+
public File outputFile(String path) {
|
55
|
+
File file = output != null ? new File(output, path) : new File(path);
|
56
|
+
File parent = file.getParentFile();
|
57
|
+
if (parent != null) parent.mkdirs();
|
58
|
+
return file;
|
59
|
+
}
|
60
|
+
|
61
|
+
// Decrypt a non-passworded but still encrypted document.
|
62
|
+
public void decrypt(PDDocument doc) {
|
63
|
+
if (!doc.isEncrypted()) return;
|
64
|
+
try {
|
65
|
+
doc.decrypt("");
|
66
|
+
} catch (Exception e) {
|
67
|
+
System.out.println("Error decrypting document, details: " + e.getMessage());
|
68
|
+
System.exit(1);
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
private void parsePages(String pageList) {
|
73
|
+
if (pageList.equals("all")) {
|
74
|
+
allPages = true;
|
75
|
+
return;
|
76
|
+
}
|
77
|
+
pageNumbers = new ArrayList<Integer>();
|
78
|
+
String[] groups = pageList.split(",");
|
79
|
+
for (String group : groups) {
|
80
|
+
if (group.contains("-")) {
|
81
|
+
String[] range = group.split("-");
|
82
|
+
int start = Integer.parseInt(range[0]);
|
83
|
+
int end = Integer.parseInt(range[1]);
|
84
|
+
for (int i=start; i<=end; i++) pageNumbers.add(new Integer(i));
|
85
|
+
} else {
|
86
|
+
pageNumbers.add(new Integer(Integer.parseInt(group)));
|
87
|
+
}
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
91
|
+
}
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
module ArgumentParser
|
4
|
+
|
5
|
+
# Flatten an options hash into an arguments string suitable for the command
|
6
|
+
# line.
|
7
|
+
def parse_options(opts)
|
8
|
+
opts.map {|k, v| ["--#{k}", normalize_value(v)] }.flatten.join(' ')
|
9
|
+
end
|
10
|
+
|
11
|
+
# Normalize a value in an options hash for the command line.
|
12
|
+
# Ranges look like: 1-10, Arrays like: 1,2,3.
|
13
|
+
def normalize_value(value)
|
14
|
+
case value
|
15
|
+
when Range then normalize_range(value)
|
16
|
+
when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
|
17
|
+
else value.to_s
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Serialize a Ruby range into it's command-line equivalent.
|
22
|
+
def normalize_range(range)
|
23
|
+
arr = range.to_a
|
24
|
+
arr.empty? ? range.first.to_s : "#{range.first}-#{arr.last}"
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
extend ArgumentParser
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
|
3
|
+
|
4
|
+
module Docsplit
|
5
|
+
|
6
|
+
# A single command-line utility to separate a PDF into all its component parts.
|
7
|
+
class CommandLine
|
8
|
+
|
9
|
+
BANNER = <<-EOS
|
10
|
+
docsplit breaks apart documents into images, text, or individual pages.
|
11
|
+
It wraps PDFBox, GraphicsMagick, and JODConverter.
|
12
|
+
|
13
|
+
Usage:
|
14
|
+
docsplit COMMAND [OPTIONS] path/to/doc.pdf
|
15
|
+
Main commands:
|
16
|
+
pages, images, text, pdf.
|
17
|
+
Metadata commands:
|
18
|
+
author, date, creator, keywords, producer, subject, title, length.
|
19
|
+
|
20
|
+
Example:
|
21
|
+
docsplit images --size 700x --format jpg document.pdf
|
22
|
+
|
23
|
+
Dependencies:
|
24
|
+
Ruby, Java, A working GraphicsMagick (gm) command,
|
25
|
+
and a headless OpenOffice server for non-PDF documents.
|
26
|
+
|
27
|
+
Options:
|
28
|
+
(size, pages and format can take comma-separated values)
|
29
|
+
|
30
|
+
EOS
|
31
|
+
|
32
|
+
# Creating a CommandLine runs off of the contents of ARGV.
|
33
|
+
def initialize
|
34
|
+
parse_options
|
35
|
+
cmd = ARGV.shift
|
36
|
+
@command = cmd && cmd.to_sym
|
37
|
+
run
|
38
|
+
end
|
39
|
+
|
40
|
+
# Delegate to the Docsplit Ruby API to perform all extractions.
|
41
|
+
def run
|
42
|
+
begin
|
43
|
+
case @command
|
44
|
+
when :images then Docsplit.extract_images(ARGV, @options)
|
45
|
+
when :pages then Docsplit.extract_pages(ARGV, @options)
|
46
|
+
when :text then Docsplit.extract_text(ARGV, @options)
|
47
|
+
when :pdf then Docsplit.extract_pdf(ARGV, @options)
|
48
|
+
else
|
49
|
+
if METADATA_KEYS.include?(@command)
|
50
|
+
value = Docsplit.send("extract_#{@command}", ARGV, @options)
|
51
|
+
puts value unless value.nil?
|
52
|
+
else
|
53
|
+
usage
|
54
|
+
end
|
55
|
+
end
|
56
|
+
rescue ExtractionFailed => e
|
57
|
+
puts e.message.chomp
|
58
|
+
exit(1)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Print out the usage help message.
|
63
|
+
def usage
|
64
|
+
puts "\n#{@option_parser}\n"
|
65
|
+
exit
|
66
|
+
end
|
67
|
+
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
# Use the OptionParser library to parse out all supported options. Return
|
72
|
+
# options formatted for the Ruby API.
|
73
|
+
def parse_options
|
74
|
+
@options = {}
|
75
|
+
@option_parser = OptionParser.new do |opts|
|
76
|
+
opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
|
77
|
+
@options[:output] = d
|
78
|
+
end
|
79
|
+
opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
|
80
|
+
@options[:pages] = p
|
81
|
+
end
|
82
|
+
opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
|
83
|
+
@options[:size] = s.split(',')
|
84
|
+
end
|
85
|
+
opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
|
86
|
+
@options[:format] = t.split(',')
|
87
|
+
end
|
88
|
+
opts.on_tail('-v', '--version', 'display docsplit version') do
|
89
|
+
puts "docsplit version #{Docsplit::VERSION}"
|
90
|
+
exit
|
91
|
+
end
|
92
|
+
opts.on_tail('-h', '--help', 'display this help message') do
|
93
|
+
usage
|
94
|
+
end
|
95
|
+
end
|
96
|
+
@option_parser.banner = BANNER
|
97
|
+
begin
|
98
|
+
@option_parser.parse!(ARGV)
|
99
|
+
rescue OptionParser::InvalidOption => e
|
100
|
+
puts e.message
|
101
|
+
exit(1)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Delegates to GraphicsMagick in order to convert PDF documents into
|
4
|
+
# nicely sized images.
|
5
|
+
class ImageExtractor
|
6
|
+
|
7
|
+
DENSITY_ARG = "-density 150"
|
8
|
+
DEFAULT_FORMAT = :png
|
9
|
+
|
10
|
+
# Extract a list of PDFs as rasterized page images, according to the
|
11
|
+
# configuration in options.
|
12
|
+
def extract(pdfs, options)
|
13
|
+
@pdfs = [pdfs].flatten
|
14
|
+
extract_options(options)
|
15
|
+
@pdfs.each {|p| @sizes.each {|s| @formats.each {|f| convert(p, s, f) }}}
|
16
|
+
end
|
17
|
+
|
18
|
+
# Convert a single PDF into page images at the specified size and format.
|
19
|
+
def convert(pdf, size, format)
|
20
|
+
basename = File.basename(pdf, File.extname(pdf))
|
21
|
+
subfolder = @sizes.length > 1 ? size.to_s : ''
|
22
|
+
directory = File.join(@output, subfolder)
|
23
|
+
FileUtils.mkdir_p(directory) unless File.exists?(directory)
|
24
|
+
out_file = File.join(directory, "#{basename}_%05d.#{format}")
|
25
|
+
cmd = "gm convert #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
|
26
|
+
result = `#{cmd}`.chomp
|
27
|
+
raise ExtractionFailed, result if $? != 0
|
28
|
+
renumber_images(out_file, format)
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
# Extract the relevant GraphicsMagick options from the options hash.
|
35
|
+
def extract_options(options)
|
36
|
+
@output = options[:output] || '.'
|
37
|
+
@pages = options[:pages]
|
38
|
+
@formats = [options[:format] || DEFAULT_FORMAT].flatten
|
39
|
+
@sizes = [options[:size]].flatten.compact
|
40
|
+
@sizes = [nil] if @sizes.empty?
|
41
|
+
end
|
42
|
+
|
43
|
+
# Generate the resize argument.
|
44
|
+
def resize_arg(size)
|
45
|
+
size.nil? ? '' : "-resize #{size}"
|
46
|
+
end
|
47
|
+
|
48
|
+
# Generate the appropriate quality argument for the image format.
|
49
|
+
def quality_arg(format)
|
50
|
+
case format.to_s
|
51
|
+
when /jpe?g/ then "-quality 85"
|
52
|
+
when /png/ then "-quality 100"
|
53
|
+
else ""
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Generate the requested page index into the document.
|
58
|
+
def pages_arg
|
59
|
+
return '' if @pages.nil?
|
60
|
+
pages = @pages.gsub(/\d+/) {|digits| (digits.to_i - 1).to_s }
|
61
|
+
"[#{pages}]"
|
62
|
+
end
|
63
|
+
|
64
|
+
# Generate the expanded list of requested page numbers.
|
65
|
+
def page_list
|
66
|
+
@pages.split(',').map { |range|
|
67
|
+
if range.include?('-')
|
68
|
+
range = range.split('-')
|
69
|
+
Range.new(range.first, range.last).to_a.map {|n| n.to_i }
|
70
|
+
else
|
71
|
+
range.to_i
|
72
|
+
end
|
73
|
+
}.flatten.sort
|
74
|
+
end
|
75
|
+
|
76
|
+
# When GraphicsMagick is through, it will have generated a number of
|
77
|
+
# incrementing page images, starting at 0. Renumber them with their correct
|
78
|
+
# page numbers.
|
79
|
+
def renumber_images(template, format)
|
80
|
+
suffixer = /_0+(\d+)\.#{format}\Z/
|
81
|
+
images = Dir[template.sub('%05d', '0*')].map do |path|
|
82
|
+
index = path[suffixer, 1].to_i
|
83
|
+
{:path => path, :index => index, :page_number => index + 1}
|
84
|
+
end
|
85
|
+
numbers = @pages ? page_list.reverse : nil
|
86
|
+
images.sort_by {|i| -i[:page_number] }.each_with_index do |image, i|
|
87
|
+
number = numbers ? numbers[i] : image[:page_number]
|
88
|
+
FileUtils.mv(image[:path], image[:path].sub(suffixer, "_#{number}.#{format}"))
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
# Include a method to transparently convert non-PDF arguments to temporary
|
4
|
+
# PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
|
5
|
+
module TransparentPDFs
|
6
|
+
|
7
|
+
# Temporarily convert any non-PDF documents to PDFs before running them
|
8
|
+
# through further extraction.
|
9
|
+
def ensure_pdfs(docs)
|
10
|
+
[docs].flatten.map do |doc|
|
11
|
+
ext = File.extname(doc)
|
12
|
+
if ext == '.pdf'
|
13
|
+
doc
|
14
|
+
else
|
15
|
+
@tempdir ||= File.join(Dir.tmpdir, 'docsplit')
|
16
|
+
extract_pdf([doc], {:output => @tempdir})
|
17
|
+
File.join(@tempdir, File.basename(doc, ext) + '.pdf')
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
extend TransparentPDFs
|
25
|
+
|
26
|
+
end
|
data/lib/docsplit.rb
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
# The Docsplit module delegates to the Java PDF extractors.
|
2
|
+
module Docsplit
|
3
|
+
|
4
|
+
VERSION = '0.1.0' # Keep in sync with gemspec.
|
5
|
+
|
6
|
+
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
7
|
+
|
8
|
+
CLASSPATH = "#{ROOT}/build#{File::PATH_SEPARATOR}#{ROOT}/vendor/'*'"
|
9
|
+
|
10
|
+
LOGGING = "-Djava.util.logging.config.file=#{ROOT}/vendor/logging.properties"
|
11
|
+
|
12
|
+
HEADLESS = "-Djava.awt.headless=true"
|
13
|
+
|
14
|
+
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
|
15
|
+
|
16
|
+
# Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
|
17
|
+
# broke.
|
18
|
+
class ExtractionFailed < StandardError; end
|
19
|
+
|
20
|
+
# Use the ExtractPages Java class to burst a PDF into single pages.
|
21
|
+
def self.extract_pages(pdfs, opts={})
|
22
|
+
pdfs = ensure_pdfs(pdfs)
|
23
|
+
run "org.documentcloud.ExtractPages", pdfs, opts
|
24
|
+
end
|
25
|
+
|
26
|
+
# Use the ExtractText Java class to write out all embedded text.
|
27
|
+
def self.extract_text(pdfs, opts={})
|
28
|
+
pdfs = ensure_pdfs(pdfs)
|
29
|
+
run "org.documentcloud.ExtractText", pdfs, opts
|
30
|
+
end
|
31
|
+
|
32
|
+
# Use the ExtractImages Java class to rasterize a PDF into each page's image.
|
33
|
+
def self.extract_images(pdfs, opts={})
|
34
|
+
pdfs = ensure_pdfs(pdfs)
|
35
|
+
opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
|
36
|
+
ImageExtractor.new.extract(pdfs, opts)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Use JODCConverter to extract the documents as PDFs.
|
40
|
+
def self.extract_pdf(docs, opts={})
|
41
|
+
[docs].flatten.each do |doc|
|
42
|
+
basename = File.basename(doc, File.extname(doc))
|
43
|
+
run "-jar #{ROOT}/vendor/jodconverter/jodconverter-cli-2.2.2.jar #{doc} #{opts[:output] || '.'}/#{basename}.pdf", [], {}
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# Define custom methods for each of the metadata keys that we support.
|
48
|
+
# Use the ExtractInfo Java class to print out a single bit of metadata.
|
49
|
+
METADATA_KEYS.each do |key|
|
50
|
+
instance_eval <<-EOS
|
51
|
+
def self.extract_#{key}(pdfs, opts={})
|
52
|
+
pdfs = ensure_pdfs(pdfs)
|
53
|
+
result = run "org.documentcloud.ExtractInfo #{key}", pdfs, opts, true
|
54
|
+
:#{key} == :length ? result.to_i : result
|
55
|
+
end
|
56
|
+
EOS
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
# Runs a Java command, with quieted logging, and the classpath set properly.
|
63
|
+
def self.run(command, pdfs, opts, return_output=false)
|
64
|
+
pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
|
65
|
+
args = parse_options(opts)
|
66
|
+
cmd = "java #{HEADLESS} #{LOGGING} -cp #{CLASSPATH} #{command} #{args} #{pdfs} 2>&1"
|
67
|
+
result = `#{cmd}`.chomp
|
68
|
+
raise ExtractionFailed, result if $? != 0
|
69
|
+
return return_output ? (result.empty? ? nil : result) : true
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
require 'tmpdir'
|
75
|
+
require 'fileutils'
|
76
|
+
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
77
|
+
require "#{Docsplit::ROOT}/lib/docsplit/argument_parser"
|
78
|
+
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
data/vendor/bcmail.jar
ADDED
Binary file
|
data/vendor/bcprov.jar
ADDED
Binary file
|
Binary file
|
data/vendor/fontbox.jar
ADDED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1 @@
|
|
1
|
+
.level=WARNING
|
data/vendor/pdfbox.jar
ADDED
Binary file
|
metadata
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: docsplit
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jeremy Ashkenas
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-12-07 00:00:00 -05:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
|
17
|
+
email: jeremy@documentcloud.org
|
18
|
+
executables:
|
19
|
+
- docsplit
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- build/org/documentcloud/ExtractInfo$1.class
|
26
|
+
- build/org/documentcloud/ExtractInfo$Keys.class
|
27
|
+
- build/org/documentcloud/ExtractInfo.class
|
28
|
+
- build/org/documentcloud/Extractor.class
|
29
|
+
- build/org/documentcloud/ExtractPages.class
|
30
|
+
- build/org/documentcloud/ExtractText.class
|
31
|
+
- lib/docsplit/argument_parser.rb
|
32
|
+
- lib/docsplit/command_line.rb
|
33
|
+
- lib/docsplit/ExtractInfo.java
|
34
|
+
- lib/docsplit/Extractor.java
|
35
|
+
- lib/docsplit/ExtractPages.java
|
36
|
+
- lib/docsplit/ExtractText.java
|
37
|
+
- lib/docsplit/image_extractor.rb
|
38
|
+
- lib/docsplit/transparent_pdfs.rb
|
39
|
+
- lib/docsplit.rb
|
40
|
+
- bin/docsplit
|
41
|
+
- vendor/bcmail.jar
|
42
|
+
- vendor/bcprov.jar
|
43
|
+
- vendor/commons-logging.jar
|
44
|
+
- vendor/fontbox.jar
|
45
|
+
- vendor/jodconverter/commons-cli-1.2.jar
|
46
|
+
- vendor/jodconverter/commons-io-1.4.jar
|
47
|
+
- vendor/jodconverter/jodconverter-2.2.2.jar
|
48
|
+
- vendor/jodconverter/jodconverter-cli-2.2.2.jar
|
49
|
+
- vendor/jodconverter/juh-3.0.1.jar
|
50
|
+
- vendor/jodconverter/jurt-3.0.1.jar
|
51
|
+
- vendor/jodconverter/ridl-3.0.1.jar
|
52
|
+
- vendor/jodconverter/slf4j-api-1.5.6.jar
|
53
|
+
- vendor/jodconverter/slf4j-jdk14-1.5.6.jar
|
54
|
+
- vendor/jodconverter/unoil-3.0.1.jar
|
55
|
+
- vendor/logging.properties
|
56
|
+
- vendor/pdfbox.jar
|
57
|
+
- docsplit.gemspec
|
58
|
+
- LICENSE
|
59
|
+
- README
|
60
|
+
has_rdoc: false
|
61
|
+
homepage: http://documentcloud.github.com/docsplit/
|
62
|
+
licenses: []
|
63
|
+
|
64
|
+
post_install_message:
|
65
|
+
rdoc_options: []
|
66
|
+
|
67
|
+
require_paths:
|
68
|
+
- lib
|
69
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: "0"
|
74
|
+
version:
|
75
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: "0"
|
80
|
+
version:
|
81
|
+
requirements: []
|
82
|
+
|
83
|
+
rubyforge_project: docsplit
|
84
|
+
rubygems_version: 1.3.5
|
85
|
+
signing_key:
|
86
|
+
specification_version: 3
|
87
|
+
summary: Break Apart Documents into Images, Text, Pages and PDFs
|
88
|
+
test_files: []
|
89
|
+
|