docsplit 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE CHANGED
@@ -1,5 +1,4 @@
1
1
  JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
2
- PDFBox is licensed under the Apache 2 License: apache.org/licenses/LICENSE-2.0
3
2
 
4
3
  Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
5
4
 
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.2.0' # Keep version in sync with docsplit.rb
4
- s.date = '2010-7-29'
3
+ s.version = '0.3.0' # Keep version in sync with docsplit.rb
4
+ s.date = '2010-8-5'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
data/lib/docsplit.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.2.0' # Keep in sync with gemspec.
4
+ VERSION = '0.3.0' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -13,6 +13,20 @@ module Docsplit
13
13
 
14
14
  METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
15
15
 
16
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
17
+
18
+ # Check for all dependencies, and warn of their absence.
19
+ dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
20
+ DEPENDENCIES.each_key do |dep|
21
+ dirs.each do |dir|
22
+ if File.executable?(File.join(dir, dep.to_s))
23
+ DEPENDENCIES[dep] = true
24
+ break
25
+ end
26
+ end
27
+ warn "Warning: Docsplit dependency #{dep} not found." if !DEPENDENCIES[dep]
28
+ end
29
+
16
30
  # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
17
31
  # broke.
18
32
  class ExtractionFailed < StandardError; end
@@ -20,7 +34,7 @@ module Docsplit
20
34
  # Use the ExtractPages Java class to burst a PDF into single pages.
21
35
  def self.extract_pages(pdfs, opts={})
22
36
  pdfs = ensure_pdfs(pdfs)
23
- run "org.documentcloud.ExtractPages", pdfs, opts
37
+ PageExtractor.new.extract(pdfs, opts)
24
38
  end
25
39
 
26
40
  # Use the ExtractText Java class to write out all embedded text.
@@ -50,8 +64,7 @@ module Docsplit
50
64
  instance_eval <<-EOS
51
65
  def self.extract_#{key}(pdfs, opts={})
52
66
  pdfs = ensure_pdfs(pdfs)
53
- result = run "org.documentcloud.ExtractInfo #{key}", pdfs, opts, true
54
- :#{key} == :length ? result.to_i : result
67
+ InfoExtractor.new.extract(:#{key}, pdfs, opts)
55
68
  end
56
69
  EOS
57
70
  end
@@ -62,18 +75,28 @@ module Docsplit
62
75
  # Runs a Java command, with quieted logging, and the classpath set properly.
63
76
  def self.run(command, pdfs, opts, return_output=false)
64
77
  pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
65
- args = parse_options(opts)
66
- cmd = "java #{HEADLESS} #{LOGGING} -cp #{CLASSPATH} #{command} #{args} #{pdfs} 2>&1"
78
+ cmd = "java #{HEADLESS} #{LOGGING} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
67
79
  result = `#{cmd}`.chomp
68
80
  raise ExtractionFailed, result if $? != 0
69
81
  return return_output ? (result.empty? ? nil : result) : true
70
82
  end
71
83
 
84
+ # Normalize a value in an options hash for the command line.
85
+ # Ranges look like: 1-10, Arrays like: 1,2,3.
86
+ def self.normalize_value(value)
87
+ case value
88
+ when Range then normalize_range(value)
89
+ when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
90
+ else value.to_s
91
+ end
92
+ end
93
+
72
94
  end
73
95
 
74
96
  require 'tmpdir'
75
97
  require 'fileutils'
76
98
  require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
77
- require "#{Docsplit::ROOT}/lib/docsplit/argument_parser"
78
99
  require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
79
100
  require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
101
+ require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
102
+ require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
@@ -8,7 +8,7 @@ module Docsplit
8
8
 
9
9
  BANNER = <<-EOS
10
10
  docsplit breaks apart documents into images, text, or individual pages.
11
- It wraps PDFBox, GraphicsMagick, and JODConverter.
11
+ It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
12
12
 
13
13
  Usage:
14
14
  docsplit COMMAND [OPTIONS] path/to/doc.pdf
@@ -71,7 +71,7 @@ Options:
71
71
  # Use the OptionParser library to parse out all supported options. Return
72
72
  # options formatted for the Ruby API.
73
73
  def parse_options
74
- @options = {}
74
+ @options = {:ocr => :default}
75
75
  @option_parser = OptionParser.new do |opts|
76
76
  opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
77
77
  @options[:output] = d
@@ -85,8 +85,14 @@ Options:
85
85
  opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
86
86
  @options[:format] = t.split(',')
87
87
  end
88
+ opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
89
+ @options[:ocr] = o
90
+ end
91
+ opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
92
+ @options[:rolling] = true
93
+ end
88
94
  opts.on_tail('-v', '--version', 'display docsplit version') do
89
- puts "docsplit version #{Docsplit::VERSION}"
95
+ puts "Docsplit version #{Docsplit::VERSION}"
90
96
  exit
91
97
  end
92
98
  opts.on_tail('-h', '--help', 'display this help message') do
@@ -4,26 +4,37 @@ module Docsplit
4
4
  # nicely sized images.
5
5
  class ImageExtractor
6
6
 
7
- DENSITY_ARG = "-density 150"
8
- MEMORY_ARGS = "-limit memory 128MiB -limit map 256MiB"
9
- DEFAULT_FORMAT = :png
7
+ DENSITY_ARG = "-density 150"
8
+ MEMORY_ARGS = "-limit memory 128MiB -limit map 256MiB"
9
+ DEFAULT_FORMAT = :png
10
10
 
11
11
  # Extract a list of PDFs as rasterized page images, according to the
12
12
  # configuration in options.
13
13
  def extract(pdfs, options)
14
14
  @pdfs = [pdfs].flatten
15
15
  extract_options(options)
16
- @pdfs.each {|p| @sizes.each {|s| @formats.each {|f| convert(p, s, f) }}}
16
+ @pdfs.each do |pdf|
17
+ previous = nil
18
+ @sizes.each_with_index do |size, i|
19
+ @formats.each {|format| convert(pdf, size, format, previous) }
20
+ previous = size if @rolling
21
+ end
22
+ end
17
23
  end
18
24
 
19
25
  # Convert a single PDF into page images at the specified size and format.
20
- def convert(pdf, size, format)
26
+ def convert(pdf, size, format, previous=nil)
21
27
  basename = File.basename(pdf, File.extname(pdf))
22
- subfolder = @sizes.length > 1 ? size.to_s : ''
23
- directory = File.join(@output, subfolder)
28
+ directory = directory_for(size)
24
29
  FileUtils.mkdir_p(directory) unless File.exists?(directory)
25
30
  out_file = File.join(directory, "#{basename}_%05d.#{format}")
26
- cmd = "gm convert +adjoin #{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
31
+ common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
32
+ if previous
33
+ FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
34
+ cmd = "OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
35
+ else
36
+ cmd = "OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
37
+ end
27
38
  result = `#{cmd}`.chomp
28
39
  raise ExtractionFailed, result if $? != 0
29
40
  renumber_images(out_file, format)
@@ -39,6 +50,14 @@ module Docsplit
39
50
  @formats = [options[:format] || DEFAULT_FORMAT].flatten
40
51
  @sizes = [options[:size]].flatten.compact
41
52
  @sizes = [nil] if @sizes.empty?
53
+ @rolling = !!options[:rolling]
54
+ end
55
+
56
+ # If there's only one size requested, generate the images directly into
57
+ # the output directory. Multiple sizes each get a directory of their own.
58
+ def directory_for(size)
59
+ path = @sizes.length == 1 ? @output : File.join(@output, size)
60
+ File.expand_path(path)
42
61
  end
43
62
 
44
63
  # Generate the resize argument.
@@ -0,0 +1,32 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdfinfo** in order to extract information about a PDF file.
4
+ class InfoExtractor
5
+
6
+ # Regex matchers for different bits of information.
7
+ MATCHERS = {
8
+ :author => /^Author:\s+([^\n]+)/,
9
+ :date => /^CreationDate:\s+([^\n]+)/,
10
+ :creator => /^Creator:\s+([^\n]+)/,
11
+ :keywords => /^Keywords:\s+([^\n]+)/,
12
+ :producer => /^Producer:\s+([^\n]+)/,
13
+ :subject => /^Subject:\s+([^\n]+)/,
14
+ :title => /^Title:\s+([^\n]+)/,
15
+ :length => /^Pages:\s+([^\n]+)/,
16
+ }
17
+
18
+ # Pull out a single datum from a pdf.
19
+ def extract(key, pdfs, opts)
20
+ pdf = [pdfs].flatten.first
21
+ cmd = "pdfinfo #{pdf} 2>&1"
22
+ result = `#{cmd}`.chomp
23
+ raise ExtractionFailed, result if $? != 0
24
+ match = result.match(MATCHERS[key])
25
+ answer = match && match[1]
26
+ answer = answer.to_i if answer && key == :length
27
+ answer
28
+ end
29
+
30
+ end
31
+
32
+ end
@@ -0,0 +1,31 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdftk** in order to create bursted single pages from
4
+ # a PDF document.
5
+ class PageExtractor
6
+
7
+ # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
8
+ def extract(pdfs, opts)
9
+ extract_options opts
10
+ [pdfs].flatten.each do |pdf|
11
+ pdf_name = File.basename(pdf, File.extname(pdf))
12
+ page_path = File.join(@output, "#{pdf_name}_%d.pdf")
13
+ FileUtils.mkdir_p @output unless File.exists?(@output)
14
+ cmd = "pdftk #{pdf} burst output #{page_path} 2>&1"
15
+ result = `#{cmd}`.chomp
16
+ FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
17
+ raise ExtractionFailed, result if $? != 0
18
+ result
19
+ end
20
+ end
21
+
22
+
23
+ private
24
+
25
+ def extract_options(options)
26
+ @output = options[:output] || '.'
27
+ end
28
+
29
+ end
30
+
31
+ end
@@ -1,53 +1,111 @@
1
1
  module Docsplit
2
-
2
+
3
+ # Delegates to **pdftotext** and **tesseract** in order to extract text from
4
+ # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
5
+ # forbid OCR extraction, but by default the heuristic works like this:
6
+ #
7
+ # * Check for the presence of fonts in the PDF. If no fonts are detected,
8
+ # OCR is used automatically.
9
+ # * Extract the text of each page with **pdftotext**, if the page has less
10
+ # than 100 bytes of text (a scanned image page, or a page that just
11
+ # contains a filename and a page number), then add it to the list of
12
+ # `@pages_to_ocr`.
13
+ # * Re-OCR each page in the `@pages_to_ocr` list at the end.
14
+ #
3
15
  class TextExtractor
4
-
5
- PAGE_COUNT_MATCHER = /Pages:\s+(\d+?)\n/
6
-
16
+
17
+ NO_TEXT_DETECTED = /---------\n\Z/
18
+
19
+ OCR_FLAGS = '-density 200x200 -colorspace GRAY'
20
+
21
+ MIN_TEXT_PER_PAGE = 100 # in bytes
22
+
23
+ def initialize
24
+ @tiffs_generated = false
25
+ @pages_to_ocr = []
26
+ end
27
+
28
+ # Extract text from a list of PDFs.
7
29
  def extract(pdfs, opts)
8
30
  extract_options opts
9
- pdfs = [pdfs].flatten
10
- pdfs.each do |pdf|
11
- pdf_name = File.basename(pdf, File.extname(pdf))
12
- text_path = File.join(@output, "#{pdf_name}.txt")
13
- FileUtils.mkdir_p @output
14
-
15
- if @pages
16
- pages = (@pages == 'all') ? 1..get_pages(pdf) : @pages
17
- pages.each do |page|
18
- extract_page pdf, page, pdf_name
19
- end
31
+ FileUtils.mkdir_p @output unless File.exists?(@output)
32
+ [pdfs].flatten.each do |pdf|
33
+ @pdf_name = File.basename(pdf, File.extname(pdf))
34
+ pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
35
+ if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
36
+ extract_from_ocr(pdf, pages)
20
37
  else
21
- cmd = "pdftotext -enc UTF-8 #{pdf} #{text_path}"
22
- result = `#{cmd}`.chomp
23
- raise ExtractionFailed, result if $? != 0
38
+ extract_from_pdf(pdf, pages)
39
+ if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
40
+ extract_from_ocr(pdf, @pages_to_ocr)
41
+ end
42
+ end
43
+ end
44
+ FileUtils.remove_entry_secure @tempdir if @tempdir
45
+ end
46
+
47
+ # Does a PDF have any text embedded?
48
+ def contains_text?(pdf)
49
+ fonts = `pdffonts #{pdf} 2>&1`
50
+ !fonts.match(NO_TEXT_DETECTED)
51
+ end
52
+
53
+ # Extract a page range worth of text from a PDF, directly.
54
+ def extract_from_pdf(pdf, pages)
55
+ return extract_full(pdf) unless pages
56
+ pages.each {|page| extract_page(pdf, page) }
57
+ end
58
+
59
+ # Extract a page range worth of text from a PDF via OCR.
60
+ def extract_from_ocr(pdf, pages)
61
+ @tempdir ||= Dir.mktmpdir
62
+ base_path = File.join(@output, @pdf_name)
63
+ if pages
64
+ run "gm convert +adjoin #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
65
+ @tiffs_generated = true
66
+ pages.each do |page|
67
+ run "tesseract #{@tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
24
68
  end
69
+ else
70
+ tiff = "#{@tempdir}/#{@pdf_name}.tif"
71
+ run "gm convert #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
72
+ run "tesseract #{tiff} #{base_path} -l eng 2>&1"
25
73
  end
26
74
  end
27
-
28
- def extract_page(pdf, page, pdf_name)
29
- text_path = File.join(@output, "#{pdf_name}_#{page}.txt")
30
- cmd = "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path}"
31
- result = `#{cmd}`.chomp
75
+
76
+
77
+ private
78
+
79
+ # Run an external process and raise an exception if it fails.
80
+ def run(command)
81
+ result = `#{command}`
32
82
  raise ExtractionFailed, result if $? != 0
33
83
  result
34
84
  end
35
85
 
36
- def get_pages(pdf_path)
37
- info = `pdfinfo #{pdf_path}`
38
- raise ExtractionFailed, result if $? != 0
39
- match = info.match(PAGE_COUNT_MATCHER)
40
- raise ExtractionFailed if match.nil?
41
- match[1].to_i
86
+ # Extract the full contents of a pdf as a single file, directly.
87
+ def extract_full(pdf)
88
+ text_path = File.join(@output, "#{@pdf_name}.txt")
89
+ run "pdftotext -enc UTF-8 #{pdf} #{text_path} 2>&1"
42
90
  end
43
-
44
- private
45
-
91
+
92
+ # Extract the contents of a single page of text, directly, adding it to
93
+ # the `@pages_to_ocr` list if the text length is inadequate.
94
+ def extract_page(pdf, page)
95
+ text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
96
+ run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path} 2>&1"
97
+ unless @forbid_ocr
98
+ @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
99
+ end
100
+ end
101
+
46
102
  def extract_options(options)
47
- @output = options[:output] || '.'
48
- @pages = options[:pages]
103
+ @output = options[:output] || '.'
104
+ @pages = options[:pages]
105
+ @force_ocr = options[:ocr] == true
106
+ @forbid_ocr = options[:ocr] == false
49
107
  end
50
108
 
51
109
  end
52
-
110
+
53
111
  end
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
5
4
  prerelease: false
6
5
  segments:
7
6
  - 0
8
- - 2
7
+ - 3
9
8
  - 0
10
- version: 0.2.0
9
+ version: 0.3.0
11
10
  platform: ruby
12
11
  authors:
13
12
  - Jeremy Ashkenas
@@ -16,7 +15,7 @@ autorequire:
16
15
  bindir: bin
17
16
  cert_chain: []
18
17
 
19
- date: 2010-07-29 00:00:00 -04:00
18
+ date: 2010-08-05 00:00:00 -04:00
20
19
  default_executable:
21
20
  dependencies: []
22
21
 
@@ -29,27 +28,14 @@ extensions: []
29
28
  extra_rdoc_files: []
30
29
 
31
30
  files:
32
- - build/org/documentcloud/ExtractInfo$1.class
33
- - build/org/documentcloud/ExtractInfo$Keys.class
34
- - build/org/documentcloud/ExtractInfo.class
35
- - build/org/documentcloud/Extractor.class
36
- - build/org/documentcloud/ExtractPages.class
37
- - build/org/documentcloud/ExtractText.class
38
- - lib/docsplit/argument_parser.rb
39
31
  - lib/docsplit/command_line.rb
40
- - lib/docsplit/ExtractInfo.java
41
- - lib/docsplit/Extractor.java
42
- - lib/docsplit/ExtractPages.java
43
- - lib/docsplit/ExtractText.java
44
32
  - lib/docsplit/image_extractor.rb
33
+ - lib/docsplit/info_extractor.rb
34
+ - lib/docsplit/page_extractor.rb
45
35
  - lib/docsplit/text_extractor.rb
46
36
  - lib/docsplit/transparent_pdfs.rb
47
37
  - lib/docsplit.rb
48
38
  - bin/docsplit
49
- - vendor/bcmail.jar
50
- - vendor/bcprov.jar
51
- - vendor/commons-logging.jar
52
- - vendor/fontbox.jar
53
39
  - vendor/jodconverter/commons-cli-1.2.jar
54
40
  - vendor/jodconverter/commons-io-1.4.jar
55
41
  - vendor/jodconverter/jodconverter-2.2.2.jar
@@ -61,11 +47,10 @@ files:
61
47
  - vendor/jodconverter/slf4j-jdk14-1.5.6.jar
62
48
  - vendor/jodconverter/unoil-3.0.1.jar
63
49
  - vendor/logging.properties
64
- - vendor/pdfbox.jar
65
50
  - docsplit.gemspec
66
51
  - LICENSE
67
52
  - README
68
- has_rdoc: true
53
+ has_rdoc: false
69
54
  homepage: http://documentcloud.github.com/docsplit/
70
55
  licenses: []
71
56
 
@@ -75,27 +60,23 @@ rdoc_options: []
75
60
  require_paths:
76
61
  - lib
77
62
  required_ruby_version: !ruby/object:Gem::Requirement
78
- none: false
79
63
  requirements:
80
64
  - - ">="
81
65
  - !ruby/object:Gem::Version
82
- hash: 3
83
66
  segments:
84
67
  - 0
85
68
  version: "0"
86
69
  required_rubygems_version: !ruby/object:Gem::Requirement
87
- none: false
88
70
  requirements:
89
71
  - - ">="
90
72
  - !ruby/object:Gem::Version
91
- hash: 3
92
73
  segments:
93
74
  - 0
94
75
  version: "0"
95
76
  requirements: []
96
77
 
97
78
  rubyforge_project: docsplit
98
- rubygems_version: 1.3.7
79
+ rubygems_version: 1.3.6
99
80
  signing_key:
100
81
  specification_version: 3
101
82
  summary: Break Apart Documents into Images, Text, Pages and PDFs
@@ -1,63 +0,0 @@
1
- package org.documentcloud;
2
-
3
- import java.util.List;
4
- import java.io.IOException;
5
- import java.text.SimpleDateFormat;
6
-
7
- import org.apache.pdfbox.pdmodel.PDDocument;
8
- import org.apache.pdfbox.pdmodel.PDDocumentInformation;
9
-
10
- // Extracts metadata from a PDF file.
11
- public class ExtractInfo extends Extractor {
12
-
13
- private PDDocument doc;
14
- private PDDocumentInformation info;
15
- private String key;
16
-
17
- // The list of metadata keys we know how to extract.
18
- private enum Keys {
19
- AUTHOR, DATE, CREATOR, KEYWORDS, PRODUCER, SUBJECT, TITLE, LENGTH
20
- }
21
-
22
- // The mainline.
23
- public static void main(String[] args) {
24
- (new ExtractInfo()).run(args);
25
- }
26
-
27
- // The first argument is always the name of the metadata key.
28
- protected void parseArguments(List<String> args) {
29
- super.parseArguments(args);
30
- key = args.remove(0).toUpperCase();
31
- }
32
-
33
- // Extract the configured bit of metadata from a PDF, decrypting if necessary.
34
- public void extract(String pdfPath) {
35
- try {
36
- doc = PDDocument.load(pdfPath, false);
37
- decrypt(doc);
38
- info = doc.getDocumentInformation();
39
- String val = extractInfo();
40
- if (val != null) System.out.println(val);
41
- doc.close();
42
- } catch(IOException e) {
43
- System.out.println(e.getMessage());
44
- System.exit(1);
45
- }
46
- }
47
-
48
- // Use the PDDocumentInformation object to fetch metadata values as strings.
49
- public String extractInfo() throws IOException {
50
- switch(Keys.valueOf(key)) {
51
- case AUTHOR: return info.getAuthor();
52
- case DATE: return new SimpleDateFormat("yyyy-MM-dd").format(info.getCreationDate().getTime());
53
- case CREATOR: return info.getCreator();
54
- case KEYWORDS: return info.getKeywords();
55
- case PRODUCER: return info.getProducer();
56
- case SUBJECT: return info.getSubject();
57
- case TITLE: return info.getTitle();
58
- case LENGTH: return String.valueOf(doc.getNumberOfPages());
59
- default: return null;
60
- }
61
- }
62
-
63
- }
@@ -1,54 +0,0 @@
1
- package org.documentcloud;
2
-
3
- import java.util.List;
4
- import java.io.File;
5
- import java.io.FileOutputStream;
6
- import java.io.IOException;
7
-
8
- import org.apache.pdfbox.pdmodel.PDDocument;
9
- import org.apache.pdfbox.util.Splitter;
10
- import org.apache.pdfbox.pdfwriter.COSWriter;
11
- import org.apache.pdfbox.exceptions.COSVisitorException;
12
-
13
- // Use PDFBox's Splitter to break apart a large PDF into individual pages.
14
- public class ExtractPages extends Extractor {
15
-
16
- private PDDocument doc;
17
- private String basename;
18
-
19
- // The mainline.
20
- public static void main(String[] args) {
21
- (new ExtractPages()).run(args);
22
- }
23
-
24
- // Extract each page of the given PDF.
25
- public void extract(String pdfPath) {
26
- try {
27
- basename = getBasename(pdfPath);
28
- doc = PDDocument.load(pdfPath);
29
- decrypt(doc);
30
- List pages = (new Splitter()).split(doc);
31
- if (pageNumbers != null) {
32
- for (Integer num : pageNumbers) writePage((PDDocument) pages.get(num.intValue()- 1), num.intValue());
33
- } else {
34
- for (int i=0; i<pages.size(); i++) writePage((PDDocument) pages.get(i), i + 1);
35
- }
36
- doc.close();
37
- } catch(Exception e) {
38
- System.out.println(e.getMessage());
39
- System.exit(1);
40
- }
41
- }
42
-
43
- // Writes out a page as a single-page PDF.
44
- private void writePage(PDDocument page, int pageNumber) throws IOException, COSVisitorException {
45
- String pageName = basename + "_" + String.valueOf(pageNumber) + ".pdf";
46
- FileOutputStream out = new FileOutputStream(outputFile(pageName));
47
- COSWriter writer = new COSWriter(out);
48
- writer.write(page);
49
- out.close();
50
- writer.close();
51
- page.close();
52
- }
53
-
54
- }
@@ -1,80 +0,0 @@
1
- package org.documentcloud;
2
-
3
- import java.util.List;
4
- import java.io.File;
5
- import java.io.FileOutputStream;
6
- import java.io.IOException;
7
- import java.io.OutputStreamWriter;
8
-
9
- import org.apache.pdfbox.pdmodel.PDDocument;
10
- import org.apache.pdfbox.util.PDFTextStripper;
11
-
12
- // Uses PDFBox's PDFTextStripper to extract the full, plain, UTF-8 text of a
13
- // PDF document. Pass --pages to write out the plain text for each individual
14
- // page; --pages-only to omit the text for the entire document.
15
- public class ExtractText extends Extractor {
16
-
17
- private PDDocument doc;
18
- private String basename;
19
-
20
- // The mainline.
21
- public static void main(String[] args) {
22
- (new ExtractText()).run(args);
23
- }
24
-
25
- // Extract the plain text for a PDF, and write it into the requested output
26
- // sizes.
27
- public void extract(String pdfPath) {
28
- try {
29
- basename = getBasename(pdfPath);
30
- doc = PDDocument.load(pdfPath, false);
31
- decrypt(doc);
32
- if (allPages || (pageNumbers != null)) {
33
- writePageText();
34
- } else {
35
- writeFullText();
36
- }
37
- doc.close();
38
- } catch(IOException e) {
39
- System.out.println(e.getMessage());
40
- System.exit(1);
41
- }
42
- }
43
-
44
- // Write out the extracted full text for the entire PDF.
45
- public void writeFullText() throws IOException {
46
- OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outputFile(basename + ".txt")), "UTF-8");
47
- extractTextForPageRange(output, 1, Integer.MAX_VALUE);
48
- output.close();
49
- }
50
-
51
- // Write out the full text for each specified page.
52
- public void writePageText() throws IOException {
53
- if (pageNumbers != null) {
54
- for (Integer num : pageNumbers) writePageText(num.intValue());
55
- } else {
56
- int pages = doc.getNumberOfPages();
57
- for (int i=1; i<=pages; i++) writePageText(i);
58
- }
59
- }
60
-
61
- // Write out the full text for a single page.
62
- public void writePageText(int pageNumber) throws IOException {
63
- File outfile = outputFile(basename + "_" + String.valueOf(pageNumber) + ".txt");
64
- OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8");
65
- extractTextForPageRange(output, pageNumber, pageNumber);
66
- output.close();
67
- }
68
-
69
- // Internal method to writes out text from the PDF for a given page range
70
- // to a provided output stream.
71
- private void extractTextForPageRange(OutputStreamWriter output, int startPage, int endPage) throws IOException {
72
- PDFTextStripper stripper = new PDFTextStripper("UTF-8");
73
- stripper.setSortByPosition(false);
74
- stripper.setShouldSeparateByBeads(true);
75
- stripper.setStartPage(startPage);
76
- stripper.setEndPage(endPage);
77
- stripper.writeText(doc, output);
78
- }
79
-
80
- }
@@ -1,91 +0,0 @@
1
- package org.documentcloud;
2
-
3
- import java.io.File;
4
- import java.util.List;
5
- import java.util.Arrays;
6
- import java.util.ArrayList;
7
- import java.util.Iterator;
8
-
9
- import org.apache.pdfbox.pdmodel.PDDocument;
10
-
11
- // The base Extractor class contains the common functionality needed to run
12
- // command-line extractors.
13
- public abstract class Extractor {
14
-
15
- protected File output;
16
- protected boolean allPages = false;
17
- protected ArrayList<Integer> pageNumbers;
18
-
19
- // Running an extractor consists of converting the arguments array into a
20
- // more manageable List, parsing arguments, and extracting pdfs.
21
- public void run(String[] arguments) {
22
- List<String> args = new ArrayList<String>(Arrays.asList(arguments));
23
- parseArguments(args);
24
- Iterator<String> iter = args.iterator();
25
- while(iter.hasNext()) extract(iter.next());
26
- }
27
-
28
- // Subclasses must override "extract" to perform their specific extraction.
29
- public abstract void extract(String pdfPath);
30
-
31
- // The default "parseArguments" method handles common arguments.
32
- protected void parseArguments(List<String> args) {
33
- int dirLoc = args.indexOf("--output");
34
- if (dirLoc >= 0) {
35
- output = new File(args.remove(dirLoc + 1));
36
- args.remove(dirLoc);
37
- }
38
- int pagesLoc = args.indexOf("--pages");
39
- if (pagesLoc >= 0) {
40
- parsePages(args.remove(pagesLoc + 1));
41
- args.remove(pagesLoc);
42
- }
43
- }
44
-
45
- // Utility function to get the basename of a file path.
46
- // After File.basename in Ruby.
47
- public String getBasename(String pdfPath) {
48
- String basename = new File(pdfPath).getName();
49
- return basename.substring(0, basename.lastIndexOf('.'));
50
- }
51
-
52
- // Get a reference to an output file, placed inside any configured directories,
53
- // while ensuring that parent directories exist.
54
- public File outputFile(String path) {
55
- File file = output != null ? new File(output, path) : new File(path);
56
- File parent = file.getParentFile();
57
- if (parent != null) parent.mkdirs();
58
- return file;
59
- }
60
-
61
- // Decrypt a non-passworded but still encrypted document.
62
- public void decrypt(PDDocument doc) {
63
- if (!doc.isEncrypted()) return;
64
- try {
65
- doc.decrypt("");
66
- } catch (Exception e) {
67
- System.out.println("Error decrypting document, details: " + e.getMessage());
68
- System.exit(1);
69
- }
70
- }
71
-
72
- private void parsePages(String pageList) {
73
- if (pageList.equals("all")) {
74
- allPages = true;
75
- return;
76
- }
77
- pageNumbers = new ArrayList<Integer>();
78
- String[] groups = pageList.split(",");
79
- for (String group : groups) {
80
- if (group.contains("-")) {
81
- String[] range = group.split("-");
82
- int start = Integer.parseInt(range[0]);
83
- int end = Integer.parseInt(range[1]);
84
- for (int i=start; i<=end; i++) pageNumbers.add(new Integer(i));
85
- } else {
86
- pageNumbers.add(new Integer(Integer.parseInt(group)));
87
- }
88
- }
89
- }
90
-
91
- }
@@ -1,31 +0,0 @@
1
- module Docsplit
2
-
3
- module ArgumentParser
4
-
5
- # Flatten an options hash into an arguments string suitable for the command
6
- # line.
7
- def parse_options(opts)
8
- opts.map {|k, v| ["--#{k}", normalize_value(v)] }.flatten.join(' ')
9
- end
10
-
11
- # Normalize a value in an options hash for the command line.
12
- # Ranges look like: 1-10, Arrays like: 1,2,3.
13
- def normalize_value(value)
14
- case value
15
- when Range then normalize_range(value)
16
- when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
17
- else value.to_s
18
- end
19
- end
20
-
21
- # Serialize a Ruby range into it's command-line equivalent.
22
- def normalize_range(range)
23
- arr = range.to_a
24
- arr.empty? ? range.first.to_s : "#{range.first}-#{arr.last}"
25
- end
26
-
27
- end
28
-
29
- extend ArgumentParser
30
-
31
- end
data/vendor/bcmail.jar DELETED
Binary file
data/vendor/bcprov.jar DELETED
Binary file
Binary file
data/vendor/fontbox.jar DELETED
Binary file
data/vendor/pdfbox.jar DELETED
Binary file