docsplit 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE CHANGED
@@ -1,5 +1,4 @@
1
1
  JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
2
- PDFBox is licensed under the Apache 2 License: apache.org/licenses/LICENSE-2.0
3
2
 
4
3
  Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
5
4
 
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.2.0' # Keep version in sync with docsplit.rb
4
- s.date = '2010-7-29'
3
+ s.version = '0.3.0' # Keep version in sync with docsplit.rb
4
+ s.date = '2010-8-5'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
data/lib/docsplit.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.2.0' # Keep in sync with gemspec.
4
+ VERSION = '0.3.0' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -13,6 +13,20 @@ module Docsplit
13
13
 
14
14
  METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
15
15
 
16
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
17
+
18
+ # Check for all dependencies, and warn of their absence.
19
+ dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
20
+ DEPENDENCIES.each_key do |dep|
21
+ dirs.each do |dir|
22
+ if File.executable?(File.join(dir, dep.to_s))
23
+ DEPENDENCIES[dep] = true
24
+ break
25
+ end
26
+ end
27
+ warn "Warning: Docsplit dependency #{dep} not found." if !DEPENDENCIES[dep]
28
+ end
29
+
16
30
  # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
17
31
  # broke.
18
32
  class ExtractionFailed < StandardError; end
@@ -20,7 +34,7 @@ module Docsplit
20
34
  # Use the ExtractPages Java class to burst a PDF into single pages.
21
35
  def self.extract_pages(pdfs, opts={})
22
36
  pdfs = ensure_pdfs(pdfs)
23
- run "org.documentcloud.ExtractPages", pdfs, opts
37
+ PageExtractor.new.extract(pdfs, opts)
24
38
  end
25
39
 
26
40
  # Use the ExtractText Java class to write out all embedded text.
@@ -50,8 +64,7 @@ module Docsplit
50
64
  instance_eval <<-EOS
51
65
  def self.extract_#{key}(pdfs, opts={})
52
66
  pdfs = ensure_pdfs(pdfs)
53
- result = run "org.documentcloud.ExtractInfo #{key}", pdfs, opts, true
54
- :#{key} == :length ? result.to_i : result
67
+ InfoExtractor.new.extract(:#{key}, pdfs, opts)
55
68
  end
56
69
  EOS
57
70
  end
@@ -62,18 +75,28 @@ module Docsplit
62
75
  # Runs a Java command, with quieted logging, and the classpath set properly.
63
76
  def self.run(command, pdfs, opts, return_output=false)
64
77
  pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
65
- args = parse_options(opts)
66
- cmd = "java #{HEADLESS} #{LOGGING} -cp #{CLASSPATH} #{command} #{args} #{pdfs} 2>&1"
78
+ cmd = "java #{HEADLESS} #{LOGGING} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
67
79
  result = `#{cmd}`.chomp
68
80
  raise ExtractionFailed, result if $? != 0
69
81
  return return_output ? (result.empty? ? nil : result) : true
70
82
  end
71
83
 
84
+ # Normalize a value in an options hash for the command line.
85
+ # Ranges look like: 1-10, Arrays like: 1,2,3.
86
+ def self.normalize_value(value)
87
+ case value
88
+ when Range then normalize_range(value)
89
+ when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
90
+ else value.to_s
91
+ end
92
+ end
93
+
72
94
  end
73
95
 
74
96
  require 'tmpdir'
75
97
  require 'fileutils'
76
98
  require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
77
- require "#{Docsplit::ROOT}/lib/docsplit/argument_parser"
78
99
  require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
79
100
  require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
101
+ require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
102
+ require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
@@ -8,7 +8,7 @@ module Docsplit
8
8
 
9
9
  BANNER = <<-EOS
10
10
  docsplit breaks apart documents into images, text, or individual pages.
11
- It wraps PDFBox, GraphicsMagick, and JODConverter.
11
+ It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
12
12
 
13
13
  Usage:
14
14
  docsplit COMMAND [OPTIONS] path/to/doc.pdf
@@ -71,7 +71,7 @@ Options:
71
71
  # Use the OptionParser library to parse out all supported options. Return
72
72
  # options formatted for the Ruby API.
73
73
  def parse_options
74
- @options = {}
74
+ @options = {:ocr => :default}
75
75
  @option_parser = OptionParser.new do |opts|
76
76
  opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
77
77
  @options[:output] = d
@@ -85,8 +85,14 @@ Options:
85
85
  opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
86
86
  @options[:format] = t.split(',')
87
87
  end
88
+ opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
89
+ @options[:ocr] = o
90
+ end
91
+ opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
92
+ @options[:rolling] = true
93
+ end
88
94
  opts.on_tail('-v', '--version', 'display docsplit version') do
89
- puts "docsplit version #{Docsplit::VERSION}"
95
+ puts "Docsplit version #{Docsplit::VERSION}"
90
96
  exit
91
97
  end
92
98
  opts.on_tail('-h', '--help', 'display this help message') do
@@ -4,26 +4,37 @@ module Docsplit
4
4
  # nicely sized images.
5
5
  class ImageExtractor
6
6
 
7
- DENSITY_ARG = "-density 150"
8
- MEMORY_ARGS = "-limit memory 128MiB -limit map 256MiB"
9
- DEFAULT_FORMAT = :png
7
+ DENSITY_ARG = "-density 150"
8
+ MEMORY_ARGS = "-limit memory 128MiB -limit map 256MiB"
9
+ DEFAULT_FORMAT = :png
10
10
 
11
11
  # Extract a list of PDFs as rasterized page images, according to the
12
12
  # configuration in options.
13
13
  def extract(pdfs, options)
14
14
  @pdfs = [pdfs].flatten
15
15
  extract_options(options)
16
- @pdfs.each {|p| @sizes.each {|s| @formats.each {|f| convert(p, s, f) }}}
16
+ @pdfs.each do |pdf|
17
+ previous = nil
18
+ @sizes.each_with_index do |size, i|
19
+ @formats.each {|format| convert(pdf, size, format, previous) }
20
+ previous = size if @rolling
21
+ end
22
+ end
17
23
  end
18
24
 
19
25
  # Convert a single PDF into page images at the specified size and format.
20
- def convert(pdf, size, format)
26
+ def convert(pdf, size, format, previous=nil)
21
27
  basename = File.basename(pdf, File.extname(pdf))
22
- subfolder = @sizes.length > 1 ? size.to_s : ''
23
- directory = File.join(@output, subfolder)
28
+ directory = directory_for(size)
24
29
  FileUtils.mkdir_p(directory) unless File.exists?(directory)
25
30
  out_file = File.join(directory, "#{basename}_%05d.#{format}")
26
- cmd = "gm convert +adjoin #{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
31
+ common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
32
+ if previous
33
+ FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
34
+ cmd = "OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
35
+ else
36
+ cmd = "OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
37
+ end
27
38
  result = `#{cmd}`.chomp
28
39
  raise ExtractionFailed, result if $? != 0
29
40
  renumber_images(out_file, format)
@@ -39,6 +50,14 @@ module Docsplit
39
50
  @formats = [options[:format] || DEFAULT_FORMAT].flatten
40
51
  @sizes = [options[:size]].flatten.compact
41
52
  @sizes = [nil] if @sizes.empty?
53
+ @rolling = !!options[:rolling]
54
+ end
55
+
56
+ # If there's only one size requested, generate the images directly into
57
+ # the output directory. Multiple sizes each get a directory of their own.
58
+ def directory_for(size)
59
+ path = @sizes.length == 1 ? @output : File.join(@output, size)
60
+ File.expand_path(path)
42
61
  end
43
62
 
44
63
  # Generate the resize argument.
@@ -0,0 +1,32 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdfinfo** in order to extract information about a PDF file.
4
+ class InfoExtractor
5
+
6
+ # Regex matchers for different bits of information.
7
+ MATCHERS = {
8
+ :author => /^Author:\s+([^\n]+)/,
9
+ :date => /^CreationDate:\s+([^\n]+)/,
10
+ :creator => /^Creator:\s+([^\n]+)/,
11
+ :keywords => /^Keywords:\s+([^\n]+)/,
12
+ :producer => /^Producer:\s+([^\n]+)/,
13
+ :subject => /^Subject:\s+([^\n]+)/,
14
+ :title => /^Title:\s+([^\n]+)/,
15
+ :length => /^Pages:\s+([^\n]+)/,
16
+ }
17
+
18
+ # Pull out a single datum from a pdf.
19
+ def extract(key, pdfs, opts)
20
+ pdf = [pdfs].flatten.first
21
+ cmd = "pdfinfo #{pdf} 2>&1"
22
+ result = `#{cmd}`.chomp
23
+ raise ExtractionFailed, result if $? != 0
24
+ match = result.match(MATCHERS[key])
25
+ answer = match && match[1]
26
+ answer = answer.to_i if answer && key == :length
27
+ answer
28
+ end
29
+
30
+ end
31
+
32
+ end
@@ -0,0 +1,31 @@
1
+ module Docsplit
2
+
3
+ # Delegates to **pdftk** in order to create bursted single pages from
4
+ # a PDF document.
5
+ class PageExtractor
6
+
7
+ # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
8
+ def extract(pdfs, opts)
9
+ extract_options opts
10
+ [pdfs].flatten.each do |pdf|
11
+ pdf_name = File.basename(pdf, File.extname(pdf))
12
+ page_path = File.join(@output, "#{pdf_name}_%d.pdf")
13
+ FileUtils.mkdir_p @output unless File.exists?(@output)
14
+ cmd = "pdftk #{pdf} burst output #{page_path} 2>&1"
15
+ result = `#{cmd}`.chomp
16
+ FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
17
+ raise ExtractionFailed, result if $? != 0
18
+ result
19
+ end
20
+ end
21
+
22
+
23
+ private
24
+
25
+ def extract_options(options)
26
+ @output = options[:output] || '.'
27
+ end
28
+
29
+ end
30
+
31
+ end
@@ -1,53 +1,111 @@
1
1
  module Docsplit
2
-
2
+
3
+ # Delegates to **pdftotext** and **tesseract** in order to extract text from
4
+ # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
5
+ # forbid OCR extraction, but by default the heuristic works like this:
6
+ #
7
+ # * Check for the presence of fonts in the PDF. If no fonts are detected,
8
+ # OCR is used automatically.
9
+ # * Extract the text of each page with **pdftotext**, if the page has less
10
+ # than 100 bytes of text (a scanned image page, or a page that just
11
+ # contains a filename and a page number), then add it to the list of
12
+ # `@pages_to_ocr`.
13
+ # * Re-OCR each page in the `@pages_to_ocr` list at the end.
14
+ #
3
15
  class TextExtractor
4
-
5
- PAGE_COUNT_MATCHER = /Pages:\s+(\d+?)\n/
6
-
16
+
17
+ NO_TEXT_DETECTED = /---------\n\Z/
18
+
19
+ OCR_FLAGS = '-density 200x200 -colorspace GRAY'
20
+
21
+ MIN_TEXT_PER_PAGE = 100 # in bytes
22
+
23
+ def initialize
24
+ @tiffs_generated = false
25
+ @pages_to_ocr = []
26
+ end
27
+
28
+ # Extract text from a list of PDFs.
7
29
  def extract(pdfs, opts)
8
30
  extract_options opts
9
- pdfs = [pdfs].flatten
10
- pdfs.each do |pdf|
11
- pdf_name = File.basename(pdf, File.extname(pdf))
12
- text_path = File.join(@output, "#{pdf_name}.txt")
13
- FileUtils.mkdir_p @output
14
-
15
- if @pages
16
- pages = (@pages == 'all') ? 1..get_pages(pdf) : @pages
17
- pages.each do |page|
18
- extract_page pdf, page, pdf_name
19
- end
31
+ FileUtils.mkdir_p @output unless File.exists?(@output)
32
+ [pdfs].flatten.each do |pdf|
33
+ @pdf_name = File.basename(pdf, File.extname(pdf))
34
+ pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
35
+ if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
36
+ extract_from_ocr(pdf, pages)
20
37
  else
21
- cmd = "pdftotext -enc UTF-8 #{pdf} #{text_path}"
22
- result = `#{cmd}`.chomp
23
- raise ExtractionFailed, result if $? != 0
38
+ extract_from_pdf(pdf, pages)
39
+ if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
40
+ extract_from_ocr(pdf, @pages_to_ocr)
41
+ end
42
+ end
43
+ end
44
+ FileUtils.remove_entry_secure @tempdir if @tempdir
45
+ end
46
+
47
+ # Does a PDF have any text embedded?
48
+ def contains_text?(pdf)
49
+ fonts = `pdffonts #{pdf} 2>&1`
50
+ !fonts.match(NO_TEXT_DETECTED)
51
+ end
52
+
53
+ # Extract a page range worth of text from a PDF, directly.
54
+ def extract_from_pdf(pdf, pages)
55
+ return extract_full(pdf) unless pages
56
+ pages.each {|page| extract_page(pdf, page) }
57
+ end
58
+
59
+ # Extract a page range worth of text from a PDF via OCR.
60
+ def extract_from_ocr(pdf, pages)
61
+ @tempdir ||= Dir.mktmpdir
62
+ base_path = File.join(@output, @pdf_name)
63
+ if pages
64
+ run "gm convert +adjoin #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
65
+ @tiffs_generated = true
66
+ pages.each do |page|
67
+ run "tesseract #{@tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
24
68
  end
69
+ else
70
+ tiff = "#{@tempdir}/#{@pdf_name}.tif"
71
+ run "gm convert #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
72
+ run "tesseract #{tiff} #{base_path} -l eng 2>&1"
25
73
  end
26
74
  end
27
-
28
- def extract_page(pdf, page, pdf_name)
29
- text_path = File.join(@output, "#{pdf_name}_#{page}.txt")
30
- cmd = "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path}"
31
- result = `#{cmd}`.chomp
75
+
76
+
77
+ private
78
+
79
+ # Run an external process and raise an exception if it fails.
80
+ def run(command)
81
+ result = `#{command}`
32
82
  raise ExtractionFailed, result if $? != 0
33
83
  result
34
84
  end
35
85
 
36
- def get_pages(pdf_path)
37
- info = `pdfinfo #{pdf_path}`
38
- raise ExtractionFailed, result if $? != 0
39
- match = info.match(PAGE_COUNT_MATCHER)
40
- raise ExtractionFailed if match.nil?
41
- match[1].to_i
86
+ # Extract the full contents of a pdf as a single file, directly.
87
+ def extract_full(pdf)
88
+ text_path = File.join(@output, "#{@pdf_name}.txt")
89
+ run "pdftotext -enc UTF-8 #{pdf} #{text_path} 2>&1"
42
90
  end
43
-
44
- private
45
-
91
+
92
+ # Extract the contents of a single page of text, directly, adding it to
93
+ # the `@pages_to_ocr` list if the text length is inadequate.
94
+ def extract_page(pdf, page)
95
+ text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
96
+ run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path} 2>&1"
97
+ unless @forbid_ocr
98
+ @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
99
+ end
100
+ end
101
+
46
102
  def extract_options(options)
47
- @output = options[:output] || '.'
48
- @pages = options[:pages]
103
+ @output = options[:output] || '.'
104
+ @pages = options[:pages]
105
+ @force_ocr = options[:ocr] == true
106
+ @forbid_ocr = options[:ocr] == false
49
107
  end
50
108
 
51
109
  end
52
-
110
+
53
111
  end
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
5
4
  prerelease: false
6
5
  segments:
7
6
  - 0
8
- - 2
7
+ - 3
9
8
  - 0
10
- version: 0.2.0
9
+ version: 0.3.0
11
10
  platform: ruby
12
11
  authors:
13
12
  - Jeremy Ashkenas
@@ -16,7 +15,7 @@ autorequire:
16
15
  bindir: bin
17
16
  cert_chain: []
18
17
 
19
- date: 2010-07-29 00:00:00 -04:00
18
+ date: 2010-08-05 00:00:00 -04:00
20
19
  default_executable:
21
20
  dependencies: []
22
21
 
@@ -29,27 +28,14 @@ extensions: []
29
28
  extra_rdoc_files: []
30
29
 
31
30
  files:
32
- - build/org/documentcloud/ExtractInfo$1.class
33
- - build/org/documentcloud/ExtractInfo$Keys.class
34
- - build/org/documentcloud/ExtractInfo.class
35
- - build/org/documentcloud/Extractor.class
36
- - build/org/documentcloud/ExtractPages.class
37
- - build/org/documentcloud/ExtractText.class
38
- - lib/docsplit/argument_parser.rb
39
31
  - lib/docsplit/command_line.rb
40
- - lib/docsplit/ExtractInfo.java
41
- - lib/docsplit/Extractor.java
42
- - lib/docsplit/ExtractPages.java
43
- - lib/docsplit/ExtractText.java
44
32
  - lib/docsplit/image_extractor.rb
33
+ - lib/docsplit/info_extractor.rb
34
+ - lib/docsplit/page_extractor.rb
45
35
  - lib/docsplit/text_extractor.rb
46
36
  - lib/docsplit/transparent_pdfs.rb
47
37
  - lib/docsplit.rb
48
38
  - bin/docsplit
49
- - vendor/bcmail.jar
50
- - vendor/bcprov.jar
51
- - vendor/commons-logging.jar
52
- - vendor/fontbox.jar
53
39
  - vendor/jodconverter/commons-cli-1.2.jar
54
40
  - vendor/jodconverter/commons-io-1.4.jar
55
41
  - vendor/jodconverter/jodconverter-2.2.2.jar
@@ -61,11 +47,10 @@ files:
61
47
  - vendor/jodconverter/slf4j-jdk14-1.5.6.jar
62
48
  - vendor/jodconverter/unoil-3.0.1.jar
63
49
  - vendor/logging.properties
64
- - vendor/pdfbox.jar
65
50
  - docsplit.gemspec
66
51
  - LICENSE
67
52
  - README
68
- has_rdoc: true
53
+ has_rdoc: false
69
54
  homepage: http://documentcloud.github.com/docsplit/
70
55
  licenses: []
71
56
 
@@ -75,27 +60,23 @@ rdoc_options: []
75
60
  require_paths:
76
61
  - lib
77
62
  required_ruby_version: !ruby/object:Gem::Requirement
78
- none: false
79
63
  requirements:
80
64
  - - ">="
81
65
  - !ruby/object:Gem::Version
82
- hash: 3
83
66
  segments:
84
67
  - 0
85
68
  version: "0"
86
69
  required_rubygems_version: !ruby/object:Gem::Requirement
87
- none: false
88
70
  requirements:
89
71
  - - ">="
90
72
  - !ruby/object:Gem::Version
91
- hash: 3
92
73
  segments:
93
74
  - 0
94
75
  version: "0"
95
76
  requirements: []
96
77
 
97
78
  rubyforge_project: docsplit
98
- rubygems_version: 1.3.7
79
+ rubygems_version: 1.3.6
99
80
  signing_key:
100
81
  specification_version: 3
101
82
  summary: Break Apart Documents into Images, Text, Pages and PDFs
@@ -1,63 +0,0 @@
1
- package org.documentcloud;
2
-
3
- import java.util.List;
4
- import java.io.IOException;
5
- import java.text.SimpleDateFormat;
6
-
7
- import org.apache.pdfbox.pdmodel.PDDocument;
8
- import org.apache.pdfbox.pdmodel.PDDocumentInformation;
9
-
10
- // Extracts metadata from a PDF file.
11
- public class ExtractInfo extends Extractor {
12
-
13
- private PDDocument doc;
14
- private PDDocumentInformation info;
15
- private String key;
16
-
17
- // The list of metadata keys we know how to extract.
18
- private enum Keys {
19
- AUTHOR, DATE, CREATOR, KEYWORDS, PRODUCER, SUBJECT, TITLE, LENGTH
20
- }
21
-
22
- // The mainline.
23
- public static void main(String[] args) {
24
- (new ExtractInfo()).run(args);
25
- }
26
-
27
- // The first argument is always the name of the metadata key.
28
- protected void parseArguments(List<String> args) {
29
- super.parseArguments(args);
30
- key = args.remove(0).toUpperCase();
31
- }
32
-
33
- // Extract the configured bit of metadata from a PDF, decrypting if necessary.
34
- public void extract(String pdfPath) {
35
- try {
36
- doc = PDDocument.load(pdfPath, false);
37
- decrypt(doc);
38
- info = doc.getDocumentInformation();
39
- String val = extractInfo();
40
- if (val != null) System.out.println(val);
41
- doc.close();
42
- } catch(IOException e) {
43
- System.out.println(e.getMessage());
44
- System.exit(1);
45
- }
46
- }
47
-
48
- // Use the PDDocumentInformation object to fetch metadata values as strings.
49
- public String extractInfo() throws IOException {
50
- switch(Keys.valueOf(key)) {
51
- case AUTHOR: return info.getAuthor();
52
- case DATE: return new SimpleDateFormat("yyyy-MM-dd").format(info.getCreationDate().getTime());
53
- case CREATOR: return info.getCreator();
54
- case KEYWORDS: return info.getKeywords();
55
- case PRODUCER: return info.getProducer();
56
- case SUBJECT: return info.getSubject();
57
- case TITLE: return info.getTitle();
58
- case LENGTH: return String.valueOf(doc.getNumberOfPages());
59
- default: return null;
60
- }
61
- }
62
-
63
- }
@@ -1,54 +0,0 @@
1
- package org.documentcloud;
2
-
3
- import java.util.List;
4
- import java.io.File;
5
- import java.io.FileOutputStream;
6
- import java.io.IOException;
7
-
8
- import org.apache.pdfbox.pdmodel.PDDocument;
9
- import org.apache.pdfbox.util.Splitter;
10
- import org.apache.pdfbox.pdfwriter.COSWriter;
11
- import org.apache.pdfbox.exceptions.COSVisitorException;
12
-
13
- // Use PDFBox's Splitter to break apart a large PDF into individual pages.
14
- public class ExtractPages extends Extractor {
15
-
16
- private PDDocument doc;
17
- private String basename;
18
-
19
- // The mainline.
20
- public static void main(String[] args) {
21
- (new ExtractPages()).run(args);
22
- }
23
-
24
- // Extract each page of the given PDF.
25
- public void extract(String pdfPath) {
26
- try {
27
- basename = getBasename(pdfPath);
28
- doc = PDDocument.load(pdfPath);
29
- decrypt(doc);
30
- List pages = (new Splitter()).split(doc);
31
- if (pageNumbers != null) {
32
- for (Integer num : pageNumbers) writePage((PDDocument) pages.get(num.intValue()- 1), num.intValue());
33
- } else {
34
- for (int i=0; i<pages.size(); i++) writePage((PDDocument) pages.get(i), i + 1);
35
- }
36
- doc.close();
37
- } catch(Exception e) {
38
- System.out.println(e.getMessage());
39
- System.exit(1);
40
- }
41
- }
42
-
43
- // Writes out a page as a single-page PDF.
44
- private void writePage(PDDocument page, int pageNumber) throws IOException, COSVisitorException {
45
- String pageName = basename + "_" + String.valueOf(pageNumber) + ".pdf";
46
- FileOutputStream out = new FileOutputStream(outputFile(pageName));
47
- COSWriter writer = new COSWriter(out);
48
- writer.write(page);
49
- out.close();
50
- writer.close();
51
- page.close();
52
- }
53
-
54
- }
@@ -1,80 +0,0 @@
1
- package org.documentcloud;
2
-
3
- import java.util.List;
4
- import java.io.File;
5
- import java.io.FileOutputStream;
6
- import java.io.IOException;
7
- import java.io.OutputStreamWriter;
8
-
9
- import org.apache.pdfbox.pdmodel.PDDocument;
10
- import org.apache.pdfbox.util.PDFTextStripper;
11
-
12
- // Uses PDFBox's PDFTextStripper to extract the full, plain, UTF-8 text of a
13
- // PDF document. Pass --pages to write out the plain text for each individual
14
- // page; --pages-only to omit the text for the entire document.
15
- public class ExtractText extends Extractor {
16
-
17
- private PDDocument doc;
18
- private String basename;
19
-
20
- // The mainline.
21
- public static void main(String[] args) {
22
- (new ExtractText()).run(args);
23
- }
24
-
25
- // Extract the plain text for a PDF, and write it into the requested output
26
- // sizes.
27
- public void extract(String pdfPath) {
28
- try {
29
- basename = getBasename(pdfPath);
30
- doc = PDDocument.load(pdfPath, false);
31
- decrypt(doc);
32
- if (allPages || (pageNumbers != null)) {
33
- writePageText();
34
- } else {
35
- writeFullText();
36
- }
37
- doc.close();
38
- } catch(IOException e) {
39
- System.out.println(e.getMessage());
40
- System.exit(1);
41
- }
42
- }
43
-
44
- // Write out the extracted full text for the entire PDF.
45
- public void writeFullText() throws IOException {
46
- OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outputFile(basename + ".txt")), "UTF-8");
47
- extractTextForPageRange(output, 1, Integer.MAX_VALUE);
48
- output.close();
49
- }
50
-
51
- // Write out the full text for each specified page.
52
- public void writePageText() throws IOException {
53
- if (pageNumbers != null) {
54
- for (Integer num : pageNumbers) writePageText(num.intValue());
55
- } else {
56
- int pages = doc.getNumberOfPages();
57
- for (int i=1; i<=pages; i++) writePageText(i);
58
- }
59
- }
60
-
61
- // Write out the full text for a single page.
62
- public void writePageText(int pageNumber) throws IOException {
63
- File outfile = outputFile(basename + "_" + String.valueOf(pageNumber) + ".txt");
64
- OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8");
65
- extractTextForPageRange(output, pageNumber, pageNumber);
66
- output.close();
67
- }
68
-
69
- // Internal method to writes out text from the PDF for a given page range
70
- // to a provided output stream.
71
- private void extractTextForPageRange(OutputStreamWriter output, int startPage, int endPage) throws IOException {
72
- PDFTextStripper stripper = new PDFTextStripper("UTF-8");
73
- stripper.setSortByPosition(false);
74
- stripper.setShouldSeparateByBeads(true);
75
- stripper.setStartPage(startPage);
76
- stripper.setEndPage(endPage);
77
- stripper.writeText(doc, output);
78
- }
79
-
80
- }
@@ -1,91 +0,0 @@
1
- package org.documentcloud;
2
-
3
- import java.io.File;
4
- import java.util.List;
5
- import java.util.Arrays;
6
- import java.util.ArrayList;
7
- import java.util.Iterator;
8
-
9
- import org.apache.pdfbox.pdmodel.PDDocument;
10
-
11
- // The base Extractor class contains the common functionality needed to run
12
- // command-line extractors.
13
- public abstract class Extractor {
14
-
15
- protected File output;
16
- protected boolean allPages = false;
17
- protected ArrayList<Integer> pageNumbers;
18
-
19
- // Running an extractor consists of converting the arguments array into a
20
- // more manageable List, parsing arguments, and extracting pdfs.
21
- public void run(String[] arguments) {
22
- List<String> args = new ArrayList<String>(Arrays.asList(arguments));
23
- parseArguments(args);
24
- Iterator<String> iter = args.iterator();
25
- while(iter.hasNext()) extract(iter.next());
26
- }
27
-
28
- // Subclasses must override "extract" to perform their specific extraction.
29
- public abstract void extract(String pdfPath);
30
-
31
- // The default "parseArguments" method handles common arguments.
32
- protected void parseArguments(List<String> args) {
33
- int dirLoc = args.indexOf("--output");
34
- if (dirLoc >= 0) {
35
- output = new File(args.remove(dirLoc + 1));
36
- args.remove(dirLoc);
37
- }
38
- int pagesLoc = args.indexOf("--pages");
39
- if (pagesLoc >= 0) {
40
- parsePages(args.remove(pagesLoc + 1));
41
- args.remove(pagesLoc);
42
- }
43
- }
44
-
45
- // Utility function to get the basename of a file path.
46
- // After File.basename in Ruby.
47
- public String getBasename(String pdfPath) {
48
- String basename = new File(pdfPath).getName();
49
- return basename.substring(0, basename.lastIndexOf('.'));
50
- }
51
-
52
- // Get a reference to an output file, placed inside any configured directories,
53
- // while ensuring that parent directories exist.
54
- public File outputFile(String path) {
55
- File file = output != null ? new File(output, path) : new File(path);
56
- File parent = file.getParentFile();
57
- if (parent != null) parent.mkdirs();
58
- return file;
59
- }
60
-
61
- // Decrypt a non-passworded but still encrypted document.
62
- public void decrypt(PDDocument doc) {
63
- if (!doc.isEncrypted()) return;
64
- try {
65
- doc.decrypt("");
66
- } catch (Exception e) {
67
- System.out.println("Error decrypting document, details: " + e.getMessage());
68
- System.exit(1);
69
- }
70
- }
71
-
72
- private void parsePages(String pageList) {
73
- if (pageList.equals("all")) {
74
- allPages = true;
75
- return;
76
- }
77
- pageNumbers = new ArrayList<Integer>();
78
- String[] groups = pageList.split(",");
79
- for (String group : groups) {
80
- if (group.contains("-")) {
81
- String[] range = group.split("-");
82
- int start = Integer.parseInt(range[0]);
83
- int end = Integer.parseInt(range[1]);
84
- for (int i=start; i<=end; i++) pageNumbers.add(new Integer(i));
85
- } else {
86
- pageNumbers.add(new Integer(Integer.parseInt(group)));
87
- }
88
- }
89
- }
90
-
91
- }
@@ -1,31 +0,0 @@
1
- module Docsplit
2
-
3
- module ArgumentParser
4
-
5
- # Flatten an options hash into an arguments string suitable for the command
6
- # line.
7
- def parse_options(opts)
8
- opts.map {|k, v| ["--#{k}", normalize_value(v)] }.flatten.join(' ')
9
- end
10
-
11
- # Normalize a value in an options hash for the command line.
12
- # Ranges look like: 1-10, Arrays like: 1,2,3.
13
- def normalize_value(value)
14
- case value
15
- when Range then normalize_range(value)
16
- when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
17
- else value.to_s
18
- end
19
- end
20
-
21
- # Serialize a Ruby range into it's command-line equivalent.
22
- def normalize_range(range)
23
- arr = range.to_a
24
- arr.empty? ? range.first.to_s : "#{range.first}-#{arr.last}"
25
- end
26
-
27
- end
28
-
29
- extend ArgumentParser
30
-
31
- end
data/vendor/bcmail.jar DELETED
Binary file
data/vendor/bcprov.jar DELETED
Binary file
Binary file
data/vendor/fontbox.jar DELETED
Binary file
data/vendor/pdfbox.jar DELETED
Binary file