docsplit 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/docsplit.gemspec +3 -3
- data/lib/docsplit.rb +8 -5
- data/lib/docsplit/text_extractor.rb +3 -2
- data/vendor/conf/document-formats.js +0 -3
- data/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar +0 -0
- data/vendor/jodconverter/json-20090211.jar +0 -0
- data/vendor/jodconverter/juh-3.2.1.jar +0 -0
- data/vendor/jodconverter/jurt-3.2.1.jar +0 -0
- data/vendor/jodconverter/ridl-3.2.1.jar +0 -0
- data/vendor/jodconverter/unoil-3.2.1.jar +0 -0
- metadata +14 -16
- data/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar +0 -0
- data/vendor/jodconverter/json-20080701.jar +0 -0
- data/vendor/jodconverter/juh-3.1.0.jar +0 -0
- data/vendor/jodconverter/jurt-3.1.0.jar +0 -0
- data/vendor/jodconverter/ridl-3.1.0.jar +0 -0
- data/vendor/jodconverter/unoil-3.1.0.jar +0 -0
data/docsplit.gemspec
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Gem::Specification.new do |s|
|
|
2
2
|
s.name = 'docsplit'
|
|
3
|
-
s.version = '0.6.
|
|
4
|
-
s.date = '2011-
|
|
3
|
+
s.version = '0.6.1' # Keep version in sync with docsplit.rb
|
|
4
|
+
s.date = '2011-11-18'
|
|
5
5
|
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
|
@@ -12,7 +12,7 @@ Gem::Specification.new do |s|
|
|
|
12
12
|
metadata (title, author, number of pages...)
|
|
13
13
|
EOS
|
|
14
14
|
|
|
15
|
-
s.authors = ['Jeremy Ashkenas', 'Samuel Clay']
|
|
15
|
+
s.authors = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
|
|
16
16
|
s.email = 'jeremy@documentcloud.org'
|
|
17
17
|
s.rubyforge_project = 'docsplit'
|
|
18
18
|
|
data/lib/docsplit.rb
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# The Docsplit module delegates to the Java PDF extractors.
|
|
2
2
|
module Docsplit
|
|
3
3
|
|
|
4
|
-
VERSION = '0.6.
|
|
4
|
+
VERSION = '0.6.1' # Keep in sync with gemspec.
|
|
5
5
|
|
|
6
6
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
|
7
7
|
|
|
@@ -11,11 +11,14 @@ module Docsplit
|
|
|
11
11
|
|
|
12
12
|
HEADLESS = "-Djava.awt.headless=true"
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
office ||= "/usr/lib/openoffice" if File.exists? '/usr/lib/openoffice'
|
|
15
|
+
office ||= "/usr/lib/libreoffice" if File.exists? '/usr/lib/libreoffice'
|
|
16
|
+
|
|
17
|
+
OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : "-Doffice.home=#{office}"
|
|
15
18
|
|
|
16
19
|
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
|
|
17
20
|
|
|
18
|
-
GM_FORMATS = [
|
|
21
|
+
GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
|
|
19
22
|
|
|
20
23
|
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
|
|
21
24
|
|
|
@@ -65,10 +68,10 @@ module Docsplit
|
|
|
65
68
|
basename = File.basename(doc, ext)
|
|
66
69
|
escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
|
|
67
70
|
|
|
68
|
-
if
|
|
71
|
+
if GM_FORMATS.include?(`file -b --mime #{doc}`.strip.split(/[:;]\s+/)[0])
|
|
69
72
|
`gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
|
|
70
73
|
else
|
|
71
|
-
options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-
|
|
74
|
+
options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ROOT}/vendor/conf/document-formats.js"
|
|
72
75
|
run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
|
|
73
76
|
end
|
|
74
77
|
end
|
|
@@ -66,7 +66,7 @@ module Docsplit
|
|
|
66
66
|
escaped_tiff = ESCAPE[tiff]
|
|
67
67
|
file = "#{base_path}_#{page}"
|
|
68
68
|
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
|
|
69
|
-
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l
|
|
69
|
+
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
|
|
70
70
|
clean_text(file + '.txt') if @clean_ocr
|
|
71
71
|
FileUtils.remove_entry_secure tiff
|
|
72
72
|
end
|
|
@@ -74,7 +74,7 @@ module Docsplit
|
|
|
74
74
|
tiff = "#{tempdir}/#{@pdf_name}.tif"
|
|
75
75
|
escaped_tiff = ESCAPE[tiff]
|
|
76
76
|
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
|
|
77
|
-
run "tesseract #{escaped_tiff} #{base_path} -l
|
|
77
|
+
run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
|
|
78
78
|
clean_text(base_path + '.txt') if @clean_ocr
|
|
79
79
|
end
|
|
80
80
|
ensure
|
|
@@ -122,6 +122,7 @@ module Docsplit
|
|
|
122
122
|
@force_ocr = options[:ocr] == true
|
|
123
123
|
@forbid_ocr = options[:ocr] == false
|
|
124
124
|
@clean_ocr = !(options[:clean] == false)
|
|
125
|
+
@language = options[:language] || 'eng'
|
|
125
126
|
end
|
|
126
127
|
|
|
127
128
|
end
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
metadata
CHANGED
|
@@ -1,22 +1,23 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: docsplit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
|
|
5
|
-
prerelease:
|
|
4
|
+
prerelease: false
|
|
6
5
|
segments:
|
|
7
6
|
- 0
|
|
8
7
|
- 6
|
|
9
|
-
-
|
|
10
|
-
version: 0.6.
|
|
8
|
+
- 1
|
|
9
|
+
version: 0.6.1
|
|
11
10
|
platform: ruby
|
|
12
11
|
authors:
|
|
13
12
|
- Jeremy Ashkenas
|
|
14
13
|
- Samuel Clay
|
|
14
|
+
- Ted Han
|
|
15
15
|
autorequire:
|
|
16
16
|
bindir: bin
|
|
17
17
|
cert_chain: []
|
|
18
18
|
|
|
19
|
-
date: 2011-
|
|
19
|
+
date: 2011-11-18 00:00:00 -06:00
|
|
20
|
+
default_executable:
|
|
20
21
|
dependencies: []
|
|
21
22
|
|
|
22
23
|
description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
|
|
@@ -40,16 +41,17 @@ files:
|
|
|
40
41
|
- vendor/conf/document-formats.js
|
|
41
42
|
- vendor/jodconverter/commons-cli-1.1.jar
|
|
42
43
|
- vendor/jodconverter/commons-io-1.4.jar
|
|
43
|
-
- vendor/jodconverter/jodconverter-core-3.0-beta-
|
|
44
|
-
- vendor/jodconverter/json-
|
|
45
|
-
- vendor/jodconverter/juh-3.1.
|
|
46
|
-
- vendor/jodconverter/jurt-3.1.
|
|
47
|
-
- vendor/jodconverter/ridl-3.1.
|
|
48
|
-
- vendor/jodconverter/unoil-3.1.
|
|
44
|
+
- vendor/jodconverter/jodconverter-core-3.0-beta-4.jar
|
|
45
|
+
- vendor/jodconverter/json-20090211.jar
|
|
46
|
+
- vendor/jodconverter/juh-3.2.1.jar
|
|
47
|
+
- vendor/jodconverter/jurt-3.2.1.jar
|
|
48
|
+
- vendor/jodconverter/ridl-3.2.1.jar
|
|
49
|
+
- vendor/jodconverter/unoil-3.2.1.jar
|
|
49
50
|
- vendor/logging.properties
|
|
50
51
|
- docsplit.gemspec
|
|
51
52
|
- LICENSE
|
|
52
53
|
- README
|
|
54
|
+
has_rdoc: true
|
|
53
55
|
homepage: http://documentcloud.github.com/docsplit/
|
|
54
56
|
licenses: []
|
|
55
57
|
|
|
@@ -59,27 +61,23 @@ rdoc_options: []
|
|
|
59
61
|
require_paths:
|
|
60
62
|
- lib
|
|
61
63
|
required_ruby_version: !ruby/object:Gem::Requirement
|
|
62
|
-
none: false
|
|
63
64
|
requirements:
|
|
64
65
|
- - ">="
|
|
65
66
|
- !ruby/object:Gem::Version
|
|
66
|
-
hash: 3
|
|
67
67
|
segments:
|
|
68
68
|
- 0
|
|
69
69
|
version: "0"
|
|
70
70
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
71
|
-
none: false
|
|
72
71
|
requirements:
|
|
73
72
|
- - ">="
|
|
74
73
|
- !ruby/object:Gem::Version
|
|
75
|
-
hash: 3
|
|
76
74
|
segments:
|
|
77
75
|
- 0
|
|
78
76
|
version: "0"
|
|
79
77
|
requirements: []
|
|
80
78
|
|
|
81
79
|
rubyforge_project: docsplit
|
|
82
|
-
rubygems_version: 1.
|
|
80
|
+
rubygems_version: 1.3.6
|
|
83
81
|
signing_key:
|
|
84
82
|
specification_version: 3
|
|
85
83
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|