docsplit 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +1 -1
- data/lib/docsplit/image_extractor.rb +4 -2
- data/lib/docsplit/text_extractor.rb +6 -6
- data/lib/docsplit/transparent_pdfs.rb +3 -3
- metadata +9 -4
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.3.
|
4
|
-
s.date = '2010-8-
|
3
|
+
s.version = '0.3.3' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2010-8-17'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
data/lib/docsplit.rb
CHANGED
@@ -24,6 +24,7 @@ module Docsplit
|
|
24
24
|
|
25
25
|
# Convert a single PDF into page images at the specified size and format.
|
26
26
|
def convert(pdf, size, format, previous=nil)
|
27
|
+
tempdir = Dir.mktmpdir
|
27
28
|
basename = File.basename(pdf, File.extname(pdf))
|
28
29
|
directory = directory_for(size)
|
29
30
|
FileUtils.mkdir_p(directory) unless File.exists?(directory)
|
@@ -31,13 +32,14 @@ module Docsplit
|
|
31
32
|
common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
|
32
33
|
if previous
|
33
34
|
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
|
34
|
-
cmd = "OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
|
35
|
+
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
|
35
36
|
else
|
36
|
-
cmd = "OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
|
37
|
+
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
|
37
38
|
end
|
38
39
|
result = `#{cmd}`.chomp
|
39
40
|
raise ExtractionFailed, result if $? != 0
|
40
41
|
renumber_images(out_file, format)
|
42
|
+
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
|
41
43
|
end
|
42
44
|
|
43
45
|
|
@@ -42,7 +42,6 @@ module Docsplit
|
|
42
42
|
end
|
43
43
|
end
|
44
44
|
end
|
45
|
-
FileUtils.remove_entry_secure @tempdir if @tempdir
|
46
45
|
end
|
47
46
|
|
48
47
|
# Does a PDF have any text embedded?
|
@@ -59,19 +58,20 @@ module Docsplit
|
|
59
58
|
|
60
59
|
# Extract a page range worth of text from a PDF via OCR.
|
61
60
|
def extract_from_ocr(pdf, pages)
|
62
|
-
|
61
|
+
tempdir = Dir.mktmpdir
|
63
62
|
base_path = File.join(@output, @pdf_name)
|
64
63
|
if pages
|
65
|
-
run "OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{
|
64
|
+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
|
66
65
|
@tiffs_generated = true
|
67
66
|
pages.each do |page|
|
68
|
-
run "tesseract #{
|
67
|
+
run "tesseract #{tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
|
69
68
|
end
|
70
69
|
else
|
71
|
-
tiff = "#{
|
72
|
-
run "OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
|
70
|
+
tiff = "#{tempdir}/#{@pdf_name}.tif"
|
71
|
+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
|
73
72
|
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
|
74
73
|
end
|
74
|
+
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
|
75
75
|
end
|
76
76
|
|
77
77
|
|
@@ -12,9 +12,9 @@ module Docsplit
|
|
12
12
|
if ext.downcase == '.pdf'
|
13
13
|
doc
|
14
14
|
else
|
15
|
-
|
16
|
-
extract_pdf([doc], {:output =>
|
17
|
-
File.join(
|
15
|
+
tempdir = File.join(Dir.tmpdir, 'docsplit')
|
16
|
+
extract_pdf([doc], {:output => tempdir})
|
17
|
+
File.join(tempdir, File.basename(doc, ext) + '.pdf')
|
18
18
|
end
|
19
19
|
end
|
20
20
|
end
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 21
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
7
8
|
- 3
|
8
|
-
-
|
9
|
-
version: 0.3.
|
9
|
+
- 3
|
10
|
+
version: 0.3.3
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Jeremy Ashkenas
|
@@ -15,7 +16,7 @@ autorequire:
|
|
15
16
|
bindir: bin
|
16
17
|
cert_chain: []
|
17
18
|
|
18
|
-
date: 2010-08-
|
19
|
+
date: 2010-08-17 00:00:00 -04:00
|
19
20
|
default_executable:
|
20
21
|
dependencies: []
|
21
22
|
|
@@ -60,23 +61,27 @@ rdoc_options: []
|
|
60
61
|
require_paths:
|
61
62
|
- lib
|
62
63
|
required_ruby_version: !ruby/object:Gem::Requirement
|
64
|
+
none: false
|
63
65
|
requirements:
|
64
66
|
- - ">="
|
65
67
|
- !ruby/object:Gem::Version
|
68
|
+
hash: 3
|
66
69
|
segments:
|
67
70
|
- 0
|
68
71
|
version: "0"
|
69
72
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
70
74
|
requirements:
|
71
75
|
- - ">="
|
72
76
|
- !ruby/object:Gem::Version
|
77
|
+
hash: 3
|
73
78
|
segments:
|
74
79
|
- 0
|
75
80
|
version: "0"
|
76
81
|
requirements: []
|
77
82
|
|
78
83
|
rubyforge_project: docsplit
|
79
|
-
rubygems_version: 1.3.
|
84
|
+
rubygems_version: 1.3.7
|
80
85
|
signing_key:
|
81
86
|
specification_version: 3
|
82
87
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|