docsplit 0.3.1 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +1 -1
- data/lib/docsplit/image_extractor.rb +4 -2
- data/lib/docsplit/text_extractor.rb +6 -6
- data/lib/docsplit/transparent_pdfs.rb +3 -3
- metadata +9 -4
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.3.
|
4
|
-
s.date = '2010-8-
|
3
|
+
s.version = '0.3.3' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2010-8-17'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
data/lib/docsplit.rb
CHANGED
@@ -24,6 +24,7 @@ module Docsplit
|
|
24
24
|
|
25
25
|
# Convert a single PDF into page images at the specified size and format.
|
26
26
|
def convert(pdf, size, format, previous=nil)
|
27
|
+
tempdir = Dir.mktmpdir
|
27
28
|
basename = File.basename(pdf, File.extname(pdf))
|
28
29
|
directory = directory_for(size)
|
29
30
|
FileUtils.mkdir_p(directory) unless File.exists?(directory)
|
@@ -31,13 +32,14 @@ module Docsplit
|
|
31
32
|
common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
|
32
33
|
if previous
|
33
34
|
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
|
34
|
-
cmd = "OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
|
35
|
+
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
|
35
36
|
else
|
36
|
-
cmd = "OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
|
37
|
+
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
|
37
38
|
end
|
38
39
|
result = `#{cmd}`.chomp
|
39
40
|
raise ExtractionFailed, result if $? != 0
|
40
41
|
renumber_images(out_file, format)
|
42
|
+
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
|
41
43
|
end
|
42
44
|
|
43
45
|
|
@@ -42,7 +42,6 @@ module Docsplit
|
|
42
42
|
end
|
43
43
|
end
|
44
44
|
end
|
45
|
-
FileUtils.remove_entry_secure @tempdir if @tempdir
|
46
45
|
end
|
47
46
|
|
48
47
|
# Does a PDF have any text embedded?
|
@@ -59,19 +58,20 @@ module Docsplit
|
|
59
58
|
|
60
59
|
# Extract a page range worth of text from a PDF via OCR.
|
61
60
|
def extract_from_ocr(pdf, pages)
|
62
|
-
|
61
|
+
tempdir = Dir.mktmpdir
|
63
62
|
base_path = File.join(@output, @pdf_name)
|
64
63
|
if pages
|
65
|
-
run "OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{
|
64
|
+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
|
66
65
|
@tiffs_generated = true
|
67
66
|
pages.each do |page|
|
68
|
-
run "tesseract #{
|
67
|
+
run "tesseract #{tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
|
69
68
|
end
|
70
69
|
else
|
71
|
-
tiff = "#{
|
72
|
-
run "OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
|
70
|
+
tiff = "#{tempdir}/#{@pdf_name}.tif"
|
71
|
+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
|
73
72
|
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
|
74
73
|
end
|
74
|
+
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
|
75
75
|
end
|
76
76
|
|
77
77
|
|
@@ -12,9 +12,9 @@ module Docsplit
|
|
12
12
|
if ext.downcase == '.pdf'
|
13
13
|
doc
|
14
14
|
else
|
15
|
-
|
16
|
-
extract_pdf([doc], {:output =>
|
17
|
-
File.join(
|
15
|
+
tempdir = File.join(Dir.tmpdir, 'docsplit')
|
16
|
+
extract_pdf([doc], {:output => tempdir})
|
17
|
+
File.join(tempdir, File.basename(doc, ext) + '.pdf')
|
18
18
|
end
|
19
19
|
end
|
20
20
|
end
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 21
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
7
8
|
- 3
|
8
|
-
-
|
9
|
-
version: 0.3.
|
9
|
+
- 3
|
10
|
+
version: 0.3.3
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Jeremy Ashkenas
|
@@ -15,7 +16,7 @@ autorequire:
|
|
15
16
|
bindir: bin
|
16
17
|
cert_chain: []
|
17
18
|
|
18
|
-
date: 2010-08-
|
19
|
+
date: 2010-08-17 00:00:00 -04:00
|
19
20
|
default_executable:
|
20
21
|
dependencies: []
|
21
22
|
|
@@ -60,23 +61,27 @@ rdoc_options: []
|
|
60
61
|
require_paths:
|
61
62
|
- lib
|
62
63
|
required_ruby_version: !ruby/object:Gem::Requirement
|
64
|
+
none: false
|
63
65
|
requirements:
|
64
66
|
- - ">="
|
65
67
|
- !ruby/object:Gem::Version
|
68
|
+
hash: 3
|
66
69
|
segments:
|
67
70
|
- 0
|
68
71
|
version: "0"
|
69
72
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
70
74
|
requirements:
|
71
75
|
- - ">="
|
72
76
|
- !ruby/object:Gem::Version
|
77
|
+
hash: 3
|
73
78
|
segments:
|
74
79
|
- 0
|
75
80
|
version: "0"
|
76
81
|
requirements: []
|
77
82
|
|
78
83
|
rubyforge_project: docsplit
|
79
|
-
rubygems_version: 1.3.
|
84
|
+
rubygems_version: 1.3.7
|
80
85
|
signing_key:
|
81
86
|
specification_version: 3
|
82
87
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|