docsplit 0.1.3 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +3 -0
- data/docsplit.gemspec +3 -3
- data/lib/docsplit.rb +3 -2
- data/lib/docsplit/text_extractor.rb +53 -0
- metadata +13 -6
data/LICENSE
CHANGED
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.
|
4
|
-
s.date = '2010-
|
3
|
+
s.version = '0.2.0' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2010-7-29'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
@@ -12,7 +12,7 @@ Gem::Specification.new do |s|
|
|
12
12
|
metadata (title, author, number of pages...)
|
13
13
|
EOS
|
14
14
|
|
15
|
-
s.authors = ['Jeremy Ashkenas']
|
15
|
+
s.authors = ['Jeremy Ashkenas', 'Samuel Clay']
|
16
16
|
s.email = 'jeremy@documentcloud.org'
|
17
17
|
s.rubyforge_project = 'docsplit'
|
18
18
|
s.has_rdoc = false
|
data/lib/docsplit.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# The Docsplit module delegates to the Java PDF extractors.
|
2
2
|
module Docsplit
|
3
3
|
|
4
|
-
VERSION = '0.
|
4
|
+
VERSION = '0.2.0' # Keep in sync with gemspec.
|
5
5
|
|
6
6
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
7
7
|
|
@@ -26,7 +26,7 @@ module Docsplit
|
|
26
26
|
# Use the ExtractText Java class to write out all embedded text.
|
27
27
|
def self.extract_text(pdfs, opts={})
|
28
28
|
pdfs = ensure_pdfs(pdfs)
|
29
|
-
|
29
|
+
TextExtractor.new.extract(pdfs, opts)
|
30
30
|
end
|
31
31
|
|
32
32
|
# Use the ExtractImages Java class to rasterize a PDF into each page's image.
|
@@ -76,3 +76,4 @@ require 'fileutils'
|
|
76
76
|
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
77
77
|
require "#{Docsplit::ROOT}/lib/docsplit/argument_parser"
|
78
78
|
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
79
|
+
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module Docsplit
|
2
|
+
|
3
|
+
class TextExtractor
|
4
|
+
|
5
|
+
PAGE_COUNT_MATCHER = /Pages:\s+(\d+?)\n/
|
6
|
+
|
7
|
+
def extract(pdfs, opts)
|
8
|
+
extract_options opts
|
9
|
+
pdfs = [pdfs].flatten
|
10
|
+
pdfs.each do |pdf|
|
11
|
+
pdf_name = File.basename(pdf, File.extname(pdf))
|
12
|
+
text_path = File.join(@output, "#{pdf_name}.txt")
|
13
|
+
FileUtils.mkdir_p @output
|
14
|
+
|
15
|
+
if @pages
|
16
|
+
pages = (@pages == 'all') ? 1..get_pages(pdf) : @pages
|
17
|
+
pages.each do |page|
|
18
|
+
extract_page pdf, page, pdf_name
|
19
|
+
end
|
20
|
+
else
|
21
|
+
cmd = "pdftotext -enc UTF-8 #{pdf} #{text_path}"
|
22
|
+
result = `#{cmd}`.chomp
|
23
|
+
raise ExtractionFailed, result if $? != 0
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def extract_page(pdf, page, pdf_name)
|
29
|
+
text_path = File.join(@output, "#{pdf_name}_#{page}.txt")
|
30
|
+
cmd = "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path}"
|
31
|
+
result = `#{cmd}`.chomp
|
32
|
+
raise ExtractionFailed, result if $? != 0
|
33
|
+
result
|
34
|
+
end
|
35
|
+
|
36
|
+
def get_pages(pdf_path)
|
37
|
+
info = `pdfinfo #{pdf_path}`
|
38
|
+
raise ExtractionFailed, result if $? != 0
|
39
|
+
match = info.match(PAGE_COUNT_MATCHER)
|
40
|
+
raise ExtractionFailed if match.nil?
|
41
|
+
match[1].to_i
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def extract_options(options)
|
47
|
+
@output = options[:output] || '.'
|
48
|
+
@pages = options[:pages]
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
metadata
CHANGED
@@ -1,20 +1,22 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 23
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
8
|
+
- 2
|
9
|
+
- 0
|
10
|
+
version: 0.2.0
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Jeremy Ashkenas
|
14
|
+
- Samuel Clay
|
13
15
|
autorequire:
|
14
16
|
bindir: bin
|
15
17
|
cert_chain: []
|
16
18
|
|
17
|
-
date: 2010-
|
19
|
+
date: 2010-07-29 00:00:00 -04:00
|
18
20
|
default_executable:
|
19
21
|
dependencies: []
|
20
22
|
|
@@ -40,6 +42,7 @@ files:
|
|
40
42
|
- lib/docsplit/ExtractPages.java
|
41
43
|
- lib/docsplit/ExtractText.java
|
42
44
|
- lib/docsplit/image_extractor.rb
|
45
|
+
- lib/docsplit/text_extractor.rb
|
43
46
|
- lib/docsplit/transparent_pdfs.rb
|
44
47
|
- lib/docsplit.rb
|
45
48
|
- bin/docsplit
|
@@ -62,7 +65,7 @@ files:
|
|
62
65
|
- docsplit.gemspec
|
63
66
|
- LICENSE
|
64
67
|
- README
|
65
|
-
has_rdoc:
|
68
|
+
has_rdoc: true
|
66
69
|
homepage: http://documentcloud.github.com/docsplit/
|
67
70
|
licenses: []
|
68
71
|
|
@@ -72,23 +75,27 @@ rdoc_options: []
|
|
72
75
|
require_paths:
|
73
76
|
- lib
|
74
77
|
required_ruby_version: !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
75
79
|
requirements:
|
76
80
|
- - ">="
|
77
81
|
- !ruby/object:Gem::Version
|
82
|
+
hash: 3
|
78
83
|
segments:
|
79
84
|
- 0
|
80
85
|
version: "0"
|
81
86
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
87
|
+
none: false
|
82
88
|
requirements:
|
83
89
|
- - ">="
|
84
90
|
- !ruby/object:Gem::Version
|
91
|
+
hash: 3
|
85
92
|
segments:
|
86
93
|
- 0
|
87
94
|
version: "0"
|
88
95
|
requirements: []
|
89
96
|
|
90
97
|
rubyforge_project: docsplit
|
91
|
-
rubygems_version: 1.3.
|
98
|
+
rubygems_version: 1.3.7
|
92
99
|
signing_key:
|
93
100
|
specification_version: 3
|
94
101
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|