docsplit 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE CHANGED
@@ -1,3 +1,6 @@
1
+ JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
2
+ PDFBox is licensed under the Apache 2 License: apache.org/licenses/LICENSE-2.0
3
+
1
4
  Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
2
5
 
3
6
  Permission is hereby granted, free of charge, to any person
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.1.3' # Keep version in sync with jammit.rb
4
- s.date = '2010-4-27'
3
+ s.version = '0.2.0' # Keep version in sync with docsplit.rb
4
+ s.date = '2010-7-29'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -12,7 +12,7 @@ Gem::Specification.new do |s|
12
12
  metadata (title, author, number of pages...)
13
13
  EOS
14
14
 
15
- s.authors = ['Jeremy Ashkenas']
15
+ s.authors = ['Jeremy Ashkenas', 'Samuel Clay']
16
16
  s.email = 'jeremy@documentcloud.org'
17
17
  s.rubyforge_project = 'docsplit'
18
18
  s.has_rdoc = false
data/lib/docsplit.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.1.3' # Keep in sync with gemspec.
4
+ VERSION = '0.2.0' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -26,7 +26,7 @@ module Docsplit
26
26
  # Use the ExtractText Java class to write out all embedded text.
27
27
  def self.extract_text(pdfs, opts={})
28
28
  pdfs = ensure_pdfs(pdfs)
29
- run "org.documentcloud.ExtractText", pdfs, opts
29
+ TextExtractor.new.extract(pdfs, opts)
30
30
  end
31
31
 
32
32
  # Use the ExtractImages Java class to rasterize a PDF into each page's image.
@@ -76,3 +76,4 @@ require 'fileutils'
76
76
  require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
77
77
  require "#{Docsplit::ROOT}/lib/docsplit/argument_parser"
78
78
  require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
79
+ require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
@@ -0,0 +1,53 @@
1
+ module Docsplit
2
+
3
+ class TextExtractor
4
+
5
+ PAGE_COUNT_MATCHER = /Pages:\s+(\d+?)\n/
6
+
7
+ def extract(pdfs, opts)
8
+ extract_options opts
9
+ pdfs = [pdfs].flatten
10
+ pdfs.each do |pdf|
11
+ pdf_name = File.basename(pdf, File.extname(pdf))
12
+ text_path = File.join(@output, "#{pdf_name}.txt")
13
+ FileUtils.mkdir_p @output
14
+
15
+ if @pages
16
+ pages = (@pages == 'all') ? 1..get_pages(pdf) : @pages
17
+ pages.each do |page|
18
+ extract_page pdf, page, pdf_name
19
+ end
20
+ else
21
+ cmd = "pdftotext -enc UTF-8 #{pdf} #{text_path}"
22
+ result = `#{cmd}`.chomp
23
+ raise ExtractionFailed, result if $? != 0
24
+ end
25
+ end
26
+ end
27
+
28
+ def extract_page(pdf, page, pdf_name)
29
+ text_path = File.join(@output, "#{pdf_name}_#{page}.txt")
30
+ cmd = "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path}"
31
+ result = `#{cmd}`.chomp
32
+ raise ExtractionFailed, result if $? != 0
33
+ result
34
+ end
35
+
36
+ def get_pages(pdf_path)
37
+ info = `pdfinfo #{pdf_path}`
38
+ raise ExtractionFailed, result if $? != 0
39
+ match = info.match(PAGE_COUNT_MATCHER)
40
+ raise ExtractionFailed if match.nil?
41
+ match[1].to_i
42
+ end
43
+
44
+ private
45
+
46
+ def extract_options(options)
47
+ @output = options[:output] || '.'
48
+ @pages = options[:pages]
49
+ end
50
+
51
+ end
52
+
53
+ end
metadata CHANGED
@@ -1,20 +1,22 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
+ hash: 23
4
5
  prerelease: false
5
6
  segments:
6
7
  - 0
7
- - 1
8
- - 3
9
- version: 0.1.3
8
+ - 2
9
+ - 0
10
+ version: 0.2.0
10
11
  platform: ruby
11
12
  authors:
12
13
  - Jeremy Ashkenas
14
+ - Samuel Clay
13
15
  autorequire:
14
16
  bindir: bin
15
17
  cert_chain: []
16
18
 
17
- date: 2010-04-27 00:00:00 -04:00
19
+ date: 2010-07-29 00:00:00 -04:00
18
20
  default_executable:
19
21
  dependencies: []
20
22
 
@@ -40,6 +42,7 @@ files:
40
42
  - lib/docsplit/ExtractPages.java
41
43
  - lib/docsplit/ExtractText.java
42
44
  - lib/docsplit/image_extractor.rb
45
+ - lib/docsplit/text_extractor.rb
43
46
  - lib/docsplit/transparent_pdfs.rb
44
47
  - lib/docsplit.rb
45
48
  - bin/docsplit
@@ -62,7 +65,7 @@ files:
62
65
  - docsplit.gemspec
63
66
  - LICENSE
64
67
  - README
65
- has_rdoc: false
68
+ has_rdoc: true
66
69
  homepage: http://documentcloud.github.com/docsplit/
67
70
  licenses: []
68
71
 
@@ -72,23 +75,27 @@ rdoc_options: []
72
75
  require_paths:
73
76
  - lib
74
77
  required_ruby_version: !ruby/object:Gem::Requirement
78
+ none: false
75
79
  requirements:
76
80
  - - ">="
77
81
  - !ruby/object:Gem::Version
82
+ hash: 3
78
83
  segments:
79
84
  - 0
80
85
  version: "0"
81
86
  required_rubygems_version: !ruby/object:Gem::Requirement
87
+ none: false
82
88
  requirements:
83
89
  - - ">="
84
90
  - !ruby/object:Gem::Version
91
+ hash: 3
85
92
  segments:
86
93
  - 0
87
94
  version: "0"
88
95
  requirements: []
89
96
 
90
97
  rubyforge_project: docsplit
91
- rubygems_version: 1.3.6
98
+ rubygems_version: 1.3.7
92
99
  signing_key:
93
100
  specification_version: 3
94
101
  summary: Break Apart Documents into Images, Text, Pages and PDFs