docsplit 0.6.0 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.6.0' # Keep version in sync with docsplit.rb
4
- s.date = '2011-09-13'
3
+ s.version = '0.6.1' # Keep version in sync with docsplit.rb
4
+ s.date = '2011-11-18'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -12,7 +12,7 @@ Gem::Specification.new do |s|
12
12
  metadata (title, author, number of pages...)
13
13
  EOS
14
14
 
15
- s.authors = ['Jeremy Ashkenas', 'Samuel Clay']
15
+ s.authors = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
16
16
  s.email = 'jeremy@documentcloud.org'
17
17
  s.rubyforge_project = 'docsplit'
18
18
 
data/lib/docsplit.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.6.0' # Keep in sync with gemspec.
4
+ VERSION = '0.6.1' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -11,11 +11,14 @@ module Docsplit
11
11
 
12
12
  HEADLESS = "-Djava.awt.headless=true"
13
13
 
14
- OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : '-Doffice.home=/usr/lib/openoffice'
14
+ office ||= "/usr/lib/openoffice" if File.exists? '/usr/lib/openoffice'
15
+ office ||= "/usr/lib/libreoffice" if File.exists? '/usr/lib/libreoffice'
16
+
17
+ OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : "-Doffice.home=#{office}"
15
18
 
16
19
  METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
17
20
 
18
- GM_FORMATS = [:png, :gif, :jpg, :jpeg, :tif, :tiff, :bmp, :pnm, :ppm, :svg, :eps]
21
+ GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
19
22
 
20
23
  DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
21
24
 
@@ -65,10 +68,10 @@ module Docsplit
65
68
  basename = File.basename(doc, ext)
66
69
  escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
67
70
 
68
- if ext.length > 0 && GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
71
+ if GM_FORMATS.include?(`file -b --mime #{doc}`.strip.split(/[:;]\s+/)[0])
69
72
  `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
70
73
  else
71
- options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
74
+ options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ROOT}/vendor/conf/document-formats.js"
72
75
  run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
73
76
  end
74
77
  end
@@ -66,7 +66,7 @@ module Docsplit
66
66
  escaped_tiff = ESCAPE[tiff]
67
67
  file = "#{base_path}_#{page}"
68
68
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
69
- run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l eng 2>&1"
69
+ run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
70
70
  clean_text(file + '.txt') if @clean_ocr
71
71
  FileUtils.remove_entry_secure tiff
72
72
  end
@@ -74,7 +74,7 @@ module Docsplit
74
74
  tiff = "#{tempdir}/#{@pdf_name}.tif"
75
75
  escaped_tiff = ESCAPE[tiff]
76
76
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
77
- run "tesseract #{escaped_tiff} #{base_path} -l eng 2>&1"
77
+ run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
78
78
  clean_text(base_path + '.txt') if @clean_ocr
79
79
  end
80
80
  ensure
@@ -122,6 +122,7 @@ module Docsplit
122
122
  @force_ocr = options[:ocr] == true
123
123
  @forbid_ocr = options[:ocr] == false
124
124
  @clean_ocr = !(options[:clean] == false)
125
+ @language = options[:language] || 'eng'
125
126
  end
126
127
 
127
128
  end
@@ -1,6 +1,3 @@
1
- //
2
- // JODConverter Document Formats Configuration
3
- //
4
1
  [
5
2
  {
6
3
  "name": "Portable Document Format",
Binary file
Binary file
Binary file
Binary file
metadata CHANGED
@@ -1,22 +1,23 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- hash: 7
5
- prerelease:
4
+ prerelease: false
6
5
  segments:
7
6
  - 0
8
7
  - 6
9
- - 0
10
- version: 0.6.0
8
+ - 1
9
+ version: 0.6.1
11
10
  platform: ruby
12
11
  authors:
13
12
  - Jeremy Ashkenas
14
13
  - Samuel Clay
14
+ - Ted Han
15
15
  autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2011-09-13 00:00:00 Z
19
+ date: 2011-11-18 00:00:00 -06:00
20
+ default_executable:
20
21
  dependencies: []
21
22
 
22
23
  description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
@@ -40,16 +41,17 @@ files:
40
41
  - vendor/conf/document-formats.js
41
42
  - vendor/jodconverter/commons-cli-1.1.jar
42
43
  - vendor/jodconverter/commons-io-1.4.jar
43
- - vendor/jodconverter/jodconverter-core-3.0-beta-3.jar
44
- - vendor/jodconverter/json-20080701.jar
45
- - vendor/jodconverter/juh-3.1.0.jar
46
- - vendor/jodconverter/jurt-3.1.0.jar
47
- - vendor/jodconverter/ridl-3.1.0.jar
48
- - vendor/jodconverter/unoil-3.1.0.jar
44
+ - vendor/jodconverter/jodconverter-core-3.0-beta-4.jar
45
+ - vendor/jodconverter/json-20090211.jar
46
+ - vendor/jodconverter/juh-3.2.1.jar
47
+ - vendor/jodconverter/jurt-3.2.1.jar
48
+ - vendor/jodconverter/ridl-3.2.1.jar
49
+ - vendor/jodconverter/unoil-3.2.1.jar
49
50
  - vendor/logging.properties
50
51
  - docsplit.gemspec
51
52
  - LICENSE
52
53
  - README
54
+ has_rdoc: true
53
55
  homepage: http://documentcloud.github.com/docsplit/
54
56
  licenses: []
55
57
 
@@ -59,27 +61,23 @@ rdoc_options: []
59
61
  require_paths:
60
62
  - lib
61
63
  required_ruby_version: !ruby/object:Gem::Requirement
62
- none: false
63
64
  requirements:
64
65
  - - ">="
65
66
  - !ruby/object:Gem::Version
66
- hash: 3
67
67
  segments:
68
68
  - 0
69
69
  version: "0"
70
70
  required_rubygems_version: !ruby/object:Gem::Requirement
71
- none: false
72
71
  requirements:
73
72
  - - ">="
74
73
  - !ruby/object:Gem::Version
75
- hash: 3
76
74
  segments:
77
75
  - 0
78
76
  version: "0"
79
77
  requirements: []
80
78
 
81
79
  rubyforge_project: docsplit
82
- rubygems_version: 1.7.2
80
+ rubygems_version: 1.3.6
83
81
  signing_key:
84
82
  specification_version: 3
85
83
  summary: Break Apart Documents into Images, Text, Pages and PDFs
Binary file
Binary file
Binary file
Binary file
Binary file