docsplit 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.6.0' # Keep version in sync with docsplit.rb
4
- s.date = '2011-09-13'
3
+ s.version = '0.6.1' # Keep version in sync with docsplit.rb
4
+ s.date = '2011-11-18'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -12,7 +12,7 @@ Gem::Specification.new do |s|
12
12
  metadata (title, author, number of pages...)
13
13
  EOS
14
14
 
15
- s.authors = ['Jeremy Ashkenas', 'Samuel Clay']
15
+ s.authors = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
16
16
  s.email = 'jeremy@documentcloud.org'
17
17
  s.rubyforge_project = 'docsplit'
18
18
 
data/lib/docsplit.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.6.0' # Keep in sync with gemspec.
4
+ VERSION = '0.6.1' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -11,11 +11,14 @@ module Docsplit
11
11
 
12
12
  HEADLESS = "-Djava.awt.headless=true"
13
13
 
14
- OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : '-Doffice.home=/usr/lib/openoffice'
14
+ office ||= "/usr/lib/openoffice" if File.exists? '/usr/lib/openoffice'
15
+ office ||= "/usr/lib/libreoffice" if File.exists? '/usr/lib/libreoffice'
16
+
17
+ OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : "-Doffice.home=#{office}"
15
18
 
16
19
  METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
17
20
 
18
- GM_FORMATS = [:png, :gif, :jpg, :jpeg, :tif, :tiff, :bmp, :pnm, :ppm, :svg, :eps]
21
+ GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
19
22
 
20
23
  DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
21
24
 
@@ -65,10 +68,10 @@ module Docsplit
65
68
  basename = File.basename(doc, ext)
66
69
  escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
67
70
 
68
- if ext.length > 0 && GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
71
+ if GM_FORMATS.include?(`file -b --mime #{doc}`.strip.split(/[:;]\s+/)[0])
69
72
  `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
70
73
  else
71
- options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
74
+ options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ROOT}/vendor/conf/document-formats.js"
72
75
  run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
73
76
  end
74
77
  end
@@ -66,7 +66,7 @@ module Docsplit
66
66
  escaped_tiff = ESCAPE[tiff]
67
67
  file = "#{base_path}_#{page}"
68
68
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
69
- run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l eng 2>&1"
69
+ run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
70
70
  clean_text(file + '.txt') if @clean_ocr
71
71
  FileUtils.remove_entry_secure tiff
72
72
  end
@@ -74,7 +74,7 @@ module Docsplit
74
74
  tiff = "#{tempdir}/#{@pdf_name}.tif"
75
75
  escaped_tiff = ESCAPE[tiff]
76
76
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
77
- run "tesseract #{escaped_tiff} #{base_path} -l eng 2>&1"
77
+ run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
78
78
  clean_text(base_path + '.txt') if @clean_ocr
79
79
  end
80
80
  ensure
@@ -122,6 +122,7 @@ module Docsplit
122
122
  @force_ocr = options[:ocr] == true
123
123
  @forbid_ocr = options[:ocr] == false
124
124
  @clean_ocr = !(options[:clean] == false)
125
+ @language = options[:language] || 'eng'
125
126
  end
126
127
 
127
128
  end
@@ -1,6 +1,3 @@
1
- //
2
- // JODConverter Document Formats Configuration
3
- //
4
1
  [
5
2
  {
6
3
  "name": "Portable Document Format",
Binary file
Binary file
Binary file
Binary file
metadata CHANGED
@@ -1,22 +1,23 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- hash: 7
5
- prerelease:
4
+ prerelease: false
6
5
  segments:
7
6
  - 0
8
7
  - 6
9
- - 0
10
- version: 0.6.0
8
+ - 1
9
+ version: 0.6.1
11
10
  platform: ruby
12
11
  authors:
13
12
  - Jeremy Ashkenas
14
13
  - Samuel Clay
14
+ - Ted Han
15
15
  autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2011-09-13 00:00:00 Z
19
+ date: 2011-11-18 00:00:00 -06:00
20
+ default_executable:
20
21
  dependencies: []
21
22
 
22
23
  description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
@@ -40,16 +41,17 @@ files:
40
41
  - vendor/conf/document-formats.js
41
42
  - vendor/jodconverter/commons-cli-1.1.jar
42
43
  - vendor/jodconverter/commons-io-1.4.jar
43
- - vendor/jodconverter/jodconverter-core-3.0-beta-3.jar
44
- - vendor/jodconverter/json-20080701.jar
45
- - vendor/jodconverter/juh-3.1.0.jar
46
- - vendor/jodconverter/jurt-3.1.0.jar
47
- - vendor/jodconverter/ridl-3.1.0.jar
48
- - vendor/jodconverter/unoil-3.1.0.jar
44
+ - vendor/jodconverter/jodconverter-core-3.0-beta-4.jar
45
+ - vendor/jodconverter/json-20090211.jar
46
+ - vendor/jodconverter/juh-3.2.1.jar
47
+ - vendor/jodconverter/jurt-3.2.1.jar
48
+ - vendor/jodconverter/ridl-3.2.1.jar
49
+ - vendor/jodconverter/unoil-3.2.1.jar
49
50
  - vendor/logging.properties
50
51
  - docsplit.gemspec
51
52
  - LICENSE
52
53
  - README
54
+ has_rdoc: true
53
55
  homepage: http://documentcloud.github.com/docsplit/
54
56
  licenses: []
55
57
 
@@ -59,27 +61,23 @@ rdoc_options: []
59
61
  require_paths:
60
62
  - lib
61
63
  required_ruby_version: !ruby/object:Gem::Requirement
62
- none: false
63
64
  requirements:
64
65
  - - ">="
65
66
  - !ruby/object:Gem::Version
66
- hash: 3
67
67
  segments:
68
68
  - 0
69
69
  version: "0"
70
70
  required_rubygems_version: !ruby/object:Gem::Requirement
71
- none: false
72
71
  requirements:
73
72
  - - ">="
74
73
  - !ruby/object:Gem::Version
75
- hash: 3
76
74
  segments:
77
75
  - 0
78
76
  version: "0"
79
77
  requirements: []
80
78
 
81
79
  rubyforge_project: docsplit
82
- rubygems_version: 1.7.2
80
+ rubygems_version: 1.3.6
83
81
  signing_key:
84
82
  specification_version: 3
85
83
  summary: Break Apart Documents into Images, Text, Pages and PDFs
Binary file
Binary file
Binary file
Binary file
Binary file