docsplit 0.7.2 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA512:
3
+ data.tar.gz: e50c360ba40c1aa51cd563d9a08d0c5a444b50c4e97635133aec13f17fcc4e541e20d2bb9867ca316486df9a501852ed482801bed26986e88dbc472bd2609d65
4
+ metadata.gz: 5da217d234a3b390963f2c6259ab859ecf805d71c60138c335eebdd9ae4c166cce63e825cee2e10c7e3029c47e4530e53b2d84dbdc91ab222c0a98c6f342d32a
5
+ SHA1:
6
+ data.tar.gz: 03c2ec2439773679080a055463d35e7004d1decf
7
+ metadata.gz: ddd8d4475b4a34fcfd2abb013d2824295718aa8c
data/LICENSE CHANGED
@@ -1,6 +1,7 @@
1
1
  JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
2
2
 
3
- Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
3
+ Copyright (c) 2009-2011 Jeremy Ashkenas, DocumentCloud
4
+ Copyright (c) 2011-2013 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
4
5
 
5
6
  Permission is hereby granted, free of charge, to any person
6
7
  obtaining a copy of this software and associated documentation
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.7.2' # Keep version in sync with docsplit.rb
4
- s.date = '2013-02-21'
3
+ s.version = '0.7.3' # Keep version in sync with docsplit.rb
4
+ s.date = '2014-02-16'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -15,6 +15,7 @@ Gem::Specification.new do |s|
15
15
  s.authors = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
16
16
  s.email = 'opensource@documentcloud.org'
17
17
  s.rubyforge_project = 'docsplit'
18
+ s.license = 'MIT'
18
19
 
19
20
  s.require_paths = ['lib']
20
21
  s.executables = ['docsplit']
data/lib/docsplit.rb CHANGED
@@ -5,7 +5,7 @@ require 'shellwords'
5
5
  # The Docsplit module delegates to the Java PDF extractors.
6
6
  module Docsplit
7
7
 
8
- VERSION = '0.7.2' # Keep in sync with gemspec.
8
+ VERSION = '0.7.3' # Keep in sync with gemspec.
9
9
 
10
10
  ESCAPE = lambda {|x| Shellwords.shellescape(x) }
11
11
 
@@ -96,6 +96,7 @@ Options:
96
96
  end
97
97
  opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
98
98
  @options[:language] = l
99
+ @options[:clean] = false
99
100
  end
100
101
  opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
101
102
  @options[:rolling] = true
@@ -27,7 +27,7 @@ module Docsplit
27
27
  raise ExtractionFailed, result if $? != 0
28
28
  # ruby 1.8 (iconv) and 1.9 (String#encode) :
29
29
  if String.method_defined?(:encode)
30
- result.encode!('UTF-8', 'UTF-8', :invalid => :replace)
30
+ result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding?
31
31
  else
32
32
  require 'iconv' unless defined?(Iconv)
33
33
  ic = Iconv.new('UTF-8//IGNORE','UTF-8')
@@ -19,7 +19,12 @@ module Docsplit
19
19
  # The first line of the help output holds the name and version number
20
20
  # of the office software to be used for extraction.
21
21
  def version_string
22
- @@help ||= `#{office_executable} -h 2>&1`.split("\n").first
22
+ versionstr = `#{office_executable} -h 2>&1`.split("\n").first
23
+ if !!versionstr.match(/[0-9]*/)
24
+ versionstr = `#{office_executable} --version`.split("\n").first
25
+ end
26
+ @@help ||= versionstr
27
+
23
28
  end
24
29
  def libre_office?
25
30
  !!version_string.match(/^LibreOffice/)
@@ -37,7 +42,7 @@ module Docsplit
37
42
  if windows?
38
43
  office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
39
44
  program_files_path = ENV["CommonProgramFiles"]
40
- search_paths = office_name.map{ |program| File.join(program_files_path, program) }
45
+ search_paths = office_names.map{ |program| File.join(program_files_path, program) }
41
46
  elsif osx?
42
47
  search_paths = %w(
43
48
  /Applications/LibreOffice.app/Contents
@@ -46,8 +51,10 @@ module Docsplit
46
51
  else # probably linux/unix
47
52
  search_paths = %w(
48
53
  /usr/lib/libreoffice
54
+ /usr/lib64/libreoffice
49
55
  /opt/libreoffice
50
56
  /usr/lib/openoffice
57
+ /usr/lib64/openoffice
51
58
  /opt/openoffice.org3
52
59
  )
53
60
  end
@@ -35,8 +35,13 @@ module Docsplit
35
35
  # For the time being, `clean` uses the regular StringScanner, and not the
36
36
  # multibyte-aware version, coercing to ASCII first.
37
37
  def clean(text)
38
- require 'iconv' unless defined?(Iconv)
39
- text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
38
+ if String.method_defined?(:encode)
39
+ text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
40
+ else
41
+ require 'iconv' unless defined?(Iconv)
42
+ text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
43
+ end
44
+
40
45
  scanner = StringScanner.new(text)
41
46
  cleaned = []
42
47
  spaced = false
metadata CHANGED
@@ -1,28 +1,28 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
- version: !ruby/object:Gem::Version
4
- version: 0.7.2
5
- prerelease:
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.7.3
6
5
  platform: ruby
7
- authors:
6
+ authors:
8
7
  - Jeremy Ashkenas
9
8
  - Samuel Clay
10
9
  - Ted Han
11
10
  autorequire:
12
11
  bindir: bin
13
12
  cert_chain: []
14
- date: 2013-02-21 00:00:00.000000000 Z
13
+
14
+ date: 2014-02-16 00:00:00 Z
15
15
  dependencies: []
16
- description: ! " Docsplit is a command-line utility and Ruby library for splitting
17
- apart\n documents into their component parts: searchable UTF-8 plain text, page\n
18
- \ images or thumbnails in any format, PDFs, single pages, and document\n metadata
19
- (title, author, number of pages...)\n"
16
+
17
+ description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
20
18
  email: opensource@documentcloud.org
21
- executables:
19
+ executables:
22
20
  - docsplit
23
21
  extensions: []
22
+
24
23
  extra_rdoc_files: []
25
- files:
24
+
25
+ files:
26
26
  - lib/docsplit/command_line.rb
27
27
  - lib/docsplit/image_extractor.rb
28
28
  - lib/docsplit/info_extractor.rb
@@ -47,28 +47,30 @@ files:
47
47
  - LICENSE
48
48
  - README
49
49
  homepage: http://documentcloud.github.com/docsplit/
50
- licenses: []
50
+ licenses:
51
+ - MIT
52
+ metadata: {}
53
+
51
54
  post_install_message:
52
55
  rdoc_options: []
53
- require_paths:
56
+
57
+ require_paths:
54
58
  - lib
55
- required_ruby_version: !ruby/object:Gem::Requirement
56
- none: false
57
- requirements:
58
- - - ! '>='
59
- - !ruby/object:Gem::Version
60
- version: '0'
61
- required_rubygems_version: !ruby/object:Gem::Requirement
62
- none: false
63
- requirements:
64
- - - ! '>='
65
- - !ruby/object:Gem::Version
66
- version: '0'
59
+ required_ruby_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - &id001
62
+ - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: "0"
65
+ required_rubygems_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - *id001
67
68
  requirements: []
69
+
68
70
  rubyforge_project: docsplit
69
- rubygems_version: 1.8.24
71
+ rubygems_version: 2.0.13
70
72
  signing_key:
71
- specification_version: 3
73
+ specification_version: 4
72
74
  summary: Break Apart Documents into Images, Text, Pages and PDFs
73
75
  test_files: []
74
- has_rdoc:
76
+