docsplit 0.7.2 → 0.7.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA512:
3
+ data.tar.gz: e50c360ba40c1aa51cd563d9a08d0c5a444b50c4e97635133aec13f17fcc4e541e20d2bb9867ca316486df9a501852ed482801bed26986e88dbc472bd2609d65
4
+ metadata.gz: 5da217d234a3b390963f2c6259ab859ecf805d71c60138c335eebdd9ae4c166cce63e825cee2e10c7e3029c47e4530e53b2d84dbdc91ab222c0a98c6f342d32a
5
+ SHA1:
6
+ data.tar.gz: 03c2ec2439773679080a055463d35e7004d1decf
7
+ metadata.gz: ddd8d4475b4a34fcfd2abb013d2824295718aa8c
data/LICENSE CHANGED
@@ -1,6 +1,7 @@
1
1
  JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
2
2
 
3
- Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
3
+ Copyright (c) 2009-2011 Jeremy Ashkenas, DocumentCloud
4
+ Copyright (c) 2011-2013 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
4
5
 
5
6
  Permission is hereby granted, free of charge, to any person
6
7
  obtaining a copy of this software and associated documentation
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.7.2' # Keep version in sync with docsplit.rb
4
- s.date = '2013-02-21'
3
+ s.version = '0.7.3' # Keep version in sync with docsplit.rb
4
+ s.date = '2014-02-16'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -15,6 +15,7 @@ Gem::Specification.new do |s|
15
15
  s.authors = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
16
16
  s.email = 'opensource@documentcloud.org'
17
17
  s.rubyforge_project = 'docsplit'
18
+ s.license = 'MIT'
18
19
 
19
20
  s.require_paths = ['lib']
20
21
  s.executables = ['docsplit']
data/lib/docsplit.rb CHANGED
@@ -5,7 +5,7 @@ require 'shellwords'
5
5
  # The Docsplit module delegates to the Java PDF extractors.
6
6
  module Docsplit
7
7
 
8
- VERSION = '0.7.2' # Keep in sync with gemspec.
8
+ VERSION = '0.7.3' # Keep in sync with gemspec.
9
9
 
10
10
  ESCAPE = lambda {|x| Shellwords.shellescape(x) }
11
11
 
@@ -96,6 +96,7 @@ Options:
96
96
  end
97
97
  opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
98
98
  @options[:language] = l
99
+ @options[:clean] = false
99
100
  end
100
101
  opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
101
102
  @options[:rolling] = true
@@ -27,7 +27,7 @@ module Docsplit
27
27
  raise ExtractionFailed, result if $? != 0
28
28
  # ruby 1.8 (iconv) and 1.9 (String#encode) :
29
29
  if String.method_defined?(:encode)
30
- result.encode!('UTF-8', 'UTF-8', :invalid => :replace)
30
+ result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding?
31
31
  else
32
32
  require 'iconv' unless defined?(Iconv)
33
33
  ic = Iconv.new('UTF-8//IGNORE','UTF-8')
@@ -19,7 +19,12 @@ module Docsplit
19
19
  # The first line of the help output holds the name and version number
20
20
  # of the office software to be used for extraction.
21
21
  def version_string
22
- @@help ||= `#{office_executable} -h 2>&1`.split("\n").first
22
+ versionstr = `#{office_executable} -h 2>&1`.split("\n").first
23
+ if !!versionstr.match(/[0-9]*/)
24
+ versionstr = `#{office_executable} --version`.split("\n").first
25
+ end
26
+ @@help ||= versionstr
27
+
23
28
  end
24
29
  def libre_office?
25
30
  !!version_string.match(/^LibreOffice/)
@@ -37,7 +42,7 @@ module Docsplit
37
42
  if windows?
38
43
  office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
39
44
  program_files_path = ENV["CommonProgramFiles"]
40
- search_paths = office_name.map{ |program| File.join(program_files_path, program) }
45
+ search_paths = office_names.map{ |program| File.join(program_files_path, program) }
41
46
  elsif osx?
42
47
  search_paths = %w(
43
48
  /Applications/LibreOffice.app/Contents
@@ -46,8 +51,10 @@ module Docsplit
46
51
  else # probably linux/unix
47
52
  search_paths = %w(
48
53
  /usr/lib/libreoffice
54
+ /usr/lib64/libreoffice
49
55
  /opt/libreoffice
50
56
  /usr/lib/openoffice
57
+ /usr/lib64/openoffice
51
58
  /opt/openoffice.org3
52
59
  )
53
60
  end
@@ -35,8 +35,13 @@ module Docsplit
35
35
  # For the time being, `clean` uses the regular StringScanner, and not the
36
36
  # multibyte-aware version, coercing to ASCII first.
37
37
  def clean(text)
38
- require 'iconv' unless defined?(Iconv)
39
- text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
38
+ if String.method_defined?(:encode)
39
+ text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
40
+ else
41
+ require 'iconv' unless defined?(Iconv)
42
+ text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
43
+ end
44
+
40
45
  scanner = StringScanner.new(text)
41
46
  cleaned = []
42
47
  spaced = false
metadata CHANGED
@@ -1,28 +1,28 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
- version: !ruby/object:Gem::Version
4
- version: 0.7.2
5
- prerelease:
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.7.3
6
5
  platform: ruby
7
- authors:
6
+ authors:
8
7
  - Jeremy Ashkenas
9
8
  - Samuel Clay
10
9
  - Ted Han
11
10
  autorequire:
12
11
  bindir: bin
13
12
  cert_chain: []
14
- date: 2013-02-21 00:00:00.000000000 Z
13
+
14
+ date: 2014-02-16 00:00:00 Z
15
15
  dependencies: []
16
- description: ! " Docsplit is a command-line utility and Ruby library for splitting
17
- apart\n documents into their component parts: searchable UTF-8 plain text, page\n
18
- \ images or thumbnails in any format, PDFs, single pages, and document\n metadata
19
- (title, author, number of pages...)\n"
16
+
17
+ description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
20
18
  email: opensource@documentcloud.org
21
- executables:
19
+ executables:
22
20
  - docsplit
23
21
  extensions: []
22
+
24
23
  extra_rdoc_files: []
25
- files:
24
+
25
+ files:
26
26
  - lib/docsplit/command_line.rb
27
27
  - lib/docsplit/image_extractor.rb
28
28
  - lib/docsplit/info_extractor.rb
@@ -47,28 +47,30 @@ files:
47
47
  - LICENSE
48
48
  - README
49
49
  homepage: http://documentcloud.github.com/docsplit/
50
- licenses: []
50
+ licenses:
51
+ - MIT
52
+ metadata: {}
53
+
51
54
  post_install_message:
52
55
  rdoc_options: []
53
- require_paths:
56
+
57
+ require_paths:
54
58
  - lib
55
- required_ruby_version: !ruby/object:Gem::Requirement
56
- none: false
57
- requirements:
58
- - - ! '>='
59
- - !ruby/object:Gem::Version
60
- version: '0'
61
- required_rubygems_version: !ruby/object:Gem::Requirement
62
- none: false
63
- requirements:
64
- - - ! '>='
65
- - !ruby/object:Gem::Version
66
- version: '0'
59
+ required_ruby_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - &id001
62
+ - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: "0"
65
+ required_rubygems_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - *id001
67
68
  requirements: []
69
+
68
70
  rubyforge_project: docsplit
69
- rubygems_version: 1.8.24
71
+ rubygems_version: 2.0.13
70
72
  signing_key:
71
- specification_version: 3
73
+ specification_version: 4
72
74
  summary: Break Apart Documents into Images, Text, Pages and PDFs
73
75
  test_files: []
74
- has_rdoc:
76
+