henkei 2.4.0.1 → 2.4.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/.github/workflows/test.yml +3 -3
 - data/.rubocop.yml +11 -1
 - data/henkei.gemspec +2 -3
 - data/jar/{tika-app-2.4.0.jar → tika-app-2.4.1.jar} +0 -0
 - data/lib/henkei/version.rb +1 -1
 - data/lib/henkei.rb +3 -3
 - data/spec/henkei_spec.rb +27 -33
 - metadata +6 -20
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: e5f5bee2529b8b7ea72cef8962f1bf4ce87d7988e9c76a2481a72e555d949490
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: c54a7262d038b9c32d667f44111b912e0d58ad8c058314539e3b68588acfbc81
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 483996169fb05e873aec30fe2d30c0e80f1b35e34ff1442c01a79a42b3374c4262233ed4e71e3cdd552fa7d8e12124609de0a762c05d478e4337e910091bd3c8
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 11d71da34a976c8fe7bc8ddd57161c4ee2407a852b2ea1ddb1ee5e8ef4013976ce094b97bf9b02842331b1bf3e6f574c2ef24b83a9106b2448ccf9fad2e0980f
         
     | 
    
        data/.github/workflows/test.yml
    CHANGED
    
    | 
         @@ -14,10 +14,10 @@ jobs: 
     | 
|
| 
       14 
14 
     | 
    
         
             
                runs-on: ubuntu-latest
         
     | 
| 
       15 
15 
     | 
    
         
             
                strategy:
         
     | 
| 
       16 
16 
     | 
    
         
             
                  matrix:
         
     | 
| 
       17 
     | 
    
         
            -
                    ruby-version: ['2. 
     | 
| 
      
 17 
     | 
    
         
            +
                    ruby-version: ['2.7', '3.0', '3.1', '3.2']
         
     | 
| 
       18 
18 
     | 
    
         | 
| 
       19 
19 
     | 
    
         
             
                steps:
         
     | 
| 
       20 
     | 
    
         
            -
                  - uses: actions/checkout@ 
     | 
| 
      
 20 
     | 
    
         
            +
                  - uses: actions/checkout@v3
         
     | 
| 
       21 
21 
     | 
    
         | 
| 
       22 
22 
     | 
    
         
             
                  - name: Set up Ruby
         
     | 
| 
       23 
23 
     | 
    
         
             
                    uses: ruby/setup-ruby@v1
         
     | 
| 
         @@ -32,6 +32,6 @@ jobs: 
     | 
|
| 
       32 
32 
     | 
    
         
             
                    run: bundle exec rspec
         
     | 
| 
       33 
33 
     | 
    
         | 
| 
       34 
34 
     | 
    
         
             
                  - name: Test & publish code coverage
         
     | 
| 
       35 
     | 
    
         
            -
                    uses: paambaati/codeclimate-action@v3. 
     | 
| 
      
 35 
     | 
    
         
            +
                    uses: paambaati/codeclimate-action@v3.2.0
         
     | 
| 
       36 
36 
     | 
    
         
             
                    env:
         
     | 
| 
       37 
37 
     | 
    
         
             
                      CC_TEST_REPORTER_ID: bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
         
     | 
    
        data/.rubocop.yml
    CHANGED
    
    | 
         @@ -1,6 +1,10 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require:
         
     | 
| 
      
 2 
     | 
    
         
            +
              - rubocop-rake
         
     | 
| 
      
 3 
     | 
    
         
            +
              - rubocop-rspec
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
       1 
5 
     | 
    
         
             
            AllCops:
         
     | 
| 
       2 
6 
     | 
    
         
             
              NewCops: enable
         
     | 
| 
       3 
     | 
    
         
            -
              TargetRubyVersion: 2. 
     | 
| 
      
 7 
     | 
    
         
            +
              TargetRubyVersion: 2.7
         
     | 
| 
       4 
8 
     | 
    
         | 
| 
       5 
9 
     | 
    
         
             
            Layout/EmptyLinesAroundAttributeAccessor:
         
     | 
| 
       6 
10 
     | 
    
         
             
              Enabled: true
         
     | 
| 
         @@ -30,6 +34,12 @@ Metrics/BlockLength: 
     | 
|
| 
       30 
34 
     | 
    
         
             
            Metrics/MethodLength:
         
     | 
| 
       31 
35 
     | 
    
         
             
              Max: 15
         
     | 
| 
       32 
36 
     | 
    
         | 
| 
      
 37 
     | 
    
         
            +
            RSpec/ExampleLength:
         
     | 
| 
      
 38 
     | 
    
         
            +
              Max: 10
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
            RSpec/MultipleExpectations:
         
     | 
| 
      
 41 
     | 
    
         
            +
              Max: 3
         
     | 
| 
      
 42 
     | 
    
         
            +
             
     | 
| 
       33 
43 
     | 
    
         
             
            Style/ClassVars:
         
     | 
| 
       34 
44 
     | 
    
         
             
              Enabled: false
         
     | 
| 
       35 
45 
     | 
    
         | 
    
        data/henkei.gemspec
    CHANGED
    
    | 
         @@ -5,7 +5,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 
     | 
|
| 
       5 
5 
     | 
    
         | 
| 
       6 
6 
     | 
    
         
             
            require 'henkei/version'
         
     | 
| 
       7 
7 
     | 
    
         | 
| 
       8 
     | 
    
         
            -
            Gem::Specification.new do |spec| 
     | 
| 
      
 8 
     | 
    
         
            +
            Gem::Specification.new do |spec|
         
     | 
| 
       9 
9 
     | 
    
         
             
              spec.name          = 'henkei'
         
     | 
| 
       10 
10 
     | 
    
         
             
              spec.version       = Henkei::VERSION
         
     | 
| 
       11 
11 
     | 
    
         
             
              spec.authors       = ['Erol Fornoles', 'Andrew Bromwich']
         
     | 
| 
         @@ -15,7 +15,7 @@ Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength 
     | 
|
| 
       15 
15 
     | 
    
         
             
                                   '(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
         
     | 
| 
       16 
16 
     | 
    
         
             
              spec.homepage      = 'https://github.com/abrom/henkei'
         
     | 
| 
       17 
17 
     | 
    
         
             
              spec.license       = 'MIT'
         
     | 
| 
       18 
     | 
    
         
            -
              spec.required_ruby_version = ['>= 2. 
     | 
| 
      
 18 
     | 
    
         
            +
              spec.required_ruby_version = ['>= 2.7.0', '< 3.3.0']
         
     | 
| 
       19 
19 
     | 
    
         | 
| 
       20 
20 
     | 
    
         
             
              # Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
         
     | 
| 
       21 
21 
     | 
    
         
             
              # delete this section to allow pushing this gem to any host.
         
     | 
| 
         @@ -38,7 +38,6 @@ Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength 
     | 
|
| 
       38 
38 
     | 
    
         
             
              spec.add_development_dependency 'rspec', '~> 3.7'
         
     | 
| 
       39 
39 
     | 
    
         
             
              spec.add_development_dependency 'rubocop', '~> 1.26'
         
     | 
| 
       40 
40 
     | 
    
         
             
              spec.add_development_dependency 'rubocop-performance', '~> 1.13'
         
     | 
| 
       41 
     | 
    
         
            -
              spec.add_development_dependency 'rubocop-rails', '~> 2.14'
         
     | 
| 
       42 
41 
     | 
    
         
             
              spec.add_development_dependency 'rubocop-rake', '~> 0.6'
         
     | 
| 
       43 
42 
     | 
    
         
             
              spec.add_development_dependency 'rubocop-rspec', '~> 2.9'
         
     | 
| 
       44 
43 
     | 
    
         
             
              spec.add_development_dependency 'simplecov', '~> 0.15', '< 0.18'
         
     | 
| 
         Binary file 
     | 
    
        data/lib/henkei/version.rb
    CHANGED
    
    
    
        data/lib/henkei.rb
    CHANGED
    
    | 
         @@ -25,14 +25,14 @@ require 'open3' 
     | 
|
| 
       25 
25 
     | 
    
         
             
            # Read text and metadata from files and documents using Apache Tika toolkit
         
     | 
| 
       26 
26 
     | 
    
         
             
            class Henkei # rubocop:disable Metrics/ClassLength
         
     | 
| 
       27 
27 
     | 
    
         
             
              GEM_PATH = File.dirname(File.dirname(__FILE__))
         
     | 
| 
       28 
     | 
    
         
            -
              JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-2.4. 
     | 
| 
      
 28 
     | 
    
         
            +
              JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-2.4.1.jar')
         
     | 
| 
       29 
29 
     | 
    
         
             
              CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
         
     | 
| 
       30 
30 
     | 
    
         
             
              CONFIG_WITHOUT_OCR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config-without-ocr.xml')
         
     | 
| 
       31 
31 
     | 
    
         | 
| 
       32 
32 
     | 
    
         
             
              def self.mimetype(content_type)
         
     | 
| 
       33 
33 
     | 
    
         
             
                if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
         
     | 
| 
       34 
     | 
    
         
            -
                  warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.'\
         
     | 
| 
       35 
     | 
    
         
            -
                       ' 
     | 
| 
      
 34 
     | 
    
         
            +
                  warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead. ' \
         
     | 
| 
      
 35 
     | 
    
         
            +
                       'Use Henkei.configure and assign "mini_mime" to `mime_library`.'
         
     | 
| 
       36 
36 
     | 
    
         
             
                  MIME::Types[content_type].first
         
     | 
| 
       37 
37 
     | 
    
         
             
                else
         
     | 
| 
       38 
38 
     | 
    
         
             
                  MiniMime.lookup_by_content_type(content_type).tap do |object|
         
     | 
    
        data/spec/henkei_spec.rb
    CHANGED
    
    | 
         @@ -20,13 +20,13 @@ describe Henkei do 
     | 
|
| 
       20 
20 
     | 
    
         | 
| 
       21 
21 
     | 
    
         
             
              describe '.read' do
         
     | 
| 
       22 
22 
     | 
    
         
             
                it 'reads text' do
         
     | 
| 
       23 
     | 
    
         
            -
                  text =  
     | 
| 
      
 23 
     | 
    
         
            +
                  text = described_class.read :text, data
         
     | 
| 
       24 
24 
     | 
    
         | 
| 
       25 
25 
     | 
    
         
             
                  expect(text).to include 'The quick brown fox jumped over the lazy cat.'
         
     | 
| 
       26 
26 
     | 
    
         
             
                end
         
     | 
| 
       27 
27 
     | 
    
         | 
| 
       28 
28 
     | 
    
         
             
                it 'reads metadata' do
         
     | 
| 
       29 
     | 
    
         
            -
                  metadata =  
     | 
| 
      
 29 
     | 
    
         
            +
                  metadata = described_class.read :metadata, data
         
     | 
| 
       30 
30 
     | 
    
         | 
| 
       31 
31 
     | 
    
         
             
                  expect(metadata['Content-Type']).to(
         
     | 
| 
       32 
32 
     | 
    
         
             
                    eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
         
     | 
| 
         @@ -35,13 +35,13 @@ describe Henkei do 
     | 
|
| 
       35 
35 
     | 
    
         | 
| 
       36 
36 
     | 
    
         
             
                it 'reads metadata values with colons as strings' do
         
     | 
| 
       37 
37 
     | 
    
         
             
                  data = File.read 'spec/samples/sample-metadata-values-with-colons.doc'
         
     | 
| 
       38 
     | 
    
         
            -
                  metadata =  
     | 
| 
      
 38 
     | 
    
         
            +
                  metadata = described_class.read :metadata, data
         
     | 
| 
       39 
39 
     | 
    
         | 
| 
       40 
40 
     | 
    
         
             
                  expect(metadata['dc:title']).to eq 'problem: test'
         
     | 
| 
       41 
41 
     | 
    
         
             
                end
         
     | 
| 
       42 
42 
     | 
    
         | 
| 
       43 
43 
     | 
    
         
             
                it 'reads mimetype' do
         
     | 
| 
       44 
     | 
    
         
            -
                  mimetype =  
     | 
| 
      
 44 
     | 
    
         
            +
                  mimetype = described_class.read :mimetype, data
         
     | 
| 
       45 
45 
     | 
    
         | 
| 
       46 
46 
     | 
    
         
             
                  expect(mimetype.content_type).to(
         
     | 
| 
       47 
47 
     | 
    
         
             
                    eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
         
     | 
| 
         @@ -53,7 +53,7 @@ describe Henkei do 
     | 
|
| 
       53 
53 
     | 
    
         
             
                  let(:data) { File.read 'spec/samples/pipe-error.png' }
         
     | 
| 
       54 
54 
     | 
    
         | 
| 
       55 
55 
     | 
    
         
             
                  it 'returns an empty result' do
         
     | 
| 
       56 
     | 
    
         
            -
                    text =  
     | 
| 
      
 56 
     | 
    
         
            +
                    text = described_class.read :text, data
         
     | 
| 
       57 
57 
     | 
    
         | 
| 
       58 
58 
     | 
    
         
             
                    expect(text).to eq ''
         
     | 
| 
       59 
59 
     | 
    
         
             
                  end
         
     | 
| 
         @@ -61,15 +61,12 @@ describe Henkei do 
     | 
|
| 
       61 
61 
     | 
    
         
             
                  unless ci?
         
     | 
| 
       62 
62 
     | 
    
         
             
                    context 'when `include_ocr` is enabled' do
         
     | 
| 
       63 
63 
     | 
    
         
             
                      it 'returns parsed plain text in the image' do
         
     | 
| 
       64 
     | 
    
         
            -
                        text =  
     | 
| 
      
 64 
     | 
    
         
            +
                        text = described_class.read :text, data, include_ocr: true
         
     | 
| 
       65 
65 
     | 
    
         | 
| 
       66 
66 
     | 
    
         
             
                        expect(text).to include <<~TEXT
         
     | 
| 
       67 
67 
     | 
    
         
             
                          West Side
         
     | 
| 
       68 
68 
     | 
    
         | 
| 
       69 
69 
     | 
    
         
             
                          Sea Island
         
     | 
| 
       70 
     | 
    
         
            -
                          PP
         
     | 
| 
       71 
     | 
    
         
            -
             
     | 
| 
       72 
     | 
    
         
            -
                          Richmond
         
     | 
| 
       73 
70 
     | 
    
         
             
                        TEXT
         
     | 
| 
       74 
71 
     | 
    
         
             
                      end
         
     | 
| 
       75 
72 
     | 
    
         
             
                    end
         
     | 
| 
         @@ -79,11 +76,11 @@ describe Henkei do 
     | 
|
| 
       79 
76 
     | 
    
         | 
| 
       80 
77 
     | 
    
         
             
              describe '.new' do
         
     | 
| 
       81 
78 
     | 
    
         
             
                it 'requires parameters' do
         
     | 
| 
       82 
     | 
    
         
            -
                  expect {  
     | 
| 
      
 79 
     | 
    
         
            +
                  expect { described_class.new }.to raise_error ArgumentError
         
     | 
| 
       83 
80 
     | 
    
         
             
                end
         
     | 
| 
       84 
81 
     | 
    
         | 
| 
       85 
82 
     | 
    
         
             
                it 'accepts a root path' do
         
     | 
| 
       86 
     | 
    
         
            -
                  henkei =  
     | 
| 
      
 83 
     | 
    
         
            +
                  henkei = described_class.new File.join(Henkei::GEM_PATH, 'spec/samples/sample.pages')
         
     | 
| 
       87 
84 
     | 
    
         | 
| 
       88 
85 
     | 
    
         
             
                  expect(henkei).to be_path
         
     | 
| 
       89 
86 
     | 
    
         
             
                  expect(henkei).not_to be_uri
         
     | 
| 
         @@ -91,7 +88,7 @@ describe Henkei do 
     | 
|
| 
       91 
88 
     | 
    
         
             
                end
         
     | 
| 
       92 
89 
     | 
    
         | 
| 
       93 
90 
     | 
    
         
             
                it 'accepts a relative path' do
         
     | 
| 
       94 
     | 
    
         
            -
                  henkei =  
     | 
| 
      
 91 
     | 
    
         
            +
                  henkei = described_class.new 'spec/samples/sample.pages'
         
     | 
| 
       95 
92 
     | 
    
         | 
| 
       96 
93 
     | 
    
         
             
                  expect(henkei).to be_path
         
     | 
| 
       97 
94 
     | 
    
         
             
                  expect(henkei).not_to be_uri
         
     | 
| 
         @@ -99,7 +96,7 @@ describe Henkei do 
     | 
|
| 
       99 
96 
     | 
    
         
             
                end
         
     | 
| 
       100 
97 
     | 
    
         | 
| 
       101 
98 
     | 
    
         
             
                it 'accepts a path with spaces' do
         
     | 
| 
       102 
     | 
    
         
            -
                  henkei =  
     | 
| 
      
 99 
     | 
    
         
            +
                  henkei = described_class.new 'spec/samples/sample filename with spaces.pages'
         
     | 
| 
       103 
100 
     | 
    
         | 
| 
       104 
101 
     | 
    
         
             
                  expect(henkei).to be_path
         
     | 
| 
       105 
102 
     | 
    
         
             
                  expect(henkei).not_to be_uri
         
     | 
| 
         @@ -107,7 +104,7 @@ describe Henkei do 
     | 
|
| 
       107 
104 
     | 
    
         
             
                end
         
     | 
| 
       108 
105 
     | 
    
         | 
| 
       109 
106 
     | 
    
         
             
                it 'accepts a URI' do
         
     | 
| 
       110 
     | 
    
         
            -
                  henkei =  
     | 
| 
      
 107 
     | 
    
         
            +
                  henkei = described_class.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
         
     | 
| 
       111 
108 
     | 
    
         | 
| 
       112 
109 
     | 
    
         
             
                  expect(henkei).to be_uri
         
     | 
| 
       113 
110 
     | 
    
         
             
                  expect(henkei).not_to be_path
         
     | 
| 
         @@ -116,7 +113,7 @@ describe Henkei do 
     | 
|
| 
       116 
113 
     | 
    
         | 
| 
       117 
114 
     | 
    
         
             
                it 'accepts a stream or object that can be read' do
         
     | 
| 
       118 
115 
     | 
    
         
             
                  File.open 'spec/samples/sample.pages', 'r' do |file|
         
     | 
| 
       119 
     | 
    
         
            -
                    henkei =  
     | 
| 
      
 116 
     | 
    
         
            +
                    henkei = described_class.new file
         
     | 
| 
       120 
117 
     | 
    
         | 
| 
       121 
118 
     | 
    
         
             
                    expect(henkei).to be_stream
         
     | 
| 
       122 
119 
     | 
    
         
             
                    expect(henkei).not_to be_path
         
     | 
| 
         @@ -125,38 +122,38 @@ describe Henkei do 
     | 
|
| 
       125 
122 
     | 
    
         
             
                end
         
     | 
| 
       126 
123 
     | 
    
         | 
| 
       127 
124 
     | 
    
         
             
                it 'refuses a path to a missing file' do
         
     | 
| 
       128 
     | 
    
         
            -
                  expect {  
     | 
| 
      
 125 
     | 
    
         
            +
                  expect { described_class.new 'test/sample/missing.pages' }.to raise_error Errno::ENOENT
         
     | 
| 
       129 
126 
     | 
    
         
             
                end
         
     | 
| 
       130 
127 
     | 
    
         | 
| 
       131 
128 
     | 
    
         
             
                it 'refuses other objects' do
         
     | 
| 
       132 
129 
     | 
    
         
             
                  [nil, 1, 1.1].each do |object|
         
     | 
| 
       133 
     | 
    
         
            -
                    expect {  
     | 
| 
      
 130 
     | 
    
         
            +
                    expect { described_class.new object }.to raise_error TypeError
         
     | 
| 
       134 
131 
     | 
    
         
             
                  end
         
     | 
| 
       135 
132 
     | 
    
         
             
                end
         
     | 
| 
       136 
133 
     | 
    
         
             
              end
         
     | 
| 
       137 
134 
     | 
    
         | 
| 
       138 
135 
     | 
    
         
             
              describe '.creation_date' do
         
     | 
| 
       139 
     | 
    
         
            -
                let(:henkei) {  
     | 
| 
      
 136 
     | 
    
         
            +
                let(:henkei) { described_class.new 'spec/samples/sample.pages' }
         
     | 
| 
       140 
137 
     | 
    
         | 
| 
       141 
     | 
    
         
            -
                it ' 
     | 
| 
      
 138 
     | 
    
         
            +
                it 'returns a Time' do
         
     | 
| 
       142 
139 
     | 
    
         
             
                  expect(henkei.creation_date).to be_a Time
         
     | 
| 
       143 
140 
     | 
    
         
             
                end
         
     | 
| 
       144 
141 
     | 
    
         
             
              end
         
     | 
| 
       145 
142 
     | 
    
         | 
| 
       146 
143 
     | 
    
         
             
              describe '.java' do
         
     | 
| 
       147 
144 
     | 
    
         
             
                specify 'with no specified JAVA_HOME' do
         
     | 
| 
       148 
     | 
    
         
            -
                  expect( 
     | 
| 
      
 145 
     | 
    
         
            +
                  expect(described_class.send(:java_path)).to eq 'java'
         
     | 
| 
       149 
146 
     | 
    
         
             
                end
         
     | 
| 
       150 
147 
     | 
    
         | 
| 
       151 
148 
     | 
    
         
             
                specify 'with a specified JAVA_HOME' do
         
     | 
| 
       152 
149 
     | 
    
         
             
                  ENV['JAVA_HOME'] = '/path/to/java/home'
         
     | 
| 
       153 
150 
     | 
    
         | 
| 
       154 
     | 
    
         
            -
                  expect( 
     | 
| 
      
 151 
     | 
    
         
            +
                  expect(described_class.send(:java_path)).to eq '/path/to/java/home/bin/java'
         
     | 
| 
       155 
152 
     | 
    
         
             
                end
         
     | 
| 
       156 
153 
     | 
    
         
             
              end
         
     | 
| 
       157 
154 
     | 
    
         | 
| 
       158 
     | 
    
         
            -
              context 'initialized with a given path' do
         
     | 
| 
       159 
     | 
    
         
            -
                let(:henkei) {  
     | 
| 
      
 155 
     | 
    
         
            +
              context 'when initialized with a given path' do
         
     | 
| 
      
 156 
     | 
    
         
            +
                let(:henkei) { described_class.new 'spec/samples/sample.pages' }
         
     | 
| 
       160 
157 
     | 
    
         | 
| 
       161 
158 
     | 
    
         
             
                specify '#text reads text' do
         
     | 
| 
       162 
159 
     | 
    
         
             
                  expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
         
     | 
| 
         @@ -167,7 +164,7 @@ describe Henkei do 
     | 
|
| 
       167 
164 
     | 
    
         
             
                end
         
     | 
| 
       168 
165 
     | 
    
         | 
| 
       169 
166 
     | 
    
         
             
                context 'when passing in the `pipe-error.png` test file' do
         
     | 
| 
       170 
     | 
    
         
            -
                  let(:henkei) {  
     | 
| 
      
 167 
     | 
    
         
            +
                  let(:henkei) { described_class.new 'spec/samples/pipe-error.png' }
         
     | 
| 
       171 
168 
     | 
    
         | 
| 
       172 
169 
     | 
    
         
             
                  it '#text returns an empty result' do
         
     | 
| 
       173 
170 
     | 
    
         
             
                    expect(henkei.text).to eq ''
         
     | 
| 
         @@ -189,9 +186,6 @@ describe Henkei do 
     | 
|
| 
       189 
186 
     | 
    
         
             
                          West Side
         
     | 
| 
       190 
187 
     | 
    
         | 
| 
       191 
188 
     | 
    
         
             
                          Sea Island
         
     | 
| 
       192 
     | 
    
         
            -
                          PP
         
     | 
| 
       193 
     | 
    
         
            -
             
     | 
| 
       194 
     | 
    
         
            -
                          Richmond
         
     | 
| 
       195 
189 
     | 
    
         
             
                        TEXT
         
     | 
| 
       196 
190 
     | 
    
         
             
                      end
         
     | 
| 
       197 
191 
     | 
    
         | 
| 
         @@ -199,7 +193,7 @@ describe Henkei do 
     | 
|
| 
       199 
193 
     | 
    
         
             
                        expect(henkei.html(include_ocr: true)).to include '<meta name="tiff:ImageWidth" content="792"/>'
         
     | 
| 
       200 
194 
     | 
    
         | 
| 
       201 
195 
     | 
    
         
             
                        html_body = Nokogiri::HTML(henkei.html(include_ocr: true)).at_xpath('//body')
         
     | 
| 
       202 
     | 
    
         
            -
                        [' 
     | 
| 
      
 196 
     | 
    
         
            +
                        ['West Side', 'Sea Island', 'Richmond', 'Steveston'].each do |location|
         
     | 
| 
       203 
197 
     | 
    
         
             
                          expect(html_body.text).to include location
         
     | 
| 
       204 
198 
     | 
    
         
             
                        end
         
     | 
| 
       205 
199 
     | 
    
         
             
                      end
         
     | 
| 
         @@ -208,8 +202,8 @@ describe Henkei do 
     | 
|
| 
       208 
202 
     | 
    
         
             
                end
         
     | 
| 
       209 
203 
     | 
    
         
             
              end
         
     | 
| 
       210 
204 
     | 
    
         | 
| 
       211 
     | 
    
         
            -
              context 'initialized with a given URI' do
         
     | 
| 
       212 
     | 
    
         
            -
                let(:henkei) {  
     | 
| 
      
 205 
     | 
    
         
            +
              context 'when initialized with a given URI' do
         
     | 
| 
      
 206 
     | 
    
         
            +
                let(:henkei) { described_class.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx' }
         
     | 
| 
       213 
207 
     | 
    
         | 
| 
       214 
208 
     | 
    
         
             
                specify '#text reads text' do
         
     | 
| 
       215 
209 
     | 
    
         
             
                  expect(henkei.text).to include 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit.'
         
     | 
| 
         @@ -222,8 +216,8 @@ describe Henkei do 
     | 
|
| 
       222 
216 
     | 
    
         
             
                end
         
     | 
| 
       223 
217 
     | 
    
         
             
              end
         
     | 
| 
       224 
218 
     | 
    
         | 
| 
       225 
     | 
    
         
            -
              context 'initialized with a given stream' do
         
     | 
| 
       226 
     | 
    
         
            -
                let(:henkei) {  
     | 
| 
      
 219 
     | 
    
         
            +
              context 'when initialized with a given stream' do
         
     | 
| 
      
 220 
     | 
    
         
            +
                let(:henkei) { described_class.new File.open('spec/samples/sample.pages', 'rb') }
         
     | 
| 
       227 
221 
     | 
    
         | 
| 
       228 
222 
     | 
    
         
             
                specify '#text reads text' do
         
     | 
| 
       229 
223 
     | 
    
         
             
                  expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
         
     | 
| 
         @@ -235,7 +229,7 @@ describe Henkei do 
     | 
|
| 
       235 
229 
     | 
    
         
             
              end
         
     | 
| 
       236 
230 
     | 
    
         | 
| 
       237 
231 
     | 
    
         
             
              context 'when source is a remote PDF' do
         
     | 
| 
       238 
     | 
    
         
            -
                let(:henkei) {  
     | 
| 
      
 232 
     | 
    
         
            +
                let(:henkei) { described_class.new 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf' }
         
     | 
| 
       239 
233 
     | 
    
         | 
| 
       240 
234 
     | 
    
         
             
                specify '#text reads text' do
         
     | 
| 
       241 
235 
     | 
    
         
             
                  expect(henkei.text).to include 'Dummy PDF file'
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: henkei
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 2.4. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 2.4.1.1
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Erol Fornoles
         
     | 
| 
         @@ -9,7 +9,7 @@ authors: 
     | 
|
| 
       9 
9 
     | 
    
         
             
            autorequire:
         
     | 
| 
       10 
10 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       11 
11 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       12 
     | 
    
         
            -
            date:  
     | 
| 
      
 12 
     | 
    
         
            +
            date: 2023-01-22 00:00:00.000000000 Z
         
     | 
| 
       13 
13 
     | 
    
         
             
            dependencies:
         
     | 
| 
       14 
14 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       15 
15 
     | 
    
         
             
              name: json
         
     | 
| 
         @@ -149,20 +149,6 @@ dependencies: 
     | 
|
| 
       149 
149 
     | 
    
         
             
                - - "~>"
         
     | 
| 
       150 
150 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       151 
151 
     | 
    
         
             
                    version: '1.13'
         
     | 
| 
       152 
     | 
    
         
            -
            - !ruby/object:Gem::Dependency
         
     | 
| 
       153 
     | 
    
         
            -
              name: rubocop-rails
         
     | 
| 
       154 
     | 
    
         
            -
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
       155 
     | 
    
         
            -
                requirements:
         
     | 
| 
       156 
     | 
    
         
            -
                - - "~>"
         
     | 
| 
       157 
     | 
    
         
            -
                  - !ruby/object:Gem::Version
         
     | 
| 
       158 
     | 
    
         
            -
                    version: '2.14'
         
     | 
| 
       159 
     | 
    
         
            -
              type: :development
         
     | 
| 
       160 
     | 
    
         
            -
              prerelease: false
         
     | 
| 
       161 
     | 
    
         
            -
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
       162 
     | 
    
         
            -
                requirements:
         
     | 
| 
       163 
     | 
    
         
            -
                - - "~>"
         
     | 
| 
       164 
     | 
    
         
            -
                  - !ruby/object:Gem::Version
         
     | 
| 
       165 
     | 
    
         
            -
                    version: '2.14'
         
     | 
| 
       166 
152 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       167 
153 
     | 
    
         
             
              name: rubocop-rake
         
     | 
| 
       168 
154 
     | 
    
         
             
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
         @@ -231,7 +217,7 @@ files: 
     | 
|
| 
       231 
217 
     | 
    
         
             
            - Rakefile
         
     | 
| 
       232 
218 
     | 
    
         
             
            - bin/console
         
     | 
| 
       233 
219 
     | 
    
         
             
            - henkei.gemspec
         
     | 
| 
       234 
     | 
    
         
            -
            - jar/tika-app-2.4. 
     | 
| 
      
 220 
     | 
    
         
            +
            - jar/tika-app-2.4.1.jar
         
     | 
| 
       235 
221 
     | 
    
         
             
            - jar/tika-config-without-ocr.xml
         
     | 
| 
       236 
222 
     | 
    
         
             
            - jar/tika-config.xml
         
     | 
| 
       237 
223 
     | 
    
         
             
            - lib/henkei.rb
         
     | 
| 
         @@ -259,17 +245,17 @@ required_ruby_version: !ruby/object:Gem::Requirement 
     | 
|
| 
       259 
245 
     | 
    
         
             
              requirements:
         
     | 
| 
       260 
246 
     | 
    
         
             
              - - ">="
         
     | 
| 
       261 
247 
     | 
    
         
             
                - !ruby/object:Gem::Version
         
     | 
| 
       262 
     | 
    
         
            -
                  version: 2. 
     | 
| 
      
 248 
     | 
    
         
            +
                  version: 2.7.0
         
     | 
| 
       263 
249 
     | 
    
         
             
              - - "<"
         
     | 
| 
       264 
250 
     | 
    
         
             
                - !ruby/object:Gem::Version
         
     | 
| 
       265 
     | 
    
         
            -
                  version: 3. 
     | 
| 
      
 251 
     | 
    
         
            +
                  version: 3.3.0
         
     | 
| 
       266 
252 
     | 
    
         
             
            required_rubygems_version: !ruby/object:Gem::Requirement
         
     | 
| 
       267 
253 
     | 
    
         
             
              requirements:
         
     | 
| 
       268 
254 
     | 
    
         
             
              - - ">="
         
     | 
| 
       269 
255 
     | 
    
         
             
                - !ruby/object:Gem::Version
         
     | 
| 
       270 
256 
     | 
    
         
             
                  version: '0'
         
     | 
| 
       271 
257 
     | 
    
         
             
            requirements: []
         
     | 
| 
       272 
     | 
    
         
            -
            rubygems_version: 3. 
     | 
| 
      
 258 
     | 
    
         
            +
            rubygems_version: 3.4.1
         
     | 
| 
       273 
259 
     | 
    
         
             
            signing_key:
         
     | 
| 
       274 
260 
     | 
    
         
             
            specification_version: 4
         
     | 
| 
       275 
261 
     | 
    
         
             
            summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
         
     |