simple_text_extract 1.1.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/Gemfile.lock +17 -14
- data/README.md +2 -1
- data/lib/simple_text_extract/format_extractor/zip_extract.rb +24 -0
- data/lib/simple_text_extract/format_extractor_factory.rb +4 -1
- data/lib/simple_text_extract/text_extractor.rb +1 -2
- data/lib/simple_text_extract/version.rb +1 -1
- data/lib/simple_text_extract.rb +1 -1
- data/simple_text_extract.gemspec +3 -2
- metadata +25 -11
- data/simple_text_extract-1.0.2.gem +0 -0
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: f3d117bf20380fae2d755e2a258189520479785207c3d27115ab3e9c90e84e51
         | 
| 4 | 
            +
              data.tar.gz: 96b634b6b061520a25be360f62eac98dabde4b1ca3e723703b40bf1d7dbc11d2
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 171f01d876c6fc30abf68268c1dd69bd56d135af56cda361027efc3ee12482afcaeafeb376cf6ff3d0ad231c92a694a873e1162525f74ca405a404192858a78e
         | 
| 7 | 
            +
              data.tar.gz: 2fad0ef23c0036fb1b9257221f22bafd9f2b0760eba7407a115931705ebf7b2d42f2f09b0c0fa5ef028500e2bb4ac68e59a5f5873c97037ca2dccbe00a24cc62
         | 
    
        data/.ruby-version
    CHANGED
    
    | @@ -1 +1 @@ | |
| 1 | 
            -
             | 
| 1 | 
            +
            3.0.1
         | 
    
        data/Gemfile.lock
    CHANGED
    
    | @@ -1,26 +1,29 @@ | |
| 1 1 | 
             
            PATH
         | 
| 2 2 | 
             
              remote: .
         | 
| 3 3 | 
             
              specs:
         | 
| 4 | 
            -
                simple_text_extract (1. | 
| 5 | 
            -
                  roo (~> 2. | 
| 6 | 
            -
                   | 
| 4 | 
            +
                simple_text_extract (1.3.0)
         | 
| 5 | 
            +
                  roo (~> 2.9.0)
         | 
| 6 | 
            +
                  rubyzip (~> 2.3.2)
         | 
| 7 | 
            +
                  spreadsheet (~> 1.3.0)
         | 
| 7 8 |  | 
| 8 9 | 
             
            GEM
         | 
| 9 10 | 
             
              remote: https://rubygems.org/
         | 
| 10 11 | 
             
              specs:
         | 
| 11 | 
            -
                mini_portile2 (2. | 
| 12 | 
            -
                minitest (5. | 
| 13 | 
            -
                mocha (1. | 
| 14 | 
            -
                nokogiri (1. | 
| 15 | 
            -
                  mini_portile2 (~> 2. | 
| 16 | 
            -
             | 
| 17 | 
            -
                 | 
| 12 | 
            +
                mini_portile2 (2.8.0)
         | 
| 13 | 
            +
                minitest (5.15.0)
         | 
| 14 | 
            +
                mocha (1.13.0)
         | 
| 15 | 
            +
                nokogiri (1.13.3)
         | 
| 16 | 
            +
                  mini_portile2 (~> 2.8.0)
         | 
| 17 | 
            +
                  racc (~> 1.4)
         | 
| 18 | 
            +
                racc (1.6.0)
         | 
| 19 | 
            +
                rake (13.0.6)
         | 
| 20 | 
            +
                roo (2.9.0)
         | 
| 18 21 | 
             
                  nokogiri (~> 1)
         | 
| 19 22 | 
             
                  rubyzip (>= 1.3.0, < 3.0.0)
         | 
| 20 23 | 
             
                ruby-ole (1.2.12.2)
         | 
| 21 | 
            -
                rubyzip (2.3. | 
| 22 | 
            -
                spreadsheet (1. | 
| 23 | 
            -
                  ruby-ole | 
| 24 | 
            +
                rubyzip (2.3.2)
         | 
| 25 | 
            +
                spreadsheet (1.3.0)
         | 
| 26 | 
            +
                  ruby-ole
         | 
| 24 27 |  | 
| 25 28 | 
             
            PLATFORMS
         | 
| 26 29 | 
             
              ruby
         | 
| @@ -32,4 +35,4 @@ DEPENDENCIES | |
| 32 35 | 
             
              simple_text_extract!
         | 
| 33 36 |  | 
| 34 37 | 
             
            BUNDLED WITH
         | 
| 35 | 
            -
               2. | 
| 38 | 
            +
               2.2.15
         | 
    
        data/README.md
    CHANGED
    
    | @@ -51,7 +51,7 @@ You can choose to use SimpleTextExtract without the following dependencies, but | |
| 51 51 | 
             
            `pdf` parsing requires `poppler-utils`
         | 
| 52 52 | 
             
            - `brew install poppler`
         | 
| 53 53 |  | 
| 54 | 
            -
            `doc` parsing requires `antiword`
         | 
| 54 | 
            +
            `doc` parsing requires `antiword` and `unzip`
         | 
| 55 55 | 
             
            - `brew install antiword`
         | 
| 56 56 |  | 
| 57 57 | 
             
            `xlsx` and `xls` parsing requires `ssconvert` which is part of `gnumeric`
         | 
| @@ -76,6 +76,7 @@ In your `Aptfile`, add: | |
| 76 76 | 
             
            ```
         | 
| 77 77 | 
             
            antiword
         | 
| 78 78 | 
             
            gnumeric
         | 
| 79 | 
            +
            unzip
         | 
| 79 80 | 
             
            ```
         | 
| 80 81 |  | 
| 81 82 | 
             
            * There is currently an [issue](https://github.com/heroku/heroku-buildpack-google-chrome/issues/59) with the heroku-18 stack that requires additional dependencies added to the Aptfile to get `gnumeric` to work properly.  You can reference the linked issue above to figure out those dependencies, or downgrade to heroku-16 until it is fixed.
         | 
| @@ -0,0 +1,24 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module SimpleTextExtract
         | 
| 4 | 
            +
              module FormatExtractor
         | 
| 5 | 
            +
                class ZipExtract < Base
         | 
| 6 | 
            +
                  def extract
         | 
| 7 | 
            +
                    require "zip"
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                    result = []
         | 
| 10 | 
            +
                    Zip::File.open(file) do |zip_file|
         | 
| 11 | 
            +
                      zip_file.each do |entry|
         | 
| 12 | 
            +
                        result << entry.name
         | 
| 13 | 
            +
                        result << SimpleTextExtract.extract(
         | 
| 14 | 
            +
                          raw: entry.get_input_stream.read,
         | 
| 15 | 
            +
                          filename: entry.name
         | 
| 16 | 
            +
                        )
         | 
| 17 | 
            +
                      end
         | 
| 18 | 
            +
                    end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                    result.join(" ")
         | 
| 21 | 
            +
                  end
         | 
| 22 | 
            +
                end
         | 
| 23 | 
            +
              end
         | 
| 24 | 
            +
            end
         | 
| @@ -7,11 +7,14 @@ require "simple_text_extract/format_extractor/xls_x" | |
| 7 7 | 
             
            require "simple_text_extract/format_extractor/xls"
         | 
| 8 8 | 
             
            require "simple_text_extract/format_extractor/doc_x"
         | 
| 9 9 | 
             
            require "simple_text_extract/format_extractor/doc"
         | 
| 10 | 
            +
            require "simple_text_extract/format_extractor/zip_extract"
         | 
| 10 11 |  | 
| 11 12 | 
             
            module SimpleTextExtract
         | 
| 12 13 | 
             
              class FormatExtractorFactory
         | 
| 13 | 
            -
                def self.call(file) | 
| 14 | 
            +
                def self.call(file)
         | 
| 14 15 | 
             
                  case file.path
         | 
| 16 | 
            +
                  when /.zip$/i
         | 
| 17 | 
            +
                    FormatExtractor::ZipExtract.new(file)
         | 
| 15 18 | 
             
                  when /(.txt$|.csv$)/i
         | 
| 16 19 | 
             
                    FormatExtractor::PlainText.new(file)
         | 
| 17 20 | 
             
                  when /.pdf$/i
         | 
| @@ -26,7 +26,6 @@ module SimpleTextExtract | |
| 26 26 |  | 
| 27 27 | 
             
                  def extract
         | 
| 28 28 | 
             
                    return unless file
         | 
| 29 | 
            -
                    return unless file
         | 
| 30 29 |  | 
| 31 30 | 
             
                    begin
         | 
| 32 31 | 
             
                      FormatExtractorFactory.call(file).extract
         | 
| @@ -38,7 +37,7 @@ module SimpleTextExtract | |
| 38 37 | 
             
                  end
         | 
| 39 38 |  | 
| 40 39 | 
             
                  def cleanup
         | 
| 41 | 
            -
                    return unless file. | 
| 40 | 
            +
                    return unless file.instance_of?(Tempfile)
         | 
| 42 41 |  | 
| 43 42 | 
             
                    file.close
         | 
| 44 43 | 
             
                    file.unlink
         | 
    
        data/lib/simple_text_extract.rb
    CHANGED
    
    | @@ -5,7 +5,7 @@ require "simple_text_extract/text_extractor" | |
| 5 5 | 
             
            require "simple_text_extract/format_extractor_factory"
         | 
| 6 6 |  | 
| 7 7 | 
             
            module SimpleTextExtract
         | 
| 8 | 
            -
              SUPPORTED_FILETYPES = ["xls", "xlsx", "doc", "docx", "txt", "pdf", "csv"].freeze
         | 
| 8 | 
            +
              SUPPORTED_FILETYPES = ["xls", "xlsx", "doc", "docx", "txt", "pdf", "csv", "zip"].freeze
         | 
| 9 9 |  | 
| 10 10 | 
             
              class Error < StandardError; end
         | 
| 11 11 |  | 
    
        data/simple_text_extract.gemspec
    CHANGED
    
    | @@ -28,8 +28,9 @@ Gem::Specification.new do |spec| | |
| 28 28 | 
             
              spec.requirements << "pdftotext/poppler"
         | 
| 29 29 | 
             
              spec.required_ruby_version = ">= 2.5"
         | 
| 30 30 |  | 
| 31 | 
            -
              spec.add_runtime_dependency "roo", "~> 2. | 
| 32 | 
            -
              spec.add_runtime_dependency "spreadsheet", "~> 1. | 
| 31 | 
            +
              spec.add_runtime_dependency "roo", "~> 2.9.0"
         | 
| 32 | 
            +
              spec.add_runtime_dependency "spreadsheet", "~> 1.3.0"
         | 
| 33 | 
            +
              spec.add_runtime_dependency "rubyzip", "~> 2.3.2"
         | 
| 33 34 |  | 
| 34 35 | 
             
              spec.add_development_dependency "rake", "~> 13.0"
         | 
| 35 36 | 
             
              spec.add_development_dependency "minitest", "~> 5.0"
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: simple_text_extract
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version:  | 
| 4 | 
            +
              version: 2.0.0
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Nick Weiland
         | 
| 8 | 
            -
            autorequire: | 
| 8 | 
            +
            autorequire:
         | 
| 9 9 | 
             
            bindir: exe
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date:  | 
| 11 | 
            +
            date: 2022-03-20 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: roo
         | 
| @@ -16,28 +16,42 @@ dependencies: | |
| 16 16 | 
             
                requirements:
         | 
| 17 17 | 
             
                - - "~>"
         | 
| 18 18 | 
             
                  - !ruby/object:Gem::Version
         | 
| 19 | 
            -
                    version: 2. | 
| 19 | 
            +
                    version: 2.9.0
         | 
| 20 20 | 
             
              type: :runtime
         | 
| 21 21 | 
             
              prerelease: false
         | 
| 22 22 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 23 23 | 
             
                requirements:
         | 
| 24 24 | 
             
                - - "~>"
         | 
| 25 25 | 
             
                  - !ruby/object:Gem::Version
         | 
| 26 | 
            -
                    version: 2. | 
| 26 | 
            +
                    version: 2.9.0
         | 
| 27 27 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 28 28 | 
             
              name: spreadsheet
         | 
| 29 29 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| 30 30 | 
             
                requirements:
         | 
| 31 31 | 
             
                - - "~>"
         | 
| 32 32 | 
             
                  - !ruby/object:Gem::Version
         | 
| 33 | 
            -
                    version: 1. | 
| 33 | 
            +
                    version: 1.3.0
         | 
| 34 34 | 
             
              type: :runtime
         | 
| 35 35 | 
             
              prerelease: false
         | 
| 36 36 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 37 37 | 
             
                requirements:
         | 
| 38 38 | 
             
                - - "~>"
         | 
| 39 39 | 
             
                  - !ruby/object:Gem::Version
         | 
| 40 | 
            -
                    version: 1. | 
| 40 | 
            +
                    version: 1.3.0
         | 
| 41 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 42 | 
            +
              name: rubyzip
         | 
| 43 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 44 | 
            +
                requirements:
         | 
| 45 | 
            +
                - - "~>"
         | 
| 46 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 47 | 
            +
                    version: 2.3.2
         | 
| 48 | 
            +
              type: :runtime
         | 
| 49 | 
            +
              prerelease: false
         | 
| 50 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 51 | 
            +
                requirements:
         | 
| 52 | 
            +
                - - "~>"
         | 
| 53 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 54 | 
            +
                    version: 2.3.2
         | 
| 41 55 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 42 56 | 
             
              name: rake
         | 
| 43 57 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| @@ -107,17 +121,17 @@ files: | |
| 107 121 | 
             
            - lib/simple_text_extract/format_extractor/plain_text.rb
         | 
| 108 122 | 
             
            - lib/simple_text_extract/format_extractor/xls.rb
         | 
| 109 123 | 
             
            - lib/simple_text_extract/format_extractor/xls_x.rb
         | 
| 124 | 
            +
            - lib/simple_text_extract/format_extractor/zip_extract.rb
         | 
| 110 125 | 
             
            - lib/simple_text_extract/format_extractor_factory.rb
         | 
| 111 126 | 
             
            - lib/simple_text_extract/text_extractor.rb
         | 
| 112 127 | 
             
            - lib/simple_text_extract/version.rb
         | 
| 113 | 
            -
            - simple_text_extract-1.0.2.gem
         | 
| 114 128 | 
             
            - simple_text_extract.gemspec
         | 
| 115 129 | 
             
            - tags
         | 
| 116 130 | 
             
            homepage: https://github.com/weilandia/simple_text_extract
         | 
| 117 131 | 
             
            licenses:
         | 
| 118 132 | 
             
            - MIT
         | 
| 119 133 | 
             
            metadata: {}
         | 
| 120 | 
            -
            post_install_message: | 
| 134 | 
            +
            post_install_message:
         | 
| 121 135 | 
             
            rdoc_options: []
         | 
| 122 136 | 
             
            require_paths:
         | 
| 123 137 | 
             
            - lib
         | 
| @@ -134,8 +148,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 134 148 | 
             
            requirements:
         | 
| 135 149 | 
             
            - antiword
         | 
| 136 150 | 
             
            - pdftotext/poppler
         | 
| 137 | 
            -
            rubygems_version: 3. | 
| 138 | 
            -
            signing_key: | 
| 151 | 
            +
            rubygems_version: 3.2.15
         | 
| 152 | 
            +
            signing_key:
         | 
| 139 153 | 
             
            specification_version: 4
         | 
| 140 154 | 
             
            summary: Attempts to quickly extract text from various file types before resorting
         | 
| 141 155 | 
             
              to something more extreme like Apache Tika.
         | 
| Binary file |