simple_text_extract 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 35eee7886cc545693f85facc69bbe8b99ea141f0af3ad520f6f75d2bc61eddd7
4
- data.tar.gz: 5a4b8e4a4dfe54535805f56e986fe36b5470889a96960b2bf8acd1ee4c94b084
3
+ metadata.gz: 93414a5f260468367e16054bfcbcebbd93962c90a2d6903622a622aa567f754d
4
+ data.tar.gz: af3fc14fcc1b3c532f5f54c7dfebdb8d4fd4da5de691fe4561b5e39bd6c04fe0
5
5
  SHA512:
6
- metadata.gz: 9ace8965b9567c8d9e85f2d52912c8081da83a2dc0ff9c463f6ac28450e39ca874125e501196f546930615b4d6271abad88dc3fe8b91f0ade1253c0468652eca
7
- data.tar.gz: 05d9819fd5835b8307ae6ece9b0f4efa8b00d615eaf6fcc054dc38959780761ed4cf40711d7b91de38a9e3390904dead968c0a4a0ccc9221a8ec5122a6f9f0b8
6
+ metadata.gz: 1219e1404ea893772da0905b7bd808f703240cdc9e7f875a287c4e7886e67ea8fbc4c07a2a55a95a8db3f828d7f7ad87a67832e7834618c62537df3d42e013da
7
+ data.tar.gz: c052e8f2640f8aa9ac452b9cfcad32695add2c1b374daf4dee712f34592a9d1decaae453dbeff1f473fdcd12df01f1a1f9e7751a03925514e91976bb9d654f10
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.6.5
1
+ 3.0.0
data/Gemfile.lock CHANGED
@@ -1,19 +1,22 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- simple_text_extract (1.0.0)
4
+ simple_text_extract (1.2.0)
5
5
  roo (~> 2.8.2)
6
+ rubyzip (>= 1.0.0)
6
7
  spreadsheet (~> 1.1.8)
7
8
 
8
9
  GEM
9
10
  remote: https://rubygems.org/
10
11
  specs:
11
- mini_portile2 (2.4.0)
12
- minitest (5.14.0)
13
- mocha (1.11.2)
14
- nokogiri (1.10.9)
15
- mini_portile2 (~> 2.4.0)
16
- rake (13.0.1)
12
+ mini_portile2 (2.5.1)
13
+ minitest (5.14.4)
14
+ mocha (1.12.0)
15
+ nokogiri (1.11.6)
16
+ mini_portile2 (~> 2.5.0)
17
+ racc (~> 1.4)
18
+ racc (1.5.2)
19
+ rake (13.0.3)
17
20
  roo (2.8.3)
18
21
  nokogiri (~> 1)
19
22
  rubyzip (>= 1.3.0, < 3.0.0)
@@ -32,4 +35,4 @@ DEPENDENCIES
32
35
  simple_text_extract!
33
36
 
34
37
  BUNDLED WITH
35
- 2.0.2
38
+ 2.2.3
data/README.md CHANGED
@@ -51,7 +51,7 @@ You can choose to use SimpleTextExtract without the following dependencies, but
51
51
  `pdf` parsing requires `poppler-utils`
52
52
  - `brew install poppler`
53
53
 
54
- `doc` parsing requires `antiword`
54
+ `doc` parsing requires `antiword` and `unzip`
55
55
  - `brew install antiword`
56
56
 
57
57
  `xlsx` and `xls` parsing requires `ssconvert` which is part of `gnumeric`
@@ -76,6 +76,7 @@ In your `Aptfile`, add:
76
76
  ```
77
77
  antiword
78
78
  gnumeric
79
+ unzip
79
80
  ```
80
81
 
81
82
  * There is currently an [issue](https://github.com/heroku/heroku-buildpack-google-chrome/issues/59) with the heroku-18 stack that requires additional dependencies added to the Aptfile to get `gnumeric` to work properly. You can reference the linked issue above to figure out those dependencies, or downgrade to heroku-16 until it is fixed.
@@ -5,7 +5,7 @@ require "simple_text_extract/text_extractor"
5
5
  require "simple_text_extract/format_extractor_factory"
6
6
 
7
7
  module SimpleTextExtract
8
- SUPPORTED_FILETYPES = ["xls", "xlsx", "doc", "docx", "txt", "pdf", "csv"].freeze
8
+ SUPPORTED_FILETYPES = ["xls", "xlsx", "doc", "docx", "txt", "pdf", "csv", "zip"].freeze
9
9
 
10
10
  class Error < StandardError; end
11
11
 
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleTextExtract
4
+ module FormatExtractor
5
+ class ZipExtract < Base
6
+ def extract
7
+ require "zip"
8
+
9
+ result = []
10
+ Zip::File.open(file) do |zip_file|
11
+ zip_file.each do |entry|
12
+ result << entry.name
13
+ result << SimpleTextExtract.extract(
14
+ raw: entry.get_input_stream.read,
15
+ filename: entry.name
16
+ )
17
+ end
18
+ end
19
+
20
+ result.join(" ")
21
+ end
22
+ end
23
+ end
24
+ end
@@ -7,11 +7,14 @@ require "simple_text_extract/format_extractor/xls_x"
7
7
  require "simple_text_extract/format_extractor/xls"
8
8
  require "simple_text_extract/format_extractor/doc_x"
9
9
  require "simple_text_extract/format_extractor/doc"
10
+ require "simple_text_extract/format_extractor/zip_extract"
10
11
 
11
12
  module SimpleTextExtract
12
13
  class FormatExtractorFactory
13
- def self.call(file) # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity
14
+ def self.call(file)
14
15
  case file.path
16
+ when /.zip$/i
17
+ FormatExtractor::ZipExtract.new(file)
15
18
  when /(.txt$|.csv$)/i
16
19
  FormatExtractor::PlainText.new(file)
17
20
  when /.pdf$/i
@@ -26,7 +26,6 @@ module SimpleTextExtract
26
26
 
27
27
  def extract
28
28
  return unless file
29
- return unless file
30
29
 
31
30
  begin
32
31
  FormatExtractorFactory.call(file).extract
@@ -38,7 +37,7 @@ module SimpleTextExtract
38
37
  end
39
38
 
40
39
  def cleanup
41
- return unless file.class == Tempfile
40
+ return unless file.instance_of?(Tempfile)
42
41
 
43
42
  file.close
44
43
  file.unlink
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SimpleTextExtract
4
- VERSION = "1.1.0"
4
+ VERSION = "1.2.0"
5
5
  end
@@ -30,6 +30,7 @@ Gem::Specification.new do |spec|
30
30
 
31
31
  spec.add_runtime_dependency "roo", "~> 2.8.2"
32
32
  spec.add_runtime_dependency "spreadsheet", "~> 1.1.8"
33
+ spec.add_runtime_dependency "rubyzip", ">= 1.0.0"
33
34
 
34
35
  spec.add_development_dependency "rake", "~> 13.0"
35
36
  spec.add_development_dependency "minitest", "~> 5.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple_text_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nick Weiland
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-29 00:00:00.000000000 Z
11
+ date: 2021-05-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: roo
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: 1.1.8
41
+ - !ruby/object:Gem::Dependency
42
+ name: rubyzip
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: 1.0.0
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: 1.0.0
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: rake
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -107,6 +121,7 @@ files:
107
121
  - lib/simple_text_extract/format_extractor/plain_text.rb
108
122
  - lib/simple_text_extract/format_extractor/xls.rb
109
123
  - lib/simple_text_extract/format_extractor/xls_x.rb
124
+ - lib/simple_text_extract/format_extractor/zip_extract.rb
110
125
  - lib/simple_text_extract/format_extractor_factory.rb
111
126
  - lib/simple_text_extract/text_extractor.rb
112
127
  - lib/simple_text_extract/version.rb
@@ -117,7 +132,7 @@ homepage: https://github.com/weilandia/simple_text_extract
117
132
  licenses:
118
133
  - MIT
119
134
  metadata: {}
120
- post_install_message:
135
+ post_install_message:
121
136
  rdoc_options: []
122
137
  require_paths:
123
138
  - lib
@@ -134,8 +149,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
134
149
  requirements:
135
150
  - antiword
136
151
  - pdftotext/poppler
137
- rubygems_version: 3.0.3
138
- signing_key:
152
+ rubygems_version: 3.2.3
153
+ signing_key:
139
154
  specification_version: 4
140
155
  summary: Attempts to quickly extract text from various file types before resorting
141
156
  to something more extreme like Apache Tika.