simple_text_extract 1.0.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 28beca26a3231ec93af472f0aafdf46c2fcadf33ab79a0925f980f9d8ebfcde3
4
- data.tar.gz: 7c1e424163c080ac59bf1414911b10a1dffbdc6d1e34deac4e7a5488f93d09fa
3
+ metadata.gz: 3d5ee7883d1447c45d4293ff8ac500334f195d3c9e6f88451490787cbf59e728
4
+ data.tar.gz: 42a88c4c258eedc9c382d80c4b72e8c5deacc46b4490dc7bc1f1d029b8dba2f1
5
5
  SHA512:
6
- metadata.gz: 41d850e3f186c323757964593028fd46efbfe8cae2031cf0c86785ee0395640f21205b053ff6f36bce5df68e711dbb6cd2565a5f0e4e3c5a2f8256c3648b4215
7
- data.tar.gz: ecd4b2d7bb87d4e4aab0b36b6bcc06e1e9b04bccfe6caad90d69145a7810f912524e1a2c7fecc6aad8f36ca8b5558f1817d4d18eec4e2fd24e5e8410256399b0
6
+ metadata.gz: 1a050e414a45fc766dbd89b3d468c1e67d687185c009caf1a41cd80f4ca8476a82aafdce29461bb8d11c367275a1de0955c39911f12d8760f10cb9fc6f1634f8
7
+ data.tar.gz: e4c49c9fc926a39d9db0cd6bbdd014e70ef480816a19d242ae541505534a3224ab960e9374459fff884b2f6b4690cf3f21cca85f2c6ff03ff9c5126ba1eb829d
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.6.5
1
+ 3.0.1
data/Gemfile.lock CHANGED
@@ -1,26 +1,29 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- simple_text_extract (1.0.0)
5
- roo (~> 2.8.2)
6
- spreadsheet (~> 1.1.8)
4
+ simple_text_extract (1.3.0)
5
+ roo (~> 2.8.3)
6
+ rubyzip (~> 2.3.2)
7
+ spreadsheet (~> 1.3.0)
7
8
 
8
9
  GEM
9
10
  remote: https://rubygems.org/
10
11
  specs:
11
- mini_portile2 (2.4.0)
12
- minitest (5.14.0)
13
- mocha (1.11.2)
14
- nokogiri (1.10.9)
15
- mini_portile2 (~> 2.4.0)
16
- rake (13.0.1)
12
+ mini_portile2 (2.7.1)
13
+ minitest (5.15.0)
14
+ mocha (1.13.0)
15
+ nokogiri (1.13.0)
16
+ mini_portile2 (~> 2.7.0)
17
+ racc (~> 1.4)
18
+ racc (1.6.0)
19
+ rake (13.0.6)
17
20
  roo (2.8.3)
18
21
  nokogiri (~> 1)
19
22
  rubyzip (>= 1.3.0, < 3.0.0)
20
23
  ruby-ole (1.2.12.2)
21
- rubyzip (2.3.0)
22
- spreadsheet (1.1.9)
23
- ruby-ole (>= 1.0)
24
+ rubyzip (2.3.2)
25
+ spreadsheet (1.3.0)
26
+ ruby-ole
24
27
 
25
28
  PLATFORMS
26
29
  ruby
@@ -32,4 +35,4 @@ DEPENDENCIES
32
35
  simple_text_extract!
33
36
 
34
37
  BUNDLED WITH
35
- 2.0.2
38
+ 2.2.15
data/README.md CHANGED
@@ -9,6 +9,7 @@ SimpleTextExtract handles parsing text from:
9
9
  - `.doc`
10
10
  - `.xlsx`
11
11
  - `.xls`
12
+ - `.csv`
12
13
  - `.txt` 😜
13
14
 
14
15
  If no text is parsed (for `pdf`), or a file format is not supported (like images), then `nil` is returned and you can move on to the heavy-duty tools like [Henkei](https://github.com/abrom/henkei) 💪.
@@ -50,7 +51,7 @@ You can choose to use SimpleTextExtract without the following dependencies, but
50
51
  `pdf` parsing requires `poppler-utils`
51
52
  - `brew install poppler`
52
53
 
53
- `doc` parsing requires `antiword`
54
+ `doc` parsing requires `antiword` and `unzip`
54
55
  - `brew install antiword`
55
56
 
56
57
  `xlsx` and `xls` parsing requires `ssconvert` which is part of `gnumeric`
@@ -75,6 +76,7 @@ In your `Aptfile`, add:
75
76
  ```
76
77
  antiword
77
78
  gnumeric
79
+ unzip
78
80
  ```
79
81
 
80
82
  * There is currently an [issue](https://github.com/heroku/heroku-buildpack-google-chrome/issues/59) with the heroku-18 stack that requires additional dependencies added to the Aptfile to get `gnumeric` to work properly. You can reference the linked issue above to figure out those dependencies, or downgrade to heroku-16 until it is fixed.
@@ -4,7 +4,7 @@ module SimpleTextExtract
4
4
  module FormatExtractor
5
5
  class Doc < Base
6
6
  def extract
7
- return nil if missing_dependency?('antiword')
7
+ return nil if missing_dependency?("antiword")
8
8
 
9
9
  `antiword #{Shellwords.escape(file.path)}`
10
10
  end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleTextExtract
4
+ module FormatExtractor
5
+ class ZipExtract < Base
6
+ def extract
7
+ require "zip"
8
+
9
+ result = []
10
+ Zip::File.open(file) do |zip_file|
11
+ zip_file.each do |entry|
12
+ result << entry.name
13
+ result << SimpleTextExtract.extract(
14
+ raw: entry.get_input_stream.read,
15
+ filename: entry.name
16
+ )
17
+ end
18
+ end
19
+
20
+ result.join(" ")
21
+ end
22
+ end
23
+ end
24
+ end
@@ -7,12 +7,15 @@ require "simple_text_extract/format_extractor/xls_x"
7
7
  require "simple_text_extract/format_extractor/xls"
8
8
  require "simple_text_extract/format_extractor/doc_x"
9
9
  require "simple_text_extract/format_extractor/doc"
10
+ require "simple_text_extract/format_extractor/zip_extract"
10
11
 
11
12
  module SimpleTextExtract
12
13
  class FormatExtractorFactory
13
- def self.call(file) # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity
14
+ def self.call(file)
14
15
  case file.path
15
- when /.txt$/i
16
+ when /.zip$/i
17
+ FormatExtractor::ZipExtract.new(file)
18
+ when /(.txt$|.csv$)/i
16
19
  FormatExtractor::PlainText.new(file)
17
20
  when /.pdf$/i
18
21
  FormatExtractor::PDF.new(file)
@@ -29,13 +29,15 @@ module SimpleTextExtract
29
29
 
30
30
  begin
31
31
  FormatExtractorFactory.call(file).extract
32
+ rescue StandardError
33
+ nil
32
34
  ensure
33
35
  cleanup
34
36
  end
35
37
  end
36
38
 
37
39
  def cleanup
38
- return unless file.class == Tempfile
40
+ return unless file.instance_of?(Tempfile)
39
41
 
40
42
  file.close
41
43
  file.unlink
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SimpleTextExtract
4
- VERSION = "1.0.1"
4
+ VERSION = "1.3.0"
5
5
  end
@@ -5,7 +5,7 @@ require "simple_text_extract/text_extractor"
5
5
  require "simple_text_extract/format_extractor_factory"
6
6
 
7
7
  module SimpleTextExtract
8
- SUPPORTED_FILETYPES = ["xls", "xlsx", "doc", "docx", "txt", "pdf"].freeze
8
+ SUPPORTED_FILETYPES = ["xls", "xlsx", "doc", "docx", "txt", "pdf", "csv", "zip"].freeze
9
9
 
10
10
  class Error < StandardError; end
11
11
 
@@ -28,8 +28,9 @@ Gem::Specification.new do |spec|
28
28
  spec.requirements << "pdftotext/poppler"
29
29
  spec.required_ruby_version = ">= 2.5"
30
30
 
31
- spec.add_runtime_dependency "roo", "~> 2.8.2"
32
- spec.add_runtime_dependency "spreadsheet", "~> 1.1.8"
31
+ spec.add_runtime_dependency "roo", "~> 2.8.3"
32
+ spec.add_runtime_dependency "spreadsheet", "~> 1.3.0"
33
+ spec.add_runtime_dependency "rubyzip", "~> 2.3.2"
33
34
 
34
35
  spec.add_development_dependency "rake", "~> 13.0"
35
36
  spec.add_development_dependency "minitest", "~> 5.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple_text_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nick Weiland
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-03-31 00:00:00.000000000 Z
11
+ date: 2022-01-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: roo
@@ -16,28 +16,42 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 2.8.2
19
+ version: 2.8.3
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 2.8.2
26
+ version: 2.8.3
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: spreadsheet
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 1.1.8
33
+ version: 1.3.0
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 1.1.8
40
+ version: 1.3.0
41
+ - !ruby/object:Gem::Dependency
42
+ name: rubyzip
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 2.3.2
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 2.3.2
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: rake
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -107,6 +121,7 @@ files:
107
121
  - lib/simple_text_extract/format_extractor/plain_text.rb
108
122
  - lib/simple_text_extract/format_extractor/xls.rb
109
123
  - lib/simple_text_extract/format_extractor/xls_x.rb
124
+ - lib/simple_text_extract/format_extractor/zip_extract.rb
110
125
  - lib/simple_text_extract/format_extractor_factory.rb
111
126
  - lib/simple_text_extract/text_extractor.rb
112
127
  - lib/simple_text_extract/version.rb
@@ -116,7 +131,7 @@ homepage: https://github.com/weilandia/simple_text_extract
116
131
  licenses:
117
132
  - MIT
118
133
  metadata: {}
119
- post_install_message:
134
+ post_install_message:
120
135
  rdoc_options: []
121
136
  require_paths:
122
137
  - lib
@@ -133,8 +148,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
133
148
  requirements:
134
149
  - antiword
135
150
  - pdftotext/poppler
136
- rubygems_version: 3.0.6
137
- signing_key:
151
+ rubygems_version: 3.2.15
152
+ signing_key:
138
153
  specification_version: 4
139
154
  summary: Attempts to quickly extract text from various file types before resorting
140
155
  to something more extreme like Apache Tika.