simple_text_extract 1.1.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 35eee7886cc545693f85facc69bbe8b99ea141f0af3ad520f6f75d2bc61eddd7
4
- data.tar.gz: 5a4b8e4a4dfe54535805f56e986fe36b5470889a96960b2bf8acd1ee4c94b084
3
+ metadata.gz: f3d117bf20380fae2d755e2a258189520479785207c3d27115ab3e9c90e84e51
4
+ data.tar.gz: 96b634b6b061520a25be360f62eac98dabde4b1ca3e723703b40bf1d7dbc11d2
5
5
  SHA512:
6
- metadata.gz: 9ace8965b9567c8d9e85f2d52912c8081da83a2dc0ff9c463f6ac28450e39ca874125e501196f546930615b4d6271abad88dc3fe8b91f0ade1253c0468652eca
7
- data.tar.gz: 05d9819fd5835b8307ae6ece9b0f4efa8b00d615eaf6fcc054dc38959780761ed4cf40711d7b91de38a9e3390904dead968c0a4a0ccc9221a8ec5122a6f9f0b8
6
+ metadata.gz: 171f01d876c6fc30abf68268c1dd69bd56d135af56cda361027efc3ee12482afcaeafeb376cf6ff3d0ad231c92a694a873e1162525f74ca405a404192858a78e
7
+ data.tar.gz: 2fad0ef23c0036fb1b9257221f22bafd9f2b0760eba7407a115931705ebf7b2d42f2f09b0c0fa5ef028500e2bb4ac68e59a5f5873c97037ca2dccbe00a24cc62
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.6.5
1
+ 3.0.1
data/Gemfile.lock CHANGED
@@ -1,26 +1,29 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- simple_text_extract (1.0.0)
5
- roo (~> 2.8.2)
6
- spreadsheet (~> 1.1.8)
4
+ simple_text_extract (1.3.0)
5
+ roo (~> 2.9.0)
6
+ rubyzip (~> 2.3.2)
7
+ spreadsheet (~> 1.3.0)
7
8
 
8
9
  GEM
9
10
  remote: https://rubygems.org/
10
11
  specs:
11
- mini_portile2 (2.4.0)
12
- minitest (5.14.0)
13
- mocha (1.11.2)
14
- nokogiri (1.10.9)
15
- mini_portile2 (~> 2.4.0)
16
- rake (13.0.1)
17
- roo (2.8.3)
12
+ mini_portile2 (2.8.0)
13
+ minitest (5.15.0)
14
+ mocha (1.13.0)
15
+ nokogiri (1.13.3)
16
+ mini_portile2 (~> 2.8.0)
17
+ racc (~> 1.4)
18
+ racc (1.6.0)
19
+ rake (13.0.6)
20
+ roo (2.9.0)
18
21
  nokogiri (~> 1)
19
22
  rubyzip (>= 1.3.0, < 3.0.0)
20
23
  ruby-ole (1.2.12.2)
21
- rubyzip (2.3.0)
22
- spreadsheet (1.1.9)
23
- ruby-ole (>= 1.0)
24
+ rubyzip (2.3.2)
25
+ spreadsheet (1.3.0)
26
+ ruby-ole
24
27
 
25
28
  PLATFORMS
26
29
  ruby
@@ -32,4 +35,4 @@ DEPENDENCIES
32
35
  simple_text_extract!
33
36
 
34
37
  BUNDLED WITH
35
- 2.0.2
38
+ 2.2.15
data/README.md CHANGED
@@ -51,7 +51,7 @@ You can choose to use SimpleTextExtract without the following dependencies, but
51
51
  `pdf` parsing requires `poppler-utils`
52
52
  - `brew install poppler`
53
53
 
54
- `doc` parsing requires `antiword`
54
+ `doc` parsing requires `antiword` and `unzip`
55
55
  - `brew install antiword`
56
56
 
57
57
  `xlsx` and `xls` parsing requires `ssconvert` which is part of `gnumeric`
@@ -76,6 +76,7 @@ In your `Aptfile`, add:
76
76
  ```
77
77
  antiword
78
78
  gnumeric
79
+ unzip
79
80
  ```
80
81
 
81
82
  * There is currently an [issue](https://github.com/heroku/heroku-buildpack-google-chrome/issues/59) with the heroku-18 stack that requires additional dependencies added to the Aptfile to get `gnumeric` to work properly. You can reference the linked issue above to figure out those dependencies, or downgrade to heroku-16 until it is fixed.
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleTextExtract
4
+ module FormatExtractor
5
+ class ZipExtract < Base
6
+ def extract
7
+ require "zip"
8
+
9
+ result = []
10
+ Zip::File.open(file) do |zip_file|
11
+ zip_file.each do |entry|
12
+ result << entry.name
13
+ result << SimpleTextExtract.extract(
14
+ raw: entry.get_input_stream.read,
15
+ filename: entry.name
16
+ )
17
+ end
18
+ end
19
+
20
+ result.join(" ")
21
+ end
22
+ end
23
+ end
24
+ end
@@ -7,11 +7,14 @@ require "simple_text_extract/format_extractor/xls_x"
7
7
  require "simple_text_extract/format_extractor/xls"
8
8
  require "simple_text_extract/format_extractor/doc_x"
9
9
  require "simple_text_extract/format_extractor/doc"
10
+ require "simple_text_extract/format_extractor/zip_extract"
10
11
 
11
12
  module SimpleTextExtract
12
13
  class FormatExtractorFactory
13
- def self.call(file) # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity
14
+ def self.call(file)
14
15
  case file.path
16
+ when /.zip$/i
17
+ FormatExtractor::ZipExtract.new(file)
15
18
  when /(.txt$|.csv$)/i
16
19
  FormatExtractor::PlainText.new(file)
17
20
  when /.pdf$/i
@@ -26,7 +26,6 @@ module SimpleTextExtract
26
26
 
27
27
  def extract
28
28
  return unless file
29
- return unless file
30
29
 
31
30
  begin
32
31
  FormatExtractorFactory.call(file).extract
@@ -38,7 +37,7 @@ module SimpleTextExtract
38
37
  end
39
38
 
40
39
  def cleanup
41
- return unless file.class == Tempfile
40
+ return unless file.instance_of?(Tempfile)
42
41
 
43
42
  file.close
44
43
  file.unlink
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SimpleTextExtract
4
- VERSION = "1.1.0"
4
+ VERSION = "2.0.0"
5
5
  end
@@ -5,7 +5,7 @@ require "simple_text_extract/text_extractor"
5
5
  require "simple_text_extract/format_extractor_factory"
6
6
 
7
7
  module SimpleTextExtract
8
- SUPPORTED_FILETYPES = ["xls", "xlsx", "doc", "docx", "txt", "pdf", "csv"].freeze
8
+ SUPPORTED_FILETYPES = ["xls", "xlsx", "doc", "docx", "txt", "pdf", "csv", "zip"].freeze
9
9
 
10
10
  class Error < StandardError; end
11
11
 
@@ -28,8 +28,9 @@ Gem::Specification.new do |spec|
28
28
  spec.requirements << "pdftotext/poppler"
29
29
  spec.required_ruby_version = ">= 2.5"
30
30
 
31
- spec.add_runtime_dependency "roo", "~> 2.8.2"
32
- spec.add_runtime_dependency "spreadsheet", "~> 1.1.8"
31
+ spec.add_runtime_dependency "roo", "~> 2.9.0"
32
+ spec.add_runtime_dependency "spreadsheet", "~> 1.3.0"
33
+ spec.add_runtime_dependency "rubyzip", "~> 2.3.2"
33
34
 
34
35
  spec.add_development_dependency "rake", "~> 13.0"
35
36
  spec.add_development_dependency "minitest", "~> 5.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple_text_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nick Weiland
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-29 00:00:00.000000000 Z
11
+ date: 2022-03-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: roo
@@ -16,28 +16,42 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 2.8.2
19
+ version: 2.9.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 2.8.2
26
+ version: 2.9.0
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: spreadsheet
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 1.1.8
33
+ version: 1.3.0
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 1.1.8
40
+ version: 1.3.0
41
+ - !ruby/object:Gem::Dependency
42
+ name: rubyzip
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 2.3.2
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 2.3.2
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: rake
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -107,17 +121,17 @@ files:
107
121
  - lib/simple_text_extract/format_extractor/plain_text.rb
108
122
  - lib/simple_text_extract/format_extractor/xls.rb
109
123
  - lib/simple_text_extract/format_extractor/xls_x.rb
124
+ - lib/simple_text_extract/format_extractor/zip_extract.rb
110
125
  - lib/simple_text_extract/format_extractor_factory.rb
111
126
  - lib/simple_text_extract/text_extractor.rb
112
127
  - lib/simple_text_extract/version.rb
113
- - simple_text_extract-1.0.2.gem
114
128
  - simple_text_extract.gemspec
115
129
  - tags
116
130
  homepage: https://github.com/weilandia/simple_text_extract
117
131
  licenses:
118
132
  - MIT
119
133
  metadata: {}
120
- post_install_message:
134
+ post_install_message:
121
135
  rdoc_options: []
122
136
  require_paths:
123
137
  - lib
@@ -134,8 +148,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
134
148
  requirements:
135
149
  - antiword
136
150
  - pdftotext/poppler
137
- rubygems_version: 3.0.3
138
- signing_key:
151
+ rubygems_version: 3.2.15
152
+ signing_key:
139
153
  specification_version: 4
140
154
  summary: Attempts to quickly extract text from various file types before resorting
141
155
  to something more extreme like Apache Tika.
Binary file