simple_text_extract 1.0.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/Gemfile.lock +16 -13
- data/README.md +3 -1
- data/lib/simple_text_extract/format_extractor/doc.rb +1 -1
- data/lib/simple_text_extract/format_extractor/zip_extract.rb +24 -0
- data/lib/simple_text_extract/format_extractor_factory.rb +5 -2
- data/lib/simple_text_extract/text_extractor.rb +3 -1
- data/lib/simple_text_extract/version.rb +1 -1
- data/lib/simple_text_extract.rb +1 -1
- data/simple_text_extract.gemspec +3 -2
- metadata +25 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3d5ee7883d1447c45d4293ff8ac500334f195d3c9e6f88451490787cbf59e728
|
4
|
+
data.tar.gz: 42a88c4c258eedc9c382d80c4b72e8c5deacc46b4490dc7bc1f1d029b8dba2f1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1a050e414a45fc766dbd89b3d468c1e67d687185c009caf1a41cd80f4ca8476a82aafdce29461bb8d11c367275a1de0955c39911f12d8760f10cb9fc6f1634f8
|
7
|
+
data.tar.gz: e4c49c9fc926a39d9db0cd6bbdd014e70ef480816a19d242ae541505534a3224ab960e9374459fff884b2f6b4690cf3f21cca85f2c6ff03ff9c5126ba1eb829d
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
3.0.1
|
data/Gemfile.lock
CHANGED
@@ -1,26 +1,29 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
simple_text_extract (1.
|
5
|
-
roo (~> 2.8.
|
6
|
-
|
4
|
+
simple_text_extract (1.3.0)
|
5
|
+
roo (~> 2.8.3)
|
6
|
+
rubyzip (~> 2.3.2)
|
7
|
+
spreadsheet (~> 1.3.0)
|
7
8
|
|
8
9
|
GEM
|
9
10
|
remote: https://rubygems.org/
|
10
11
|
specs:
|
11
|
-
mini_portile2 (2.
|
12
|
-
minitest (5.
|
13
|
-
mocha (1.
|
14
|
-
nokogiri (1.
|
15
|
-
mini_portile2 (~> 2.
|
16
|
-
|
12
|
+
mini_portile2 (2.7.1)
|
13
|
+
minitest (5.15.0)
|
14
|
+
mocha (1.13.0)
|
15
|
+
nokogiri (1.13.0)
|
16
|
+
mini_portile2 (~> 2.7.0)
|
17
|
+
racc (~> 1.4)
|
18
|
+
racc (1.6.0)
|
19
|
+
rake (13.0.6)
|
17
20
|
roo (2.8.3)
|
18
21
|
nokogiri (~> 1)
|
19
22
|
rubyzip (>= 1.3.0, < 3.0.0)
|
20
23
|
ruby-ole (1.2.12.2)
|
21
|
-
rubyzip (2.3.
|
22
|
-
spreadsheet (1.
|
23
|
-
ruby-ole
|
24
|
+
rubyzip (2.3.2)
|
25
|
+
spreadsheet (1.3.0)
|
26
|
+
ruby-ole
|
24
27
|
|
25
28
|
PLATFORMS
|
26
29
|
ruby
|
@@ -32,4 +35,4 @@ DEPENDENCIES
|
|
32
35
|
simple_text_extract!
|
33
36
|
|
34
37
|
BUNDLED WITH
|
35
|
-
2.
|
38
|
+
2.2.15
|
data/README.md
CHANGED
@@ -9,6 +9,7 @@ SimpleTextExtract handles parsing text from:
|
|
9
9
|
- `.doc`
|
10
10
|
- `.xlsx`
|
11
11
|
- `.xls`
|
12
|
+
- `.csv`
|
12
13
|
- `.txt` 😜
|
13
14
|
|
14
15
|
If no text is parsed (for `pdf`), or a file format is not supported (like images), then `nil` is returned and you can move on to the heavy-duty tools like [Henkei](https://github.com/abrom/henkei) 💪.
|
@@ -50,7 +51,7 @@ You can choose to use SimpleTextExtract without the following dependencies, but
|
|
50
51
|
`pdf` parsing requires `poppler-utils`
|
51
52
|
- `brew install poppler`
|
52
53
|
|
53
|
-
`doc` parsing requires `antiword`
|
54
|
+
`doc` parsing requires `antiword` and `unzip`
|
54
55
|
- `brew install antiword`
|
55
56
|
|
56
57
|
`xlsx` and `xls` parsing requires `ssconvert` which is part of `gnumeric`
|
@@ -75,6 +76,7 @@ In your `Aptfile`, add:
|
|
75
76
|
```
|
76
77
|
antiword
|
77
78
|
gnumeric
|
79
|
+
unzip
|
78
80
|
```
|
79
81
|
|
80
82
|
* There is currently an [issue](https://github.com/heroku/heroku-buildpack-google-chrome/issues/59) with the heroku-18 stack that requires additional dependencies added to the Aptfile to get `gnumeric` to work properly. You can reference the linked issue above to figure out those dependencies, or downgrade to heroku-16 until it is fixed.
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SimpleTextExtract
|
4
|
+
module FormatExtractor
|
5
|
+
class ZipExtract < Base
|
6
|
+
def extract
|
7
|
+
require "zip"
|
8
|
+
|
9
|
+
result = []
|
10
|
+
Zip::File.open(file) do |zip_file|
|
11
|
+
zip_file.each do |entry|
|
12
|
+
result << entry.name
|
13
|
+
result << SimpleTextExtract.extract(
|
14
|
+
raw: entry.get_input_stream.read,
|
15
|
+
filename: entry.name
|
16
|
+
)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
result.join(" ")
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -7,12 +7,15 @@ require "simple_text_extract/format_extractor/xls_x"
|
|
7
7
|
require "simple_text_extract/format_extractor/xls"
|
8
8
|
require "simple_text_extract/format_extractor/doc_x"
|
9
9
|
require "simple_text_extract/format_extractor/doc"
|
10
|
+
require "simple_text_extract/format_extractor/zip_extract"
|
10
11
|
|
11
12
|
module SimpleTextExtract
|
12
13
|
class FormatExtractorFactory
|
13
|
-
def self.call(file)
|
14
|
+
def self.call(file)
|
14
15
|
case file.path
|
15
|
-
when /.
|
16
|
+
when /.zip$/i
|
17
|
+
FormatExtractor::ZipExtract.new(file)
|
18
|
+
when /(.txt$|.csv$)/i
|
16
19
|
FormatExtractor::PlainText.new(file)
|
17
20
|
when /.pdf$/i
|
18
21
|
FormatExtractor::PDF.new(file)
|
@@ -29,13 +29,15 @@ module SimpleTextExtract
|
|
29
29
|
|
30
30
|
begin
|
31
31
|
FormatExtractorFactory.call(file).extract
|
32
|
+
rescue StandardError
|
33
|
+
nil
|
32
34
|
ensure
|
33
35
|
cleanup
|
34
36
|
end
|
35
37
|
end
|
36
38
|
|
37
39
|
def cleanup
|
38
|
-
return unless file.
|
40
|
+
return unless file.instance_of?(Tempfile)
|
39
41
|
|
40
42
|
file.close
|
41
43
|
file.unlink
|
data/lib/simple_text_extract.rb
CHANGED
@@ -5,7 +5,7 @@ require "simple_text_extract/text_extractor"
|
|
5
5
|
require "simple_text_extract/format_extractor_factory"
|
6
6
|
|
7
7
|
module SimpleTextExtract
|
8
|
-
SUPPORTED_FILETYPES = ["xls", "xlsx", "doc", "docx", "txt", "pdf"].freeze
|
8
|
+
SUPPORTED_FILETYPES = ["xls", "xlsx", "doc", "docx", "txt", "pdf", "csv", "zip"].freeze
|
9
9
|
|
10
10
|
class Error < StandardError; end
|
11
11
|
|
data/simple_text_extract.gemspec
CHANGED
@@ -28,8 +28,9 @@ Gem::Specification.new do |spec|
|
|
28
28
|
spec.requirements << "pdftotext/poppler"
|
29
29
|
spec.required_ruby_version = ">= 2.5"
|
30
30
|
|
31
|
-
spec.add_runtime_dependency "roo", "~> 2.8.
|
32
|
-
spec.add_runtime_dependency "spreadsheet", "~> 1.
|
31
|
+
spec.add_runtime_dependency "roo", "~> 2.8.3"
|
32
|
+
spec.add_runtime_dependency "spreadsheet", "~> 1.3.0"
|
33
|
+
spec.add_runtime_dependency "rubyzip", "~> 2.3.2"
|
33
34
|
|
34
35
|
spec.add_development_dependency "rake", "~> 13.0"
|
35
36
|
spec.add_development_dependency "minitest", "~> 5.0"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple_text_extract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nick Weiland
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-01-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: roo
|
@@ -16,28 +16,42 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 2.8.
|
19
|
+
version: 2.8.3
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 2.8.
|
26
|
+
version: 2.8.3
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: spreadsheet
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.
|
33
|
+
version: 1.3.0
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.
|
40
|
+
version: 1.3.0
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rubyzip
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 2.3.2
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 2.3.2
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: rake
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -107,6 +121,7 @@ files:
|
|
107
121
|
- lib/simple_text_extract/format_extractor/plain_text.rb
|
108
122
|
- lib/simple_text_extract/format_extractor/xls.rb
|
109
123
|
- lib/simple_text_extract/format_extractor/xls_x.rb
|
124
|
+
- lib/simple_text_extract/format_extractor/zip_extract.rb
|
110
125
|
- lib/simple_text_extract/format_extractor_factory.rb
|
111
126
|
- lib/simple_text_extract/text_extractor.rb
|
112
127
|
- lib/simple_text_extract/version.rb
|
@@ -116,7 +131,7 @@ homepage: https://github.com/weilandia/simple_text_extract
|
|
116
131
|
licenses:
|
117
132
|
- MIT
|
118
133
|
metadata: {}
|
119
|
-
post_install_message:
|
134
|
+
post_install_message:
|
120
135
|
rdoc_options: []
|
121
136
|
require_paths:
|
122
137
|
- lib
|
@@ -133,8 +148,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
133
148
|
requirements:
|
134
149
|
- antiword
|
135
150
|
- pdftotext/poppler
|
136
|
-
rubygems_version: 3.
|
137
|
-
signing_key:
|
151
|
+
rubygems_version: 3.2.15
|
152
|
+
signing_key:
|
138
153
|
specification_version: 4
|
139
154
|
summary: Attempts to quickly extract text from various file types before resorting
|
140
155
|
to something more extreme like Apache Tika.
|