simple_text_extract 0.2.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/Gemfile +0 -2
- data/Gemfile.lock +18 -24
- data/README.md +8 -4
- data/Rakefile +1 -1
- data/bin/console +0 -3
- data/lib/simple_text_extract.rb +4 -6
- data/lib/simple_text_extract/format_extractor/doc.rb +1 -1
- data/lib/simple_text_extract/format_extractor/xls.rb +0 -1
- data/lib/simple_text_extract/format_extractor/xls_x.rb +1 -1
- data/lib/simple_text_extract/format_extractor/zip_extract.rb +24 -0
- data/lib/simple_text_extract/format_extractor_factory.rb +5 -2
- data/lib/simple_text_extract/text_extractor.rb +41 -11
- data/lib/simple_text_extract/version.rb +1 -1
- data/simple_text_extract-1.0.2.gem +0 -0
- data/simple_text_extract.gemspec +3 -3
- metadata +18 -19
- data/lib/simple_text_extract/file_extractor.rb +0 -17
- data/lib/simple_text_extract/tempfile_extractor.rb +0 -34
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 93414a5f260468367e16054bfcbcebbd93962c90a2d6903622a622aa567f754d
|
4
|
+
data.tar.gz: af3fc14fcc1b3c532f5f54c7dfebdb8d4fd4da5de691fe4561b5e39bd6c04fe0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1219e1404ea893772da0905b7bd808f703240cdc9e7f875a287c4e7886e67ea8fbc4c07a2a55a95a8db3f828d7f7ad87a67832e7834618c62537df3d42e013da
|
7
|
+
data.tar.gz: c052e8f2640f8aa9ac452b9cfcad32695add2c1b374daf4dee712f34592a9d1decaae453dbeff1f473fdcd12df01f1a1f9e7751a03925514e91976bb9d654f10
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
3.0.0
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,44 +1,38 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
simple_text_extract (
|
5
|
-
roo (~> 2.8)
|
4
|
+
simple_text_extract (1.2.0)
|
5
|
+
roo (~> 2.8.2)
|
6
|
+
rubyzip (>= 1.0.0)
|
6
7
|
spreadsheet (~> 1.1.8)
|
7
8
|
|
8
9
|
GEM
|
9
10
|
remote: https://rubygems.org/
|
10
11
|
specs:
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
pry (0.12.2)
|
21
|
-
coderay (~> 1.1.0)
|
22
|
-
method_source (~> 0.9.0)
|
23
|
-
rake (10.5.0)
|
24
|
-
roo (2.8.1)
|
12
|
+
mini_portile2 (2.5.1)
|
13
|
+
minitest (5.14.4)
|
14
|
+
mocha (1.12.0)
|
15
|
+
nokogiri (1.11.6)
|
16
|
+
mini_portile2 (~> 2.5.0)
|
17
|
+
racc (~> 1.4)
|
18
|
+
racc (1.5.2)
|
19
|
+
rake (13.0.3)
|
20
|
+
roo (2.8.3)
|
25
21
|
nokogiri (~> 1)
|
26
|
-
rubyzip (>= 1.
|
27
|
-
ruby-ole (1.2.12.
|
28
|
-
rubyzip (
|
29
|
-
spreadsheet (1.1.
|
22
|
+
rubyzip (>= 1.3.0, < 3.0.0)
|
23
|
+
ruby-ole (1.2.12.2)
|
24
|
+
rubyzip (2.3.0)
|
25
|
+
spreadsheet (1.1.9)
|
30
26
|
ruby-ole (>= 1.0)
|
31
27
|
|
32
28
|
PLATFORMS
|
33
29
|
ruby
|
34
30
|
|
35
31
|
DEPENDENCIES
|
36
|
-
bundler (~> 1.17)
|
37
32
|
minitest (~> 5.0)
|
38
33
|
mocha
|
39
|
-
|
40
|
-
rake (~> 10.0)
|
34
|
+
rake (~> 13.0)
|
41
35
|
simple_text_extract!
|
42
36
|
|
43
37
|
BUNDLED WITH
|
44
|
-
|
38
|
+
2.2.3
|
data/README.md
CHANGED
@@ -9,6 +9,7 @@ SimpleTextExtract handles parsing text from:
|
|
9
9
|
- `.doc`
|
10
10
|
- `.xlsx`
|
11
11
|
- `.xls`
|
12
|
+
- `.csv`
|
12
13
|
- `.txt` 😜
|
13
14
|
|
14
15
|
If no text is parsed (for `pdf`), or a file format is not supported (like images), then `nil` is returned and you can move on to the heavy-duty tools like [Henkei](https://github.com/abrom/henkei) 💪.
|
@@ -34,11 +35,13 @@ Or install it yourself as:
|
|
34
35
|
Text can be parsed from raw file content or files in the filesystem t by calling `SimpleTextExtract.extract`:
|
35
36
|
|
36
37
|
```ruby
|
37
|
-
#
|
38
|
-
|
38
|
+
# using ActiveStorage >= 6
|
39
|
+
extract = attachment.open { |tmp| SimpleTextExtract.extract(tempfile: tmp) }
|
40
|
+
# raw file content or when ActiveStorage < 6
|
41
|
+
extract = SimpleTextExtract.extract(filename: attachment.blob.filename, raw: attachment.download)
|
39
42
|
|
40
43
|
# filesystem
|
41
|
-
SimpleTextExtract.extract(filepath: "path_to_file.pdf")
|
44
|
+
extract = SimpleTextExtract.extract(filepath: "path_to_file.pdf")
|
42
45
|
```
|
43
46
|
|
44
47
|
### Usage Dependencies
|
@@ -48,7 +51,7 @@ You can choose to use SimpleTextExtract without the following dependencies, but
|
|
48
51
|
`pdf` parsing requires `poppler-utils`
|
49
52
|
- `brew install poppler`
|
50
53
|
|
51
|
-
`doc` parsing requires `antiword`
|
54
|
+
`doc` parsing requires `antiword` and `unzip`
|
52
55
|
- `brew install antiword`
|
53
56
|
|
54
57
|
`xlsx` and `xls` parsing requires `ssconvert` which is part of `gnumeric`
|
@@ -73,6 +76,7 @@ In your `Aptfile`, add:
|
|
73
76
|
```
|
74
77
|
antiword
|
75
78
|
gnumeric
|
79
|
+
unzip
|
76
80
|
```
|
77
81
|
|
78
82
|
* There is currently an [issue](https://github.com/heroku/heroku-buildpack-google-chrome/issues/59) with the heroku-18 stack that requires additional dependencies added to the Aptfile to get `gnumeric` to work properly. You can reference the linked issue above to figure out those dependencies, or downgrade to heroku-16 until it is fixed.
|
data/Rakefile
CHANGED
data/bin/console
CHANGED
data/lib/simple_text_extract.rb
CHANGED
@@ -2,20 +2,18 @@
|
|
2
2
|
|
3
3
|
require "simple_text_extract/version"
|
4
4
|
require "simple_text_extract/text_extractor"
|
5
|
-
require "simple_text_extract/file_extractor"
|
6
|
-
require "simple_text_extract/tempfile_extractor"
|
7
5
|
require "simple_text_extract/format_extractor_factory"
|
8
6
|
|
9
7
|
module SimpleTextExtract
|
10
|
-
SUPPORTED_FILETYPES = ["xls", "xlsx", "doc", "docx", "txt", "pdf"]
|
8
|
+
SUPPORTED_FILETYPES = ["xls", "xlsx", "doc", "docx", "txt", "pdf", "csv", "zip"].freeze
|
11
9
|
|
12
10
|
class Error < StandardError; end
|
13
11
|
|
14
|
-
def self.extract(filename: nil, raw: nil, filepath: nil)
|
15
|
-
TextExtractor.
|
12
|
+
def self.extract(filename: nil, raw: nil, filepath: nil, tempfile: nil)
|
13
|
+
TextExtractor.new(filename: filename, raw: raw, filepath: filepath, tempfile: tempfile).to_s
|
16
14
|
end
|
17
15
|
|
18
16
|
def self.supports?(filename: nil)
|
19
|
-
SUPPORTED_FILETYPES.include?(filename.split(".")
|
17
|
+
SUPPORTED_FILETYPES.include?(filename.split(".").last)
|
20
18
|
end
|
21
19
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SimpleTextExtract
|
4
|
+
module FormatExtractor
|
5
|
+
class ZipExtract < Base
|
6
|
+
def extract
|
7
|
+
require "zip"
|
8
|
+
|
9
|
+
result = []
|
10
|
+
Zip::File.open(file) do |zip_file|
|
11
|
+
zip_file.each do |entry|
|
12
|
+
result << entry.name
|
13
|
+
result << SimpleTextExtract.extract(
|
14
|
+
raw: entry.get_input_stream.read,
|
15
|
+
filename: entry.name
|
16
|
+
)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
result.join(" ")
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -7,12 +7,15 @@ require "simple_text_extract/format_extractor/xls_x"
|
|
7
7
|
require "simple_text_extract/format_extractor/xls"
|
8
8
|
require "simple_text_extract/format_extractor/doc_x"
|
9
9
|
require "simple_text_extract/format_extractor/doc"
|
10
|
+
require "simple_text_extract/format_extractor/zip_extract"
|
10
11
|
|
11
12
|
module SimpleTextExtract
|
12
13
|
class FormatExtractorFactory
|
13
|
-
def self.call(file)
|
14
|
+
def self.call(file)
|
14
15
|
case file.path
|
15
|
-
when /.
|
16
|
+
when /.zip$/i
|
17
|
+
FormatExtractor::ZipExtract.new(file)
|
18
|
+
when /(.txt$|.csv$)/i
|
16
19
|
FormatExtractor::PlainText.new(file)
|
17
20
|
when /.pdf$/i
|
18
21
|
FormatExtractor::PDF.new(file)
|
@@ -2,24 +2,54 @@
|
|
2
2
|
|
3
3
|
module SimpleTextExtract
|
4
4
|
class TextExtractor
|
5
|
-
|
6
|
-
if !filename.nil? && !raw.nil?
|
7
|
-
TempfileExtractor.new(filename: filename.to_s, raw: raw).extract
|
8
|
-
elsif !filepath.nil? && File.exist?(filepath)
|
9
|
-
FileExtractor.new(filepath: filepath).extract
|
10
|
-
end
|
11
|
-
end
|
5
|
+
attr_reader :file
|
12
6
|
|
13
|
-
def
|
14
|
-
|
15
|
-
|
7
|
+
def initialize(filename: nil, raw: nil, filepath: nil, tempfile: nil)
|
8
|
+
@file = get_file(filename: filename, raw: raw, filepath: filepath, tempfile: tempfile)
|
9
|
+
end
|
16
10
|
|
17
|
-
|
11
|
+
def to_s
|
12
|
+
@to_s ||= extract.to_s
|
18
13
|
end
|
19
14
|
|
20
15
|
private
|
21
16
|
|
17
|
+
def get_file(filename:, raw:, filepath:, tempfile:)
|
18
|
+
if tempfile&.class == Tempfile
|
19
|
+
tempfile
|
20
|
+
elsif !filename.nil? && !raw.nil?
|
21
|
+
write_tempfile(filename: filename.to_s, raw: raw)
|
22
|
+
elsif !filepath.nil? && File.exist?(filepath)
|
23
|
+
File.new(filepath)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def extract
|
28
|
+
return unless file
|
29
|
+
|
30
|
+
begin
|
31
|
+
FormatExtractorFactory.call(file).extract
|
32
|
+
rescue StandardError
|
33
|
+
nil
|
34
|
+
ensure
|
35
|
+
cleanup
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
22
39
|
def cleanup
|
40
|
+
return unless file.instance_of?(Tempfile)
|
41
|
+
|
42
|
+
file.close
|
43
|
+
file.unlink
|
44
|
+
end
|
45
|
+
|
46
|
+
def write_tempfile(filename:, raw:)
|
47
|
+
filename = filename.split(".").yield_self { |parts| [parts[0], ".#{parts[1]}"] }
|
48
|
+
file = Tempfile.new(filename)
|
49
|
+
raw = String.new(raw, encoding: Encoding::UTF_8)
|
50
|
+
|
51
|
+
file.write(raw)
|
52
|
+
file.tap(&:rewind)
|
23
53
|
end
|
24
54
|
end
|
25
55
|
end
|
Binary file
|
data/simple_text_extract.gemspec
CHANGED
@@ -28,11 +28,11 @@ Gem::Specification.new do |spec|
|
|
28
28
|
spec.requirements << "pdftotext/poppler"
|
29
29
|
spec.required_ruby_version = ">= 2.5"
|
30
30
|
|
31
|
-
spec.add_runtime_dependency "roo", "~> 2.8"
|
31
|
+
spec.add_runtime_dependency "roo", "~> 2.8.2"
|
32
32
|
spec.add_runtime_dependency "spreadsheet", "~> 1.1.8"
|
33
|
+
spec.add_runtime_dependency "rubyzip", ">= 1.0.0"
|
33
34
|
|
34
|
-
spec.add_development_dependency "
|
35
|
-
spec.add_development_dependency "rake", "~> 10.0"
|
35
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
36
36
|
spec.add_development_dependency "minitest", "~> 5.0"
|
37
37
|
spec.add_development_dependency "mocha"
|
38
38
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple_text_extract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nick Weiland
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-05-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: roo
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 2.8.2
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 2.8.2
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: spreadsheet
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -39,33 +39,33 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 1.1.8
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: rubyzip
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
48
|
-
type: :
|
47
|
+
version: 1.0.0
|
48
|
+
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 1.0.0
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: rake
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '13.0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '13.0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: minitest
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -114,7 +114,6 @@ files:
|
|
114
114
|
- bin/console
|
115
115
|
- bin/setup
|
116
116
|
- lib/simple_text_extract.rb
|
117
|
-
- lib/simple_text_extract/file_extractor.rb
|
118
117
|
- lib/simple_text_extract/format_extractor/base.rb
|
119
118
|
- lib/simple_text_extract/format_extractor/doc.rb
|
120
119
|
- lib/simple_text_extract/format_extractor/doc_x.rb
|
@@ -122,17 +121,18 @@ files:
|
|
122
121
|
- lib/simple_text_extract/format_extractor/plain_text.rb
|
123
122
|
- lib/simple_text_extract/format_extractor/xls.rb
|
124
123
|
- lib/simple_text_extract/format_extractor/xls_x.rb
|
124
|
+
- lib/simple_text_extract/format_extractor/zip_extract.rb
|
125
125
|
- lib/simple_text_extract/format_extractor_factory.rb
|
126
|
-
- lib/simple_text_extract/tempfile_extractor.rb
|
127
126
|
- lib/simple_text_extract/text_extractor.rb
|
128
127
|
- lib/simple_text_extract/version.rb
|
128
|
+
- simple_text_extract-1.0.2.gem
|
129
129
|
- simple_text_extract.gemspec
|
130
130
|
- tags
|
131
131
|
homepage: https://github.com/weilandia/simple_text_extract
|
132
132
|
licenses:
|
133
133
|
- MIT
|
134
134
|
metadata: {}
|
135
|
-
post_install_message:
|
135
|
+
post_install_message:
|
136
136
|
rdoc_options: []
|
137
137
|
require_paths:
|
138
138
|
- lib
|
@@ -149,9 +149,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
149
149
|
requirements:
|
150
150
|
- antiword
|
151
151
|
- pdftotext/poppler
|
152
|
-
|
153
|
-
|
154
|
-
signing_key:
|
152
|
+
rubygems_version: 3.2.3
|
153
|
+
signing_key:
|
155
154
|
specification_version: 4
|
156
155
|
summary: Attempts to quickly extract text from various file types before resorting
|
157
156
|
to something more extreme like Apache Tika.
|
@@ -1,17 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module SimpleTextExtract
|
4
|
-
class FileExtractor < TextExtractor
|
5
|
-
attr_reader :filepath
|
6
|
-
|
7
|
-
def initialize(filepath:)
|
8
|
-
@filepath = filepath
|
9
|
-
end
|
10
|
-
|
11
|
-
private
|
12
|
-
|
13
|
-
def file
|
14
|
-
@file ||= File.new(filepath)
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
@@ -1,34 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module SimpleTextExtract
|
4
|
-
class TempfileExtractor < TextExtractor
|
5
|
-
attr_reader :filename, :raw
|
6
|
-
|
7
|
-
def initialize(filename:, raw:)
|
8
|
-
@filename = filename
|
9
|
-
@raw = String.new(raw, encoding: Encoding::UTF_8)
|
10
|
-
|
11
|
-
write_raw
|
12
|
-
end
|
13
|
-
|
14
|
-
private
|
15
|
-
|
16
|
-
def file
|
17
|
-
@file ||= Tempfile.new(filepath)
|
18
|
-
end
|
19
|
-
|
20
|
-
def write_raw
|
21
|
-
file.write(raw)
|
22
|
-
file.rewind
|
23
|
-
end
|
24
|
-
|
25
|
-
def cleanup
|
26
|
-
file.close
|
27
|
-
file.unlink
|
28
|
-
end
|
29
|
-
|
30
|
-
def filepath
|
31
|
-
@filepath ||= filename.split(".").yield_self { |parts| [parts[0], ".#{parts[1]}"] }
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|