simple_text_extract 0.3.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/Gemfile +0 -2
- data/Gemfile.lock +11 -20
- data/README.md +5 -3
- data/bin/console +0 -3
- data/lib/simple_text_extract.rb +3 -5
- data/lib/simple_text_extract/format_extractor/xls.rb +0 -1
- data/lib/simple_text_extract/text_extractor.rb +39 -11
- data/lib/simple_text_extract/version.rb +1 -1
- data/simple_text_extract.gemspec +2 -3
- metadata +7 -23
- data/lib/simple_text_extract/file_extractor.rb +0 -17
- data/lib/simple_text_extract/tempfile_extractor.rb +0 -34
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 28beca26a3231ec93af472f0aafdf46c2fcadf33ab79a0925f980f9d8ebfcde3
|
4
|
+
data.tar.gz: 7c1e424163c080ac59bf1414911b10a1dffbdc6d1e34deac4e7a5488f93d09fa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 41d850e3f186c323757964593028fd46efbfe8cae2031cf0c86785ee0395640f21205b053ff6f36bce5df68e711dbb6cd2565a5f0e4e3c5a2f8256c3648b4215
|
7
|
+
data.tar.gz: ecd4b2d7bb87d4e4aab0b36b6bcc06e1e9b04bccfe6caad90d69145a7810f912524e1a2c7fecc6aad8f36ca8b5558f1817d4d18eec4e2fd24e5e8410256399b0
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.6.
|
1
|
+
2.6.5
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,31 +1,24 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
simple_text_extract (0.
|
5
|
-
roo (~> 2.8)
|
4
|
+
simple_text_extract (1.0.0)
|
5
|
+
roo (~> 2.8.2)
|
6
6
|
spreadsheet (~> 1.1.8)
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
-
coderay (1.1.2)
|
12
|
-
metaclass (0.0.4)
|
13
|
-
method_source (0.9.2)
|
14
11
|
mini_portile2 (2.4.0)
|
15
|
-
minitest (5.
|
16
|
-
mocha (1.
|
17
|
-
|
18
|
-
nokogiri (1.10.3)
|
12
|
+
minitest (5.14.0)
|
13
|
+
mocha (1.11.2)
|
14
|
+
nokogiri (1.10.9)
|
19
15
|
mini_portile2 (~> 2.4.0)
|
20
|
-
|
21
|
-
|
22
|
-
method_source (~> 0.9.0)
|
23
|
-
rake (10.5.0)
|
24
|
-
roo (2.8.2)
|
16
|
+
rake (13.0.1)
|
17
|
+
roo (2.8.3)
|
25
18
|
nokogiri (~> 1)
|
26
|
-
rubyzip (>= 1.
|
19
|
+
rubyzip (>= 1.3.0, < 3.0.0)
|
27
20
|
ruby-ole (1.2.12.2)
|
28
|
-
rubyzip (
|
21
|
+
rubyzip (2.3.0)
|
29
22
|
spreadsheet (1.1.9)
|
30
23
|
ruby-ole (>= 1.0)
|
31
24
|
|
@@ -33,12 +26,10 @@ PLATFORMS
|
|
33
26
|
ruby
|
34
27
|
|
35
28
|
DEPENDENCIES
|
36
|
-
bundler (~> 1.17)
|
37
29
|
minitest (~> 5.0)
|
38
30
|
mocha
|
39
|
-
|
40
|
-
rake (~> 10.0)
|
31
|
+
rake (~> 13.0)
|
41
32
|
simple_text_extract!
|
42
33
|
|
43
34
|
BUNDLED WITH
|
44
|
-
|
35
|
+
2.0.2
|
data/README.md
CHANGED
@@ -34,11 +34,13 @@ Or install it yourself as:
|
|
34
34
|
Text can be parsed from raw file content or files in the filesystem t by calling `SimpleTextExtract.extract`:
|
35
35
|
|
36
36
|
```ruby
|
37
|
-
#
|
38
|
-
|
37
|
+
# using ActiveStorage >= 6
|
38
|
+
extract = attachment.open { |tmp| SimpleTextExtract.extract(tempfile: tmp) }
|
39
|
+
# raw file content or when ActiveStorage < 6
|
40
|
+
extract = SimpleTextExtract.extract(filename: attachment.blob.filename, raw: attachment.download)
|
39
41
|
|
40
42
|
# filesystem
|
41
|
-
SimpleTextExtract.extract(filepath: "path_to_file.pdf")
|
43
|
+
extract = SimpleTextExtract.extract(filepath: "path_to_file.pdf")
|
42
44
|
```
|
43
45
|
|
44
46
|
### Usage Dependencies
|
data/bin/console
CHANGED
data/lib/simple_text_extract.rb
CHANGED
@@ -2,8 +2,6 @@
|
|
2
2
|
|
3
3
|
require "simple_text_extract/version"
|
4
4
|
require "simple_text_extract/text_extractor"
|
5
|
-
require "simple_text_extract/file_extractor"
|
6
|
-
require "simple_text_extract/tempfile_extractor"
|
7
5
|
require "simple_text_extract/format_extractor_factory"
|
8
6
|
|
9
7
|
module SimpleTextExtract
|
@@ -11,11 +9,11 @@ module SimpleTextExtract
|
|
11
9
|
|
12
10
|
class Error < StandardError; end
|
13
11
|
|
14
|
-
def self.extract(filename: nil, raw: nil, filepath: nil)
|
15
|
-
TextExtractor.
|
12
|
+
def self.extract(filename: nil, raw: nil, filepath: nil, tempfile: nil)
|
13
|
+
TextExtractor.new(filename: filename, raw: raw, filepath: filepath, tempfile: tempfile).to_s
|
16
14
|
end
|
17
15
|
|
18
16
|
def self.supports?(filename: nil)
|
19
|
-
SUPPORTED_FILETYPES.include?(filename.split(".")
|
17
|
+
SUPPORTED_FILETYPES.include?(filename.split(".").last)
|
20
18
|
end
|
21
19
|
end
|
@@ -2,24 +2,52 @@
|
|
2
2
|
|
3
3
|
module SimpleTextExtract
|
4
4
|
class TextExtractor
|
5
|
-
|
6
|
-
if !filename.nil? && !raw.nil?
|
7
|
-
TempfileExtractor.new(filename: filename.to_s, raw: raw).extract
|
8
|
-
elsif !filepath.nil? && File.exist?(filepath)
|
9
|
-
FileExtractor.new(filepath: filepath).extract
|
10
|
-
end
|
11
|
-
end
|
5
|
+
attr_reader :file
|
12
6
|
|
13
|
-
def
|
14
|
-
|
15
|
-
|
7
|
+
def initialize(filename: nil, raw: nil, filepath: nil, tempfile: nil)
|
8
|
+
@file = get_file(filename: filename, raw: raw, filepath: filepath, tempfile: tempfile)
|
9
|
+
end
|
16
10
|
|
17
|
-
|
11
|
+
def to_s
|
12
|
+
@to_s ||= extract.to_s
|
18
13
|
end
|
19
14
|
|
20
15
|
private
|
21
16
|
|
17
|
+
def get_file(filename:, raw:, filepath:, tempfile:)
|
18
|
+
if tempfile&.class == Tempfile
|
19
|
+
tempfile
|
20
|
+
elsif !filename.nil? && !raw.nil?
|
21
|
+
write_tempfile(filename: filename.to_s, raw: raw)
|
22
|
+
elsif !filepath.nil? && File.exist?(filepath)
|
23
|
+
File.new(filepath)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def extract
|
28
|
+
return unless file
|
29
|
+
|
30
|
+
begin
|
31
|
+
FormatExtractorFactory.call(file).extract
|
32
|
+
ensure
|
33
|
+
cleanup
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
22
37
|
def cleanup
|
38
|
+
return unless file.class == Tempfile
|
39
|
+
|
40
|
+
file.close
|
41
|
+
file.unlink
|
42
|
+
end
|
43
|
+
|
44
|
+
def write_tempfile(filename:, raw:)
|
45
|
+
filename = filename.split(".").yield_self { |parts| [parts[0], ".#{parts[1]}"] }
|
46
|
+
file = Tempfile.new(filename)
|
47
|
+
raw = String.new(raw, encoding: Encoding::UTF_8)
|
48
|
+
|
49
|
+
file.write(raw)
|
50
|
+
file.tap(&:rewind)
|
23
51
|
end
|
24
52
|
end
|
25
53
|
end
|
data/simple_text_extract.gemspec
CHANGED
@@ -28,11 +28,10 @@ Gem::Specification.new do |spec|
|
|
28
28
|
spec.requirements << "pdftotext/poppler"
|
29
29
|
spec.required_ruby_version = ">= 2.5"
|
30
30
|
|
31
|
-
spec.add_runtime_dependency "roo", "~> 2.8"
|
31
|
+
spec.add_runtime_dependency "roo", "~> 2.8.2"
|
32
32
|
spec.add_runtime_dependency "spreadsheet", "~> 1.1.8"
|
33
33
|
|
34
|
-
spec.add_development_dependency "
|
35
|
-
spec.add_development_dependency "rake", "~> 10.0"
|
34
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
36
35
|
spec.add_development_dependency "minitest", "~> 5.0"
|
37
36
|
spec.add_development_dependency "mocha"
|
38
37
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple_text_extract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nick Weiland
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-03-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: roo
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 2.8.2
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 2.8.2
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: spreadsheet
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -38,34 +38,20 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 1.1.8
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: bundler
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - "~>"
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '1.17'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - "~>"
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '1.17'
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: rake
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
58
44
|
requirements:
|
59
45
|
- - "~>"
|
60
46
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
47
|
+
version: '13.0'
|
62
48
|
type: :development
|
63
49
|
prerelease: false
|
64
50
|
version_requirements: !ruby/object:Gem::Requirement
|
65
51
|
requirements:
|
66
52
|
- - "~>"
|
67
53
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
54
|
+
version: '13.0'
|
69
55
|
- !ruby/object:Gem::Dependency
|
70
56
|
name: minitest
|
71
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -114,7 +100,6 @@ files:
|
|
114
100
|
- bin/console
|
115
101
|
- bin/setup
|
116
102
|
- lib/simple_text_extract.rb
|
117
|
-
- lib/simple_text_extract/file_extractor.rb
|
118
103
|
- lib/simple_text_extract/format_extractor/base.rb
|
119
104
|
- lib/simple_text_extract/format_extractor/doc.rb
|
120
105
|
- lib/simple_text_extract/format_extractor/doc_x.rb
|
@@ -123,7 +108,6 @@ files:
|
|
123
108
|
- lib/simple_text_extract/format_extractor/xls.rb
|
124
109
|
- lib/simple_text_extract/format_extractor/xls_x.rb
|
125
110
|
- lib/simple_text_extract/format_extractor_factory.rb
|
126
|
-
- lib/simple_text_extract/tempfile_extractor.rb
|
127
111
|
- lib/simple_text_extract/text_extractor.rb
|
128
112
|
- lib/simple_text_extract/version.rb
|
129
113
|
- simple_text_extract.gemspec
|
@@ -149,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
149
133
|
requirements:
|
150
134
|
- antiword
|
151
135
|
- pdftotext/poppler
|
152
|
-
rubygems_version: 3.0.
|
136
|
+
rubygems_version: 3.0.6
|
153
137
|
signing_key:
|
154
138
|
specification_version: 4
|
155
139
|
summary: Attempts to quickly extract text from various file types before resorting
|
@@ -1,17 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module SimpleTextExtract
|
4
|
-
class FileExtractor < TextExtractor
|
5
|
-
attr_reader :filepath
|
6
|
-
|
7
|
-
def initialize(filepath:)
|
8
|
-
@filepath = filepath
|
9
|
-
end
|
10
|
-
|
11
|
-
private
|
12
|
-
|
13
|
-
def file
|
14
|
-
@file ||= File.new(filepath)
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
@@ -1,34 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module SimpleTextExtract
|
4
|
-
class TempfileExtractor < TextExtractor
|
5
|
-
attr_reader :filename, :raw
|
6
|
-
|
7
|
-
def initialize(filename:, raw:)
|
8
|
-
@filename = filename
|
9
|
-
@raw = String.new(raw, encoding: Encoding::UTF_8)
|
10
|
-
|
11
|
-
write_raw
|
12
|
-
end
|
13
|
-
|
14
|
-
private
|
15
|
-
|
16
|
-
def file
|
17
|
-
@file ||= Tempfile.new(filepath)
|
18
|
-
end
|
19
|
-
|
20
|
-
def write_raw
|
21
|
-
file.write(raw)
|
22
|
-
file.rewind
|
23
|
-
end
|
24
|
-
|
25
|
-
def cleanup
|
26
|
-
file.close
|
27
|
-
file.unlink
|
28
|
-
end
|
29
|
-
|
30
|
-
def filepath
|
31
|
-
@filepath ||= filename.split(".").yield_self { |parts| [parts[0], ".#{parts[1]}"] }
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|