simple_text_extract 0.3.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a5970667ea2f6f6ac902a474e59a7f90a0a19ea2078ece54ef01b75a391e2325
4
- data.tar.gz: '01559ba36c772154566e78b374f0f1d6995ec9f179ed192d6b56afb1cef304f0'
3
+ metadata.gz: 28beca26a3231ec93af472f0aafdf46c2fcadf33ab79a0925f980f9d8ebfcde3
4
+ data.tar.gz: 7c1e424163c080ac59bf1414911b10a1dffbdc6d1e34deac4e7a5488f93d09fa
5
5
  SHA512:
6
- metadata.gz: df13326f81af45c023cac0e059a77fc54d958f9ccaf3c77d62a86d7d4746e71a8acfd7be56eacd2dde1fc29e35f134857b331585c9dd9903db6b479b0a8bcfc4
7
- data.tar.gz: 4713fb6e3c52225b38acaa9bee1d4e2bb2bbf582fa943ed16fdc46bfaeb540cfa0056068282a5993206ecdd00fcbaf79796843d4acdc9f21c96edb72b909b7d3
6
+ metadata.gz: 41d850e3f186c323757964593028fd46efbfe8cae2031cf0c86785ee0395640f21205b053ff6f36bce5df68e711dbb6cd2565a5f0e4e3c5a2f8256c3648b4215
7
+ data.tar.gz: ecd4b2d7bb87d4e4aab0b36b6bcc06e1e9b04bccfe6caad90d69145a7810f912524e1a2c7fecc6aad8f36ca8b5558f1817d4d18eec4e2fd24e5e8410256399b0
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.6.1
1
+ 2.6.5
data/Gemfile CHANGED
@@ -4,6 +4,4 @@ source "https://rubygems.org"
4
4
 
5
5
  git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
6
6
 
7
- gem "pry"
8
-
9
7
  gemspec
data/Gemfile.lock CHANGED
@@ -1,31 +1,24 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- simple_text_extract (0.2.1)
5
- roo (~> 2.8)
4
+ simple_text_extract (1.0.0)
5
+ roo (~> 2.8.2)
6
6
  spreadsheet (~> 1.1.8)
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
- coderay (1.1.2)
12
- metaclass (0.0.4)
13
- method_source (0.9.2)
14
11
  mini_portile2 (2.4.0)
15
- minitest (5.11.3)
16
- mocha (1.8.0)
17
- metaclass (~> 0.0.1)
18
- nokogiri (1.10.3)
12
+ minitest (5.14.0)
13
+ mocha (1.11.2)
14
+ nokogiri (1.10.9)
19
15
  mini_portile2 (~> 2.4.0)
20
- pry (0.12.2)
21
- coderay (~> 1.1.0)
22
- method_source (~> 0.9.0)
23
- rake (10.5.0)
24
- roo (2.8.2)
16
+ rake (13.0.1)
17
+ roo (2.8.3)
25
18
  nokogiri (~> 1)
26
- rubyzip (>= 1.2.1, < 2.0.0)
19
+ rubyzip (>= 1.3.0, < 3.0.0)
27
20
  ruby-ole (1.2.12.2)
28
- rubyzip (1.2.3)
21
+ rubyzip (2.3.0)
29
22
  spreadsheet (1.1.9)
30
23
  ruby-ole (>= 1.0)
31
24
 
@@ -33,12 +26,10 @@ PLATFORMS
33
26
  ruby
34
27
 
35
28
  DEPENDENCIES
36
- bundler (~> 1.17)
37
29
  minitest (~> 5.0)
38
30
  mocha
39
- pry
40
- rake (~> 10.0)
31
+ rake (~> 13.0)
41
32
  simple_text_extract!
42
33
 
43
34
  BUNDLED WITH
44
- 1.17.2
35
+ 2.0.2
data/README.md CHANGED
@@ -34,11 +34,13 @@ Or install it yourself as:
34
34
  Text can be parsed from raw file content or files in the filesystem t by calling `SimpleTextExtract.extract`:
35
35
 
36
36
  ```ruby
37
- # raw file content using ActiveStorage
38
- SimpleTextExtract.extract(filename: attachment.blob.filename, raw: attachment.download)
37
+ # using ActiveStorage >= 6
38
+ extract = attachment.open { |tmp| SimpleTextExtract.extract(tempfile: tmp) }
39
+ # raw file content or when ActiveStorage < 6
40
+ extract = SimpleTextExtract.extract(filename: attachment.blob.filename, raw: attachment.download)
39
41
 
40
42
  # filesystem
41
- SimpleTextExtract.extract(filepath: "path_to_file.pdf")
43
+ extract = SimpleTextExtract.extract(filepath: "path_to_file.pdf")
42
44
  ```
43
45
 
44
46
  ### Usage Dependencies
data/bin/console CHANGED
@@ -6,8 +6,5 @@ require "simple_text_extract"
6
6
  # You can add fixtures and/or initialization code here to make experimenting
7
7
  # with your gem easier. You can also use a different console, if you like.
8
8
 
9
- require "pry"
10
- Pry.start
11
-
12
9
  require "irb"
13
10
  IRB.start(__FILE__)
@@ -2,8 +2,6 @@
2
2
 
3
3
  require "simple_text_extract/version"
4
4
  require "simple_text_extract/text_extractor"
5
- require "simple_text_extract/file_extractor"
6
- require "simple_text_extract/tempfile_extractor"
7
5
  require "simple_text_extract/format_extractor_factory"
8
6
 
9
7
  module SimpleTextExtract
@@ -11,11 +9,11 @@ module SimpleTextExtract
11
9
 
12
10
  class Error < StandardError; end
13
11
 
14
- def self.extract(filename: nil, raw: nil, filepath: nil)
15
- TextExtractor.call(filename: filename, raw: raw, filepath: filepath).to_s
12
+ def self.extract(filename: nil, raw: nil, filepath: nil, tempfile: nil)
13
+ TextExtractor.new(filename: filename, raw: raw, filepath: filepath, tempfile: tempfile).to_s
16
14
  end
17
15
 
18
16
  def self.supports?(filename: nil)
19
- SUPPORTED_FILETYPES.include?(filename.split(".")[1])
17
+ SUPPORTED_FILETYPES.include?(filename.split(".").last)
20
18
  end
21
19
  end
@@ -8,7 +8,6 @@ module SimpleTextExtract
8
8
 
9
9
  spreadsheet = Spreadsheet.open(file)
10
10
  text = []
11
-
12
11
  spreadsheet.worksheets.each do |sheet|
13
12
  text << sheet.name
14
13
  text << sheet.rows
@@ -2,24 +2,52 @@
2
2
 
3
3
  module SimpleTextExtract
4
4
  class TextExtractor
5
- def self.call(filename: nil, raw: nil, filepath: nil)
6
- if !filename.nil? && !raw.nil?
7
- TempfileExtractor.new(filename: filename.to_s, raw: raw).extract
8
- elsif !filepath.nil? && File.exist?(filepath)
9
- FileExtractor.new(filepath: filepath).extract
10
- end
11
- end
5
+ attr_reader :file
12
6
 
13
- def extract
14
- text = FormatExtractorFactory.call(file).extract
15
- cleanup
7
+ def initialize(filename: nil, raw: nil, filepath: nil, tempfile: nil)
8
+ @file = get_file(filename: filename, raw: raw, filepath: filepath, tempfile: tempfile)
9
+ end
16
10
 
17
- text
11
+ def to_s
12
+ @to_s ||= extract.to_s
18
13
  end
19
14
 
20
15
  private
21
16
 
17
+ def get_file(filename:, raw:, filepath:, tempfile:)
18
+ if tempfile&.class == Tempfile
19
+ tempfile
20
+ elsif !filename.nil? && !raw.nil?
21
+ write_tempfile(filename: filename.to_s, raw: raw)
22
+ elsif !filepath.nil? && File.exist?(filepath)
23
+ File.new(filepath)
24
+ end
25
+ end
26
+
27
+ def extract
28
+ return unless file
29
+
30
+ begin
31
+ FormatExtractorFactory.call(file).extract
32
+ ensure
33
+ cleanup
34
+ end
35
+ end
36
+
22
37
  def cleanup
38
+ return unless file.class == Tempfile
39
+
40
+ file.close
41
+ file.unlink
42
+ end
43
+
44
+ def write_tempfile(filename:, raw:)
45
+ filename = filename.split(".").yield_self { |parts| [parts[0], ".#{parts[1]}"] }
46
+ file = Tempfile.new(filename)
47
+ raw = String.new(raw, encoding: Encoding::UTF_8)
48
+
49
+ file.write(raw)
50
+ file.tap(&:rewind)
23
51
  end
24
52
  end
25
53
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SimpleTextExtract
4
- VERSION = "0.3.0"
4
+ VERSION = "1.0.1"
5
5
  end
@@ -28,11 +28,10 @@ Gem::Specification.new do |spec|
28
28
  spec.requirements << "pdftotext/poppler"
29
29
  spec.required_ruby_version = ">= 2.5"
30
30
 
31
- spec.add_runtime_dependency "roo", "~> 2.8"
31
+ spec.add_runtime_dependency "roo", "~> 2.8.2"
32
32
  spec.add_runtime_dependency "spreadsheet", "~> 1.1.8"
33
33
 
34
- spec.add_development_dependency "bundler", "~> 1.17"
35
- spec.add_development_dependency "rake", "~> 10.0"
34
+ spec.add_development_dependency "rake", "~> 13.0"
36
35
  spec.add_development_dependency "minitest", "~> 5.0"
37
36
  spec.add_development_dependency "mocha"
38
37
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple_text_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nick Weiland
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-07-24 00:00:00.000000000 Z
11
+ date: 2020-03-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: roo
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '2.8'
19
+ version: 2.8.2
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '2.8'
26
+ version: 2.8.2
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: spreadsheet
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -38,34 +38,20 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: 1.1.8
41
- - !ruby/object:Gem::Dependency
42
- name: bundler
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - "~>"
46
- - !ruby/object:Gem::Version
47
- version: '1.17'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - "~>"
53
- - !ruby/object:Gem::Version
54
- version: '1.17'
55
41
  - !ruby/object:Gem::Dependency
56
42
  name: rake
57
43
  requirement: !ruby/object:Gem::Requirement
58
44
  requirements:
59
45
  - - "~>"
60
46
  - !ruby/object:Gem::Version
61
- version: '10.0'
47
+ version: '13.0'
62
48
  type: :development
63
49
  prerelease: false
64
50
  version_requirements: !ruby/object:Gem::Requirement
65
51
  requirements:
66
52
  - - "~>"
67
53
  - !ruby/object:Gem::Version
68
- version: '10.0'
54
+ version: '13.0'
69
55
  - !ruby/object:Gem::Dependency
70
56
  name: minitest
71
57
  requirement: !ruby/object:Gem::Requirement
@@ -114,7 +100,6 @@ files:
114
100
  - bin/console
115
101
  - bin/setup
116
102
  - lib/simple_text_extract.rb
117
- - lib/simple_text_extract/file_extractor.rb
118
103
  - lib/simple_text_extract/format_extractor/base.rb
119
104
  - lib/simple_text_extract/format_extractor/doc.rb
120
105
  - lib/simple_text_extract/format_extractor/doc_x.rb
@@ -123,7 +108,6 @@ files:
123
108
  - lib/simple_text_extract/format_extractor/xls.rb
124
109
  - lib/simple_text_extract/format_extractor/xls_x.rb
125
110
  - lib/simple_text_extract/format_extractor_factory.rb
126
- - lib/simple_text_extract/tempfile_extractor.rb
127
111
  - lib/simple_text_extract/text_extractor.rb
128
112
  - lib/simple_text_extract/version.rb
129
113
  - simple_text_extract.gemspec
@@ -149,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
149
133
  requirements:
150
134
  - antiword
151
135
  - pdftotext/poppler
152
- rubygems_version: 3.0.1
136
+ rubygems_version: 3.0.6
153
137
  signing_key:
154
138
  specification_version: 4
155
139
  summary: Attempts to quickly extract text from various file types before resorting
@@ -1,17 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SimpleTextExtract
4
- class FileExtractor < TextExtractor
5
- attr_reader :filepath
6
-
7
- def initialize(filepath:)
8
- @filepath = filepath
9
- end
10
-
11
- private
12
-
13
- def file
14
- @file ||= File.new(filepath)
15
- end
16
- end
17
- end
@@ -1,34 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SimpleTextExtract
4
- class TempfileExtractor < TextExtractor
5
- attr_reader :filename, :raw
6
-
7
- def initialize(filename:, raw:)
8
- @filename = filename
9
- @raw = String.new(raw, encoding: Encoding::UTF_8)
10
-
11
- write_raw
12
- end
13
-
14
- private
15
-
16
- def file
17
- @file ||= Tempfile.new(filepath)
18
- end
19
-
20
- def write_raw
21
- file.write(raw)
22
- file.rewind
23
- end
24
-
25
- def cleanup
26
- file.close
27
- file.unlink
28
- end
29
-
30
- def filepath
31
- @filepath ||= filename.split(".").yield_self { |parts| [parts[0], ".#{parts[1]}"] }
32
- end
33
- end
34
- end