simple_text_extract 0.2.1 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 83da9d28803f321b9a13aeaad4972211d40733b96f6b5fd085e52ab293a19d30
4
- data.tar.gz: 99769610f1adef1d8fbe46647c7253af7859029a362854f4c8d73ec45fa9d8da
3
+ metadata.gz: 93414a5f260468367e16054bfcbcebbd93962c90a2d6903622a622aa567f754d
4
+ data.tar.gz: af3fc14fcc1b3c532f5f54c7dfebdb8d4fd4da5de691fe4561b5e39bd6c04fe0
5
5
  SHA512:
6
- metadata.gz: 6f8dc568cf35fe6519d24dfc9a97a2b3c4d68770d5d489a1a1c4f813307ff7cc2fb973a663656893b448fb2532198f36373827ef202887edb1ad73b0ef53d3e7
7
- data.tar.gz: d334282c216656d91cb038d020e4c1da67ca563b708bf4356f0005ff8d1ec2f1dae1ea58c5427828a2a593b2ef238325caab4fdbdf9d3575ca8ca5e14b1791ca
6
+ metadata.gz: 1219e1404ea893772da0905b7bd808f703240cdc9e7f875a287c4e7886e67ea8fbc4c07a2a55a95a8db3f828d7f7ad87a67832e7834618c62537df3d42e013da
7
+ data.tar.gz: c052e8f2640f8aa9ac452b9cfcad32695add2c1b374daf4dee712f34592a9d1decaae453dbeff1f473fdcd12df01f1a1f9e7751a03925514e91976bb9d654f10
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.5.3
1
+ 3.0.0
data/Gemfile CHANGED
@@ -4,6 +4,4 @@ source "https://rubygems.org"
4
4
 
5
5
  git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
6
6
 
7
- gem "pry"
8
-
9
7
  gemspec
data/Gemfile.lock CHANGED
@@ -1,44 +1,38 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- simple_text_extract (0.2.0)
5
- roo (~> 2.8)
4
+ simple_text_extract (1.2.0)
5
+ roo (~> 2.8.2)
6
+ rubyzip (>= 1.0.0)
6
7
  spreadsheet (~> 1.1.8)
7
8
 
8
9
  GEM
9
10
  remote: https://rubygems.org/
10
11
  specs:
11
- coderay (1.1.2)
12
- metaclass (0.0.4)
13
- method_source (0.9.2)
14
- mini_portile2 (2.4.0)
15
- minitest (5.11.3)
16
- mocha (1.8.0)
17
- metaclass (~> 0.0.1)
18
- nokogiri (1.10.1)
19
- mini_portile2 (~> 2.4.0)
20
- pry (0.12.2)
21
- coderay (~> 1.1.0)
22
- method_source (~> 0.9.0)
23
- rake (10.5.0)
24
- roo (2.8.1)
12
+ mini_portile2 (2.5.1)
13
+ minitest (5.14.4)
14
+ mocha (1.12.0)
15
+ nokogiri (1.11.6)
16
+ mini_portile2 (~> 2.5.0)
17
+ racc (~> 1.4)
18
+ racc (1.5.2)
19
+ rake (13.0.3)
20
+ roo (2.8.3)
25
21
  nokogiri (~> 1)
26
- rubyzip (>= 1.2.1, < 2.0.0)
27
- ruby-ole (1.2.12.1)
28
- rubyzip (1.2.2)
29
- spreadsheet (1.1.8)
22
+ rubyzip (>= 1.3.0, < 3.0.0)
23
+ ruby-ole (1.2.12.2)
24
+ rubyzip (2.3.0)
25
+ spreadsheet (1.1.9)
30
26
  ruby-ole (>= 1.0)
31
27
 
32
28
  PLATFORMS
33
29
  ruby
34
30
 
35
31
  DEPENDENCIES
36
- bundler (~> 1.17)
37
32
  minitest (~> 5.0)
38
33
  mocha
39
- pry
40
- rake (~> 10.0)
34
+ rake (~> 13.0)
41
35
  simple_text_extract!
42
36
 
43
37
  BUNDLED WITH
44
- 1.17.2
38
+ 2.2.3
data/README.md CHANGED
@@ -9,6 +9,7 @@ SimpleTextExtract handles parsing text from:
9
9
  - `.doc`
10
10
  - `.xlsx`
11
11
  - `.xls`
12
+ - `.csv`
12
13
  - `.txt` 😜
13
14
 
14
15
  If no text is parsed (for `pdf`), or a file format is not supported (like images), then `nil` is returned and you can move on to the heavy-duty tools like [Henkei](https://github.com/abrom/henkei) 💪.
@@ -34,11 +35,13 @@ Or install it yourself as:
34
35
  Text can be parsed from raw file content or files in the filesystem t by calling `SimpleTextExtract.extract`:
35
36
 
36
37
  ```ruby
37
- # raw file content using ActiveStorage
38
- SimpleTextExtract.extract(filename: attachment.blob.filename, raw: attachment.download)
38
+ # using ActiveStorage >= 6
39
+ extract = attachment.open { |tmp| SimpleTextExtract.extract(tempfile: tmp) }
40
+ # raw file content or when ActiveStorage < 6
41
+ extract = SimpleTextExtract.extract(filename: attachment.blob.filename, raw: attachment.download)
39
42
 
40
43
  # filesystem
41
- SimpleTextExtract.extract(filepath: "path_to_file.pdf")
44
+ extract = SimpleTextExtract.extract(filepath: "path_to_file.pdf")
42
45
  ```
43
46
 
44
47
  ### Usage Dependencies
@@ -48,7 +51,7 @@ You can choose to use SimpleTextExtract without the following dependencies, but
48
51
  `pdf` parsing requires `poppler-utils`
49
52
  - `brew install poppler`
50
53
 
51
- `doc` parsing requires `antiword`
54
+ `doc` parsing requires `antiword` and `unzip`
52
55
  - `brew install antiword`
53
56
 
54
57
  `xlsx` and `xls` parsing requires `ssconvert` which is part of `gnumeric`
@@ -73,6 +76,7 @@ In your `Aptfile`, add:
73
76
  ```
74
77
  antiword
75
78
  gnumeric
79
+ unzip
76
80
  ```
77
81
 
78
82
  * There is currently an [issue](https://github.com/heroku/heroku-buildpack-google-chrome/issues/59) with the heroku-18 stack that requires additional dependencies added to the Aptfile to get `gnumeric` to work properly. You can reference the linked issue above to figure out those dependencies, or downgrade to heroku-16 until it is fixed.
data/Rakefile CHANGED
@@ -7,4 +7,4 @@ Rake::TestTask.new(:test) do |t|
7
7
  t.test_files = FileList["test/**/*_test.rb"]
8
8
  end
9
9
 
10
- task :default => :test
10
+ task default: :test
data/bin/console CHANGED
@@ -6,8 +6,5 @@ require "simple_text_extract"
6
6
  # You can add fixtures and/or initialization code here to make experimenting
7
7
  # with your gem easier. You can also use a different console, if you like.
8
8
 
9
- require "pry"
10
- Pry.start
11
-
12
9
  require "irb"
13
10
  IRB.start(__FILE__)
@@ -2,20 +2,18 @@
2
2
 
3
3
  require "simple_text_extract/version"
4
4
  require "simple_text_extract/text_extractor"
5
- require "simple_text_extract/file_extractor"
6
- require "simple_text_extract/tempfile_extractor"
7
5
  require "simple_text_extract/format_extractor_factory"
8
6
 
9
7
  module SimpleTextExtract
10
- SUPPORTED_FILETYPES = ["xls", "xlsx", "doc", "docx", "txt", "pdf"]
8
+ SUPPORTED_FILETYPES = ["xls", "xlsx", "doc", "docx", "txt", "pdf", "csv", "zip"].freeze
11
9
 
12
10
  class Error < StandardError; end
13
11
 
14
- def self.extract(filename: nil, raw: nil, filepath: nil)
15
- TextExtractor.call(filename: filename, raw: raw, filepath: filepath).to_s
12
+ def self.extract(filename: nil, raw: nil, filepath: nil, tempfile: nil)
13
+ TextExtractor.new(filename: filename, raw: raw, filepath: filepath, tempfile: tempfile).to_s
16
14
  end
17
15
 
18
16
  def self.supports?(filename: nil)
19
- SUPPORTED_FILETYPES.include?(filename.split(".")[1])
17
+ SUPPORTED_FILETYPES.include?(filename.split(".").last)
20
18
  end
21
19
  end
@@ -4,7 +4,7 @@ module SimpleTextExtract
4
4
  module FormatExtractor
5
5
  class Doc < Base
6
6
  def extract
7
- return nil if missing_dependency?('antiword')
7
+ return nil if missing_dependency?("antiword")
8
8
 
9
9
  `antiword #{Shellwords.escape(file.path)}`
10
10
  end
@@ -8,7 +8,6 @@ module SimpleTextExtract
8
8
 
9
9
  spreadsheet = Spreadsheet.open(file)
10
10
  text = []
11
-
12
11
  spreadsheet.worksheets.each do |sheet|
13
12
  text << sheet.name
14
13
  text << sheet.rows
@@ -6,7 +6,7 @@ module SimpleTextExtract
6
6
  def extract
7
7
  require "roo"
8
8
 
9
- spreadsheet = Roo::Spreadsheet.open(file)
9
+ spreadsheet = Roo::Spreadsheet.open(file, only_visible_sheets: true)
10
10
 
11
11
  text = []
12
12
 
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleTextExtract
4
+ module FormatExtractor
5
+ class ZipExtract < Base
6
+ def extract
7
+ require "zip"
8
+
9
+ result = []
10
+ Zip::File.open(file) do |zip_file|
11
+ zip_file.each do |entry|
12
+ result << entry.name
13
+ result << SimpleTextExtract.extract(
14
+ raw: entry.get_input_stream.read,
15
+ filename: entry.name
16
+ )
17
+ end
18
+ end
19
+
20
+ result.join(" ")
21
+ end
22
+ end
23
+ end
24
+ end
@@ -7,12 +7,15 @@ require "simple_text_extract/format_extractor/xls_x"
7
7
  require "simple_text_extract/format_extractor/xls"
8
8
  require "simple_text_extract/format_extractor/doc_x"
9
9
  require "simple_text_extract/format_extractor/doc"
10
+ require "simple_text_extract/format_extractor/zip_extract"
10
11
 
11
12
  module SimpleTextExtract
12
13
  class FormatExtractorFactory
13
- def self.call(file) # rubocop:disable Metrics/MethodLength
14
+ def self.call(file)
14
15
  case file.path
15
- when /.txt$/i
16
+ when /.zip$/i
17
+ FormatExtractor::ZipExtract.new(file)
18
+ when /(.txt$|.csv$)/i
16
19
  FormatExtractor::PlainText.new(file)
17
20
  when /.pdf$/i
18
21
  FormatExtractor::PDF.new(file)
@@ -2,24 +2,54 @@
2
2
 
3
3
  module SimpleTextExtract
4
4
  class TextExtractor
5
- def self.call(filename: nil, raw: nil, filepath: nil)
6
- if !filename.nil? && !raw.nil?
7
- TempfileExtractor.new(filename: filename.to_s, raw: raw).extract
8
- elsif !filepath.nil? && File.exist?(filepath)
9
- FileExtractor.new(filepath: filepath).extract
10
- end
11
- end
5
+ attr_reader :file
12
6
 
13
- def extract
14
- text = FormatExtractorFactory.call(file).extract
15
- cleanup
7
+ def initialize(filename: nil, raw: nil, filepath: nil, tempfile: nil)
8
+ @file = get_file(filename: filename, raw: raw, filepath: filepath, tempfile: tempfile)
9
+ end
16
10
 
17
- text
11
+ def to_s
12
+ @to_s ||= extract.to_s
18
13
  end
19
14
 
20
15
  private
21
16
 
17
+ def get_file(filename:, raw:, filepath:, tempfile:)
18
+ if tempfile&.class == Tempfile
19
+ tempfile
20
+ elsif !filename.nil? && !raw.nil?
21
+ write_tempfile(filename: filename.to_s, raw: raw)
22
+ elsif !filepath.nil? && File.exist?(filepath)
23
+ File.new(filepath)
24
+ end
25
+ end
26
+
27
+ def extract
28
+ return unless file
29
+
30
+ begin
31
+ FormatExtractorFactory.call(file).extract
32
+ rescue StandardError
33
+ nil
34
+ ensure
35
+ cleanup
36
+ end
37
+ end
38
+
22
39
  def cleanup
40
+ return unless file.instance_of?(Tempfile)
41
+
42
+ file.close
43
+ file.unlink
44
+ end
45
+
46
+ def write_tempfile(filename:, raw:)
47
+ filename = filename.split(".").yield_self { |parts| [parts[0], ".#{parts[1]}"] }
48
+ file = Tempfile.new(filename)
49
+ raw = String.new(raw, encoding: Encoding::UTF_8)
50
+
51
+ file.write(raw)
52
+ file.tap(&:rewind)
23
53
  end
24
54
  end
25
55
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SimpleTextExtract
4
- VERSION = "0.2.1"
4
+ VERSION = "1.2.0"
5
5
  end
Binary file
@@ -28,11 +28,11 @@ Gem::Specification.new do |spec|
28
28
  spec.requirements << "pdftotext/poppler"
29
29
  spec.required_ruby_version = ">= 2.5"
30
30
 
31
- spec.add_runtime_dependency "roo", "~> 2.8"
31
+ spec.add_runtime_dependency "roo", "~> 2.8.2"
32
32
  spec.add_runtime_dependency "spreadsheet", "~> 1.1.8"
33
+ spec.add_runtime_dependency "rubyzip", ">= 1.0.0"
33
34
 
34
- spec.add_development_dependency "bundler", "~> 1.17"
35
- spec.add_development_dependency "rake", "~> 10.0"
35
+ spec.add_development_dependency "rake", "~> 13.0"
36
36
  spec.add_development_dependency "minitest", "~> 5.0"
37
37
  spec.add_development_dependency "mocha"
38
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple_text_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nick Weiland
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-01-28 00:00:00.000000000 Z
11
+ date: 2021-05-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: roo
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '2.8'
19
+ version: 2.8.2
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '2.8'
26
+ version: 2.8.2
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: spreadsheet
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -39,33 +39,33 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: 1.1.8
41
41
  - !ruby/object:Gem::Dependency
42
- name: bundler
42
+ name: rubyzip
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: '1.17'
48
- type: :development
47
+ version: 1.0.0
48
+ type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - "~>"
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: '1.17'
54
+ version: 1.0.0
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: rake
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '10.0'
61
+ version: '13.0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '10.0'
68
+ version: '13.0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: minitest
71
71
  requirement: !ruby/object:Gem::Requirement
@@ -114,7 +114,6 @@ files:
114
114
  - bin/console
115
115
  - bin/setup
116
116
  - lib/simple_text_extract.rb
117
- - lib/simple_text_extract/file_extractor.rb
118
117
  - lib/simple_text_extract/format_extractor/base.rb
119
118
  - lib/simple_text_extract/format_extractor/doc.rb
120
119
  - lib/simple_text_extract/format_extractor/doc_x.rb
@@ -122,17 +121,18 @@ files:
122
121
  - lib/simple_text_extract/format_extractor/plain_text.rb
123
122
  - lib/simple_text_extract/format_extractor/xls.rb
124
123
  - lib/simple_text_extract/format_extractor/xls_x.rb
124
+ - lib/simple_text_extract/format_extractor/zip_extract.rb
125
125
  - lib/simple_text_extract/format_extractor_factory.rb
126
- - lib/simple_text_extract/tempfile_extractor.rb
127
126
  - lib/simple_text_extract/text_extractor.rb
128
127
  - lib/simple_text_extract/version.rb
128
+ - simple_text_extract-1.0.2.gem
129
129
  - simple_text_extract.gemspec
130
130
  - tags
131
131
  homepage: https://github.com/weilandia/simple_text_extract
132
132
  licenses:
133
133
  - MIT
134
134
  metadata: {}
135
- post_install_message:
135
+ post_install_message:
136
136
  rdoc_options: []
137
137
  require_paths:
138
138
  - lib
@@ -149,9 +149,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
149
149
  requirements:
150
150
  - antiword
151
151
  - pdftotext/poppler
152
- rubyforge_project:
153
- rubygems_version: 2.7.6
154
- signing_key:
152
+ rubygems_version: 3.2.3
153
+ signing_key:
155
154
  specification_version: 4
156
155
  summary: Attempts to quickly extract text from various file types before resorting
157
156
  to something more extreme like Apache Tika.
@@ -1,17 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SimpleTextExtract
4
- class FileExtractor < TextExtractor
5
- attr_reader :filepath
6
-
7
- def initialize(filepath:)
8
- @filepath = filepath
9
- end
10
-
11
- private
12
-
13
- def file
14
- @file ||= File.new(filepath)
15
- end
16
- end
17
- end
@@ -1,34 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SimpleTextExtract
4
- class TempfileExtractor < TextExtractor
5
- attr_reader :filename, :raw
6
-
7
- def initialize(filename:, raw:)
8
- @filename = filename
9
- @raw = String.new(raw, encoding: Encoding::UTF_8)
10
-
11
- write_raw
12
- end
13
-
14
- private
15
-
16
- def file
17
- @file ||= Tempfile.new(filepath)
18
- end
19
-
20
- def write_raw
21
- file.write(raw)
22
- file.rewind
23
- end
24
-
25
- def cleanup
26
- file.close
27
- file.unlink
28
- end
29
-
30
- def filepath
31
- @filepath ||= filename.split(".").yield_self { |parts| [parts[0], ".#{parts[1]}"] }
32
- end
33
- end
34
- end