simple_text_extract 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bd06e0bd11dd74c71adb01918b474b714ebd3762785931d14d294052aa3301e5
4
- data.tar.gz: 8fbfff63e6403e4abfc980abd03fce52a1653010c6d401c2507ae253d9391916
3
+ metadata.gz: fa34a31195f18156df695d2fb860a2beb9562f6a8ae76d6f2b6a7d4585e2e306
4
+ data.tar.gz: 3af31eaa54ed98bf8d0355cdf2bdf29f525a97e1ae66573b1c7317cbfcd3c951
5
5
  SHA512:
6
- metadata.gz: 6d2cc814d3b419e9540800752f097b2f9d860e4756e7bd6b8e62f6178ad6cd7967c0f0dd9912a6f900564a686e78564c2c2acb7cee412c9e857ae6ed48cc906e
7
- data.tar.gz: dbb00c6da2de38f9d254486adb98bbce2c607a2c54c8637e2b8c9e08efae746ff8bd2ef384a9b1f7ffc2d4edaabda7527498c63726525aa29ba005f24db03770
6
+ metadata.gz: 3d37e232dd959b4c0897439a29b46f64d598f024b61bba2edc2cdad1f0d14461db77b93a227326697d4913e83ab87b4393234943aef964285b253c652849436d
7
+ data.tar.gz: 44255a841598321a97559ef3a77c3db3e6f6a7d22cb78fc78819aaecef643b8b67c85484b4f10ac979923f28c8f727fcaca69ebec694b3df9fe5f6a9c37cc1f9
data/Gemfile.lock CHANGED
@@ -1,7 +1,9 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- simple_text_extract (0.1.1)
4
+ simple_text_extract (0.1.3)
5
+ roo (~> 2.8)
6
+ spreadsheet (~> 1.1.8)
5
7
 
6
8
  GEM
7
9
  remote: https://rubygems.org/
@@ -9,13 +11,23 @@ GEM
9
11
  coderay (1.1.2)
10
12
  metaclass (0.0.4)
11
13
  method_source (0.9.2)
14
+ mini_portile2 (2.4.0)
12
15
  minitest (5.11.3)
13
16
  mocha (1.8.0)
14
17
  metaclass (~> 0.0.1)
18
+ nokogiri (1.10.1)
19
+ mini_portile2 (~> 2.4.0)
15
20
  pry (0.12.2)
16
21
  coderay (~> 1.1.0)
17
22
  method_source (~> 0.9.0)
18
23
  rake (10.5.0)
24
+ roo (2.8.1)
25
+ nokogiri (~> 1)
26
+ rubyzip (>= 1.2.1, < 2.0.0)
27
+ ruby-ole (1.2.12.1)
28
+ rubyzip (1.2.2)
29
+ spreadsheet (1.1.8)
30
+ ruby-ole (>= 1.0)
19
31
 
20
32
  PLATFORMS
21
33
  ruby
data/README.md CHANGED
@@ -51,9 +51,6 @@ You can choose to use SimpleTextExtract without the following dependencies, but
51
51
  `doc` parsing requires `antiword`
52
52
  - `brew install antiword`
53
53
 
54
- `xlsx` and `xls` parsing requires `ssconvert` which is part of `gnumeric`
55
- - `brew install gnumeric`
56
-
57
54
  ### Usage on Heroku
58
55
 
59
56
  To use on Heroku you'll have to add some custom buildpacks.
@@ -72,7 +69,6 @@ To add `antiword` as a dependency on Heroku, install the [heroku-buildpack-apt](
72
69
  In your `Aptfile`, add:
73
70
  ```
74
71
  antiword
75
- gnumeric
76
72
  ```
77
73
 
78
74
  ## Benchmarks
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleTextExtract
4
+ module FormatExtractor
5
+ class Xls < Base
6
+ def extract
7
+ require "spreadsheet"
8
+
9
+ spreadsheet = Spreadsheet.open(file)
10
+ text = []
11
+
12
+ spreadsheet.worksheets.each do |sheet|
13
+ text << sheet.name
14
+ text << sheet.rows
15
+ end
16
+
17
+ text.flatten.join(" ")
18
+ end
19
+ end
20
+ end
21
+ end
@@ -4,16 +4,18 @@ module SimpleTextExtract
4
4
  module FormatExtractor
5
5
  class XlsX < Base
6
6
  def extract
7
- return nil if missing_dependency?("ssconvert")
7
+ require "roo"
8
8
 
9
- extract_filepath = "#{file.path.split(".")[0]}.txt"
9
+ spreadsheet = Roo::Spreadsheet.open(file)
10
10
 
11
- `ssconvert -O 'separator=" "' #{Shellwords.escape(file.path)} #{extract_filepath}`
11
+ text = []
12
12
 
13
- text = File.read(extract_filepath)
14
- File.unlink(extract_filepath)
13
+ spreadsheet.each_with_pagename do |name, sheet|
14
+ text << name
15
+ 1.upto(sheet.last_row.to_i) { |row| text << sheet.row(row) }
16
+ end
15
17
 
16
- text
18
+ text.flatten.join(" ")
17
19
  end
18
20
  end
19
21
  end
@@ -4,6 +4,7 @@ require "simple_text_extract/format_extractor/base"
4
4
  require "simple_text_extract/format_extractor/plain_text"
5
5
  require "simple_text_extract/format_extractor/pdf"
6
6
  require "simple_text_extract/format_extractor/xls_x"
7
+ require "simple_text_extract/format_extractor/xls"
7
8
  require "simple_text_extract/format_extractor/doc_x"
8
9
  require "simple_text_extract/format_extractor/doc"
9
10
 
@@ -19,8 +20,10 @@ module SimpleTextExtract
19
20
  FormatExtractor::DocX.new(file)
20
21
  when /.doc$/i
21
22
  FormatExtractor::Doc.new(file)
22
- when /(.xlsx$|.xls$)/i
23
+ when /.xlsx$/i
23
24
  FormatExtractor::XlsX.new(file)
25
+ when /.xls$/i
26
+ FormatExtractor::Xls.new(file)
24
27
  else
25
28
  FormatExtractor::Base.new(file)
26
29
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SimpleTextExtract
4
- VERSION = "0.1.3"
4
+ VERSION = "0.2.0"
5
5
  end
@@ -24,10 +24,13 @@ Gem::Specification.new do |spec|
24
24
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
25
25
  spec.require_paths = ["lib"]
26
26
 
27
- spec.requirements << "Antiword"
27
+ spec.requirements << "antiword"
28
28
  spec.requirements << "pdftotext/poppler"
29
29
  spec.required_ruby_version = ">= 2.5"
30
30
 
31
+ spec.add_runtime_dependency "roo", "~> 2.8"
32
+ spec.add_runtime_dependency "spreadsheet", "~> 1.1.8"
33
+
31
34
  spec.add_development_dependency "bundler", "~> 1.17"
32
35
  spec.add_development_dependency "rake", "~> 10.0"
33
36
  spec.add_development_dependency "minitest", "~> 5.0"
metadata CHANGED
@@ -1,15 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple_text_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nick Weiland
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-01-24 00:00:00.000000000 Z
11
+ date: 2019-01-25 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: roo
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.8'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: spreadsheet
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.1.8
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.1.8
13
41
  - !ruby/object:Gem::Dependency
14
42
  name: bundler
15
43
  requirement: !ruby/object:Gem::Requirement
@@ -92,6 +120,7 @@ files:
92
120
  - lib/simple_text_extract/format_extractor/doc_x.rb
93
121
  - lib/simple_text_extract/format_extractor/pdf.rb
94
122
  - lib/simple_text_extract/format_extractor/plain_text.rb
123
+ - lib/simple_text_extract/format_extractor/xls.rb
95
124
  - lib/simple_text_extract/format_extractor/xls_x.rb
96
125
  - lib/simple_text_extract/format_extractor_factory.rb
97
126
  - lib/simple_text_extract/tempfile_extractor.rb
@@ -118,7 +147,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
118
147
  - !ruby/object:Gem::Version
119
148
  version: '0'
120
149
  requirements:
121
- - Antiword
150
+ - antiword
122
151
  - pdftotext/poppler
123
152
  rubyforge_project:
124
153
  rubygems_version: 2.7.6