simple_text_extract 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bd06e0bd11dd74c71adb01918b474b714ebd3762785931d14d294052aa3301e5
4
- data.tar.gz: 8fbfff63e6403e4abfc980abd03fce52a1653010c6d401c2507ae253d9391916
3
+ metadata.gz: fa34a31195f18156df695d2fb860a2beb9562f6a8ae76d6f2b6a7d4585e2e306
4
+ data.tar.gz: 3af31eaa54ed98bf8d0355cdf2bdf29f525a97e1ae66573b1c7317cbfcd3c951
5
5
  SHA512:
6
- metadata.gz: 6d2cc814d3b419e9540800752f097b2f9d860e4756e7bd6b8e62f6178ad6cd7967c0f0dd9912a6f900564a686e78564c2c2acb7cee412c9e857ae6ed48cc906e
7
- data.tar.gz: dbb00c6da2de38f9d254486adb98bbce2c607a2c54c8637e2b8c9e08efae746ff8bd2ef384a9b1f7ffc2d4edaabda7527498c63726525aa29ba005f24db03770
6
+ metadata.gz: 3d37e232dd959b4c0897439a29b46f64d598f024b61bba2edc2cdad1f0d14461db77b93a227326697d4913e83ab87b4393234943aef964285b253c652849436d
7
+ data.tar.gz: 44255a841598321a97559ef3a77c3db3e6f6a7d22cb78fc78819aaecef643b8b67c85484b4f10ac979923f28c8f727fcaca69ebec694b3df9fe5f6a9c37cc1f9
data/Gemfile.lock CHANGED
@@ -1,7 +1,9 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- simple_text_extract (0.1.1)
4
+ simple_text_extract (0.1.3)
5
+ roo (~> 2.8)
6
+ spreadsheet (~> 1.1.8)
5
7
 
6
8
  GEM
7
9
  remote: https://rubygems.org/
@@ -9,13 +11,23 @@ GEM
9
11
  coderay (1.1.2)
10
12
  metaclass (0.0.4)
11
13
  method_source (0.9.2)
14
+ mini_portile2 (2.4.0)
12
15
  minitest (5.11.3)
13
16
  mocha (1.8.0)
14
17
  metaclass (~> 0.0.1)
18
+ nokogiri (1.10.1)
19
+ mini_portile2 (~> 2.4.0)
15
20
  pry (0.12.2)
16
21
  coderay (~> 1.1.0)
17
22
  method_source (~> 0.9.0)
18
23
  rake (10.5.0)
24
+ roo (2.8.1)
25
+ nokogiri (~> 1)
26
+ rubyzip (>= 1.2.1, < 2.0.0)
27
+ ruby-ole (1.2.12.1)
28
+ rubyzip (1.2.2)
29
+ spreadsheet (1.1.8)
30
+ ruby-ole (>= 1.0)
19
31
 
20
32
  PLATFORMS
21
33
  ruby
data/README.md CHANGED
@@ -51,9 +51,6 @@ You can choose to use SimpleTextExtract without the following dependencies, but
51
51
  `doc` parsing requires `antiword`
52
52
  - `brew install antiword`
53
53
 
54
- `xlsx` and `xls` parsing requires `ssconvert` which is part of `gnumeric`
55
- - `brew install gnumeric`
56
-
57
54
  ### Usage on Heroku
58
55
 
59
56
  To use on Heroku you'll have to add some custom buildpacks.
@@ -72,7 +69,6 @@ To add `antiword` as a dependency on Heroku, install the [heroku-buildpack-apt](
72
69
  In your `Aptfile`, add:
73
70
  ```
74
71
  antiword
75
- gnumeric
76
72
  ```
77
73
 
78
74
  ## Benchmarks
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleTextExtract
4
+ module FormatExtractor
5
+ class Xls < Base
6
+ def extract
7
+ require "spreadsheet"
8
+
9
+ spreadsheet = Spreadsheet.open(file)
10
+ text = []
11
+
12
+ spreadsheet.worksheets.each do |sheet|
13
+ text << sheet.name
14
+ text << sheet.rows
15
+ end
16
+
17
+ text.flatten.join(" ")
18
+ end
19
+ end
20
+ end
21
+ end
@@ -4,16 +4,18 @@ module SimpleTextExtract
4
4
  module FormatExtractor
5
5
  class XlsX < Base
6
6
  def extract
7
- return nil if missing_dependency?("ssconvert")
7
+ require "roo"
8
8
 
9
- extract_filepath = "#{file.path.split(".")[0]}.txt"
9
+ spreadsheet = Roo::Spreadsheet.open(file)
10
10
 
11
- `ssconvert -O 'separator=" "' #{Shellwords.escape(file.path)} #{extract_filepath}`
11
+ text = []
12
12
 
13
- text = File.read(extract_filepath)
14
- File.unlink(extract_filepath)
13
+ spreadsheet.each_with_pagename do |name, sheet|
14
+ text << name
15
+ 1.upto(sheet.last_row.to_i) { |row| text << sheet.row(row) }
16
+ end
15
17
 
16
- text
18
+ text.flatten.join(" ")
17
19
  end
18
20
  end
19
21
  end
@@ -4,6 +4,7 @@ require "simple_text_extract/format_extractor/base"
4
4
  require "simple_text_extract/format_extractor/plain_text"
5
5
  require "simple_text_extract/format_extractor/pdf"
6
6
  require "simple_text_extract/format_extractor/xls_x"
7
+ require "simple_text_extract/format_extractor/xls"
7
8
  require "simple_text_extract/format_extractor/doc_x"
8
9
  require "simple_text_extract/format_extractor/doc"
9
10
 
@@ -19,8 +20,10 @@ module SimpleTextExtract
19
20
  FormatExtractor::DocX.new(file)
20
21
  when /.doc$/i
21
22
  FormatExtractor::Doc.new(file)
22
- when /(.xlsx$|.xls$)/i
23
+ when /.xlsx$/i
23
24
  FormatExtractor::XlsX.new(file)
25
+ when /.xls$/i
26
+ FormatExtractor::Xls.new(file)
24
27
  else
25
28
  FormatExtractor::Base.new(file)
26
29
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SimpleTextExtract
4
- VERSION = "0.1.3"
4
+ VERSION = "0.2.0"
5
5
  end
@@ -24,10 +24,13 @@ Gem::Specification.new do |spec|
24
24
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
25
25
  spec.require_paths = ["lib"]
26
26
 
27
- spec.requirements << "Antiword"
27
+ spec.requirements << "antiword"
28
28
  spec.requirements << "pdftotext/poppler"
29
29
  spec.required_ruby_version = ">= 2.5"
30
30
 
31
+ spec.add_runtime_dependency "roo", "~> 2.8"
32
+ spec.add_runtime_dependency "spreadsheet", "~> 1.1.8"
33
+
31
34
  spec.add_development_dependency "bundler", "~> 1.17"
32
35
  spec.add_development_dependency "rake", "~> 10.0"
33
36
  spec.add_development_dependency "minitest", "~> 5.0"
metadata CHANGED
@@ -1,15 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple_text_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nick Weiland
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-01-24 00:00:00.000000000 Z
11
+ date: 2019-01-25 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: roo
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.8'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: spreadsheet
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.1.8
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.1.8
13
41
  - !ruby/object:Gem::Dependency
14
42
  name: bundler
15
43
  requirement: !ruby/object:Gem::Requirement
@@ -92,6 +120,7 @@ files:
92
120
  - lib/simple_text_extract/format_extractor/doc_x.rb
93
121
  - lib/simple_text_extract/format_extractor/pdf.rb
94
122
  - lib/simple_text_extract/format_extractor/plain_text.rb
123
+ - lib/simple_text_extract/format_extractor/xls.rb
95
124
  - lib/simple_text_extract/format_extractor/xls_x.rb
96
125
  - lib/simple_text_extract/format_extractor_factory.rb
97
126
  - lib/simple_text_extract/tempfile_extractor.rb
@@ -118,7 +147,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
118
147
  - !ruby/object:Gem::Version
119
148
  version: '0'
120
149
  requirements:
121
- - Antiword
150
+ - antiword
122
151
  - pdftotext/poppler
123
152
  rubyforge_project:
124
153
  rubygems_version: 2.7.6