simple_text_extract 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +13 -1
- data/README.md +0 -4
- data/lib/simple_text_extract/format_extractor/xls.rb +21 -0
- data/lib/simple_text_extract/format_extractor/xls_x.rb +8 -6
- data/lib/simple_text_extract/format_extractor_factory.rb +4 -1
- data/lib/simple_text_extract/version.rb +1 -1
- data/simple_text_extract.gemspec +4 -1
- metadata +32 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fa34a31195f18156df695d2fb860a2beb9562f6a8ae76d6f2b6a7d4585e2e306
|
4
|
+
data.tar.gz: 3af31eaa54ed98bf8d0355cdf2bdf29f525a97e1ae66573b1c7317cbfcd3c951
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3d37e232dd959b4c0897439a29b46f64d598f024b61bba2edc2cdad1f0d14461db77b93a227326697d4913e83ab87b4393234943aef964285b253c652849436d
|
7
|
+
data.tar.gz: 44255a841598321a97559ef3a77c3db3e6f6a7d22cb78fc78819aaecef643b8b67c85484b4f10ac979923f28c8f727fcaca69ebec694b3df9fe5f6a9c37cc1f9
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
simple_text_extract (0.1.
|
4
|
+
simple_text_extract (0.1.3)
|
5
|
+
roo (~> 2.8)
|
6
|
+
spreadsheet (~> 1.1.8)
|
5
7
|
|
6
8
|
GEM
|
7
9
|
remote: https://rubygems.org/
|
@@ -9,13 +11,23 @@ GEM
|
|
9
11
|
coderay (1.1.2)
|
10
12
|
metaclass (0.0.4)
|
11
13
|
method_source (0.9.2)
|
14
|
+
mini_portile2 (2.4.0)
|
12
15
|
minitest (5.11.3)
|
13
16
|
mocha (1.8.0)
|
14
17
|
metaclass (~> 0.0.1)
|
18
|
+
nokogiri (1.10.1)
|
19
|
+
mini_portile2 (~> 2.4.0)
|
15
20
|
pry (0.12.2)
|
16
21
|
coderay (~> 1.1.0)
|
17
22
|
method_source (~> 0.9.0)
|
18
23
|
rake (10.5.0)
|
24
|
+
roo (2.8.1)
|
25
|
+
nokogiri (~> 1)
|
26
|
+
rubyzip (>= 1.2.1, < 2.0.0)
|
27
|
+
ruby-ole (1.2.12.1)
|
28
|
+
rubyzip (1.2.2)
|
29
|
+
spreadsheet (1.1.8)
|
30
|
+
ruby-ole (>= 1.0)
|
19
31
|
|
20
32
|
PLATFORMS
|
21
33
|
ruby
|
data/README.md
CHANGED
@@ -51,9 +51,6 @@ You can choose to use SimpleTextExtract without the following dependencies, but
|
|
51
51
|
`doc` parsing requires `antiword`
|
52
52
|
- `brew install antiword`
|
53
53
|
|
54
|
-
`xlsx` and `xls` parsing requires `ssconvert` which is part of `gnumeric`
|
55
|
-
- `brew install gnumeric`
|
56
|
-
|
57
54
|
### Usage on Heroku
|
58
55
|
|
59
56
|
To use on Heroku you'll have to add some custom buildpacks.
|
@@ -72,7 +69,6 @@ To add `antiword` as a dependency on Heroku, install the [heroku-buildpack-apt](
|
|
72
69
|
In your `Aptfile`, add:
|
73
70
|
```
|
74
71
|
antiword
|
75
|
-
gnumeric
|
76
72
|
```
|
77
73
|
|
78
74
|
## Benchmarks
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SimpleTextExtract
|
4
|
+
module FormatExtractor
|
5
|
+
class Xls < Base
|
6
|
+
def extract
|
7
|
+
require "spreadsheet"
|
8
|
+
|
9
|
+
spreadsheet = Spreadsheet.open(file)
|
10
|
+
text = []
|
11
|
+
|
12
|
+
spreadsheet.worksheets.each do |sheet|
|
13
|
+
text << sheet.name
|
14
|
+
text << sheet.rows
|
15
|
+
end
|
16
|
+
|
17
|
+
text.flatten.join(" ")
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -4,16 +4,18 @@ module SimpleTextExtract
|
|
4
4
|
module FormatExtractor
|
5
5
|
class XlsX < Base
|
6
6
|
def extract
|
7
|
-
|
7
|
+
require "roo"
|
8
8
|
|
9
|
-
|
9
|
+
spreadsheet = Roo::Spreadsheet.open(file)
|
10
10
|
|
11
|
-
|
11
|
+
text = []
|
12
12
|
|
13
|
-
|
14
|
-
|
13
|
+
spreadsheet.each_with_pagename do |name, sheet|
|
14
|
+
text << name
|
15
|
+
1.upto(sheet.last_row.to_i) { |row| text << sheet.row(row) }
|
16
|
+
end
|
15
17
|
|
16
|
-
text
|
18
|
+
text.flatten.join(" ")
|
17
19
|
end
|
18
20
|
end
|
19
21
|
end
|
@@ -4,6 +4,7 @@ require "simple_text_extract/format_extractor/base"
|
|
4
4
|
require "simple_text_extract/format_extractor/plain_text"
|
5
5
|
require "simple_text_extract/format_extractor/pdf"
|
6
6
|
require "simple_text_extract/format_extractor/xls_x"
|
7
|
+
require "simple_text_extract/format_extractor/xls"
|
7
8
|
require "simple_text_extract/format_extractor/doc_x"
|
8
9
|
require "simple_text_extract/format_extractor/doc"
|
9
10
|
|
@@ -19,8 +20,10 @@ module SimpleTextExtract
|
|
19
20
|
FormatExtractor::DocX.new(file)
|
20
21
|
when /.doc$/i
|
21
22
|
FormatExtractor::Doc.new(file)
|
22
|
-
when
|
23
|
+
when /.xlsx$/i
|
23
24
|
FormatExtractor::XlsX.new(file)
|
25
|
+
when /.xls$/i
|
26
|
+
FormatExtractor::Xls.new(file)
|
24
27
|
else
|
25
28
|
FormatExtractor::Base.new(file)
|
26
29
|
end
|
data/simple_text_extract.gemspec
CHANGED
@@ -24,10 +24,13 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
25
25
|
spec.require_paths = ["lib"]
|
26
26
|
|
27
|
-
spec.requirements << "
|
27
|
+
spec.requirements << "antiword"
|
28
28
|
spec.requirements << "pdftotext/poppler"
|
29
29
|
spec.required_ruby_version = ">= 2.5"
|
30
30
|
|
31
|
+
spec.add_runtime_dependency "roo", "~> 2.8"
|
32
|
+
spec.add_runtime_dependency "spreadsheet", "~> 1.1.8"
|
33
|
+
|
31
34
|
spec.add_development_dependency "bundler", "~> 1.17"
|
32
35
|
spec.add_development_dependency "rake", "~> 10.0"
|
33
36
|
spec.add_development_dependency "minitest", "~> 5.0"
|
metadata
CHANGED
@@ -1,15 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple_text_extract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nick Weiland
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-01-
|
11
|
+
date: 2019-01-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: roo
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.8'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.8'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: spreadsheet
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.1.8
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.1.8
|
13
41
|
- !ruby/object:Gem::Dependency
|
14
42
|
name: bundler
|
15
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -92,6 +120,7 @@ files:
|
|
92
120
|
- lib/simple_text_extract/format_extractor/doc_x.rb
|
93
121
|
- lib/simple_text_extract/format_extractor/pdf.rb
|
94
122
|
- lib/simple_text_extract/format_extractor/plain_text.rb
|
123
|
+
- lib/simple_text_extract/format_extractor/xls.rb
|
95
124
|
- lib/simple_text_extract/format_extractor/xls_x.rb
|
96
125
|
- lib/simple_text_extract/format_extractor_factory.rb
|
97
126
|
- lib/simple_text_extract/tempfile_extractor.rb
|
@@ -118,7 +147,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
118
147
|
- !ruby/object:Gem::Version
|
119
148
|
version: '0'
|
120
149
|
requirements:
|
121
|
-
-
|
150
|
+
- antiword
|
122
151
|
- pdftotext/poppler
|
123
152
|
rubyforge_project:
|
124
153
|
rubygems_version: 2.7.6
|