simple_text_extract 0.1.3 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +13 -1
- data/README.md +0 -4
- data/lib/simple_text_extract/format_extractor/xls.rb +21 -0
- data/lib/simple_text_extract/format_extractor/xls_x.rb +8 -6
- data/lib/simple_text_extract/format_extractor_factory.rb +4 -1
- data/lib/simple_text_extract/version.rb +1 -1
- data/simple_text_extract.gemspec +4 -1
- metadata +32 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fa34a31195f18156df695d2fb860a2beb9562f6a8ae76d6f2b6a7d4585e2e306
|
4
|
+
data.tar.gz: 3af31eaa54ed98bf8d0355cdf2bdf29f525a97e1ae66573b1c7317cbfcd3c951
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3d37e232dd959b4c0897439a29b46f64d598f024b61bba2edc2cdad1f0d14461db77b93a227326697d4913e83ab87b4393234943aef964285b253c652849436d
|
7
|
+
data.tar.gz: 44255a841598321a97559ef3a77c3db3e6f6a7d22cb78fc78819aaecef643b8b67c85484b4f10ac979923f28c8f727fcaca69ebec694b3df9fe5f6a9c37cc1f9
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
simple_text_extract (0.1.
|
4
|
+
simple_text_extract (0.1.3)
|
5
|
+
roo (~> 2.8)
|
6
|
+
spreadsheet (~> 1.1.8)
|
5
7
|
|
6
8
|
GEM
|
7
9
|
remote: https://rubygems.org/
|
@@ -9,13 +11,23 @@ GEM
|
|
9
11
|
coderay (1.1.2)
|
10
12
|
metaclass (0.0.4)
|
11
13
|
method_source (0.9.2)
|
14
|
+
mini_portile2 (2.4.0)
|
12
15
|
minitest (5.11.3)
|
13
16
|
mocha (1.8.0)
|
14
17
|
metaclass (~> 0.0.1)
|
18
|
+
nokogiri (1.10.1)
|
19
|
+
mini_portile2 (~> 2.4.0)
|
15
20
|
pry (0.12.2)
|
16
21
|
coderay (~> 1.1.0)
|
17
22
|
method_source (~> 0.9.0)
|
18
23
|
rake (10.5.0)
|
24
|
+
roo (2.8.1)
|
25
|
+
nokogiri (~> 1)
|
26
|
+
rubyzip (>= 1.2.1, < 2.0.0)
|
27
|
+
ruby-ole (1.2.12.1)
|
28
|
+
rubyzip (1.2.2)
|
29
|
+
spreadsheet (1.1.8)
|
30
|
+
ruby-ole (>= 1.0)
|
19
31
|
|
20
32
|
PLATFORMS
|
21
33
|
ruby
|
data/README.md
CHANGED
@@ -51,9 +51,6 @@ You can choose to use SimpleTextExtract without the following dependencies, but
|
|
51
51
|
`doc` parsing requires `antiword`
|
52
52
|
- `brew install antiword`
|
53
53
|
|
54
|
-
`xlsx` and `xls` parsing requires `ssconvert` which is part of `gnumeric`
|
55
|
-
- `brew install gnumeric`
|
56
|
-
|
57
54
|
### Usage on Heroku
|
58
55
|
|
59
56
|
To use on Heroku you'll have to add some custom buildpacks.
|
@@ -72,7 +69,6 @@ To add `antiword` as a dependency on Heroku, install the [heroku-buildpack-apt](
|
|
72
69
|
In your `Aptfile`, add:
|
73
70
|
```
|
74
71
|
antiword
|
75
|
-
gnumeric
|
76
72
|
```
|
77
73
|
|
78
74
|
## Benchmarks
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SimpleTextExtract
|
4
|
+
module FormatExtractor
|
5
|
+
class Xls < Base
|
6
|
+
def extract
|
7
|
+
require "spreadsheet"
|
8
|
+
|
9
|
+
spreadsheet = Spreadsheet.open(file)
|
10
|
+
text = []
|
11
|
+
|
12
|
+
spreadsheet.worksheets.each do |sheet|
|
13
|
+
text << sheet.name
|
14
|
+
text << sheet.rows
|
15
|
+
end
|
16
|
+
|
17
|
+
text.flatten.join(" ")
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -4,16 +4,18 @@ module SimpleTextExtract
|
|
4
4
|
module FormatExtractor
|
5
5
|
class XlsX < Base
|
6
6
|
def extract
|
7
|
-
|
7
|
+
require "roo"
|
8
8
|
|
9
|
-
|
9
|
+
spreadsheet = Roo::Spreadsheet.open(file)
|
10
10
|
|
11
|
-
|
11
|
+
text = []
|
12
12
|
|
13
|
-
|
14
|
-
|
13
|
+
spreadsheet.each_with_pagename do |name, sheet|
|
14
|
+
text << name
|
15
|
+
1.upto(sheet.last_row.to_i) { |row| text << sheet.row(row) }
|
16
|
+
end
|
15
17
|
|
16
|
-
text
|
18
|
+
text.flatten.join(" ")
|
17
19
|
end
|
18
20
|
end
|
19
21
|
end
|
@@ -4,6 +4,7 @@ require "simple_text_extract/format_extractor/base"
|
|
4
4
|
require "simple_text_extract/format_extractor/plain_text"
|
5
5
|
require "simple_text_extract/format_extractor/pdf"
|
6
6
|
require "simple_text_extract/format_extractor/xls_x"
|
7
|
+
require "simple_text_extract/format_extractor/xls"
|
7
8
|
require "simple_text_extract/format_extractor/doc_x"
|
8
9
|
require "simple_text_extract/format_extractor/doc"
|
9
10
|
|
@@ -19,8 +20,10 @@ module SimpleTextExtract
|
|
19
20
|
FormatExtractor::DocX.new(file)
|
20
21
|
when /.doc$/i
|
21
22
|
FormatExtractor::Doc.new(file)
|
22
|
-
when
|
23
|
+
when /.xlsx$/i
|
23
24
|
FormatExtractor::XlsX.new(file)
|
25
|
+
when /.xls$/i
|
26
|
+
FormatExtractor::Xls.new(file)
|
24
27
|
else
|
25
28
|
FormatExtractor::Base.new(file)
|
26
29
|
end
|
data/simple_text_extract.gemspec
CHANGED
@@ -24,10 +24,13 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
25
25
|
spec.require_paths = ["lib"]
|
26
26
|
|
27
|
-
spec.requirements << "
|
27
|
+
spec.requirements << "antiword"
|
28
28
|
spec.requirements << "pdftotext/poppler"
|
29
29
|
spec.required_ruby_version = ">= 2.5"
|
30
30
|
|
31
|
+
spec.add_runtime_dependency "roo", "~> 2.8"
|
32
|
+
spec.add_runtime_dependency "spreadsheet", "~> 1.1.8"
|
33
|
+
|
31
34
|
spec.add_development_dependency "bundler", "~> 1.17"
|
32
35
|
spec.add_development_dependency "rake", "~> 10.0"
|
33
36
|
spec.add_development_dependency "minitest", "~> 5.0"
|
metadata
CHANGED
@@ -1,15 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple_text_extract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nick Weiland
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-01-
|
11
|
+
date: 2019-01-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: roo
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.8'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.8'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: spreadsheet
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.1.8
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.1.8
|
13
41
|
- !ruby/object:Gem::Dependency
|
14
42
|
name: bundler
|
15
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -92,6 +120,7 @@ files:
|
|
92
120
|
- lib/simple_text_extract/format_extractor/doc_x.rb
|
93
121
|
- lib/simple_text_extract/format_extractor/pdf.rb
|
94
122
|
- lib/simple_text_extract/format_extractor/plain_text.rb
|
123
|
+
- lib/simple_text_extract/format_extractor/xls.rb
|
95
124
|
- lib/simple_text_extract/format_extractor/xls_x.rb
|
96
125
|
- lib/simple_text_extract/format_extractor_factory.rb
|
97
126
|
- lib/simple_text_extract/tempfile_extractor.rb
|
@@ -118,7 +147,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
118
147
|
- !ruby/object:Gem::Version
|
119
148
|
version: '0'
|
120
149
|
requirements:
|
121
|
-
-
|
150
|
+
- antiword
|
122
151
|
- pdftotext/poppler
|
123
152
|
rubyforge_project:
|
124
153
|
rubygems_version: 2.7.6
|