simple_text_extract 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6bfb9a91dc36259a45033d005b1b5a4bc37c941b153235708ca6755d77cce66e
|
4
|
+
data.tar.gz: dad58cb4b7f039d258196a1ce8568e8169214b12d8ff1c024e1fd9f8412fdf5b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0c5923028e2ff87feecddfcc2f6b340d4ffeb20d92bde686d607a1f498a2fe94326cd5f2df514f392641bbea988776955c76c4a12651178c2a32f885fda39c3d
|
7
|
+
data.tar.gz: acfb70f0bc6746011111a39c5fb730e378372e21b7b5f9eafda1f072c0ead5801ac2425d00070e1b2cc6cb78f6df3a93a0d313ae4627af1e3a7de3339baab882
|
data/README.md
CHANGED
@@ -7,6 +7,8 @@ SimpleTextExtract handles parsing text from:
|
|
7
7
|
- `.pdf`
|
8
8
|
- `.docx`
|
9
9
|
- `.doc`
|
10
|
+
- `.xlsx`
|
11
|
+
- `.xls`
|
10
12
|
- `.txt` 😜
|
11
13
|
|
12
14
|
If no text is parsed (for `pdf`), or a file format is not supported (like images), then `nil` is returned and you can move on to the heavy-duty tools like [Henkei](https://github.com/abrom/henkei) 💪.
|
@@ -32,11 +34,11 @@ Or install it yourself as:
|
|
32
34
|
Text can be parsed from raw file content or files in the filesystem t by calling `SimpleTextExtract.extract`:
|
33
35
|
|
34
36
|
```ruby
|
35
|
-
|
36
|
-
|
37
|
+
# raw file content using ActiveStorage
|
38
|
+
SimpleTextExtract.extract(filename: attachment.blob.filename, raw: attachment.download)
|
37
39
|
|
38
|
-
|
39
|
-
|
40
|
+
# filesystem
|
41
|
+
SimpleTextExtract.extract(filepath: "path_to_file.pdf")
|
40
42
|
```
|
41
43
|
|
42
44
|
### Usage Dependencies
|
@@ -49,6 +51,9 @@ You can choose to use SimpleTextExtract without the following dependencies, but
|
|
49
51
|
`doc` parsing requires `antiword`
|
50
52
|
- `brew install antiword`
|
51
53
|
|
54
|
+
`xlsx` and `xls` parsing requires `ssconvert` which is part of `gnumeric`
|
55
|
+
- `brew install gnumeric`
|
56
|
+
|
52
57
|
### Usage on Heroku
|
53
58
|
|
54
59
|
To use on Heroku you'll have to add some custom buildpacks.
|
@@ -67,6 +72,7 @@ To add `antiword` as a dependency on Heroku, install the [heroku-buildpack-apt](
|
|
67
72
|
In your `Aptfile`, add:
|
68
73
|
```
|
69
74
|
antiword
|
75
|
+
gnumeric
|
70
76
|
```
|
71
77
|
|
72
78
|
## Development
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SimpleTextExtract
|
4
|
+
module FormatExtractor
|
5
|
+
class XlsX < Base
|
6
|
+
def extract
|
7
|
+
return nil if `command -v ssconvert`.empty?
|
8
|
+
|
9
|
+
extract_filepath = "#{file.path.split(".")[0]}.txt"
|
10
|
+
|
11
|
+
`ssconvert -O 'separator=" "' #{Shellwords.escape(file.path)} #{extract_filepath}`
|
12
|
+
|
13
|
+
text = File.read(extract_filepath)
|
14
|
+
File.unlink(extract_filepath)
|
15
|
+
|
16
|
+
text
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -3,6 +3,7 @@
|
|
3
3
|
require "simple_text_extract/format_extractor/base"
|
4
4
|
require "simple_text_extract/format_extractor/plain_text"
|
5
5
|
require "simple_text_extract/format_extractor/pdf"
|
6
|
+
require "simple_text_extract/format_extractor/xls_x"
|
6
7
|
require "simple_text_extract/format_extractor/doc_x"
|
7
8
|
require "simple_text_extract/format_extractor/doc"
|
8
9
|
|
@@ -18,6 +19,8 @@ module SimpleTextExtract
|
|
18
19
|
FormatExtractor::DocX.new(file)
|
19
20
|
when /.doc$/i
|
20
21
|
FormatExtractor::Doc.new(file)
|
22
|
+
when /(.xlsx$|.xls$)/i
|
23
|
+
FormatExtractor::XlsX.new(file)
|
21
24
|
else
|
22
25
|
FormatExtractor::Base.new(file)
|
23
26
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple_text_extract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nick Weiland
|
@@ -78,6 +78,7 @@ files:
|
|
78
78
|
- lib/simple_text_extract/format_extractor/doc_x.rb
|
79
79
|
- lib/simple_text_extract/format_extractor/pdf.rb
|
80
80
|
- lib/simple_text_extract/format_extractor/plain_text.rb
|
81
|
+
- lib/simple_text_extract/format_extractor/xls_x.rb
|
81
82
|
- lib/simple_text_extract/format_extractor_factory.rb
|
82
83
|
- lib/simple_text_extract/tempfile_extractor.rb
|
83
84
|
- lib/simple_text_extract/text_extractor.rb
|