simple_text_extract 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f684f9268c632c7531a7c4e453f8c7f75f895146c50db9c0af5cbb78188839e0
4
- data.tar.gz: '0902f893a23d00b8f3ee22b9a2f386962f1886ee312fc1d851baffc7e3a8d58f'
3
+ metadata.gz: 6bfb9a91dc36259a45033d005b1b5a4bc37c941b153235708ca6755d77cce66e
4
+ data.tar.gz: dad58cb4b7f039d258196a1ce8568e8169214b12d8ff1c024e1fd9f8412fdf5b
5
5
  SHA512:
6
- metadata.gz: 365538ffe696d3ef3345b44e4cc0e64eccf5490c15d667919bbb978d11a4972b84b87b2f603e75eb4782806af805a6d1d857b2436722489210283c8abb111e15
7
- data.tar.gz: ca200f066331919161710e8076d13192225e5e8d5fd2b2b9fccecee8ab1bd1f75c4a5af07bc149f6ceab42030be505e2b959a1c673c78cd9f48f7e34f92defd9
6
+ metadata.gz: 0c5923028e2ff87feecddfcc2f6b340d4ffeb20d92bde686d607a1f498a2fe94326cd5f2df514f392641bbea988776955c76c4a12651178c2a32f885fda39c3d
7
+ data.tar.gz: acfb70f0bc6746011111a39c5fb730e378372e21b7b5f9eafda1f072c0ead5801ac2425d00070e1b2cc6cb78f6df3a93a0d313ae4627af1e3a7de3339baab882
data/README.md CHANGED
@@ -7,6 +7,8 @@ SimpleTextExtract handles parsing text from:
7
7
  - `.pdf`
8
8
  - `.docx`
9
9
  - `.doc`
10
+ - `.xlsx`
11
+ - `.xls`
10
12
  - `.txt` 😜
11
13
 
12
14
  If no text is parsed (for `pdf`), or a file format is not supported (like images), then `nil` is returned and you can move on to the heavy-duty tools like [Henkei](https://github.com/abrom/henkei) 💪.
@@ -32,11 +34,11 @@ Or install it yourself as:
32
34
  Text can be parsed from raw file content or files in the filesystem t by calling `SimpleTextExtract.extract`:
33
35
 
34
36
  ```ruby
35
- # raw file content using ActiveStorage
36
- SimpleTextExtract.extract(filename: attachment.blob.filename, raw: attachment.download)
37
+ # raw file content using ActiveStorage
38
+ SimpleTextExtract.extract(filename: attachment.blob.filename, raw: attachment.download)
37
39
 
38
- # filesystem
39
- SimpleTextExtract.extract(filepath: "path_to_file.pdf")
40
+ # filesystem
41
+ SimpleTextExtract.extract(filepath: "path_to_file.pdf")
40
42
  ```
41
43
 
42
44
  ### Usage Dependencies
@@ -49,6 +51,9 @@ You can choose to use SimpleTextExtract without the following dependencies, but
49
51
  `doc` parsing requires `antiword`
50
52
  - `brew install antiword`
51
53
 
54
+ `xlsx` and `xls` parsing requires `ssconvert` which is part of `gnumeric`
55
+ - `brew install gnumeric`
56
+
52
57
  ### Usage on Heroku
53
58
 
54
59
  To use on Heroku you'll have to add some custom buildpacks.
@@ -67,6 +72,7 @@ To add `antiword` as a dependency on Heroku, install the [heroku-buildpack-apt](
67
72
  In your `Aptfile`, add:
68
73
  ```
69
74
  antiword
75
+ gnumeric
70
76
  ```
71
77
 
72
78
  ## Development
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SimpleTextExtract
4
+ module FormatExtractor
5
+ class XlsX < Base
6
+ def extract
7
+ return nil if `command -v ssconvert`.empty?
8
+
9
+ extract_filepath = "#{file.path.split(".")[0]}.txt"
10
+
11
+ `ssconvert -O 'separator=" "' #{Shellwords.escape(file.path)} #{extract_filepath}`
12
+
13
+ text = File.read(extract_filepath)
14
+ File.unlink(extract_filepath)
15
+
16
+ text
17
+ end
18
+ end
19
+ end
20
+ end
@@ -3,6 +3,7 @@
3
3
  require "simple_text_extract/format_extractor/base"
4
4
  require "simple_text_extract/format_extractor/plain_text"
5
5
  require "simple_text_extract/format_extractor/pdf"
6
+ require "simple_text_extract/format_extractor/xls_x"
6
7
  require "simple_text_extract/format_extractor/doc_x"
7
8
  require "simple_text_extract/format_extractor/doc"
8
9
 
@@ -18,6 +19,8 @@ module SimpleTextExtract
18
19
  FormatExtractor::DocX.new(file)
19
20
  when /.doc$/i
20
21
  FormatExtractor::Doc.new(file)
22
+ when /(.xlsx$|.xls$)/i
23
+ FormatExtractor::XlsX.new(file)
21
24
  else
22
25
  FormatExtractor::Base.new(file)
23
26
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SimpleTextExtract
4
- VERSION = "0.1.0"
4
+ VERSION = "0.1.1"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple_text_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nick Weiland
@@ -78,6 +78,7 @@ files:
78
78
  - lib/simple_text_extract/format_extractor/doc_x.rb
79
79
  - lib/simple_text_extract/format_extractor/pdf.rb
80
80
  - lib/simple_text_extract/format_extractor/plain_text.rb
81
+ - lib/simple_text_extract/format_extractor/xls_x.rb
81
82
  - lib/simple_text_extract/format_extractor_factory.rb
82
83
  - lib/simple_text_extract/tempfile_extractor.rb
83
84
  - lib/simple_text_extract/text_extractor.rb