simple_text_extract 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6bfb9a91dc36259a45033d005b1b5a4bc37c941b153235708ca6755d77cce66e
4
- data.tar.gz: dad58cb4b7f039d258196a1ce8568e8169214b12d8ff1c024e1fd9f8412fdf5b
3
+ metadata.gz: 0c6b024db3cda3f2d163a321e838e37a61e95c4531afc8cc540f7ef0e49ca6d6
4
+ data.tar.gz: 017e1d1942669d66c74b43872e174d848e2909907f991b7802052edc7021aa43
5
5
  SHA512:
6
- metadata.gz: 0c5923028e2ff87feecddfcc2f6b340d4ffeb20d92bde686d607a1f498a2fe94326cd5f2df514f392641bbea988776955c76c4a12651178c2a32f885fda39c3d
7
- data.tar.gz: acfb70f0bc6746011111a39c5fb730e378372e21b7b5f9eafda1f072c0ead5801ac2425d00070e1b2cc6cb78f6df3a93a0d313ae4627af1e3a7de3339baab882
6
+ metadata.gz: 76a36c16f31adf9360ccc55b8f1da78b9afd3d5f1d7c89d4139518bd0070f1d3b250eeb11d43af4127057622036086c0dad34970788fbce36088a6b282835b6d
7
+ data.tar.gz: b0691f0e9a1f0cc3efbfa0aa8a8a2fb7d8370e5cbd40b2c3dd9cb1998f1b67bb3e7a5ce8851e11756f6db957ae8b039b838f4a845dc6358c10db1e8984bab964
data/Gemfile.lock CHANGED
@@ -1,14 +1,17 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- simple_text_extract (0.1.0)
4
+ simple_text_extract (0.1.1)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
9
  coderay (1.1.2)
10
+ metaclass (0.0.4)
10
11
  method_source (0.9.2)
11
12
  minitest (5.11.3)
13
+ mocha (1.8.0)
14
+ metaclass (~> 0.0.1)
12
15
  pry (0.12.2)
13
16
  coderay (~> 1.1.0)
14
17
  method_source (~> 0.9.0)
@@ -20,6 +23,7 @@ PLATFORMS
20
23
  DEPENDENCIES
21
24
  bundler (~> 1.17)
22
25
  minitest (~> 5.0)
26
+ mocha
23
27
  pry
24
28
  rake (~> 10.0)
25
29
  simple_text_extract!
data/README.md CHANGED
@@ -75,6 +75,20 @@ antiword
75
75
  gnumeric
76
76
  ```
77
77
 
78
+ ## Benchmarks
79
+
80
+ *Benchmarks test extracting text from the same file 50 times (Macbook pro)*
81
+
82
+ | File format | SimpleTextExtract | Henkei (i.e. Yomu/Apache Tika) |
83
+ |-------------|-------------------|--------------------------------|
84
+ | .doc | 1.40s | 74.27s |
85
+ | .docx | 0.78s | 71.44s |
86
+ | .pdf* | 1.73s | 82.86s |
87
+ | .xlsx | 21.99s | 51.89s |
88
+ | .txt | 0.036s | 39.25s |
89
+
90
+ * SimpleTextExtract is limited in its text extraction from pdfs, as Tika can also perform OCR on pdfs with Tesseract
91
+
78
92
  ## Development
79
93
 
80
94
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -13,6 +13,11 @@ module SimpleTextExtract
13
13
 
14
14
  def extract
15
15
  end
16
+
17
+ def missing_dependency?(command)
18
+ dependency = `command -v #{command}`
19
+ dependency.nil? || dependency.empty?
20
+ end
16
21
  end
17
22
  end
18
23
  end
@@ -4,7 +4,7 @@ module SimpleTextExtract
4
4
  module FormatExtractor
5
5
  class Doc < Base
6
6
  def extract
7
- return nil if `command -v antiword`.empty?
7
+ return nil if missing_dependency?('antiword')
8
8
 
9
9
  `antiword #{Shellwords.escape(file.path)}`
10
10
  end
@@ -4,7 +4,7 @@ module SimpleTextExtract
4
4
  module FormatExtractor
5
5
  class DocX < Base
6
6
  def extract
7
- return nil if `command -v unzip`.empty?
7
+ return nil if missing_dependency?("unzip")
8
8
 
9
9
  `unzip -p #{Shellwords.escape(file.path)} | grep '<w:t' | sed 's/<[^<]*>//g' | grep -v '^[[:space:]]*$'`
10
10
  end
@@ -4,7 +4,7 @@ module SimpleTextExtract
4
4
  module FormatExtractor
5
5
  class PDF < Base
6
6
  def extract
7
- return nil if `command -v pdftotext`.empty?
7
+ return nil if missing_dependency?("pdftotext")
8
8
 
9
9
  `pdftotext #{Shellwords.escape(file.path)} -`
10
10
  end
@@ -4,7 +4,7 @@ module SimpleTextExtract
4
4
  module FormatExtractor
5
5
  class XlsX < Base
6
6
  def extract
7
- return nil if `command -v ssconvert`.empty?
7
+ return nil if missing_dependency?("ssconvert")
8
8
 
9
9
  extract_filepath = "#{file.path.split(".")[0]}.txt"
10
10
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SimpleTextExtract
4
- VERSION = "0.1.1"
4
+ VERSION = "0.1.2"
5
5
  end
@@ -31,4 +31,5 @@ Gem::Specification.new do |spec|
31
31
  spec.add_development_dependency "bundler", "~> 1.17"
32
32
  spec.add_development_dependency "rake", "~> 10.0"
33
33
  spec.add_development_dependency "minitest", "~> 5.0"
34
+ spec.add_development_dependency "mocha"
34
35
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple_text_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nick Weiland
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '5.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: mocha
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  description: Attempts to quickly extract text from various file types before resorting
56
70
  to something more extreme like Apache Tika. Built with ActiveStorage in mind.
57
71
  email: