simple_text_extract 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6bfb9a91dc36259a45033d005b1b5a4bc37c941b153235708ca6755d77cce66e
4
- data.tar.gz: dad58cb4b7f039d258196a1ce8568e8169214b12d8ff1c024e1fd9f8412fdf5b
3
+ metadata.gz: 0c6b024db3cda3f2d163a321e838e37a61e95c4531afc8cc540f7ef0e49ca6d6
4
+ data.tar.gz: 017e1d1942669d66c74b43872e174d848e2909907f991b7802052edc7021aa43
5
5
  SHA512:
6
- metadata.gz: 0c5923028e2ff87feecddfcc2f6b340d4ffeb20d92bde686d607a1f498a2fe94326cd5f2df514f392641bbea988776955c76c4a12651178c2a32f885fda39c3d
7
- data.tar.gz: acfb70f0bc6746011111a39c5fb730e378372e21b7b5f9eafda1f072c0ead5801ac2425d00070e1b2cc6cb78f6df3a93a0d313ae4627af1e3a7de3339baab882
6
+ metadata.gz: 76a36c16f31adf9360ccc55b8f1da78b9afd3d5f1d7c89d4139518bd0070f1d3b250eeb11d43af4127057622036086c0dad34970788fbce36088a6b282835b6d
7
+ data.tar.gz: b0691f0e9a1f0cc3efbfa0aa8a8a2fb7d8370e5cbd40b2c3dd9cb1998f1b67bb3e7a5ce8851e11756f6db957ae8b039b838f4a845dc6358c10db1e8984bab964
data/Gemfile.lock CHANGED
@@ -1,14 +1,17 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- simple_text_extract (0.1.0)
4
+ simple_text_extract (0.1.1)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
9
  coderay (1.1.2)
10
+ metaclass (0.0.4)
10
11
  method_source (0.9.2)
11
12
  minitest (5.11.3)
13
+ mocha (1.8.0)
14
+ metaclass (~> 0.0.1)
12
15
  pry (0.12.2)
13
16
  coderay (~> 1.1.0)
14
17
  method_source (~> 0.9.0)
@@ -20,6 +23,7 @@ PLATFORMS
20
23
  DEPENDENCIES
21
24
  bundler (~> 1.17)
22
25
  minitest (~> 5.0)
26
+ mocha
23
27
  pry
24
28
  rake (~> 10.0)
25
29
  simple_text_extract!
data/README.md CHANGED
@@ -75,6 +75,20 @@ antiword
75
75
  gnumeric
76
76
  ```
77
77
 
78
+ ## Benchmarks
79
+
80
+ *Benchmarks test extracting text from the same file 50 times (Macbook pro)*
81
+
82
+ | File format | SimpleTextExtract | Henkei (i.e. Yomu/Apache Tika) |
83
+ |-------------|-------------------|--------------------------------|
84
+ | .doc | 1.40s | 74.27s |
85
+ | .docx | 0.78s | 71.44s |
86
+ | .pdf* | 1.73s | 82.86s |
87
+ | .xlsx | 21.99s | 51.89s |
88
+ | .txt | 0.036s | 39.25s |
89
+
90
+ * SimpleTextExtract is limited in its text extraction from pdfs, as Tika can also perform OCR on pdfs with Tesseract
91
+
78
92
  ## Development
79
93
 
80
94
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -13,6 +13,11 @@ module SimpleTextExtract
13
13
 
14
14
  def extract
15
15
  end
16
+
17
+ def missing_dependency?(command)
18
+ dependency = `command -v #{command}`
19
+ dependency.nil? || dependency.empty?
20
+ end
16
21
  end
17
22
  end
18
23
  end
@@ -4,7 +4,7 @@ module SimpleTextExtract
4
4
  module FormatExtractor
5
5
  class Doc < Base
6
6
  def extract
7
- return nil if `command -v antiword`.empty?
7
+ return nil if missing_dependency?('antiword')
8
8
 
9
9
  `antiword #{Shellwords.escape(file.path)}`
10
10
  end
@@ -4,7 +4,7 @@ module SimpleTextExtract
4
4
  module FormatExtractor
5
5
  class DocX < Base
6
6
  def extract
7
- return nil if `command -v unzip`.empty?
7
+ return nil if missing_dependency?("unzip")
8
8
 
9
9
  `unzip -p #{Shellwords.escape(file.path)} | grep '<w:t' | sed 's/<[^<]*>//g' | grep -v '^[[:space:]]*$'`
10
10
  end
@@ -4,7 +4,7 @@ module SimpleTextExtract
4
4
  module FormatExtractor
5
5
  class PDF < Base
6
6
  def extract
7
- return nil if `command -v pdftotext`.empty?
7
+ return nil if missing_dependency?("pdftotext")
8
8
 
9
9
  `pdftotext #{Shellwords.escape(file.path)} -`
10
10
  end
@@ -4,7 +4,7 @@ module SimpleTextExtract
4
4
  module FormatExtractor
5
5
  class XlsX < Base
6
6
  def extract
7
- return nil if `command -v ssconvert`.empty?
7
+ return nil if missing_dependency?("ssconvert")
8
8
 
9
9
  extract_filepath = "#{file.path.split(".")[0]}.txt"
10
10
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SimpleTextExtract
4
- VERSION = "0.1.1"
4
+ VERSION = "0.1.2"
5
5
  end
@@ -31,4 +31,5 @@ Gem::Specification.new do |spec|
31
31
  spec.add_development_dependency "bundler", "~> 1.17"
32
32
  spec.add_development_dependency "rake", "~> 10.0"
33
33
  spec.add_development_dependency "minitest", "~> 5.0"
34
+ spec.add_development_dependency "mocha"
34
35
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple_text_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nick Weiland
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '5.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: mocha
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  description: Attempts to quickly extract text from various file types before resorting
56
70
  to something more extreme like Apache Tika. Built with ActiveStorage in mind.
57
71
  email: