simple_text_extract 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -1
- data/README.md +14 -0
- data/lib/simple_text_extract/format_extractor/base.rb +5 -0
- data/lib/simple_text_extract/format_extractor/doc.rb +1 -1
- data/lib/simple_text_extract/format_extractor/doc_x.rb +1 -1
- data/lib/simple_text_extract/format_extractor/pdf.rb +1 -1
- data/lib/simple_text_extract/format_extractor/xls_x.rb +1 -1
- data/lib/simple_text_extract/version.rb +1 -1
- data/simple_text_extract.gemspec +1 -0
- metadata +15 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0c6b024db3cda3f2d163a321e838e37a61e95c4531afc8cc540f7ef0e49ca6d6
|
4
|
+
data.tar.gz: 017e1d1942669d66c74b43872e174d848e2909907f991b7802052edc7021aa43
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 76a36c16f31adf9360ccc55b8f1da78b9afd3d5f1d7c89d4139518bd0070f1d3b250eeb11d43af4127057622036086c0dad34970788fbce36088a6b282835b6d
|
7
|
+
data.tar.gz: b0691f0e9a1f0cc3efbfa0aa8a8a2fb7d8370e5cbd40b2c3dd9cb1998f1b67bb3e7a5ce8851e11756f6db957ae8b039b838f4a845dc6358c10db1e8984bab964
|
data/Gemfile.lock
CHANGED
@@ -1,14 +1,17 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
simple_text_extract (0.1.
|
4
|
+
simple_text_extract (0.1.1)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
9
|
coderay (1.1.2)
|
10
|
+
metaclass (0.0.4)
|
10
11
|
method_source (0.9.2)
|
11
12
|
minitest (5.11.3)
|
13
|
+
mocha (1.8.0)
|
14
|
+
metaclass (~> 0.0.1)
|
12
15
|
pry (0.12.2)
|
13
16
|
coderay (~> 1.1.0)
|
14
17
|
method_source (~> 0.9.0)
|
@@ -20,6 +23,7 @@ PLATFORMS
|
|
20
23
|
DEPENDENCIES
|
21
24
|
bundler (~> 1.17)
|
22
25
|
minitest (~> 5.0)
|
26
|
+
mocha
|
23
27
|
pry
|
24
28
|
rake (~> 10.0)
|
25
29
|
simple_text_extract!
|
data/README.md
CHANGED
@@ -75,6 +75,20 @@ antiword
|
|
75
75
|
gnumeric
|
76
76
|
```
|
77
77
|
|
78
|
+
## Benchmarks
|
79
|
+
|
80
|
+
*Benchmarks test extracting text from the same file 50 times (Macbook pro)*
|
81
|
+
|
82
|
+
| File format | SimpleTextExtract | Henkei (i.e. Yomu/Apache Tika) |
|
83
|
+
|-------------|-------------------|--------------------------------|
|
84
|
+
| .doc | 1.40s | 74.27s |
|
85
|
+
| .docx | 0.78s | 71.44s |
|
86
|
+
| .pdf* | 1.73s | 82.86s |
|
87
|
+
| .xlsx | 21.99s | 51.89s |
|
88
|
+
| .txt | 0.036s | 39.25s |
|
89
|
+
|
90
|
+
* SimpleTextExtract is limited in its text extraction from pdfs, as Tika can also perform OCR on pdfs with Tesseract
|
91
|
+
|
78
92
|
## Development
|
79
93
|
|
80
94
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
@@ -4,7 +4,7 @@ module SimpleTextExtract
|
|
4
4
|
module FormatExtractor
|
5
5
|
class DocX < Base
|
6
6
|
def extract
|
7
|
-
return nil if
|
7
|
+
return nil if missing_dependency?("unzip")
|
8
8
|
|
9
9
|
`unzip -p #{Shellwords.escape(file.path)} | grep '<w:t' | sed 's/<[^<]*>//g' | grep -v '^[[:space:]]*$'`
|
10
10
|
end
|
data/simple_text_extract.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple_text_extract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nick Weiland
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '5.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: mocha
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
description: Attempts to quickly extract text from various file types before resorting
|
56
70
|
to something more extreme like Apache Tika. Built with ActiveStorage in mind.
|
57
71
|
email:
|