simple_text_extract 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -1
- data/README.md +14 -0
- data/lib/simple_text_extract/format_extractor/base.rb +5 -0
- data/lib/simple_text_extract/format_extractor/doc.rb +1 -1
- data/lib/simple_text_extract/format_extractor/doc_x.rb +1 -1
- data/lib/simple_text_extract/format_extractor/pdf.rb +1 -1
- data/lib/simple_text_extract/format_extractor/xls_x.rb +1 -1
- data/lib/simple_text_extract/version.rb +1 -1
- data/simple_text_extract.gemspec +1 -0
- metadata +15 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0c6b024db3cda3f2d163a321e838e37a61e95c4531afc8cc540f7ef0e49ca6d6
|
4
|
+
data.tar.gz: 017e1d1942669d66c74b43872e174d848e2909907f991b7802052edc7021aa43
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 76a36c16f31adf9360ccc55b8f1da78b9afd3d5f1d7c89d4139518bd0070f1d3b250eeb11d43af4127057622036086c0dad34970788fbce36088a6b282835b6d
|
7
|
+
data.tar.gz: b0691f0e9a1f0cc3efbfa0aa8a8a2fb7d8370e5cbd40b2c3dd9cb1998f1b67bb3e7a5ce8851e11756f6db957ae8b039b838f4a845dc6358c10db1e8984bab964
|
data/Gemfile.lock
CHANGED
@@ -1,14 +1,17 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
simple_text_extract (0.1.
|
4
|
+
simple_text_extract (0.1.1)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
9
|
coderay (1.1.2)
|
10
|
+
metaclass (0.0.4)
|
10
11
|
method_source (0.9.2)
|
11
12
|
minitest (5.11.3)
|
13
|
+
mocha (1.8.0)
|
14
|
+
metaclass (~> 0.0.1)
|
12
15
|
pry (0.12.2)
|
13
16
|
coderay (~> 1.1.0)
|
14
17
|
method_source (~> 0.9.0)
|
@@ -20,6 +23,7 @@ PLATFORMS
|
|
20
23
|
DEPENDENCIES
|
21
24
|
bundler (~> 1.17)
|
22
25
|
minitest (~> 5.0)
|
26
|
+
mocha
|
23
27
|
pry
|
24
28
|
rake (~> 10.0)
|
25
29
|
simple_text_extract!
|
data/README.md
CHANGED
@@ -75,6 +75,20 @@ antiword
|
|
75
75
|
gnumeric
|
76
76
|
```
|
77
77
|
|
78
|
+
## Benchmarks
|
79
|
+
|
80
|
+
*Benchmarks test extracting text from the same file 50 times (Macbook pro)*
|
81
|
+
|
82
|
+
| File format | SimpleTextExtract | Henkei (i.e. Yomu/Apache Tika) |
|
83
|
+
|-------------|-------------------|--------------------------------|
|
84
|
+
| .doc | 1.40s | 74.27s |
|
85
|
+
| .docx | 0.78s | 71.44s |
|
86
|
+
| .pdf* | 1.73s | 82.86s |
|
87
|
+
| .xlsx | 21.99s | 51.89s |
|
88
|
+
| .txt | 0.036s | 39.25s |
|
89
|
+
|
90
|
+
* SimpleTextExtract is limited in its text extraction from pdfs, as Tika can also perform OCR on pdfs with Tesseract
|
91
|
+
|
78
92
|
## Development
|
79
93
|
|
80
94
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
@@ -4,7 +4,7 @@ module SimpleTextExtract
|
|
4
4
|
module FormatExtractor
|
5
5
|
class DocX < Base
|
6
6
|
def extract
|
7
|
-
return nil if
|
7
|
+
return nil if missing_dependency?("unzip")
|
8
8
|
|
9
9
|
`unzip -p #{Shellwords.escape(file.path)} | grep '<w:t' | sed 's/<[^<]*>//g' | grep -v '^[[:space:]]*$'`
|
10
10
|
end
|
data/simple_text_extract.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple_text_extract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nick Weiland
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '5.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: mocha
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
description: Attempts to quickly extract text from various file types before resorting
|
56
70
|
to something more extreme like Apache Tika. Built with ActiveStorage in mind.
|
57
71
|
email:
|