doc_ripper 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +6 -14
- data/.gitignore +9 -13
- data/.rspec +2 -0
- data/Gemfile +1 -0
- data/README.md +22 -12
- data/Rakefile +6 -0
- data/doc_ripper.gemspec +1 -0
- data/lib/doc_ripper.rb +14 -0
- data/lib/doc_ripper/exceptions.rb +7 -0
- data/lib/doc_ripper/ripper/base.rb +2 -1
- data/lib/doc_ripper/text_ripper.rb +2 -0
- data/lib/doc_ripper/version.rb +1 -1
- data/spec/doc_ripper/doc_ripper_spec.rb +57 -0
- data/spec/doc_ripper/ripper/base_spec.rb +9 -0
- data/spec/doc_ripper/text_ripper_spec.rb +7 -0
- data/spec/fixtures/lorem.doc +0 -0
- data/spec/fixtures/lorem.docx +0 -0
- data/spec/fixtures/lorem.pdf +0 -0
- data/spec/fixtures/lorem.txt +1 -0
- data/spec/fixtures/missing_file.txt +0 -0
- data/spec/fixtures/some_missing_path.txt +0 -0
- data/spec/spec_helper.rb +11 -0
- metadata +47 -11
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
metadata.gz: !binary |-
|
9
|
-
MDczOTMxNDI1ZWFlYmQzM2JlMTU0YjJlYjEwNGQ5ZjM4M2I0ZmYxMWRhOTNl
|
10
|
-
YTkxN2UyY2ZhZWVhYmE3ZWYyNmZlODA3MWJjN2M1ZDI1MjZjMjdhZmQ3ODE0
|
11
|
-
NDIxYzA0ZTA4MWFhOTRhNjcxY2U0NmVhMDM3MGZkN2NiOWY0OTg=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
OTA0NmU4ZDliOWY1MjVjYmYzMTJjNjJhZmI4YzUyZWQyYTg1ZDNhMzM0Y2Zm
|
14
|
-
MTZhNGI2NDMzMjE0MGVjN2EwMDE2YWRjNjYzNTAxYWVlZmU3ZGNhZjYzNWE3
|
15
|
-
Y2M5NzU2ZTliNjIwNDI4MzNlN2ZkNjYxM2I3YTRhMzA3Y2FlNjk=
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: e356e467916b8452aeb2121a234b0011302286ee
|
4
|
+
data.tar.gz: 6d7d1bc5c12f8a7de5e8585fbf5f7ca9bffe6735
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c58f820acc305465e13c19e2e328de856a4e06d7e5c32a74827332002352a6e982286d0261b1e9cf99871084f65a9f2abb5c75227b7392f463325e476fc2df97
|
7
|
+
data.tar.gz: 812a7e6df98b6f247e0bd46e520611ad93180f43ac6e61306d3fe9b6bfc49ee75668609055ca5162fc595a223394b1bc7dc956b0eb189a2368f502a052b25528
|
data/.gitignore
CHANGED
@@ -1,14 +1,10 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
/doc/
|
7
|
-
/pkg/
|
8
|
-
/spec/reports/
|
1
|
+
.ruby-gemset
|
2
|
+
.ruby-version
|
3
|
+
Gemfile.lock
|
4
|
+
*.DS_Store
|
5
|
+
|
9
6
|
/tmp/
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
mkmf.log
|
7
|
+
/spec/tmp/
|
8
|
+
/doc/
|
9
|
+
/rdoc/
|
10
|
+
/coverage/
|
data/.rspec
ADDED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
-
DocRipper
|
1
|
+
# DocRipper
|
2
2
|
|
3
|
-
|
3
|
+
Grab the text from common document formats with 1 command. DocRipper is an extremely lightweight Ruby wrapper that can be used to parse text contents from common file formats (currently .doc, .docx and .pdf) without the need for a large number of dependencies like an OCR library or OpenOffice/LibreOffice.
|
4
|
+
|
5
|
+
For simple parsing, you'll likely see a large performance improvement with DocRipper over solutions that rely on OpenOffice/LibreOffice for .doc/.docx conversion.
|
4
6
|
|
5
7
|
Need OCR support or in-image text parsing? Take a look at [Docsplit](https://github.com/documentcloud/docsplit).
|
6
8
|
|
@@ -9,28 +11,36 @@ Need OCR support or in-image text parsing? Take a look at [Docsplit](https://git
|
|
9
11
|
```
|
10
12
|
gem install doc_ripper
|
11
13
|
```
|
12
|
-
### Specify a file
|
14
|
+
### Specify a file path of a file
|
13
15
|
|
14
16
|
```
|
15
|
-
|
17
|
+
require 'doc_ripper'
|
18
|
+
|
19
|
+
DocRipper::rip('/path/to/file')
|
16
20
|
```
|
17
21
|
|
18
|
-
|
22
|
+
#### If the file cannot be read, nil will be returned.
|
23
|
+
|
19
24
|
```
|
20
|
-
|
21
|
-
|
22
|
-
=> "Document's text"
|
25
|
+
DocRipper::rip('/path/to/missing/file')
|
26
|
+
=> nil
|
23
27
|
```
|
24
28
|
|
25
|
-
|
29
|
+
#### Want to raise an exception? Use #rip!
|
30
|
+
#rip! will raise an exception if rip returns nil or the file type isn't supported
|
26
31
|
|
27
32
|
```
|
28
|
-
|
29
|
-
|
30
|
-
=>
|
33
|
+
# invalid file type
|
34
|
+
DocRipper::rip!('/path/to/invalide/file.type')
|
35
|
+
=> DocRipper::UnsupportedFileType
|
36
|
+
|
37
|
+
# missing file
|
38
|
+
DocRipper::rip!('/path/to/missing/file.doc')
|
39
|
+
=> DocRipper::FileNotFound
|
31
40
|
```
|
32
41
|
|
33
42
|
|
43
|
+
|
34
44
|
## Dependencies
|
35
45
|
- Ruby version >= 1.9.2
|
36
46
|
- [Poppler-utils/(pdftotext)](http://poppler.freedesktop.org/) (PDF)
|
data/Rakefile
CHANGED
data/doc_ripper.gemspec
CHANGED
data/lib/doc_ripper.rb
CHANGED
@@ -5,7 +5,21 @@ require "doc_ripper/text_ripper"
|
|
5
5
|
require "doc_ripper/pdf_ripper"
|
6
6
|
require "doc_ripper/docx_ripper"
|
7
7
|
require "doc_ripper/ms_doc_ripper"
|
8
|
+
require "doc_ripper/exceptions"
|
8
9
|
|
9
10
|
module DocRipper
|
11
|
+
class << self
|
12
|
+
def rip(path, options = {})
|
13
|
+
TextRipper.new(path, options).text
|
14
|
+
end
|
10
15
|
|
16
|
+
def rip!(path)
|
17
|
+
text = rip(path, raise: true)
|
18
|
+
if text
|
19
|
+
text
|
20
|
+
else
|
21
|
+
raise FileNotFound
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
11
25
|
end
|
data/lib/doc_ripper/version.rb
CHANGED
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module DocRipper
|
4
|
+
describe 'provide a clean api to return the text from a document' do
|
5
|
+
let(:doc_path) { "#{FIXTURE_PATH}lorem.doc" }
|
6
|
+
let(:docx_path) { "#{FIXTURE_PATH}lorem.docx" }
|
7
|
+
let(:pdf_path) { "#{FIXTURE_PATH}lorem.docx" }
|
8
|
+
let(:invalid_path) { "#{FIXTURE_PATH}missing_file.docx" }
|
9
|
+
let(:invalid_file_type) { "#{FIXTURE_PATH}lorem.jpg"}
|
10
|
+
let(:missing_path) { "#{
|
11
|
+
FIXTURE_PATH}some_missing_path.docx" }
|
12
|
+
|
13
|
+
context '#rip' do
|
14
|
+
|
15
|
+
it 'should respond to #rip' do
|
16
|
+
expect(DocRipper.respond_to? :rip).to eq(true)
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should respond with text to valid file extensions' do
|
20
|
+
expect(DocRipper.rip(doc_path)).not_to eq(nil)
|
21
|
+
expect(DocRipper.rip(docx_path)).not_to eq(nil)
|
22
|
+
expect(DocRipper.rip(pdf_path)).not_to eq(nil)
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'should respond with nil if file is missing' do
|
26
|
+
expect(DocRipper.rip(missing_path)).to eq(nil)
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
it 'should respond with nil if the file is the wrong type' do
|
31
|
+
expect(DocRipper.rip(invalid_path)).to eq(nil)
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should remove the dumped text version of the file' do
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
context '#rip!' do
|
41
|
+
|
42
|
+
it 'should respond with an exception if the file is missing' do
|
43
|
+
expect{DocRipper.rip!(invalid_path)}.to raise_error(FileNotFound)
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should respond with an exception if the file is the wrong type of extension' do
|
47
|
+
expect{DocRipper.rip!(invalid_file_type)}.to raise_error(UnsupportedFileType)
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'should respond with an exception if the text file is nil' do
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1 @@
|
|
1
|
+
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
File without changes
|
File without changes
|
data/spec/spec_helper.rb
ADDED
metadata
CHANGED
@@ -1,43 +1,57 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: doc_ripper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Paul Zaich
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-12-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.6'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.6'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - ~>
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '10.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - ~>
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
description: Provides a lean, convenient ruby wrapper to poppler, and antiword command
|
42
56
|
line tools to quickly rip out text from common text formats.
|
43
57
|
email:
|
@@ -46,7 +60,8 @@ executables: []
|
|
46
60
|
extensions: []
|
47
61
|
extra_rdoc_files: []
|
48
62
|
files:
|
49
|
-
- .gitignore
|
63
|
+
- ".gitignore"
|
64
|
+
- ".rspec"
|
50
65
|
- Gemfile
|
51
66
|
- LICENSE.txt
|
52
67
|
- README.md
|
@@ -54,11 +69,22 @@ files:
|
|
54
69
|
- doc_ripper.gemspec
|
55
70
|
- lib/doc_ripper.rb
|
56
71
|
- lib/doc_ripper/docx_ripper.rb
|
72
|
+
- lib/doc_ripper/exceptions.rb
|
57
73
|
- lib/doc_ripper/ms_doc_ripper.rb
|
58
74
|
- lib/doc_ripper/pdf_ripper.rb
|
59
75
|
- lib/doc_ripper/ripper/base.rb
|
60
76
|
- lib/doc_ripper/text_ripper.rb
|
61
77
|
- lib/doc_ripper/version.rb
|
78
|
+
- spec/doc_ripper/doc_ripper_spec.rb
|
79
|
+
- spec/doc_ripper/ripper/base_spec.rb
|
80
|
+
- spec/doc_ripper/text_ripper_spec.rb
|
81
|
+
- spec/fixtures/lorem.doc
|
82
|
+
- spec/fixtures/lorem.docx
|
83
|
+
- spec/fixtures/lorem.pdf
|
84
|
+
- spec/fixtures/lorem.txt
|
85
|
+
- spec/fixtures/missing_file.txt
|
86
|
+
- spec/fixtures/some_missing_path.txt
|
87
|
+
- spec/spec_helper.rb
|
62
88
|
homepage: https://github.com/pzaich/doc_ripper
|
63
89
|
licenses:
|
64
90
|
- MIT
|
@@ -69,20 +95,30 @@ require_paths:
|
|
69
95
|
- lib
|
70
96
|
required_ruby_version: !ruby/object:Gem::Requirement
|
71
97
|
requirements:
|
72
|
-
- -
|
98
|
+
- - ">="
|
73
99
|
- !ruby/object:Gem::Version
|
74
100
|
version: '0'
|
75
101
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
102
|
requirements:
|
77
|
-
- -
|
103
|
+
- - ">="
|
78
104
|
- !ruby/object:Gem::Version
|
79
105
|
version: '0'
|
80
106
|
requirements:
|
81
107
|
- Antiword
|
82
108
|
- pdftotext/poppler
|
83
109
|
rubyforge_project:
|
84
|
-
rubygems_version: 2.
|
110
|
+
rubygems_version: 2.2.2
|
85
111
|
signing_key:
|
86
112
|
specification_version: 4
|
87
113
|
summary: Rip out text from pdf, doc and docx formats
|
88
|
-
test_files:
|
114
|
+
test_files:
|
115
|
+
- spec/doc_ripper/doc_ripper_spec.rb
|
116
|
+
- spec/doc_ripper/ripper/base_spec.rb
|
117
|
+
- spec/doc_ripper/text_ripper_spec.rb
|
118
|
+
- spec/fixtures/lorem.doc
|
119
|
+
- spec/fixtures/lorem.docx
|
120
|
+
- spec/fixtures/lorem.pdf
|
121
|
+
- spec/fixtures/lorem.txt
|
122
|
+
- spec/fixtures/missing_file.txt
|
123
|
+
- spec/fixtures/some_missing_path.txt
|
124
|
+
- spec/spec_helper.rb
|