doc_ripper 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- OTJjZmJlZjQwMzQ2ZDFlMWQwMzIyY2UyNGJmYTA1NWQxNGJmODEyZA==
5
- data.tar.gz: !binary |-
6
- MWIxZWYzZmIwZWE5Yjk5MTg0N2RkNWVjNTA1ODYxYzg4NWM5NTkzYw==
7
- !binary "U0hBNTEy":
8
- metadata.gz: !binary |-
9
- MDczOTMxNDI1ZWFlYmQzM2JlMTU0YjJlYjEwNGQ5ZjM4M2I0ZmYxMWRhOTNl
10
- YTkxN2UyY2ZhZWVhYmE3ZWYyNmZlODA3MWJjN2M1ZDI1MjZjMjdhZmQ3ODE0
11
- NDIxYzA0ZTA4MWFhOTRhNjcxY2U0NmVhMDM3MGZkN2NiOWY0OTg=
12
- data.tar.gz: !binary |-
13
- OTA0NmU4ZDliOWY1MjVjYmYzMTJjNjJhZmI4YzUyZWQyYTg1ZDNhMzM0Y2Zm
14
- MTZhNGI2NDMzMjE0MGVjN2EwMDE2YWRjNjYzNTAxYWVlZmU3ZGNhZjYzNWE3
15
- Y2M5NzU2ZTliNjIwNDI4MzNlN2ZkNjYxM2I3YTRhMzA3Y2FlNjk=
2
+ SHA1:
3
+ metadata.gz: e356e467916b8452aeb2121a234b0011302286ee
4
+ data.tar.gz: 6d7d1bc5c12f8a7de5e8585fbf5f7ca9bffe6735
5
+ SHA512:
6
+ metadata.gz: c58f820acc305465e13c19e2e328de856a4e06d7e5c32a74827332002352a6e982286d0261b1e9cf99871084f65a9f2abb5c75227b7392f463325e476fc2df97
7
+ data.tar.gz: 812a7e6df98b6f247e0bd46e520611ad93180f43ac6e61306d3fe9b6bfc49ee75668609055ca5162fc595a223394b1bc7dc956b0eb189a2368f502a052b25528
data/.gitignore CHANGED
@@ -1,14 +1,10 @@
1
- /.bundle/
2
- /.yardoc
3
- /Gemfile.lock
4
- /_yardoc/
5
- /coverage/
6
- /doc/
7
- /pkg/
8
- /spec/reports/
1
+ .ruby-gemset
2
+ .ruby-version
3
+ Gemfile.lock
4
+ *.DS_Store
5
+
9
6
  /tmp/
10
- *.bundle
11
- *.so
12
- *.o
13
- *.a
14
- mkmf.log
7
+ /spec/tmp/
8
+ /doc/
9
+ /rdoc/
10
+ /coverage/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/Gemfile CHANGED
@@ -2,3 +2,4 @@ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in doc_ripper.gemspec
4
4
  gemspec
5
+
data/README.md CHANGED
@@ -1,6 +1,8 @@
1
- DocRipper is an extremely lightweight Ruby wrapper that can be used to parse text contents from common file formats (currently .doc, .docx and .pdf) without the need for a large number of dependencies like an OCR library or OpenOffice/LibreOffice.
1
+ # DocRipper
2
2
 
3
- For simple parsing, you'll likely see a large performance improvement with DocRipper over solutions that rely on OpenOffice/LibreOffice for .doc/.docx conversion. I found
3
+ Grab the text from common document formats with 1 command. DocRipper is an extremely lightweight Ruby wrapper that can be used to parse text contents from common file formats (currently .doc, .docx and .pdf) without the need for a large number of dependencies like an OCR library or OpenOffice/LibreOffice.
4
+
5
+ For simple parsing, you'll likely see a large performance improvement with DocRipper over solutions that rely on OpenOffice/LibreOffice for .doc/.docx conversion.
4
6
 
5
7
  Need OCR support or in-image text parsing? Take a look at [Docsplit](https://github.com/documentcloud/docsplit).
6
8
 
@@ -9,28 +11,36 @@ Need OCR support or in-image text parsing? Take a look at [Docsplit](https://git
9
11
  ```
10
12
  gem install doc_ripper
11
13
  ```
12
- ### Specify a file to parse
14
+ ### Specify a file path of a file
13
15
 
14
16
  ```
15
- DocRipper::TextRipper.new('/path/to/file')
17
+ require 'doc_ripper'
18
+
19
+ DocRipper::rip('/path/to/file')
16
20
  ```
17
21
 
18
- ### Return the file's text
22
+ #### If the file cannot be read, nil will be returned.
23
+
19
24
  ```
20
- dr = DocRipper::TextRipper.new('/path/to/file')
21
- dr.text
22
- => "Document's text"
25
+ DocRipper::rip('/path/to/missing/file')
26
+ => nil
23
27
  ```
24
28
 
25
- If the file cannot be read, nil will be returned.
29
+ #### Want to raise an exception? Use #rip!
30
+ #rip! will raise an exception if rip returns nil or the file type isn't supported
26
31
 
27
32
  ```
28
- dr = DocRipper::TextRipper.new('/path/to/missing/file')
29
- dr.text
30
- => nil
33
+ # invalid file type
34
+ DocRipper::rip!('/path/to/invalide/file.type')
35
+ => DocRipper::UnsupportedFileType
36
+
37
+ # missing file
38
+ DocRipper::rip!('/path/to/missing/file.doc')
39
+ => DocRipper::FileNotFound
31
40
  ```
32
41
 
33
42
 
43
+
34
44
  ## Dependencies
35
45
  - Ruby version >= 1.9.2
36
46
  - [Poppler-utils/(pdftotext)](http://poppler.freedesktop.org/) (PDF)
data/Rakefile CHANGED
@@ -1,2 +1,8 @@
1
1
  require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new
5
+
6
+ task test: :spec
7
+ task default: :spec
2
8
 
@@ -23,4 +23,5 @@ Gem::Specification.new do |spec|
23
23
 
24
24
  spec.add_development_dependency "bundler", "~> 1.6"
25
25
  spec.add_development_dependency "rake", "~> 10.0"
26
+ spec.add_development_dependency "rspec"
26
27
  end
@@ -5,7 +5,21 @@ require "doc_ripper/text_ripper"
5
5
  require "doc_ripper/pdf_ripper"
6
6
  require "doc_ripper/docx_ripper"
7
7
  require "doc_ripper/ms_doc_ripper"
8
+ require "doc_ripper/exceptions"
8
9
 
9
10
  module DocRipper
11
+ class << self
12
+ def rip(path, options = {})
13
+ TextRipper.new(path, options).text
14
+ end
10
15
 
16
+ def rip!(path)
17
+ text = rip(path, raise: true)
18
+ if text
19
+ text
20
+ else
21
+ raise FileNotFound
22
+ end
23
+ end
24
+ end
11
25
  end
@@ -0,0 +1,7 @@
1
+ module DocRipper
2
+ class FileNotFound < StandardError
3
+ end
4
+
5
+ class UnsupportedFileType < StandardError
6
+ end
7
+ end
@@ -4,9 +4,10 @@ module DocRipper
4
4
  class Base
5
5
  attr_reader :text
6
6
 
7
- def initialize(file_path)
7
+ def initialize(file_path, options = {})
8
8
  @file_path = file_path
9
9
  @text_file_path = "#{file_path.split('.').first}.txt"
10
+ @options = options
10
11
  end
11
12
 
12
13
  private
@@ -22,6 +22,8 @@ module DocRipper
22
22
  MsDocRipper.new(@file_path).rip
23
23
  when !!(@file_path[-4..-1] =~ /.pdf/i)
24
24
  PdfRipper.new(@file_path).rip
25
+ when @options[:raise]
26
+ raise UnsupportedFileType
25
27
  end
26
28
  end
27
29
 
@@ -1,5 +1,5 @@
1
1
  module DocRipper
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
4
4
 
5
5
 
@@ -0,0 +1,57 @@
1
+ require 'spec_helper'
2
+
3
+ module DocRipper
4
+ describe 'provide a clean api to return the text from a document' do
5
+ let(:doc_path) { "#{FIXTURE_PATH}lorem.doc" }
6
+ let(:docx_path) { "#{FIXTURE_PATH}lorem.docx" }
7
+ let(:pdf_path) { "#{FIXTURE_PATH}lorem.docx" }
8
+ let(:invalid_path) { "#{FIXTURE_PATH}missing_file.docx" }
9
+ let(:invalid_file_type) { "#{FIXTURE_PATH}lorem.jpg"}
10
+ let(:missing_path) { "#{
11
+ FIXTURE_PATH}some_missing_path.docx" }
12
+
13
+ context '#rip' do
14
+
15
+ it 'should respond to #rip' do
16
+ expect(DocRipper.respond_to? :rip).to eq(true)
17
+ end
18
+
19
+ it 'should respond with text to valid file extensions' do
20
+ expect(DocRipper.rip(doc_path)).not_to eq(nil)
21
+ expect(DocRipper.rip(docx_path)).not_to eq(nil)
22
+ expect(DocRipper.rip(pdf_path)).not_to eq(nil)
23
+ end
24
+
25
+ it 'should respond with nil if file is missing' do
26
+ expect(DocRipper.rip(missing_path)).to eq(nil)
27
+ end
28
+
29
+
30
+ it 'should respond with nil if the file is the wrong type' do
31
+ expect(DocRipper.rip(invalid_path)).to eq(nil)
32
+ end
33
+
34
+ it 'should remove the dumped text version of the file' do
35
+
36
+ end
37
+
38
+ end
39
+
40
+ context '#rip!' do
41
+
42
+ it 'should respond with an exception if the file is missing' do
43
+ expect{DocRipper.rip!(invalid_path)}.to raise_error(FileNotFound)
44
+ end
45
+
46
+ it 'should respond with an exception if the file is the wrong type of extension' do
47
+ expect{DocRipper.rip!(invalid_file_type)}.to raise_error(UnsupportedFileType)
48
+ end
49
+
50
+ it 'should respond with an exception if the text file is nil' do
51
+ end
52
+
53
+ end
54
+
55
+
56
+ end
57
+ end
@@ -0,0 +1,9 @@
1
+ require 'spec_helper'
2
+
3
+ module DocRipper
4
+ module Ripper
5
+ describe 'Base' do
6
+
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,7 @@
1
+ require 'spec_helper'
2
+
3
+ module DocRipper
4
+ describe 'TextRipper' do
5
+
6
+ end
7
+ end
Binary file
Binary file
Binary file
@@ -0,0 +1 @@
1
+ Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
File without changes
File without changes
@@ -0,0 +1,11 @@
1
+ require 'bundler/setup'
2
+ Bundler.setup
3
+
4
+ require 'doc_ripper'
5
+
6
+ FIXTURE_PATH = "#{File.expand_path '../',__FILE__}/fixtures/"
7
+
8
+
9
+ RSpec.configure do |config|
10
+ # some (optional) config here
11
+ end
metadata CHANGED
@@ -1,43 +1,57 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: doc_ripper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paul Zaich
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-07-22 00:00:00.000000000 Z
11
+ date: 2014-12-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.6'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.6'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ~>
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
33
  version: '10.0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ~>
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  description: Provides a lean, convenient ruby wrapper to poppler, and antiword command
42
56
  line tools to quickly rip out text from common text formats.
43
57
  email:
@@ -46,7 +60,8 @@ executables: []
46
60
  extensions: []
47
61
  extra_rdoc_files: []
48
62
  files:
49
- - .gitignore
63
+ - ".gitignore"
64
+ - ".rspec"
50
65
  - Gemfile
51
66
  - LICENSE.txt
52
67
  - README.md
@@ -54,11 +69,22 @@ files:
54
69
  - doc_ripper.gemspec
55
70
  - lib/doc_ripper.rb
56
71
  - lib/doc_ripper/docx_ripper.rb
72
+ - lib/doc_ripper/exceptions.rb
57
73
  - lib/doc_ripper/ms_doc_ripper.rb
58
74
  - lib/doc_ripper/pdf_ripper.rb
59
75
  - lib/doc_ripper/ripper/base.rb
60
76
  - lib/doc_ripper/text_ripper.rb
61
77
  - lib/doc_ripper/version.rb
78
+ - spec/doc_ripper/doc_ripper_spec.rb
79
+ - spec/doc_ripper/ripper/base_spec.rb
80
+ - spec/doc_ripper/text_ripper_spec.rb
81
+ - spec/fixtures/lorem.doc
82
+ - spec/fixtures/lorem.docx
83
+ - spec/fixtures/lorem.pdf
84
+ - spec/fixtures/lorem.txt
85
+ - spec/fixtures/missing_file.txt
86
+ - spec/fixtures/some_missing_path.txt
87
+ - spec/spec_helper.rb
62
88
  homepage: https://github.com/pzaich/doc_ripper
63
89
  licenses:
64
90
  - MIT
@@ -69,20 +95,30 @@ require_paths:
69
95
  - lib
70
96
  required_ruby_version: !ruby/object:Gem::Requirement
71
97
  requirements:
72
- - - ! '>='
98
+ - - ">="
73
99
  - !ruby/object:Gem::Version
74
100
  version: '0'
75
101
  required_rubygems_version: !ruby/object:Gem::Requirement
76
102
  requirements:
77
- - - ! '>='
103
+ - - ">="
78
104
  - !ruby/object:Gem::Version
79
105
  version: '0'
80
106
  requirements:
81
107
  - Antiword
82
108
  - pdftotext/poppler
83
109
  rubyforge_project:
84
- rubygems_version: 2.0.3
110
+ rubygems_version: 2.2.2
85
111
  signing_key:
86
112
  specification_version: 4
87
113
  summary: Rip out text from pdf, doc and docx formats
88
- test_files: []
114
+ test_files:
115
+ - spec/doc_ripper/doc_ripper_spec.rb
116
+ - spec/doc_ripper/ripper/base_spec.rb
117
+ - spec/doc_ripper/text_ripper_spec.rb
118
+ - spec/fixtures/lorem.doc
119
+ - spec/fixtures/lorem.docx
120
+ - spec/fixtures/lorem.pdf
121
+ - spec/fixtures/lorem.txt
122
+ - spec/fixtures/missing_file.txt
123
+ - spec/fixtures/some_missing_path.txt
124
+ - spec/spec_helper.rb