RubyGems - doc_ripper - Versions diffs - 0.0.4 → 0.0.5 - Mend

doc_ripper 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +6 -14
data/.gitignore +9 -13
data/.rspec +2 -0
data/Gemfile +1 -0
data/README.md +22 -12
data/Rakefile +6 -0
data/doc_ripper.gemspec +1 -0
data/lib/doc_ripper.rb +14 -0
data/lib/doc_ripper/exceptions.rb +7 -0
data/lib/doc_ripper/ripper/base.rb +2 -1
data/lib/doc_ripper/text_ripper.rb +2 -0
data/lib/doc_ripper/version.rb +1 -1
data/spec/doc_ripper/doc_ripper_spec.rb +57 -0
data/spec/doc_ripper/ripper/base_spec.rb +9 -0
data/spec/doc_ripper/text_ripper_spec.rb +7 -0
data/spec/fixtures/lorem.doc +0 -0
data/spec/fixtures/lorem.docx +0 -0
data/spec/fixtures/lorem.pdf +0 -0
data/spec/fixtures/lorem.txt +1 -0
data/spec/fixtures/missing_file.txt +0 -0
data/spec/fixtures/some_missing_path.txt +0 -0
data/spec/spec_helper.rb +11 -0
metadata +47 -11

checksums.yaml CHANGED

@@ -1,15 +1,7 @@
 ---
-!binary "U0hBMQ==":
-  metadata.gz: !binary |-
-    OTJjZmJlZjQwMzQ2ZDFlMWQwMzIyY2UyNGJmYTA1NWQxNGJmODEyZA==
-  data.tar.gz: !binary |-
-    MWIxZWYzZmIwZWE5Yjk5MTg0N2RkNWVjNTA1ODYxYzg4NWM5NTkzYw==
-!binary "U0hBNTEy":
-  metadata.gz: !binary |-
-    MDczOTMxNDI1ZWFlYmQzM2JlMTU0YjJlYjEwNGQ5ZjM4M2I0ZmYxMWRhOTNl
-    YTkxN2UyY2ZhZWVhYmE3ZWYyNmZlODA3MWJjN2M1ZDI1MjZjMjdhZmQ3ODE0
-    NDIxYzA0ZTA4MWFhOTRhNjcxY2U0NmVhMDM3MGZkN2NiOWY0OTg=
-  data.tar.gz: !binary |-
-    OTA0NmU4ZDliOWY1MjVjYmYzMTJjNjJhZmI4YzUyZWQyYTg1ZDNhMzM0Y2Zm
-    MTZhNGI2NDMzMjE0MGVjN2EwMDE2YWRjNjYzNTAxYWVlZmU3ZGNhZjYzNWE3
-    Y2M5NzU2ZTliNjIwNDI4MzNlN2ZkNjYxM2I3YTRhMzA3Y2FlNjk=
+SHA1:
+  metadata.gz: e356e467916b8452aeb2121a234b0011302286ee
+  data.tar.gz: 6d7d1bc5c12f8a7de5e8585fbf5f7ca9bffe6735
+SHA512:
+  metadata.gz: c58f820acc305465e13c19e2e328de856a4e06d7e5c32a74827332002352a6e982286d0261b1e9cf99871084f65a9f2abb5c75227b7392f463325e476fc2df97
+  data.tar.gz: 812a7e6df98b6f247e0bd46e520611ad93180f43ac6e61306d3fe9b6bfc49ee75668609055ca5162fc595a223394b1bc7dc956b0eb189a2368f502a052b25528

data/.gitignore CHANGED

@@ -1,14 +1,10 @@
-/.bundle/
-/.yardoc
-/Gemfile.lock
-/_yardoc/
-/coverage/
-/doc/
-/pkg/
-/spec/reports/
+.ruby-gemset
+.ruby-version
+Gemfile.lock
+*.DS_Store
 /tmp/
-*.bundle
-*.so
-*.o
-*.a
-mkmf.log
+/spec/tmp/
+/doc/
+/rdoc/
+/coverage/

data/.rspec ADDED

	@@ -0,0 +1,2 @@
1	+ --color
2	+ --format documentation

data/Gemfile CHANGED

@@ -2,3 +2,4 @@ source 'https://rubygems.org'
 # Specify your gem's dependencies in doc_ripper.gemspec
 gemspec

data/README.md CHANGED

@@ -1,6 +1,8 @@
-DocRipper is an extremely lightweight Ruby wrapper that can be used to parse text contents from common file formats (currently .doc, .docx and .pdf) without the need for a large number of dependencies like an OCR library or OpenOffice/LibreOffice.
+# DocRipper
-For simple parsing, you'll likely see a large performance improvement with DocRipper over solutions that rely on OpenOffice/LibreOffice for .doc/.docx conversion. I found
+Grab the text from common document formats with 1 command. DocRipper is an extremely lightweight Ruby wrapper that can be used to parse text contents from common file formats (currently .doc, .docx and .pdf) without the need for a large number of dependencies like an OCR library or OpenOffice/LibreOffice.
+For simple parsing, you'll likely see a large performance improvement with DocRipper over solutions that rely on OpenOffice/LibreOffice for .doc/.docx conversion.
 Need OCR support or in-image text parsing? Take a look at [Docsplit](https://github.com/documentcloud/docsplit).
@@ -9,28 +11,36 @@ Need OCR support or in-image text parsing? Take a look at [Docsplit](https://git
 ```
   gem install doc_ripper
 ```
-### Specify a file to parse
+### Specify a file path of a file
 ```
-  DocRipper::TextRipper.new('/path/to/file')
+  require 'doc_ripper'
+  DocRipper::rip('/path/to/file')
 ```
-### Return the file's text
+#### If the file cannot be read, nil will be returned.
 ```
-  dr = DocRipper::TextRipper.new('/path/to/file')
-  dr.text
-  => "Document's text"
+  DocRipper::rip('/path/to/missing/file')
+  => nil
 ```
-If the file cannot be read, nil will be returned.
+#### Want to raise an exception? Use #rip!
+#rip! will raise an exception if rip returns nil or the file type isn't supported
 ```
-  dr = DocRipper::TextRipper.new('/path/to/missing/file')
-  dr.text
-  => nil
+  # invalid file type
+  DocRipper::rip!('/path/to/invalide/file.type')
+  => DocRipper::UnsupportedFileType
+  # missing file
+  DocRipper::rip!('/path/to/missing/file.doc')
+  => DocRipper::FileNotFound
 ```
 ## Dependencies
  - Ruby version >= 1.9.2
  - [Poppler-utils/(pdftotext)](http://poppler.freedesktop.org/) (PDF)

data/Rakefile CHANGED

@@ -1,2 +1,8 @@
 require "bundler/gem_tasks"
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new
+task test: :spec
+task default: :spec

data/doc_ripper.gemspec CHANGED

@@ -23,4 +23,5 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "bundler", "~> 1.6"
   spec.add_development_dependency "rake", "~> 10.0"
+  spec.add_development_dependency "rspec"
 end

data/lib/doc_ripper.rb CHANGED

@@ -5,7 +5,21 @@ require "doc_ripper/text_ripper"
 require "doc_ripper/pdf_ripper"
 require "doc_ripper/docx_ripper"
 require "doc_ripper/ms_doc_ripper"
+require "doc_ripper/exceptions"
 module DocRipper
+  class << self
+    def rip(path, options = {})
+      TextRipper.new(path, options).text
+    end
+    def rip!(path)
+      text = rip(path, raise: true)
+      if text
+        text
+      else
+        raise FileNotFound
+      end
+    end
+  end
 end

data/lib/doc_ripper/exceptions.rb ADDED

@@ -0,0 +1,7 @@
+module DocRipper
+  class FileNotFound < StandardError
+  end
+  class UnsupportedFileType < StandardError
+  end
+end

data/lib/doc_ripper/ripper/base.rb CHANGED

@@ -4,9 +4,10 @@ module DocRipper
     class Base
       attr_reader :text
-      def initialize(file_path)
+      def initialize(file_path, options = {})
         @file_path      = file_path
         @text_file_path = "#{file_path.split('.').first}.txt"
+        @options = options
       end
       private

data/lib/doc_ripper/text_ripper.rb CHANGED

@@ -22,6 +22,8 @@ module DocRipper
         MsDocRipper.new(@file_path).rip
       when !!(@file_path[-4..-1]  =~ /.pdf/i)
         PdfRipper.new(@file_path).rip
+      when @options[:raise]
+        raise UnsupportedFileType
       end
     end

data/lib/doc_ripper/version.rb CHANGED

@@ -1,5 +1,5 @@
 module DocRipper
-  VERSION = "0.0.4"
+  VERSION = "0.0.5"
 end

data/spec/doc_ripper/doc_ripper_spec.rb ADDED

@@ -0,0 +1,57 @@
+require 'spec_helper'
+module DocRipper
+  describe 'provide a clean api to return the text from a document' do
+    let(:doc_path)     { "#{FIXTURE_PATH}lorem.doc"  }
+    let(:docx_path)    { "#{FIXTURE_PATH}lorem.docx" }
+    let(:pdf_path)     { "#{FIXTURE_PATH}lorem.docx" }
+    let(:invalid_path) { "#{FIXTURE_PATH}missing_file.docx" }
+    let(:invalid_file_type) { "#{FIXTURE_PATH}lorem.jpg"}
+    let(:missing_path) { "#{
+      FIXTURE_PATH}some_missing_path.docx" }
+    context '#rip' do
+      it 'should respond to #rip' do
+        expect(DocRipper.respond_to? :rip).to eq(true)
+      end
+      it 'should respond with text to valid file extensions' do
+        expect(DocRipper.rip(doc_path)).not_to eq(nil)
+        expect(DocRipper.rip(docx_path)).not_to eq(nil)
+        expect(DocRipper.rip(pdf_path)).not_to eq(nil)
+      end
+      it 'should respond with nil if file is missing' do
+        expect(DocRipper.rip(missing_path)).to eq(nil)
+      end
+      it 'should respond with nil if the file is the wrong type' do
+        expect(DocRipper.rip(invalid_path)).to eq(nil)
+      end
+      it 'should remove the dumped text version of the file' do
+      end
+    end
+    context '#rip!' do
+      it 'should respond with an exception if the file is missing' do
+        expect{DocRipper.rip!(invalid_path)}.to raise_error(FileNotFound)
+      end
+      it 'should respond with an exception if the file is the wrong type of extension' do
+        expect{DocRipper.rip!(invalid_file_type)}.to raise_error(UnsupportedFileType)
+      end
+      it 'should respond with an exception if the text file is nil' do
+      end
+    end
+  end
+end

data/spec/doc_ripper/ripper/base_spec.rb ADDED

@@ -0,0 +1,9 @@
+require 'spec_helper'
+module DocRipper
+  module Ripper
+    describe 'Base' do
+    end
+  end
+end

data/spec/doc_ripper/text_ripper_spec.rb ADDED

@@ -0,0 +1,7 @@
+require 'spec_helper'
+module DocRipper
+  describe 'TextRipper' do
+  end
+end

data/spec/fixtures/lorem.doc ADDED

Binary file

data/spec/fixtures/lorem.docx ADDED

Binary file

data/spec/fixtures/lorem.pdf ADDED

Binary file

data/spec/fixtures/lorem.txt ADDED

@@ -0,0 +1 @@

+ Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

data/spec/fixtures/missing_file.txt ADDED

File without changes

data/spec/fixtures/some_missing_path.txt ADDED

File without changes

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,11 @@
+require 'bundler/setup'
+Bundler.setup
+require 'doc_ripper'
+FIXTURE_PATH = "#{File.expand_path '../',__FILE__}/fixtures/"
+RSpec.configure do |config|
+  # some (optional) config here
+end

metadata CHANGED

@@ -1,43 +1,57 @@
 --- !ruby/object:Gem::Specification
 name: doc_ripper
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.0.5
 platform: ruby
 authors:
 - Paul Zaich
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-07-22 00:00:00.000000000 Z
+date: 2014-12-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '1.6'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '1.6'
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '10.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description: Provides a lean, convenient ruby wrapper to poppler, and antiword command
   line tools to quickly rip out text from common text formats.
 email:
@@ -46,7 +60,8 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
-- .gitignore
+- ".gitignore"
+- ".rspec"
 - Gemfile
 - LICENSE.txt
 - README.md
@@ -54,11 +69,22 @@ files:
 - doc_ripper.gemspec
 - lib/doc_ripper.rb
 - lib/doc_ripper/docx_ripper.rb
+- lib/doc_ripper/exceptions.rb
 - lib/doc_ripper/ms_doc_ripper.rb
 - lib/doc_ripper/pdf_ripper.rb
 - lib/doc_ripper/ripper/base.rb
 - lib/doc_ripper/text_ripper.rb
 - lib/doc_ripper/version.rb
+- spec/doc_ripper/doc_ripper_spec.rb
+- spec/doc_ripper/ripper/base_spec.rb
+- spec/doc_ripper/text_ripper_spec.rb
+- spec/fixtures/lorem.doc
+- spec/fixtures/lorem.docx
+- spec/fixtures/lorem.pdf
+- spec/fixtures/lorem.txt
+- spec/fixtures/missing_file.txt
+- spec/fixtures/some_missing_path.txt
+- spec/spec_helper.rb
 homepage: https://github.com/pzaich/doc_ripper
 licenses:
 - MIT
@@ -69,20 +95,30 @@ require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ! '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ! '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements:
 - Antiword
 - pdftotext/poppler
 rubyforge_project:
-rubygems_version: 2.0.3
+rubygems_version: 2.2.2
 signing_key:
 specification_version: 4
 summary: Rip out text from pdf, doc and docx formats
-test_files: []
+test_files:
+- spec/doc_ripper/doc_ripper_spec.rb
+- spec/doc_ripper/ripper/base_spec.rb
+- spec/doc_ripper/text_ripper_spec.rb
+- spec/fixtures/lorem.doc
+- spec/fixtures/lorem.docx
+- spec/fixtures/lorem.pdf
+- spec/fixtures/lorem.txt
+- spec/fixtures/missing_file.txt
+- spec/fixtures/some_missing_path.txt
+- spec/spec_helper.rb