RubyGems - textractor - Versions diffs - 0.1.3 → 0.1.4 - Mend

textractor 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/Gemfile.lock +1 -1
data/README.md +13 -1
data/lib/textractor.rb +13 -8
data/lib/textractor/content_type_detector.rb +7 -2
data/lib/textractor/{mimetype_fu_content_type_detector.rb → content_type_detector/mimetype_fu.rb} +2 -2
data/lib/textractor/{simple_content_type_detector.rb → content_type_detector/simple.rb} +2 -2
data/lib/textractor/version.rb +1 -1
data/spec/textractor_spec.rb +10 -0
metadata +5 -6
data/lib/textractor/extractors/word_extractor.rb +0 -29

data/Gemfile.lock CHANGED

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    textractor (0.1.3)
+    textractor (0.1.4)
 GEM
   remote: http://rubygems.org/

data/README.md CHANGED

@@ -20,7 +20,11 @@ I recommend using also passing +no_x11 to the install command, but this may not
     apt-get install wv xpdf-utils links
-### Optional mimetype-fu
+### Perl (*sigh*)
+Yes, this is slightly ridiculous, but a working perl installation is required in order to extract text from a docx file.
+### mimetype-fu (optional)
     gem install mimetype-fu
@@ -53,6 +57,14 @@ It's possible to define additional extractors for additional content types.  An
     Textractor.register_content_type("text/html", HTMLExtractor)
+It is also possible to use a block as a simple content type extractor:
+    Textractor.register_content_type("text/html") do |path|
+      data = File.read(path)
+      document = Nokogiri::HTML(data)
+      document.text
+    end
 You can also remove a content type extractor:
     Textractor.remove_content_type("text/html")

data/lib/textractor.rb CHANGED

@@ -5,17 +5,18 @@ module Textractor
   ContentTypeAlreadyRegistered = Class.new(StandardError)
   ContentTypeNotRegistered     = Class.new(StandardError)
-  autoload :Extractors,                    'textractor/extractors'
-  autoload :SimpleContentTypeDetector,     'textractor/simple_content_type_detector'
-  autoload :MimetypeFuContentTypeDetector, 'textractor/mimetype_fu_content_type_detector'
+  autoload :Extractors, 'textractor/extractors'
   def self.text_from_path(path, options = {})
     raise FileNotFound unless File.exists?(path)
     content_type    = options.fetch(:content_type) { content_type_for_path(path) }
-    extractor_class = extractor_for_content_type(content_type)
-    extractor       = extractor_class.new
+    extractor       = extractor_for_content_type(content_type)
-    extractor.text_from_path(path)
+    if extractor.is_a?(Proc)
+      extractor.call(path)
+    else
+      extractor.new.text_from_path(path)
+    end
   end
   class << self
@@ -26,9 +27,13 @@ module Textractor
     content_type_detector.content_type_for_path(path) or raise UnknownContentType, "unable to determine content type for #{path}"
   end
-  def self.register_content_type(content_type, extractor)
+  def self.register_content_type(content_type, extractor = nil, &block)
     raise ContentTypeAlreadyRegistered, "#{content_type} is already registered" if extractors[content_type]
-    extractors[content_type] = extractor
+    if extractor
+      extractors[content_type] = extractor
+    elsif block_given?
+      extractors[content_type] = block
+    end
   end
   def self.remove_content_type(content_type)

data/lib/textractor/content_type_detector.rb CHANGED

@@ -1,9 +1,14 @@
+module Textractor::ContentTypeDetector
+  autoload :Simple,     'textractor/content_type_detector/simple'
+  autoload :MimetypeFu, 'textractor/content_type_detector/mimetype_fu'
+end
 begin
   require 'rubygems'
   require 'yaml'
   require 'mimetype_fu'
-  Textractor.content_type_detector = Textractor::MimetypeFuContentTypeDetector
+  Textractor.content_type_detector = Textractor::ContentTypeDetector::MimetypeFu
 rescue LoadError => e
-  Textractor.content_type_detector = Textractor::SimpleContentTypeDetector
+  Textractor.content_type_detector = Textractor::ContentTypeDetector::Simple
 end

data/lib/textractor/{mimetype_fu_content_type_detector.rb → content_type_detector/mimetype_fu.rb} RENAMED

@@ -1,6 +1,6 @@
-module Textractor
+module Textractor::ContentTypeDetector
-  class MimetypeFuContentTypeDetector
+  class MimetypeFu
     def self.content_type_for_path(path)
       File.mime_type?(path)

data/lib/textractor/{simple_content_type_detector.rb → content_type_detector/simple.rb} RENAMED

@@ -1,6 +1,6 @@
-module Textractor
+module Textractor::ContentTypeDetector
-  class SimpleContentTypeDetector
+  class Simple
     def self.content_type_for_path(path)
       case File.extname(path)

data/lib/textractor/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Textractor
-  VERSION = '0.1.3'
+  VERSION = '0.1.4'
 end

data/spec/textractor_spec.rb CHANGED

@@ -47,6 +47,16 @@ describe Textractor do
       }.to raise_error(Textractor::ContentTypeAlreadyRegistered)
     end
+    it 'takes a block for simple cases' do
+      File.stub(:exists?).and_return(true)
+      Textractor.stub(:content_type_for_path).and_return('test')
+      Textractor.register_content_type('test') do |path|
+        path
+      end
+      Textractor.text_from_path('document').should == 'document'
+    end
   end
   describe ".extractor_for_content_type" do

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: textractor
 version: !ruby/object:Gem::Version
-  hash: 29
+  hash: 19
   prerelease: false
   segments:
   - 0
   - 1
-  - 3
-  version: 0.1.3
+  - 4
+  version: 0.1.4
 platform: ruby
 authors:
 - Michael Guterl
@@ -71,14 +71,13 @@ files:
 - bin/textractor
 - lib/textractor.rb
 - lib/textractor/content_type_detector.rb
+- lib/textractor/content_type_detector/mimetype_fu.rb
+- lib/textractor/content_type_detector/simple.rb
 - lib/textractor/extractors.rb
 - lib/textractor/extractors/doc_extractor.rb
 - lib/textractor/extractors/docx_extractor.rb
 - lib/textractor/extractors/pdf_extractor.rb
 - lib/textractor/extractors/text_extractor.rb
-- lib/textractor/extractors/word_extractor.rb
-- lib/textractor/mimetype_fu_content_type_detector.rb
-- lib/textractor/simple_content_type_detector.rb
 - lib/textractor/version.rb
 - spec/fixtures/document.doc
 - spec/fixtures/document.docx

data/lib/textractor/extractors/word_extractor.rb DELETED

@@ -1,29 +0,0 @@
-module Textractor::Extractors
-  class WordExtractor
-    DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../support/wvText.xml").freeze
-    class << self
-      attr_writer :wvText_path
-      def wvText_path
-        @wvText_path || DEFAULT_WV_TEXT_PATH
-      end
-    end
-    def text_from_path(path)
-      command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
-      puts command if $DEBUG
-      `#{command}`.strip
-    end
-    private
-    def wvText_path
-      self.class.wvText_path
-    end
-  end
-end