RubyGems - textractor - Versions diffs - 0.1.3 → 0.1.4 - Mend

textractor 0.1.3 → 0.1.4

Files changed (10) hide show

data/Gemfile.lock +1 -1
data/README.md +13 -1
data/lib/textractor.rb +13 -8
data/lib/textractor/content_type_detector.rb +7 -2
data/lib/textractor/{mimetype_fu_content_type_detector.rb → content_type_detector/mimetype_fu.rb} +2 -2
data/lib/textractor/{simple_content_type_detector.rb → content_type_detector/simple.rb} +2 -2
data/lib/textractor/version.rb +1 -1
data/spec/textractor_spec.rb +10 -0
metadata +5 -6
data/lib/textractor/extractors/word_extractor.rb +0 -29

data/Gemfile.lock CHANGED

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    textractor (0.1.3)
+    textractor (0.1.4)
 GEM
   remote: http://rubygems.org/

data/README.md CHANGED

@@ -20,7 +20,11 @@ I recommend using also passing +no_x11 to the install command, but this may not
     apt-get install wv xpdf-utils links
-### Optional mimetype-fu
+### Perl (*sigh*)
+Yes, this is slightly ridiculous, but a working perl installation is required in order to extract text from a docx file.
+### mimetype-fu (optional)
     gem install mimetype-fu
@@ -53,6 +57,14 @@ It's possible to define additional extractors for additional content types.  An
     Textractor.register_content_type("text/html", HTMLExtractor)
+It is also possible to use a block as a simple content type extractor:
+    Textractor.register_content_type("text/html") do |path|
+      data = File.read(path)
+      document = Nokogiri::HTML(data)
+      document.text
+    end
 You can also remove a content type extractor:
     Textractor.remove_content_type("text/html")

data/lib/textractor.rb CHANGED

@@ -5,17 +5,18 @@ module Textractor
   ContentTypeAlreadyRegistered = Class.new(StandardError)
   ContentTypeNotRegistered     = Class.new(StandardError)
-  autoload :Extractors,                    'textractor/extractors'
-  autoload :SimpleContentTypeDetector,     'textractor/simple_content_type_detector'
-  autoload :MimetypeFuContentTypeDetector, 'textractor/mimetype_fu_content_type_detector'
+  autoload :Extractors, 'textractor/extractors'
   def self.text_from_path(path, options = {})
     raise FileNotFound unless File.exists?(path)
     content_type    = options.fetch(:content_type) { content_type_for_path(path) }
-    extractor_class = extractor_for_content_type(content_type)
-    extractor       = extractor_class.new
+    extractor       = extractor_for_content_type(content_type)
-    extractor.text_from_path(path)
+    if extractor.is_a?(Proc)
+      extractor.call(path)
+    else
+      extractor.new.text_from_path(path)
+    end
   end
   class << self
@@ -26,9 +27,13 @@ module Textractor
     content_type_detector.content_type_for_path(path) or raise UnknownContentType, "unable to determine content type for #{path}"
   end
-  def self.register_content_type(content_type, extractor)
+  def self.register_content_type(content_type, extractor = nil, &block)
     raise ContentTypeAlreadyRegistered, "#{content_type} is already registered" if extractors[content_type]
-    extractors[content_type] = extractor
+    if extractor
+      extractors[content_type] = extractor
+    elsif block_given?
+      extractors[content_type] = block
+    end
   end
   def self.remove_content_type(content_type)

data/lib/textractor/content_type_detector.rb CHANGED

@@ -1,9 +1,14 @@
+module Textractor::ContentTypeDetector
+  autoload :Simple,     'textractor/content_type_detector/simple'
+  autoload :MimetypeFu, 'textractor/content_type_detector/mimetype_fu'
+end
 begin
   require 'rubygems'
   require 'yaml'
   require 'mimetype_fu'
-  Textractor.content_type_detector = Textractor::MimetypeFuContentTypeDetector
+  Textractor.content_type_detector = Textractor::ContentTypeDetector::MimetypeFu
 rescue LoadError => e
-  Textractor.content_type_detector = Textractor::SimpleContentTypeDetector
+  Textractor.content_type_detector = Textractor::ContentTypeDetector::Simple
 end

data/lib/textractor/{mimetype_fu_content_type_detector.rb → content_type_detector/mimetype_fu.rb} RENAMED

@@ -1,6 +1,6 @@
-module Textractor
+module Textractor::ContentTypeDetector
-  class MimetypeFuContentTypeDetector
+  class MimetypeFu
     def self.content_type_for_path(path)
       File.mime_type?(path)

data/lib/textractor/{simple_content_type_detector.rb → content_type_detector/simple.rb} RENAMED

@@ -1,6 +1,6 @@
-module Textractor
+module Textractor::ContentTypeDetector
-  class SimpleContentTypeDetector
+  class Simple
     def self.content_type_for_path(path)
       case File.extname(path)

data/lib/textractor/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Textractor
-  VERSION = '0.1.3'
+  VERSION = '0.1.4'
 end

data/spec/textractor_spec.rb CHANGED

@@ -47,6 +47,16 @@ describe Textractor do
       }.to raise_error(Textractor::ContentTypeAlreadyRegistered)
     end
+    it 'takes a block for simple cases' do
+      File.stub(:exists?).and_return(true)
+      Textractor.stub(:content_type_for_path).and_return('test')
+      Textractor.register_content_type('test') do |path|
+        path
+      end
+      Textractor.text_from_path('document').should == 'document'
+    end
   end
   describe ".extractor_for_content_type" do

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: textractor
 version: !ruby/object:Gem::Version
-  hash: 29
+  hash: 19
   prerelease: false
   segments:
   - 0
   - 1
-  - 3
-  version: 0.1.3
+  - 4
+  version: 0.1.4
 platform: ruby
 authors:
 - Michael Guterl
@@ -71,14 +71,13 @@ files:
 - bin/textractor
 - lib/textractor.rb
 - lib/textractor/content_type_detector.rb
+- lib/textractor/content_type_detector/mimetype_fu.rb
+- lib/textractor/content_type_detector/simple.rb
 - lib/textractor/extractors.rb
 - lib/textractor/extractors/doc_extractor.rb
 - lib/textractor/extractors/docx_extractor.rb
 - lib/textractor/extractors/pdf_extractor.rb
 - lib/textractor/extractors/text_extractor.rb
-- lib/textractor/extractors/word_extractor.rb
-- lib/textractor/mimetype_fu_content_type_detector.rb
-- lib/textractor/simple_content_type_detector.rb
 - lib/textractor/version.rb
 - spec/fixtures/document.doc
 - spec/fixtures/document.docx

data/lib/textractor/extractors/word_extractor.rb DELETED

@@ -1,29 +0,0 @@
-module Textractor::Extractors
-  class WordExtractor
-    DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../support/wvText.xml").freeze
-    class << self
-      attr_writer :wvText_path
-      def wvText_path
-        @wvText_path || DEFAULT_WV_TEXT_PATH
-      end
-    end
-    def text_from_path(path)
-      command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
-      puts command if $DEBUG
-      `#{command}`.strip
-    end
-    private
-    def wvText_path
-      self.class.wvText_path
-    end
-  end
-end