RubyGems - textractor - Versions diffs - 0.1.2 → 0.1.3 - Mend

textractor 0.1.2 → 0.1.3

Files changed (9) hide show

data/Gemfile.lock +1 -1
data/README.md +36 -11
data/bin/textractor +6 -0
data/lib/textractor.rb +10 -13
data/lib/textractor/content_type_detector.rb +9 -0
data/lib/textractor/mimetype_fu_content_type_detector.rb +11 -0
data/lib/textractor/simple_content_type_detector.rb +20 -0
data/lib/textractor/version.rb +1 -1
metadata +9 -5

data/Gemfile.lock CHANGED

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    textractor (0.1.2)
+    textractor (0.1.3)
 GEM
   remote: http://rubygems.org/

data/README.md CHANGED

@@ -1,21 +1,18 @@
 # textractor
-textractor is a ruby library that provides a simple wrapper around CLI
-tools for extracting text from PDF and Word documents.
+textractor is a ruby library that provides a simple wrapper around CLI tools for extracting text from PDF and Word documents.
 ## Setup
     gem install textractor
-In order to use textractor you have to install a few command line
-tools.
+In order to use textractor you have to install a few command line tools.
 ### OS X
     port install wv xpdf links
-I recommend using also passing +no_x11 to the install command, but
-this may not work on all systems due to dependency issues.
+I recommend using also passing +no_x11 to the install command, but this may not work on all systems due to dependency issues.
     port install wv xpdf links +no_x11
@@ -23,19 +20,47 @@ this may not work on all systems due to dependency issues.
     apt-get install wv xpdf-utils links
+### Optional mimetype-fu
+    gem install mimetype-fu
+If you plan on using more than the default extractors it is a good idea to install mimetype-fu.  This will allow much more robust content type detection.
 ## Usage
-Due to textractor's reliance on command line tools all the methods in
-textractor work on paths not File objects.
+### Basics
+Due to textractor's reliance on command line tools all the methods in textractor work on paths not File objects.
     Textractor.text_from_path(path_to_document) # => "Ruby on rails developer"
-Textractor will attempt to guess what type of document you're trying
-to extract text from.  However, if you know the content type of your
-document, you can provide it and Textractor won't guess.
+Textractor will attempt to guess what type of document you're trying to extract text from.  However, if you know the content type of your document, you can provide it and Textractor won't guess.
     Textractor.text_from_path(path_to_document, :content_type => "application/doc")
+### Custom Extractors
+It's possible to define additional extractors for additional content types.  An extractor only has to respond to a single method `text_from_path`.
+    class HTMLExtractor < Textractor::Extractors::TextExtractor
+      def text_from_path(path)
+        document = Nokogiri::HTML(super)
+        document.text
+      end
+    end
+    Textractor.register_content_type("text/html", HTMLExtractor)
+You can also remove a content type extractor:
+    Textractor.remove_content_type("text/html")
+Or clear out all known extractors:
+    Textractor.clear_registry
 ## TODO
 * Remove vendored docx2txt perl script

data/bin/textractor ADDED

@@ -0,0 +1,6 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'textractor'
+puts Textractor.text_from_path(File.expand_path(ARGV[0]))

data/lib/textractor.rb CHANGED

@@ -5,7 +5,9 @@ module Textractor
   ContentTypeAlreadyRegistered = Class.new(StandardError)
   ContentTypeNotRegistered     = Class.new(StandardError)
-  autoload :Extractors, "textractor/extractors"
+  autoload :Extractors,                    'textractor/extractors'
+  autoload :SimpleContentTypeDetector,     'textractor/simple_content_type_detector'
+  autoload :MimetypeFuContentTypeDetector, 'textractor/mimetype_fu_content_type_detector'
   def self.text_from_path(path, options = {})
     raise FileNotFound unless File.exists?(path)
@@ -16,19 +18,12 @@ module Textractor
     extractor.text_from_path(path)
   end
+  class << self
+    attr_accessor :content_type_detector
+  end
   def self.content_type_for_path(path)
-    case File.extname(path)
-    when /\.pdf$/
-      'application/pdf'
-    when /\.doc$/
-      'application/msword'
-    when /\.docx$/
-      'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
-    when /\.txt$/
-      'text/plain'
-    else
-      raise UnknownContentType, "unable to determine content type for #{path}"
-    end
+    content_type_detector.content_type_for_path(path) or raise UnknownContentType, "unable to determine content type for #{path}"
   end
   def self.register_content_type(content_type, extractor)
@@ -62,3 +57,5 @@ module Textractor
   register_basic_types
 end
+require 'textractor/content_type_detector'

data/lib/textractor/content_type_detector.rb ADDED

@@ -0,0 +1,9 @@
+begin
+  require 'rubygems'
+  require 'yaml'
+  require 'mimetype_fu'
+  Textractor.content_type_detector = Textractor::MimetypeFuContentTypeDetector
+rescue LoadError => e
+  Textractor.content_type_detector = Textractor::SimpleContentTypeDetector
+end

data/lib/textractor/mimetype_fu_content_type_detector.rb ADDED

@@ -0,0 +1,11 @@
+module Textractor
+  class MimetypeFuContentTypeDetector
+    def self.content_type_for_path(path)
+      File.mime_type?(path)
+    end
+  end
+end

data/lib/textractor/simple_content_type_detector.rb ADDED

@@ -0,0 +1,20 @@
+module Textractor
+  class SimpleContentTypeDetector
+    def self.content_type_for_path(path)
+      case File.extname(path)
+      when /\.pdf$/
+        'application/pdf'
+      when /\.doc$/
+        'application/msword'
+      when /\.docx$/
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+      when /\.txt$/
+        'text/plain'
+      end
+    end
+  end
+end

data/lib/textractor/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Textractor
-  VERSION = '0.1.2'
+  VERSION = '0.1.3'
 end

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: textractor
 version: !ruby/object:Gem::Version
-  hash: 31
+  hash: 29
   prerelease: false
   segments:
   - 0
   - 1
-  - 2
-  version: 0.1.2
+  - 3
+  version: 0.1.3
 platform: ruby
 authors:
 - Michael Guterl
@@ -53,8 +53,8 @@ dependencies:
 description: simple wrapper around CLI for extracting text from PDF and Word documents
 email:
 - michael@diminishing.org
-executables: []
+executables:
+- textractor
 extensions: []
 extra_rdoc_files:
@@ -68,13 +68,17 @@ files:
 - LICENSE
 - README.md
 - Rakefile
+- bin/textractor
 - lib/textractor.rb
+- lib/textractor/content_type_detector.rb
 - lib/textractor/extractors.rb
 - lib/textractor/extractors/doc_extractor.rb
 - lib/textractor/extractors/docx_extractor.rb
 - lib/textractor/extractors/pdf_extractor.rb
 - lib/textractor/extractors/text_extractor.rb
 - lib/textractor/extractors/word_extractor.rb
+- lib/textractor/mimetype_fu_content_type_detector.rb
+- lib/textractor/simple_content_type_detector.rb
 - lib/textractor/version.rb
 - spec/fixtures/document.doc
 - spec/fixtures/document.docx