RubyGems - textractor - Versions diffs - 0.0.1 - Mend

textractor 0.0.1

Files changed (16) hide show

data/.document ADDED

@@ -0,0 +1,5 @@
+README.rdoc
+lib/**/*.rb
+bin/*
+features/**/*.feature
+LICENSE

data/.gitignore ADDED

@@ -0,0 +1,21 @@
+## MAC OS
+.DS_Store
+## TEXTMATE
+*.tmproj
+tmtags
+## EMACS
+*~
+\#*
+.\#*
+## VIM
+*.swp
+## PROJECT::GENERAL
+coverage
+rdoc
+pkg
+## PROJECT::SPECIFIC

data/LICENSE ADDED

@@ -0,0 +1,20 @@
+Copyright (c) 2010 Michael Guterl
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,54 @@
+# textractor
+textractor is a ruby library that provides a simple wrapper for
+extracting text from PDF and Word documents.
+## Setup
+In order to use textractor you have to install a few command line
+tools.
+### OS X
+    port install wv pdftohtml links
+I recommend using also passing +no_x11 to the install command, but
+this may not work on all systems due to dependency issues.
+    port install wv pdftohtml links +no_x11
+### Ubuntu 8.04
+    apt-get install wv xpdf-utils links
+## Usage
+Due to textractor's reliance on command line tools all the methods in
+textractor work on paths not File objects.
+    document = Textractor::Document.new(path_to_document)
+    document.text # => "Ruby on rails developer"
+There is also a convenience method on Textractor.
+    Textractor.text_from_file(path_to_document) # => "Ruby on rails developer"
+Textractor will attempt to guess what type of document you're trying
+to extract text from.  However, if you know the content type of your
+document, you can provide it and Textractor won't guess.
+    Textractor.text_from_file(path_to_document, :content_type => "application/doc")
+## Note on Patches/Pull Requests
+* Fork the project.
+* Make your feature addition or bug fix.
+* Add tests for it. This is important so I don't break it in a
+  future version unintentionally.
+* Commit, do not mess with rakefile, version, or history.
+  (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
+* Send me a pull request. Bonus points for topic branches.
+## Copyright
+Copyright (c) 2010 Michael Guterl. See LICENSE for details.

data/Rakefile ADDED

@@ -0,0 +1,45 @@
+require 'rubygems'
+require 'rake'
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gem|
+    gem.name = "textractor"
+    gem.summary = %Q{simple wrapper for extracting text from PDF and Word documents}
+    gem.description = %Q{simple wrapper for extracting text from PDF and Word documents}
+    gem.email = "mguterl@gmail.com"
+    gem.homepage = "http://github.com/mguterl/textractor"
+    gem.authors = ["Michael Guterl"]
+    gem.add_development_dependency "rspec", ">= 1.2.9"
+    # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
+  end
+  Jeweler::GemcutterTasks.new
+rescue LoadError
+  puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
+end
+require 'spec/rake/spectask'
+Spec::Rake::SpecTask.new(:spec) do |spec|
+  spec.libs << 'lib' << 'spec'
+  spec.spec_files = FileList['spec/**/*_spec.rb']
+end
+Spec::Rake::SpecTask.new(:rcov) do |spec|
+  spec.libs << 'lib' << 'spec'
+  spec.pattern = 'spec/**/*_spec.rb'
+  spec.rcov = true
+end
+task :spec => :check_dependencies
+task :default => :spec
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  version = File.exist?('VERSION') ? File.read('VERSION') : ""
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "textractor #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/VERSION ADDED

	@@ -0,0 +1 @@
1	+ 0.0.1

data/lib/textractor.rb ADDED

@@ -0,0 +1,18 @@
+module Textractor
+  autoload :Document, "textractor/document"
+  def self.text_from_file(filename, options = {})
+    Textractor::Document.new(filename, options).text
+  end
+  DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../support/wvText.xml")
+  def self.wvText_path
+    @wvText_path || DEFAULT_WV_TEXT_PATH
+  end
+  def self.wvText_path=(path)
+    @wvText_path = path
+  end
+end

data/lib/textractor/document.rb ADDED

@@ -0,0 +1,51 @@
+module Textractor
+  class Document
+    CONTENT_TYPE_CONVERSIONS = {
+      'application/pdf'   => :pdf,
+      'application/x-pdf' => :pdf,
+      'application/doc'   => :word,
+      'application/x-doc' => :word,
+    }
+    attr_reader :filename
+    def initialize(filename, options = {})
+      @filename = File.expand_path(filename)
+      @content_type = options[:content_type]
+    end
+    def text
+      send("extract_from_#{type}")
+    end
+    def type
+      return CONTENT_TYPE_CONVERSIONS[content_type] if content_type
+      case File.extname(@filename)
+      when /pdf/
+        :pdf
+      when /doc/
+        :word
+      else
+        nil
+      end
+    end
+    private
+    def content_type
+      @content_type
+    end
+    def extract_from_pdf
+      `pdftotext #{filename} - 2>/dev/null`.strip
+    end
+    def extract_from_word
+      `wvWare -c utf-8 --nographics -x #{Textractor.wvText_path} #{filename} 2>/dev/null`.strip
+    end
+  end
+end

data/spec/document_spec.rb ADDED

@@ -0,0 +1,69 @@
+require 'spec/spec_helper'
+describe Textractor::Document do
+  PDF_DOCUMENT_FIXTURE  = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.pdf")
+  WORD_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.doc")
+  it 'should require a filename to create' do
+    expect { Textractor::Document.new }.to raise_error(ArgumentError)
+    Textractor::Document.new('filename').filename.should == File.expand_path('filename')
+  end
+  describe "#text" do
+    describe "with pdf document" do
+      it 'should extract the text from the document' do
+        @doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
+        @doc.text.should == "Ruby on rails developer"
+      end
+    end
+    describe "with word document" do
+      it 'should extract the text from the document' do
+        @doc = Textractor::Document.new(WORD_DOCUMENT_FIXTURE)
+        @doc.text.should == "Ruby on rails developer"
+      end
+    end
+  end
+  describe "#type" do
+    describe "with no content type provided" do
+      it 'should return :pdf for PDF documents' do
+        @doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
+        @doc.type.should == :pdf
+      end
+      it 'should return :word for Word documents' do
+        @doc = Textractor::Document.new(WORD_DOCUMENT_FIXTURE)
+        @doc.type.should == :word
+      end
+      it 'should return nil for unknown documents' do
+        @doc = Textractor::Document.new("foo.bar")
+        @doc.type.should == nil
+      end
+    end
+    describe "with a content type provided" do
+      it 'should ignore the extension of the file' do
+        [PDF_DOCUMENT_FIXTURE, WORD_DOCUMENT_FIXTURE].each do |filename|
+          Textractor::Document::CONTENT_TYPE_CONVERSIONS.each do |content_type, type|
+            @doc = Textractor::Document.new(filename, :content_type => content_type)
+            @doc.type.should == type
+          end
+        end
+      end
+    end
+  end
+end

data/spec/fixtures/document.doc ADDED

Binary file

data/spec/fixtures/document.pdf ADDED

Binary file

data/spec/spec.opts ADDED

	@@ -0,0 +1 @@
1	+ --color

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,9 @@
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+require 'textractor'
+require 'spec'
+require 'spec/autorun'
+Spec::Runner.configure do |config|
+end

data/spec/textractor_spec.rb ADDED

@@ -0,0 +1,32 @@
+require 'spec/spec_helper'
+describe Textractor do
+  describe ".wvText_path" do
+    it 'should default to the file provided with the gem' do
+      Textractor.wvText_path.should == Textractor::DEFAULT_WV_TEXT_PATH
+    end
+    it 'should use the new wvText_path if provided' do
+      Textractor.wvText_path = "foo.bar"
+      Textractor.wvText_path.should == "foo.bar"
+    end
+  end
+  describe ".text_from_file" do
+    it 'should return the extracted text from the file' do
+      document_path = 'word.doc'
+      document = mock("Textractor::Document", :text => "Ruby on Rails developer")
+      Textractor::Document.should_receive(:new).with(document_path, :content_type => "application/doc").and_return(document)
+      Textractor.text_from_file(document_path, :content_type => "application/doc").should == "Ruby on Rails developer"
+    end
+  end
+  after(:all) do
+    Textractor.instance_variable_set(:"@wvText_path", nil)
+  end
+end

data/support/wvText.xml ADDED

@@ -0,0 +1,355 @@
+<main>
+<charentity>
+<begin>ABW</begin>
+</charentity>
+<document>
+<begin>
+</begin>
+<end>
+</end>
+</document>
+<section>
+<begin>
+</begin>
+<end>
+</end>
+</section>
+<justification>
+<left></left>
+<right></right>
+<center></center>
+<block></block>
+<asian></asian>
+</justification>
+<numbering>
+<Arabic>type=&quot;1&quot;</Arabic>
+<UpperRoman>type=&quot;I&quot;</UpperRoman>
+<LowerRoman>type=&quot;i&quot;</LowerRoman>
+<UpperCaseN>type=&quot;A&quot;</UpperCaseN>
+<LowerCaseN>type=&quot;a&quot;</LowerCaseN>
+</numbering>
+<border>
+<noned></noned>
+<singled></singled>
+<thickd></thickd>
+<doubled></doubled>
+<number4d></number4d>
+<hairlined></hairlined>
+<dotd></dotd>
+<dashlargegapd></dashlargegapd>
+<dotdashd></dotdashd>
+<dotdotdashd></dotdotdashd>
+<tripled></tripled>
+<thin-thicksmallgapd></thin-thicksmallgapd>
+<thick-thinsmallgapd></thick-thinsmallgapd>
+<thin-thick-thinsmallgapd></thin-thick-thinsmallgapd>
+<thin-thickmediumgapd></thin-thickmediumgapd>
+<thick-thinmediumgapd></thick-thinmediumgapd>
+<thin-thick-thinmediumgapd></thin-thick-thinmediumgapd>
+<thin-thicklargegapd></thin-thicklargegapd>
+<thick-thinlargegapd></thick-thinlargegapd>
+<thin-thick-thinlargegapd></thin-thick-thinlargegapd>
+<waved></waved>
+<doublewaved></doublewaved>
+<dashsmallgapd></dashsmallgapd>
+<dashdotstrokedd></dashdotstrokedd>
+<emboss3Dd></emboss3Dd>
+<engrave3Dd></engrave3Dd>
+<defaultd></defaultd>
+</border>
+<olist>
+<begin></begin>
+<end></end>
+</olist>
+<ulist>
+<begin></begin>
+<end></end>
+</ulist>
+<entry>
+<begin></begin>
+<end></end>
+</entry>
+<!-- the only thing of significance -->
+<text>
+<begin></begin>
+<end>
+</end>
+</text>
+<!--
+this tableoverride option can be used to turn off handling of
+these tags in tables, which I find is necessary for at least netscape
+-->
+<tableoverrides>
+<ParaBefore>0</ParaBefore>
+<ParaRight>0</ParaRight>
+<ParaAfter>0</ParaAfter>
+<ParaLeft>0</ParaLeft>
+<ParaLeft1>0</ParaLeft1>
+<VertMergedCells>0</VertMergedCells>
+</tableoverrides>
+<table>
+<begin></begin>
+<end></end>
+</table>
+<row>
+<begin></begin>
+<end></end>
+</row>
+<cell>
+<begin></begin>
+<end></end>
+</cell>
+<paragraph>
+<begin><text.begin/></begin>
+<end><text.end/></end>
+</paragraph>
+<!-- these are all the character properties that can show up in word -->
+<bold><begin></begin><end></end></bold>
+<italic><begin></begin><end></end></italic>
+<!--
+text that has been deleted and will be displayed with strikethrough when
+revision marked text is to be displayed
+use either this line...
+-->
+<RMarkDel><begin></begin>
+<end></end>
+</RMarkDel>
+<!--
+or uncomment below to make deleted text dissappear (well, become commented out)
+-->
+<!--
+<RMarkDel><begin>&lt;!-&#45;</begin><end>-&#45;&gt;</end></RMarkDel>
+-->
+<!-- I don't even know what outline means -->
+<outline><begin></begin><end></end></outline>
+<smallcaps><begin></begin><end></end></smallcaps>
+<caps><begin></begin><end></end></caps>
+<vanish><begin></begin><end></end></vanish>
+<!--If you uncomment this then the annotation text links will become commented out by html tags-->
+<!--
+<vanish><begin>&lt;!-&#45;</begin><end>-&#45;&gt;</end></vanish>
+-->
+<!--
+text that has been newly typed since the last time revision marks have been accepted
+and will be displayed with underline when revision marked text is to be displayed
+use either this line...
+-->
+<RMark><begin></begin><end></end></RMark>
+<!--
+or uncomment below to make the underline dissappear
+-->
+<!--
+<RMark><begin></begin><end></end></RMark>
+-->
+<strike><begin></begin><end></end></strike>
+<shadow><begin></begin><end></end></shadow>
+<lowercase><begin></begin><end></end></lowercase>
+<emboss><begin></begin><end></end></emboss>
+<imprint><begin></begin><end></end></imprint>
+<!--double strike-->
+<dstrike><begin></begin><end></end></dstrike>
+<!--
+ftc's
+&
+hps
+keep them for font face and do that later.
+-->
+<super><begin></begin><end></end></super>
+<sub><begin></begin><end></end></sub>
+<singleu><begin></begin><end></end></singleu>
+<wordu><begin></begin><end></end></wordu>
+<doubleu><begin></begin><end></end></doubleu>
+<dottedu><begin></begin><end></end></dottedu>
+<hiddenu><begin></begin><end></end></hiddenu>
+<thicku><begin></begin><end></end></thicku>
+<dashu><begin></begin><end></end></dashu>
+<dotu><begin></begin><end></end></dotu>
+<dotdashu><begin></begin><end></end></dotdashu>
+<dotdotdashu><begin></begin><end></end></dotdotdashu>
+<waveu><begin></begin><end></end></waveu>
+<!--
+text whose properties have been changed since the last time revision marks have been accepted
+and will be displayed with a note showing the change points.
+use either this line (which admit it a bit scary looking, but harmless)...
+-->
+<PropRMark><begin><ibstPropRMark/></begin><end></end></PropRMark>
+<!--
+or uncomment below to make the notes dissappear
+-->
+<!--
+<PropRMark><begin></begin><end></end></PropRMark>
+-->
+<!--
+<color>
+-->
+<Black><begin></begin><end></end></Black>
+<Blue><begin></begin><end></end></Blue>
+<Cyan><begin></begin><end></end></Cyan>
+<Green><begin></begin><end></end></Green>
+<Magenta><begin></begin><end></end></Magenta>
+<Red><begin></begin><end></end></Red>
+<Yellow><begin></begin><end></end></Yellow>
+<White><begin></begin><end></end></White>
+<DkBlue><begin></begin><end></end></DkBlue>
+<DkCyan><begin></begin><end></end></DkCyan>
+<DkGreen><begin></begin><end></end></DkGreen>
+<DkMagenta><begin></begin><end></end></DkMagenta>
+<DkRed><begin></begin><end></end></DkRed>
+<DkYellow><begin></begin><end></end></DkYellow>
+<DkGray><begin></begin><end></end></DkGray>
+<LtGray><begin></begin><end></end></LtGray>
+<!--
+</color>
+-->
+<!--
+<animation>
+-->
+<LasVegas><begin></begin><end></end></LasVegas>
+<BackgroundBlink><begin></begin><end></end></BackgroundBlink>
+<SparkleText><begin></begin><end></end></SparkleText>
+<MarchingAnts><begin></begin><end></end></MarchingAnts>
+<MarchingRedAnts><begin></begin><end></end></MarchingRedAnts>
+<Shimmer><begin></begin><end></end></Shimmer>
+<!--
+</animation>
+-->
+<!--
+I dont understand what this one is, and ive never come across it
+use this sample line (which admit it a bit scary looking, but harmless)...
+-->
+<DispFldRMark><begin></begin><end></end></DispFldRMark>
+<!--
+or uncomment below to ignore it, the previous might even crash wv ?
+-->
+<!--
+<DispFldRMark><begin></begin><end></end></DispFldRMark>
+-->
+<animation>
+<begin><LasVegas.begin/><BackgroundBlink.begin/><SparkleText.begin/><MarchingAnts.begin/><MarchingRedAnts.begin/><Shimmer.begin/></begin>
+<end><Shimmer.end/><MarchingRedAnts.end/><MarchingAnts.end/><SparkleText.end/><BackgroundBlink.end/><LasVegas.end/></end>
+</animation>
+<fontstr>
+<begin></begin>
+<end></end>
+</fontstr>
+<comment>
+<begin>
+</begin>
+<end>
+</end>
+</comment>
+<style name="Normal">
+<character>
+<begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
+<end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
+</character>
+<!-- Netscape does handle this correctly yet, here is how each different side of the border should work.
+border-top: thin <bordertopstyle/> <bordertopcolor/>;
+border-left: thin <borderleftstyle/> <borderleftcolor/>;
+border-right: thin <borderrightstyle/> <borderrightcolor/>;
+border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
+-->
+<pmargin>
+<begin><!-- <mmParaBefore/> <mmParaRight/> <mmParaAfter/> <mmParaLeft/>;--></begin>
+</pmargin>
+<pborder>
+<begin>
+<!--
+border: thin <borderleftstyle/> <borderleftcolor/>;
+border-top: thin <bordertopstyle/> <bordertopcolor/>;
+border-left: thin <borderleftstyle/> <borderleftcolor/>;
+border-right: thin <borderrightstyle/> <borderrightcolor/>;
+border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
+-->
+</begin>
+</pborder>
+<picture>
+<begin>
+</begin>
+<!-- images are lacking for now -->
+</picture>
+</style>
+<!--we need to be override the character properties-->
+<!--
+<style name="Normal">
+<character>
+<begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
+<end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
+</character>
+<text>
+<begin></begin>
+<end>
+</end>
+</text>
+</style>
+<style name="Heading 1">
+<character>
+<begin></begin>
+<end></end>
+</character>
+<text>
+<begin></begin>
+<end>
+</end>
+</text>
+</style>
+-->
+</main>

metadata ADDED

@@ -0,0 +1,92 @@
+--- !ruby/object:Gem::Specification
+name: textractor
+version: !ruby/object:Gem::Version
+  prerelease: false
+  segments:
+  - 0
+  - 0
+  - 1
+  version: 0.0.1
+platform: ruby
+authors:
+- Michael Guterl
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-04-20 00:00:00 -04:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rspec
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 1
+        - 2
+        - 9
+        version: 1.2.9
+  type: :development
+  version_requirements: *id001
+description: simple wrapper for extracting text from PDF and Word documents
+email: mguterl@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE
+- README.md
+files:
+- .document
+- .gitignore
+- LICENSE
+- README.md
+- Rakefile
+- VERSION
+- lib/textractor.rb
+- lib/textractor/document.rb
+- spec/document_spec.rb
+- spec/fixtures/document.doc
+- spec/fixtures/document.pdf
+- spec/spec.opts
+- spec/spec_helper.rb
+- spec/textractor_spec.rb
+- support/wvText.xml
+has_rdoc: true
+homepage: http://github.com/mguterl/textractor
+licenses: []
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.6
+signing_key:
+specification_version: 3
+summary: simple wrapper for extracting text from PDF and Word documents
+test_files:
+- spec/document_spec.rb
+- spec/spec_helper.rb
+- spec/textractor_spec.rb