RubyGems - xtotxt - Versions diffs - 0.1 - Mend

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/.rvmrc ADDED Viewed

	@@ -0,0 +1 @@
1	+ rvm ree@topprospect

data/Rakefile ADDED Viewed

@@ -0,0 +1,15 @@
+# -*- encoding: utf-8 -*-
+require 'rubygems'
+require 'bundler/setup'
+require 'bundler/gem_tasks'
+Bundler::GemHelper.install_tasks
+require 'spec/rake/spectask'
+desc 'Run the specs'
+Spec::Rake::SpecTask.new(:spec) do |t|
+  t.libs << 'lib'
+  t.pattern = 'spec/*_spec.rb'
+  t.verbose = false
+end
+task :default => :spec

data/lib/xtotxt.rb ADDED Viewed

@@ -0,0 +1,37 @@
+class Xtotxt
+  def convert(input_file_name)
+    path_list = input_file_name.split(".")
+    ext = path_list.pop
+    raise("not a supported document extension: #{ext}") unless %w{pdf doc docx}.member?(ext)
+    output_file = (path_list << "txt").join(".")
+    command_line = case ext
+    when "pdf"
+        "#{@ext[:pdf]} #{input_file_name}"
+    when "doc"
+        "#{@ext[:doc]} > #{output_file} #{input_file_name}"
+    when "docx"
+        "#{@ext[:docx]} #{input_file_name}"
+    else
+        raise "have no way to convert #{ext} yet"
+    end
+    command_output = `#{command_line}`
+    text = if $? == 0
+      File.read(output_file)
+    else
+      raise "Failed to convert #{input_file_name}. Exit status: #{$?.exitstatus}.  Output: #{command_output}"
+    end
+    puts "the text is: #{text}"
+    text
+  end
+  def initialize(ext)
+    @ext = ext || @@ext
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,4 @@
+$:.unshift(File.dirname(__FILE__) + '/../lib')
+require 'rubygems'
+require 'spec'

data/spec/test.doc ADDED Viewed

Binary file

data/spec/test.docx ADDED Viewed

Binary file

data/spec/test.pdf ADDED Viewed

@@ -0,0 +1,78 @@
+%PDF-1.4
+%����
+1 0 obj
+<<
+/Type /Catalog
+/Version /1.4
+/Pages 2 0 R
+>>
+endobj
+2 0 obj
+<<
+/Type /Pages
+/Kids [3 0 R]
+/Count 1
+>>
+endobj
+3 0 obj
+<<
+/Type /Page
+/MediaBox [0 0 612 792]
+/Parent 2 0 R
+/Resources 4 0 R
+/Contents 5 0 R
+>>
+endobj
+4 0 obj
+<<
+/Font 6 0 R
+/XObject <<
+>>
+>>
+endobj
+5 0 obj
+<<
+/Filter [/FlateDecode]
+/Length 7 0 R
+>>
+stream
+x��A
+�0����/u�:1iL���`.ȴ�������f�aȗF!��{�م`!�/��7�?��皳�����)k~T�<PRF�Vӊ��$tk�k
+endstream
+endobj
+6 0 obj
+<<
+/F0 8 0 R
+>>
+endobj
+7 0 obj
+99
+endobj
+8 0 obj
+<<
+/Type /Font
+/Subtype /Type1
+/BaseFont /Helvetica
+/Encoding /WinAnsiEncoding
+>>
+endobj
+xref
+0 9
+0000000000 65535 f
+0000000015 00000 n
+0000000078 00000 n
+0000000135 00000 n
+0000000239 00000 n
+0000000287 00000 n
+0000000464 00000 n
+0000000495 00000 n
+0000000513 00000 n
+trailer
+<<
+/Root 1 0 R
+/ID [<3B410C1790C3CECC951696069F08E3BC> <3B410C1790C3CECC951696069F08E3BC>]
+/Size 9
+>>
+startxref
+610
+%%EOF

data/spec/xtotxt_spec.rb ADDED Viewed

@@ -0,0 +1,55 @@
+require 'spec_helper'
+require 'xtotxt'
+describe Xtotxt do
+  before do
+    @ext = { :pdf  => "/opt/local/bin/xpdf-pdftotext",
+             :doc  => "/opt/local/bin/antiword",
+             :docx => "/usr/local/bin/docx2txt.pl" }
+    @x = Xtotxt.new(@ext)
+    @input = "test.pdf"
+  end
+  describe "convert" do
+    it "is created with a single hash argument containing convertors" do
+      lambda { Xtoxt.new }.should raise_error
+      lambda { Xtoxt.new(1, 2) }.should raise_error
+    end
+    context "input parameters and results" do
+      %w{pdf doc docx}.each do |ext|
+        it "accepts an #{ext} input" do
+          lambda { @x.convert("test.#{ext}") }.should_not raise_error
+        end
+      end
+      it "does not accept one input file argument of the wrong type" do
+         lambda { @x.convert("test.bat") }.should raise_error
+      end
+    end
+  end
+  it "converts a pdf document correctly" do
+    text = @x.convert("test.pdf")
+    text.should == "three pigheaded piglets had a plan\n\n\f"
+  end
+  it "converts a doc document correctly" do
+    text = @x.convert("test.doc")
+    text.should == "\nthree pigheaded piglets had a plan\n\n"
+  end
+  it "converts a docx document correctly" do
+    text = @x.convert("test.docx")
+    text.should == "three pigheaded piglets had a plan\n\n"
+  end
+end

data/xtotxt/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Xtotxt
+  VERSION = 0.1
+end

data/xtotxt.gemspec ADDED Viewed

@@ -0,0 +1,20 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "xtotxt/version"
+Gem::Specification.new do |s|
+  s.name        = "xtotxt"
+  s.version     = Xtotxt::VERSION
+  s.authors     = ["Alexy Khrabrov"]
+  s.email       = ["alexy@topprospect.com"]
+  s.homepage    = "http://www.topprospect.com"
+  s.summary     = %q{Convert pdf, doc and docx to plain text}
+  s.description = %q{A simple wrapper calling, for each supported input format, a given command-line tool}
+  s.rubyforge_project = "xtotxt"
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+end

metadata ADDED Viewed

@@ -0,0 +1,80 @@
+--- !ruby/object:Gem::Specification
+name: xtotxt
+version: !ruby/object:Gem::Version
+  hash: 9
+  prerelease:
+  segments:
+  - 0
+  - 1
+  version: "0.1"
+platform: ruby
+authors:
+- Alexy Khrabrov
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-09-20 00:00:00 Z
+dependencies: []
+description: A simple wrapper calling, for each supported input format, a given command-line tool
+email:
+- alexy@topprospect.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .rvmrc
+- Rakefile
+- lib/xtotxt.rb
+- spec/spec_helper.rb
+- spec/test.doc
+- spec/test.docx
+- spec/test.pdf
+- spec/xtotxt_spec.rb
+- xtotxt.gemspec
+- xtotxt/version.rb
+homepage: http://www.topprospect.com
+licenses: []
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project: xtotxt
+rubygems_version: 1.8.10
+signing_key:
+specification_version: 4
+summary: Convert pdf, doc and docx to plain text
+test_files:
+- spec/spec_helper.rb
+- spec/test.doc
+- spec/test.docx
+- spec/test.pdf
+- spec/xtotxt_spec.rb

xtotxt 0.1