xtotxt 0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm ree@topprospect
data/Rakefile ADDED
@@ -0,0 +1,15 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'bundler/setup'
4
+ require 'bundler/gem_tasks'
5
+ Bundler::GemHelper.install_tasks
6
+
7
+ require 'spec/rake/spectask'
8
+ desc 'Run the specs'
9
+ Spec::Rake::SpecTask.new(:spec) do |t|
10
+ t.libs << 'lib'
11
+ t.pattern = 'spec/*_spec.rb'
12
+ t.verbose = false
13
+ end
14
+
15
+ task :default => :spec
data/lib/xtotxt.rb ADDED
@@ -0,0 +1,37 @@
1
+ class Xtotxt
2
+
3
+ def convert(input_file_name)
4
+ path_list = input_file_name.split(".")
5
+
6
+ ext = path_list.pop
7
+
8
+ raise("not a supported document extension: #{ext}") unless %w{pdf doc docx}.member?(ext)
9
+
10
+ output_file = (path_list << "txt").join(".")
11
+
12
+ command_line = case ext
13
+ when "pdf"
14
+ "#{@ext[:pdf]} #{input_file_name}"
15
+ when "doc"
16
+ "#{@ext[:doc]} > #{output_file} #{input_file_name}"
17
+ when "docx"
18
+ "#{@ext[:docx]} #{input_file_name}"
19
+ else
20
+ raise "have no way to convert #{ext} yet"
21
+ end
22
+
23
+ command_output = `#{command_line}`
24
+ text = if $? == 0
25
+ File.read(output_file)
26
+ else
27
+ raise "Failed to convert #{input_file_name}. Exit status: #{$?.exitstatus}. Output: #{command_output}"
28
+ end
29
+ puts "the text is: #{text}"
30
+ text
31
+ end
32
+
33
+ def initialize(ext)
34
+ @ext = ext || @@ext
35
+ end
36
+
37
+ end
@@ -0,0 +1,4 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'rubygems'
4
+ require 'spec'
data/spec/test.doc ADDED
Binary file
data/spec/test.docx ADDED
Binary file
data/spec/test.pdf ADDED
@@ -0,0 +1,78 @@
1
+ %PDF-1.4
2
+ %����
3
+ 1 0 obj
4
+ <<
5
+ /Type /Catalog
6
+ /Version /1.4
7
+ /Pages 2 0 R
8
+ >>
9
+ endobj
10
+ 2 0 obj
11
+ <<
12
+ /Type /Pages
13
+ /Kids [3 0 R]
14
+ /Count 1
15
+ >>
16
+ endobj
17
+ 3 0 obj
18
+ <<
19
+ /Type /Page
20
+ /MediaBox [0 0 612 792]
21
+ /Parent 2 0 R
22
+ /Resources 4 0 R
23
+ /Contents 5 0 R
24
+ >>
25
+ endobj
26
+ 4 0 obj
27
+ <<
28
+ /Font 6 0 R
29
+ /XObject <<
30
+ >>
31
+ >>
32
+ endobj
33
+ 5 0 obj
34
+ <<
35
+ /Filter [/FlateDecode]
36
+ /Length 7 0 R
37
+ >>
38
+ stream
39
+ x��A
40
+ �0����/u�:1iL���`.ȴ�������f�aȗF!��{�م`!�/��7�?��皳�����)k~T�<PRF�Vӊ��$tk�k
41
+ endstream
42
+ endobj
43
+ 6 0 obj
44
+ <<
45
+ /F0 8 0 R
46
+ >>
47
+ endobj
48
+ 7 0 obj
49
+ 99
50
+ endobj
51
+ 8 0 obj
52
+ <<
53
+ /Type /Font
54
+ /Subtype /Type1
55
+ /BaseFont /Helvetica
56
+ /Encoding /WinAnsiEncoding
57
+ >>
58
+ endobj
59
+ xref
60
+ 0 9
61
+ 0000000000 65535 f
62
+ 0000000015 00000 n
63
+ 0000000078 00000 n
64
+ 0000000135 00000 n
65
+ 0000000239 00000 n
66
+ 0000000287 00000 n
67
+ 0000000464 00000 n
68
+ 0000000495 00000 n
69
+ 0000000513 00000 n
70
+ trailer
71
+ <<
72
+ /Root 1 0 R
73
+ /ID [<3B410C1790C3CECC951696069F08E3BC> <3B410C1790C3CECC951696069F08E3BC>]
74
+ /Size 9
75
+ >>
76
+ startxref
77
+ 610
78
+ %%EOF
@@ -0,0 +1,55 @@
1
+ require 'spec_helper'
2
+ require 'xtotxt'
3
+
4
+ describe Xtotxt do
5
+ before do
6
+ @ext = { :pdf => "/opt/local/bin/xpdf-pdftotext",
7
+ :doc => "/opt/local/bin/antiword",
8
+ :docx => "/usr/local/bin/docx2txt.pl" }
9
+
10
+ @x = Xtotxt.new(@ext)
11
+ @input = "test.pdf"
12
+ end
13
+
14
+ describe "convert" do
15
+
16
+ it "is created with a single hash argument containing convertors" do
17
+ lambda { Xtoxt.new }.should raise_error
18
+
19
+ lambda { Xtoxt.new(1, 2) }.should raise_error
20
+ end
21
+
22
+ context "input parameters and results" do
23
+
24
+ %w{pdf doc docx}.each do |ext|
25
+ it "accepts an #{ext} input" do
26
+ lambda { @x.convert("test.#{ext}") }.should_not raise_error
27
+ end
28
+ end
29
+
30
+ it "does not accept one input file argument of the wrong type" do
31
+ lambda { @x.convert("test.bat") }.should raise_error
32
+ end
33
+
34
+ end
35
+ end
36
+
37
+ it "converts a pdf document correctly" do
38
+ text = @x.convert("test.pdf")
39
+
40
+ text.should == "three pigheaded piglets had a plan\n\n\f"
41
+ end
42
+
43
+ it "converts a doc document correctly" do
44
+ text = @x.convert("test.doc")
45
+
46
+ text.should == "\nthree pigheaded piglets had a plan\n\n"
47
+ end
48
+
49
+ it "converts a docx document correctly" do
50
+ text = @x.convert("test.docx")
51
+
52
+ text.should == "three pigheaded piglets had a plan\n\n"
53
+ end
54
+
55
+ end
data/xtotxt/version.rb ADDED
@@ -0,0 +1,3 @@
1
+ module Xtotxt
2
+ VERSION = 0.1
3
+ end
data/xtotxt.gemspec ADDED
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "xtotxt/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "xtotxt"
7
+ s.version = Xtotxt::VERSION
8
+ s.authors = ["Alexy Khrabrov"]
9
+ s.email = ["alexy@topprospect.com"]
10
+ s.homepage = "http://www.topprospect.com"
11
+ s.summary = %q{Convert pdf, doc and docx to plain text}
12
+ s.description = %q{A simple wrapper calling, for each supported input format, a given command-line tool}
13
+
14
+ s.rubyforge_project = "xtotxt"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+ end
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: xtotxt
3
+ version: !ruby/object:Gem::Version
4
+ hash: 9
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ version: "0.1"
10
+ platform: ruby
11
+ authors:
12
+ - Alexy Khrabrov
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-09-20 00:00:00 Z
18
+ dependencies: []
19
+
20
+ description: A simple wrapper calling, for each supported input format, a given command-line tool
21
+ email:
22
+ - alexy@topprospect.com
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - .rvmrc
31
+ - Rakefile
32
+ - lib/xtotxt.rb
33
+ - spec/spec_helper.rb
34
+ - spec/test.doc
35
+ - spec/test.docx
36
+ - spec/test.pdf
37
+ - spec/xtotxt_spec.rb
38
+ - xtotxt.gemspec
39
+ - xtotxt/version.rb
40
+ homepage: http://www.topprospect.com
41
+ licenses: []
42
+
43
+ metadata: {}
44
+
45
+ post_install_message:
46
+ rdoc_options: []
47
+
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ hash: 3
56
+ segments:
57
+ - 0
58
+ version: "0"
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ none: false
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ hash: 3
65
+ segments:
66
+ - 0
67
+ version: "0"
68
+ requirements: []
69
+
70
+ rubyforge_project: xtotxt
71
+ rubygems_version: 1.8.10
72
+ signing_key:
73
+ specification_version: 4
74
+ summary: Convert pdf, doc and docx to plain text
75
+ test_files:
76
+ - spec/spec_helper.rb
77
+ - spec/test.doc
78
+ - spec/test.docx
79
+ - spec/test.pdf
80
+ - spec/xtotxt_spec.rb