xtotxt 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm ree@topprospect
data/Rakefile ADDED
@@ -0,0 +1,15 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'bundler/setup'
4
+ require 'bundler/gem_tasks'
5
+ Bundler::GemHelper.install_tasks
6
+
7
+ require 'spec/rake/spectask'
8
+ desc 'Run the specs'
9
+ Spec::Rake::SpecTask.new(:spec) do |t|
10
+ t.libs << 'lib'
11
+ t.pattern = 'spec/*_spec.rb'
12
+ t.verbose = false
13
+ end
14
+
15
+ task :default => :spec
data/lib/xtotxt.rb ADDED
@@ -0,0 +1,37 @@
1
+ class Xtotxt
2
+
3
+ def convert(input_file_name)
4
+ path_list = input_file_name.split(".")
5
+
6
+ ext = path_list.pop
7
+
8
+ raise("not a supported document extension: #{ext}") unless %w{pdf doc docx}.member?(ext)
9
+
10
+ output_file = (path_list << "txt").join(".")
11
+
12
+ command_line = case ext
13
+ when "pdf"
14
+ "#{@ext[:pdf]} #{input_file_name}"
15
+ when "doc"
16
+ "#{@ext[:doc]} > #{output_file} #{input_file_name}"
17
+ when "docx"
18
+ "#{@ext[:docx]} #{input_file_name}"
19
+ else
20
+ raise "have no way to convert #{ext} yet"
21
+ end
22
+
23
+ command_output = `#{command_line}`
24
+ text = if $? == 0
25
+ File.read(output_file)
26
+ else
27
+ raise "Failed to convert #{input_file_name}. Exit status: #{$?.exitstatus}. Output: #{command_output}"
28
+ end
29
+ puts "the text is: #{text}"
30
+ text
31
+ end
32
+
33
+ def initialize(ext)
34
+ @ext = ext || @@ext
35
+ end
36
+
37
+ end
@@ -0,0 +1,4 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'rubygems'
4
+ require 'spec'
data/spec/test.doc ADDED
Binary file
data/spec/test.docx ADDED
Binary file
data/spec/test.pdf ADDED
@@ -0,0 +1,78 @@
1
+ %PDF-1.4
2
+ %����
3
+ 1 0 obj
4
+ <<
5
+ /Type /Catalog
6
+ /Version /1.4
7
+ /Pages 2 0 R
8
+ >>
9
+ endobj
10
+ 2 0 obj
11
+ <<
12
+ /Type /Pages
13
+ /Kids [3 0 R]
14
+ /Count 1
15
+ >>
16
+ endobj
17
+ 3 0 obj
18
+ <<
19
+ /Type /Page
20
+ /MediaBox [0 0 612 792]
21
+ /Parent 2 0 R
22
+ /Resources 4 0 R
23
+ /Contents 5 0 R
24
+ >>
25
+ endobj
26
+ 4 0 obj
27
+ <<
28
+ /Font 6 0 R
29
+ /XObject <<
30
+ >>
31
+ >>
32
+ endobj
33
+ 5 0 obj
34
+ <<
35
+ /Filter [/FlateDecode]
36
+ /Length 7 0 R
37
+ >>
38
+ stream
39
+ x��A
40
+ �0����/u�:1iL���`.ȴ�������f�aȗF!��{�م`!�/��7�?��皳�����)k~T�<PRF�Vӊ��$tk�k
41
+ endstream
42
+ endobj
43
+ 6 0 obj
44
+ <<
45
+ /F0 8 0 R
46
+ >>
47
+ endobj
48
+ 7 0 obj
49
+ 99
50
+ endobj
51
+ 8 0 obj
52
+ <<
53
+ /Type /Font
54
+ /Subtype /Type1
55
+ /BaseFont /Helvetica
56
+ /Encoding /WinAnsiEncoding
57
+ >>
58
+ endobj
59
+ xref
60
+ 0 9
61
+ 0000000000 65535 f
62
+ 0000000015 00000 n
63
+ 0000000078 00000 n
64
+ 0000000135 00000 n
65
+ 0000000239 00000 n
66
+ 0000000287 00000 n
67
+ 0000000464 00000 n
68
+ 0000000495 00000 n
69
+ 0000000513 00000 n
70
+ trailer
71
+ <<
72
+ /Root 1 0 R
73
+ /ID [<3B410C1790C3CECC951696069F08E3BC> <3B410C1790C3CECC951696069F08E3BC>]
74
+ /Size 9
75
+ >>
76
+ startxref
77
+ 610
78
+ %%EOF
@@ -0,0 +1,55 @@
1
+ require 'spec_helper'
2
+ require 'xtotxt'
3
+
4
+ describe Xtotxt do
5
+ before do
6
+ @ext = { :pdf => "/opt/local/bin/xpdf-pdftotext",
7
+ :doc => "/opt/local/bin/antiword",
8
+ :docx => "/usr/local/bin/docx2txt.pl" }
9
+
10
+ @x = Xtotxt.new(@ext)
11
+ @input = "test.pdf"
12
+ end
13
+
14
+ describe "convert" do
15
+
16
+ it "is created with a single hash argument containing convertors" do
17
+ lambda { Xtoxt.new }.should raise_error
18
+
19
+ lambda { Xtoxt.new(1, 2) }.should raise_error
20
+ end
21
+
22
+ context "input parameters and results" do
23
+
24
+ %w{pdf doc docx}.each do |ext|
25
+ it "accepts an #{ext} input" do
26
+ lambda { @x.convert("test.#{ext}") }.should_not raise_error
27
+ end
28
+ end
29
+
30
+ it "does not accept one input file argument of the wrong type" do
31
+ lambda { @x.convert("test.bat") }.should raise_error
32
+ end
33
+
34
+ end
35
+ end
36
+
37
+ it "converts a pdf document correctly" do
38
+ text = @x.convert("test.pdf")
39
+
40
+ text.should == "three pigheaded piglets had a plan\n\n\f"
41
+ end
42
+
43
+ it "converts a doc document correctly" do
44
+ text = @x.convert("test.doc")
45
+
46
+ text.should == "\nthree pigheaded piglets had a plan\n\n"
47
+ end
48
+
49
+ it "converts a docx document correctly" do
50
+ text = @x.convert("test.docx")
51
+
52
+ text.should == "three pigheaded piglets had a plan\n\n"
53
+ end
54
+
55
+ end
data/xtotxt/version.rb ADDED
@@ -0,0 +1,3 @@
1
+ module Xtotxt
2
+ VERSION = 0.1
3
+ end
data/xtotxt.gemspec ADDED
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "xtotxt/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "xtotxt"
7
+ s.version = Xtotxt::VERSION
8
+ s.authors = ["Alexy Khrabrov"]
9
+ s.email = ["alexy@topprospect.com"]
10
+ s.homepage = "http://www.topprospect.com"
11
+ s.summary = %q{Convert pdf, doc and docx to plain text}
12
+ s.description = %q{A simple wrapper calling, for each supported input format, a given command-line tool}
13
+
14
+ s.rubyforge_project = "xtotxt"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+ end
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: xtotxt
3
+ version: !ruby/object:Gem::Version
4
+ hash: 9
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ version: "0.1"
10
+ platform: ruby
11
+ authors:
12
+ - Alexy Khrabrov
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-09-20 00:00:00 Z
18
+ dependencies: []
19
+
20
+ description: A simple wrapper calling, for each supported input format, a given command-line tool
21
+ email:
22
+ - alexy@topprospect.com
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - .rvmrc
31
+ - Rakefile
32
+ - lib/xtotxt.rb
33
+ - spec/spec_helper.rb
34
+ - spec/test.doc
35
+ - spec/test.docx
36
+ - spec/test.pdf
37
+ - spec/xtotxt_spec.rb
38
+ - xtotxt.gemspec
39
+ - xtotxt/version.rb
40
+ homepage: http://www.topprospect.com
41
+ licenses: []
42
+
43
+ metadata: {}
44
+
45
+ post_install_message:
46
+ rdoc_options: []
47
+
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ hash: 3
56
+ segments:
57
+ - 0
58
+ version: "0"
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ none: false
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ hash: 3
65
+ segments:
66
+ - 0
67
+ version: "0"
68
+ requirements: []
69
+
70
+ rubyforge_project: xtotxt
71
+ rubygems_version: 1.8.10
72
+ signing_key:
73
+ specification_version: 4
74
+ summary: Convert pdf, doc and docx to plain text
75
+ test_files:
76
+ - spec/spec_helper.rb
77
+ - spec/test.doc
78
+ - spec/test.docx
79
+ - spec/test.pdf
80
+ - spec/xtotxt_spec.rb