xtotxt 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rvmrc +1 -0
- data/Rakefile +15 -0
- data/lib/xtotxt.rb +37 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/test.doc +0 -0
- data/spec/test.docx +0 -0
- data/spec/test.pdf +78 -0
- data/spec/xtotxt_spec.rb +55 -0
- data/xtotxt/version.rb +3 -0
- data/xtotxt.gemspec +20 -0
- metadata +80 -0
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm ree@topprospect
|
data/Rakefile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'rubygems'
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'bundler/gem_tasks'
|
5
|
+
Bundler::GemHelper.install_tasks
|
6
|
+
|
7
|
+
require 'spec/rake/spectask'
|
8
|
+
desc 'Run the specs'
|
9
|
+
Spec::Rake::SpecTask.new(:spec) do |t|
|
10
|
+
t.libs << 'lib'
|
11
|
+
t.pattern = 'spec/*_spec.rb'
|
12
|
+
t.verbose = false
|
13
|
+
end
|
14
|
+
|
15
|
+
task :default => :spec
|
data/lib/xtotxt.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
class Xtotxt
|
2
|
+
|
3
|
+
def convert(input_file_name)
|
4
|
+
path_list = input_file_name.split(".")
|
5
|
+
|
6
|
+
ext = path_list.pop
|
7
|
+
|
8
|
+
raise("not a supported document extension: #{ext}") unless %w{pdf doc docx}.member?(ext)
|
9
|
+
|
10
|
+
output_file = (path_list << "txt").join(".")
|
11
|
+
|
12
|
+
command_line = case ext
|
13
|
+
when "pdf"
|
14
|
+
"#{@ext[:pdf]} #{input_file_name}"
|
15
|
+
when "doc"
|
16
|
+
"#{@ext[:doc]} > #{output_file} #{input_file_name}"
|
17
|
+
when "docx"
|
18
|
+
"#{@ext[:docx]} #{input_file_name}"
|
19
|
+
else
|
20
|
+
raise "have no way to convert #{ext} yet"
|
21
|
+
end
|
22
|
+
|
23
|
+
command_output = `#{command_line}`
|
24
|
+
text = if $? == 0
|
25
|
+
File.read(output_file)
|
26
|
+
else
|
27
|
+
raise "Failed to convert #{input_file_name}. Exit status: #{$?.exitstatus}. Output: #{command_output}"
|
28
|
+
end
|
29
|
+
puts "the text is: #{text}"
|
30
|
+
text
|
31
|
+
end
|
32
|
+
|
33
|
+
def initialize(ext)
|
34
|
+
@ext = ext || @@ext
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
data/spec/spec_helper.rb
ADDED
data/spec/test.doc
ADDED
Binary file
|
data/spec/test.docx
ADDED
Binary file
|
data/spec/test.pdf
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
%PDF-1.4
|
2
|
+
%����
|
3
|
+
1 0 obj
|
4
|
+
<<
|
5
|
+
/Type /Catalog
|
6
|
+
/Version /1.4
|
7
|
+
/Pages 2 0 R
|
8
|
+
>>
|
9
|
+
endobj
|
10
|
+
2 0 obj
|
11
|
+
<<
|
12
|
+
/Type /Pages
|
13
|
+
/Kids [3 0 R]
|
14
|
+
/Count 1
|
15
|
+
>>
|
16
|
+
endobj
|
17
|
+
3 0 obj
|
18
|
+
<<
|
19
|
+
/Type /Page
|
20
|
+
/MediaBox [0 0 612 792]
|
21
|
+
/Parent 2 0 R
|
22
|
+
/Resources 4 0 R
|
23
|
+
/Contents 5 0 R
|
24
|
+
>>
|
25
|
+
endobj
|
26
|
+
4 0 obj
|
27
|
+
<<
|
28
|
+
/Font 6 0 R
|
29
|
+
/XObject <<
|
30
|
+
>>
|
31
|
+
>>
|
32
|
+
endobj
|
33
|
+
5 0 obj
|
34
|
+
<<
|
35
|
+
/Filter [/FlateDecode]
|
36
|
+
/Length 7 0 R
|
37
|
+
>>
|
38
|
+
stream
|
39
|
+
x��A
|
40
|
+
�0����/u�:1iL���`.ȴ�������f�aȗF!��{�م`!�/��7�?��皳�����)k~T�<PRF�Vӊ��$tk�k
|
41
|
+
endstream
|
42
|
+
endobj
|
43
|
+
6 0 obj
|
44
|
+
<<
|
45
|
+
/F0 8 0 R
|
46
|
+
>>
|
47
|
+
endobj
|
48
|
+
7 0 obj
|
49
|
+
99
|
50
|
+
endobj
|
51
|
+
8 0 obj
|
52
|
+
<<
|
53
|
+
/Type /Font
|
54
|
+
/Subtype /Type1
|
55
|
+
/BaseFont /Helvetica
|
56
|
+
/Encoding /WinAnsiEncoding
|
57
|
+
>>
|
58
|
+
endobj
|
59
|
+
xref
|
60
|
+
0 9
|
61
|
+
0000000000 65535 f
|
62
|
+
0000000015 00000 n
|
63
|
+
0000000078 00000 n
|
64
|
+
0000000135 00000 n
|
65
|
+
0000000239 00000 n
|
66
|
+
0000000287 00000 n
|
67
|
+
0000000464 00000 n
|
68
|
+
0000000495 00000 n
|
69
|
+
0000000513 00000 n
|
70
|
+
trailer
|
71
|
+
<<
|
72
|
+
/Root 1 0 R
|
73
|
+
/ID [<3B410C1790C3CECC951696069F08E3BC> <3B410C1790C3CECC951696069F08E3BC>]
|
74
|
+
/Size 9
|
75
|
+
>>
|
76
|
+
startxref
|
77
|
+
610
|
78
|
+
%%EOF
|
data/spec/xtotxt_spec.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'xtotxt'
|
3
|
+
|
4
|
+
describe Xtotxt do
|
5
|
+
before do
|
6
|
+
@ext = { :pdf => "/opt/local/bin/xpdf-pdftotext",
|
7
|
+
:doc => "/opt/local/bin/antiword",
|
8
|
+
:docx => "/usr/local/bin/docx2txt.pl" }
|
9
|
+
|
10
|
+
@x = Xtotxt.new(@ext)
|
11
|
+
@input = "test.pdf"
|
12
|
+
end
|
13
|
+
|
14
|
+
describe "convert" do
|
15
|
+
|
16
|
+
it "is created with a single hash argument containing convertors" do
|
17
|
+
lambda { Xtoxt.new }.should raise_error
|
18
|
+
|
19
|
+
lambda { Xtoxt.new(1, 2) }.should raise_error
|
20
|
+
end
|
21
|
+
|
22
|
+
context "input parameters and results" do
|
23
|
+
|
24
|
+
%w{pdf doc docx}.each do |ext|
|
25
|
+
it "accepts an #{ext} input" do
|
26
|
+
lambda { @x.convert("test.#{ext}") }.should_not raise_error
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
it "does not accept one input file argument of the wrong type" do
|
31
|
+
lambda { @x.convert("test.bat") }.should raise_error
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
it "converts a pdf document correctly" do
|
38
|
+
text = @x.convert("test.pdf")
|
39
|
+
|
40
|
+
text.should == "three pigheaded piglets had a plan\n\n\f"
|
41
|
+
end
|
42
|
+
|
43
|
+
it "converts a doc document correctly" do
|
44
|
+
text = @x.convert("test.doc")
|
45
|
+
|
46
|
+
text.should == "\nthree pigheaded piglets had a plan\n\n"
|
47
|
+
end
|
48
|
+
|
49
|
+
it "converts a docx document correctly" do
|
50
|
+
text = @x.convert("test.docx")
|
51
|
+
|
52
|
+
text.should == "three pigheaded piglets had a plan\n\n"
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
data/xtotxt/version.rb
ADDED
data/xtotxt.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "xtotxt/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "xtotxt"
|
7
|
+
s.version = Xtotxt::VERSION
|
8
|
+
s.authors = ["Alexy Khrabrov"]
|
9
|
+
s.email = ["alexy@topprospect.com"]
|
10
|
+
s.homepage = "http://www.topprospect.com"
|
11
|
+
s.summary = %q{Convert pdf, doc and docx to plain text}
|
12
|
+
s.description = %q{A simple wrapper calling, for each supported input format, a given command-line tool}
|
13
|
+
|
14
|
+
s.rubyforge_project = "xtotxt"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
end
|
metadata
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: xtotxt
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 9
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: "0.1"
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Alexy Khrabrov
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-09-20 00:00:00 Z
|
18
|
+
dependencies: []
|
19
|
+
|
20
|
+
description: A simple wrapper calling, for each supported input format, a given command-line tool
|
21
|
+
email:
|
22
|
+
- alexy@topprospect.com
|
23
|
+
executables: []
|
24
|
+
|
25
|
+
extensions: []
|
26
|
+
|
27
|
+
extra_rdoc_files: []
|
28
|
+
|
29
|
+
files:
|
30
|
+
- .rvmrc
|
31
|
+
- Rakefile
|
32
|
+
- lib/xtotxt.rb
|
33
|
+
- spec/spec_helper.rb
|
34
|
+
- spec/test.doc
|
35
|
+
- spec/test.docx
|
36
|
+
- spec/test.pdf
|
37
|
+
- spec/xtotxt_spec.rb
|
38
|
+
- xtotxt.gemspec
|
39
|
+
- xtotxt/version.rb
|
40
|
+
homepage: http://www.topprospect.com
|
41
|
+
licenses: []
|
42
|
+
|
43
|
+
metadata: {}
|
44
|
+
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options: []
|
47
|
+
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
hash: 3
|
56
|
+
segments:
|
57
|
+
- 0
|
58
|
+
version: "0"
|
59
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
|
+
none: false
|
61
|
+
requirements:
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
hash: 3
|
65
|
+
segments:
|
66
|
+
- 0
|
67
|
+
version: "0"
|
68
|
+
requirements: []
|
69
|
+
|
70
|
+
rubyforge_project: xtotxt
|
71
|
+
rubygems_version: 1.8.10
|
72
|
+
signing_key:
|
73
|
+
specification_version: 4
|
74
|
+
summary: Convert pdf, doc and docx to plain text
|
75
|
+
test_files:
|
76
|
+
- spec/spec_helper.rb
|
77
|
+
- spec/test.doc
|
78
|
+
- spec/test.docx
|
79
|
+
- spec/test.pdf
|
80
|
+
- spec/xtotxt_spec.rb
|