xtotxt 0.3 → 0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -0
- data/lib/xtotxt.rb +31 -4
- data/spec/fixtures/test.html +6 -0
- data/spec/fixtures/test.odt +0 -0
- data/spec/fixtures/test.rtf +8 -0
- data/spec/xtotxt.yml +3 -0
- data/spec/xtotxt_spec.rb +23 -2
- data/xtotxt/version.rb +1 -1
- metadata +11 -4
data/.gitignore
ADDED
data/lib/xtotxt.rb
CHANGED
@@ -22,7 +22,7 @@ class Xtotxt
|
|
22
22
|
|
23
23
|
ext = path_list.pop
|
24
24
|
|
25
|
-
raise("not a supported document extension: #{ext}") unless %w{pdf doc docx}.member?(ext)
|
25
|
+
raise("not a supported document extension: #{ext}") unless %w{pdf doc docx odt rtf html}.member?(ext)
|
26
26
|
|
27
27
|
output_file = (path_list << "txt").join(".")
|
28
28
|
|
@@ -33,17 +33,31 @@ class Xtotxt
|
|
33
33
|
"#{@ext[:doc]} > #{output_file} #{input_file_name}"
|
34
34
|
when "docx"
|
35
35
|
"#{@ext[:docx]} #{input_file_name}"
|
36
|
+
when "odt":
|
37
|
+
"#{@ext[:odt]} #{input_file_name} --output=#{output_file}"
|
38
|
+
when "rtf":
|
39
|
+
"#{@ext[:rtf]} --text #{input_file_name} > #{output_file}"
|
40
|
+
when "html":
|
41
|
+
"#{@ext[:html]} -o #{output_file} #{input_file_name}"
|
36
42
|
else
|
37
43
|
raise "have no way to convert #{ext} yet"
|
38
44
|
end
|
39
45
|
|
40
|
-
|
46
|
+
#puts "executing: #{command_line}"
|
47
|
+
|
48
|
+
command_output = `#{command_line} 2>/dev/null`
|
41
49
|
text = if $? == 0
|
42
50
|
File.read(output_file)
|
43
51
|
else
|
44
52
|
raise "Failed to convert #{input_file_name}. Exit status: #{$?.exitstatus}. Output: #{command_output}"
|
45
53
|
end
|
46
|
-
|
54
|
+
|
55
|
+
case ext
|
56
|
+
when "rtf"
|
57
|
+
skip_unrtf_header(text)
|
58
|
+
else
|
59
|
+
text
|
60
|
+
end
|
47
61
|
end
|
48
62
|
|
49
63
|
def initialize(ext=nil)
|
@@ -57,8 +71,21 @@ class Xtotxt
|
|
57
71
|
Xtotxt.read_config
|
58
72
|
@@ext
|
59
73
|
end
|
74
|
+
end
|
60
75
|
|
61
|
-
|
76
|
+
private
|
77
|
+
|
78
|
+
def skip_unrtf_header(text)
|
79
|
+
a = text.lines.to_a
|
80
|
+
while true
|
81
|
+
unless a.shift =~ /^###/
|
82
|
+
unless a.shift == "-----------------\n"
|
83
|
+
raise "cannot parse rtf"
|
84
|
+
end
|
85
|
+
break
|
86
|
+
end
|
87
|
+
end
|
88
|
+
a.join
|
62
89
|
end
|
63
90
|
|
64
91
|
end
|
Binary file
|
@@ -0,0 +1,8 @@
|
|
1
|
+
{\rtf1\ansi\ansicpg1252\cocoartf1138
|
2
|
+
{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
|
3
|
+
{\colortbl;\red255\green255\blue255;}
|
4
|
+
\margl1440\margr1440\vieww10800\viewh8400\viewkind0
|
5
|
+
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural
|
6
|
+
|
7
|
+
\f0\fs24 \cf0 three pigheaded piglets had a plan\
|
8
|
+
}
|
data/spec/xtotxt.yml
CHANGED
data/spec/xtotxt_spec.rb
CHANGED
@@ -2,10 +2,12 @@ require 'spec_helper'
|
|
2
2
|
require 'xtotxt'
|
3
3
|
|
4
4
|
describe Xtotxt do
|
5
|
+
before(:all) do
|
6
|
+
@input_prefix = "#{Pathname.new(__FILE__).dirname}/fixtures/test"
|
7
|
+
end
|
5
8
|
|
6
9
|
before do
|
7
10
|
@x = Xtotxt.new
|
8
|
-
@input_prefix = "fixtures/test"
|
9
11
|
end
|
10
12
|
|
11
13
|
describe "convert" do
|
@@ -18,7 +20,7 @@ describe Xtotxt do
|
|
18
20
|
|
19
21
|
context "input parameters and results" do
|
20
22
|
|
21
|
-
%w{pdf doc docx}.each do |ext|
|
23
|
+
%w{pdf doc docx odt rtf}.each do |ext|
|
22
24
|
it "accepts an #{ext} input" do
|
23
25
|
lambda { @x.convert("#{@input_prefix}.#{ext}") }.should_not raise_error
|
24
26
|
end
|
@@ -49,4 +51,23 @@ describe Xtotxt do
|
|
49
51
|
text.should == "three pigheaded piglets had a plan\n\n"
|
50
52
|
end
|
51
53
|
|
54
|
+
it "converts an odt document correctly" do
|
55
|
+
text = @x.convert("#{@input_prefix}.odt")
|
56
|
+
|
57
|
+
text.should == "\nthree pigheaded piglets had a plan\n\n"
|
58
|
+
end
|
59
|
+
|
60
|
+
it "converts an rtf document correctly" do
|
61
|
+
text = @x.convert("#{@input_prefix}.rtf")
|
62
|
+
|
63
|
+
text.should == "three pigheaded piglets had a plan\n"
|
64
|
+
end
|
65
|
+
|
66
|
+
it "converts an html document correctly" do
|
67
|
+
text = @x.convert("#{@input_prefix}.html")
|
68
|
+
|
69
|
+
text.should == "three pigheaded piglets had a plan\n"
|
70
|
+
end
|
71
|
+
|
72
|
+
|
52
73
|
end
|
data/xtotxt/version.rb
CHANGED
metadata
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: xtotxt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 3
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: "0.
|
8
|
+
- 4
|
9
|
+
version: "0.4"
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Alexy Khrabrov
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-09-
|
17
|
+
date: 2011-09-23 00:00:00 Z
|
18
18
|
dependencies: []
|
19
19
|
|
20
20
|
description: A simple wrapper calling, for each supported input format, a given command-line tool
|
@@ -27,12 +27,16 @@ extensions: []
|
|
27
27
|
extra_rdoc_files: []
|
28
28
|
|
29
29
|
files:
|
30
|
+
- .gitignore
|
30
31
|
- .rvmrc
|
31
32
|
- Rakefile
|
32
33
|
- lib/xtotxt.rb
|
33
34
|
- spec/fixtures/test.doc
|
34
35
|
- spec/fixtures/test.docx
|
36
|
+
- spec/fixtures/test.html
|
37
|
+
- spec/fixtures/test.odt
|
35
38
|
- spec/fixtures/test.pdf
|
39
|
+
- spec/fixtures/test.rtf
|
36
40
|
- spec/spec_helper.rb
|
37
41
|
- spec/xtotxt.yml
|
38
42
|
- spec/xtotxt_spec.rb
|
@@ -76,7 +80,10 @@ summary: Convert pdf, doc and docx to plain text
|
|
76
80
|
test_files:
|
77
81
|
- spec/fixtures/test.doc
|
78
82
|
- spec/fixtures/test.docx
|
83
|
+
- spec/fixtures/test.html
|
84
|
+
- spec/fixtures/test.odt
|
79
85
|
- spec/fixtures/test.pdf
|
86
|
+
- spec/fixtures/test.rtf
|
80
87
|
- spec/spec_helper.rb
|
81
88
|
- spec/xtotxt.yml
|
82
89
|
- spec/xtotxt_spec.rb
|