xtotxt 0.3 → 0.4

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ spec/fixtures/test.txt
2
+ *.gem
3
+ .idea
data/lib/xtotxt.rb CHANGED
@@ -22,7 +22,7 @@ class Xtotxt
22
22
 
23
23
  ext = path_list.pop
24
24
 
25
- raise("not a supported document extension: #{ext}") unless %w{pdf doc docx}.member?(ext)
25
+ raise("not a supported document extension: #{ext}") unless %w{pdf doc docx odt rtf html}.member?(ext)
26
26
 
27
27
  output_file = (path_list << "txt").join(".")
28
28
 
@@ -33,17 +33,31 @@ class Xtotxt
33
33
  "#{@ext[:doc]} > #{output_file} #{input_file_name}"
34
34
  when "docx"
35
35
  "#{@ext[:docx]} #{input_file_name}"
36
+ when "odt":
37
+ "#{@ext[:odt]} #{input_file_name} --output=#{output_file}"
38
+ when "rtf":
39
+ "#{@ext[:rtf]} --text #{input_file_name} > #{output_file}"
40
+ when "html":
41
+ "#{@ext[:html]} -o #{output_file} #{input_file_name}"
36
42
  else
37
43
  raise "have no way to convert #{ext} yet"
38
44
  end
39
45
 
40
- command_output = `#{command_line}`
46
+ #puts "executing: #{command_line}"
47
+
48
+ command_output = `#{command_line} 2>/dev/null`
41
49
  text = if $? == 0
42
50
  File.read(output_file)
43
51
  else
44
52
  raise "Failed to convert #{input_file_name}. Exit status: #{$?.exitstatus}. Output: #{command_output}"
45
53
  end
46
- text
54
+
55
+ case ext
56
+ when "rtf"
57
+ skip_unrtf_header(text)
58
+ else
59
+ text
60
+ end
47
61
  end
48
62
 
49
63
  def initialize(ext=nil)
@@ -57,8 +71,21 @@ class Xtotxt
57
71
  Xtotxt.read_config
58
72
  @@ext
59
73
  end
74
+ end
60
75
 
61
- puts "@ext: #{@ext}, @@ext: #{@@ext}"
76
+ private
77
+
78
+ def skip_unrtf_header(text)
79
+ a = text.lines.to_a
80
+ while true
81
+ unless a.shift =~ /^###/
82
+ unless a.shift == "-----------------\n"
83
+ raise "cannot parse rtf"
84
+ end
85
+ break
86
+ end
87
+ end
88
+ a.join
62
89
  end
63
90
 
64
91
  end
@@ -0,0 +1,6 @@
1
+ <head>
2
+ <title>piglet plan</title>
3
+ </head>
4
+ <body>
5
+ <p>three pigheaded piglets had a plan</p>
6
+ </body>
Binary file
@@ -0,0 +1,8 @@
1
+ {\rtf1\ansi\ansicpg1252\cocoartf1138
2
+ {\fonttbl\f0\fswiss\fcharset0 Helvetica;}
3
+ {\colortbl;\red255\green255\blue255;}
4
+ \margl1440\margr1440\vieww10800\viewh8400\viewkind0
5
+ \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural
6
+
7
+ \f0\fs24 \cf0 three pigheaded piglets had a plan\
8
+ }
data/spec/xtotxt.yml CHANGED
@@ -1,3 +1,6 @@
1
1
  :pdf: "/opt/local/bin/xpdf-pdftotext"
2
2
  :doc: "/opt/local/bin/antiword"
3
3
  :docx: "/usr/local/bin/docx2txt.pl"
4
+ :odt: "/usr/local/bin/odt2txt"
5
+ :rtf: "/opt/local/bin/unrtf -P /opt/local/lib/unrtf"
6
+ :html: "/opt/local/bin/html2text"
data/spec/xtotxt_spec.rb CHANGED
@@ -2,10 +2,12 @@ require 'spec_helper'
2
2
  require 'xtotxt'
3
3
 
4
4
  describe Xtotxt do
5
+ before(:all) do
6
+ @input_prefix = "#{Pathname.new(__FILE__).dirname}/fixtures/test"
7
+ end
5
8
 
6
9
  before do
7
10
  @x = Xtotxt.new
8
- @input_prefix = "fixtures/test"
9
11
  end
10
12
 
11
13
  describe "convert" do
@@ -18,7 +20,7 @@ describe Xtotxt do
18
20
 
19
21
  context "input parameters and results" do
20
22
 
21
- %w{pdf doc docx}.each do |ext|
23
+ %w{pdf doc docx odt rtf}.each do |ext|
22
24
  it "accepts an #{ext} input" do
23
25
  lambda { @x.convert("#{@input_prefix}.#{ext}") }.should_not raise_error
24
26
  end
@@ -49,4 +51,23 @@ describe Xtotxt do
49
51
  text.should == "three pigheaded piglets had a plan\n\n"
50
52
  end
51
53
 
54
+ it "converts an odt document correctly" do
55
+ text = @x.convert("#{@input_prefix}.odt")
56
+
57
+ text.should == "\nthree pigheaded piglets had a plan\n\n"
58
+ end
59
+
60
+ it "converts an rtf document correctly" do
61
+ text = @x.convert("#{@input_prefix}.rtf")
62
+
63
+ text.should == "three pigheaded piglets had a plan\n"
64
+ end
65
+
66
+ it "converts an html document correctly" do
67
+ text = @x.convert("#{@input_prefix}.html")
68
+
69
+ text.should == "three pigheaded piglets had a plan\n"
70
+ end
71
+
72
+
52
73
  end
data/xtotxt/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Xtotxt
2
- VERSION = 0.3
2
+ VERSION = 0.4
3
3
  end
metadata CHANGED
@@ -1,12 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xtotxt
3
3
  version: !ruby/object:Gem::Version
4
- hash: 13
4
+ hash: 3
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 3
9
- version: "0.3"
8
+ - 4
9
+ version: "0.4"
10
10
  platform: ruby
11
11
  authors:
12
12
  - Alexy Khrabrov
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-09-21 00:00:00 Z
17
+ date: 2011-09-23 00:00:00 Z
18
18
  dependencies: []
19
19
 
20
20
  description: A simple wrapper calling, for each supported input format, a given command-line tool
@@ -27,12 +27,16 @@ extensions: []
27
27
  extra_rdoc_files: []
28
28
 
29
29
  files:
30
+ - .gitignore
30
31
  - .rvmrc
31
32
  - Rakefile
32
33
  - lib/xtotxt.rb
33
34
  - spec/fixtures/test.doc
34
35
  - spec/fixtures/test.docx
36
+ - spec/fixtures/test.html
37
+ - spec/fixtures/test.odt
35
38
  - spec/fixtures/test.pdf
39
+ - spec/fixtures/test.rtf
36
40
  - spec/spec_helper.rb
37
41
  - spec/xtotxt.yml
38
42
  - spec/xtotxt_spec.rb
@@ -76,7 +80,10 @@ summary: Convert pdf, doc and docx to plain text
76
80
  test_files:
77
81
  - spec/fixtures/test.doc
78
82
  - spec/fixtures/test.docx
83
+ - spec/fixtures/test.html
84
+ - spec/fixtures/test.odt
79
85
  - spec/fixtures/test.pdf
86
+ - spec/fixtures/test.rtf
80
87
  - spec/spec_helper.rb
81
88
  - spec/xtotxt.yml
82
89
  - spec/xtotxt_spec.rb