xtotxt 0.3 → 0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ spec/fixtures/test.txt
2
+ *.gem
3
+ .idea
data/lib/xtotxt.rb CHANGED
@@ -22,7 +22,7 @@ class Xtotxt
22
22
 
23
23
  ext = path_list.pop
24
24
 
25
- raise("not a supported document extension: #{ext}") unless %w{pdf doc docx}.member?(ext)
25
+ raise("not a supported document extension: #{ext}") unless %w{pdf doc docx odt rtf html}.member?(ext)
26
26
 
27
27
  output_file = (path_list << "txt").join(".")
28
28
 
@@ -33,17 +33,31 @@ class Xtotxt
33
33
  "#{@ext[:doc]} > #{output_file} #{input_file_name}"
34
34
  when "docx"
35
35
  "#{@ext[:docx]} #{input_file_name}"
36
+ when "odt":
37
+ "#{@ext[:odt]} #{input_file_name} --output=#{output_file}"
38
+ when "rtf":
39
+ "#{@ext[:rtf]} --text #{input_file_name} > #{output_file}"
40
+ when "html":
41
+ "#{@ext[:html]} -o #{output_file} #{input_file_name}"
36
42
  else
37
43
  raise "have no way to convert #{ext} yet"
38
44
  end
39
45
 
40
- command_output = `#{command_line}`
46
+ #puts "executing: #{command_line}"
47
+
48
+ command_output = `#{command_line} 2>/dev/null`
41
49
  text = if $? == 0
42
50
  File.read(output_file)
43
51
  else
44
52
  raise "Failed to convert #{input_file_name}. Exit status: #{$?.exitstatus}. Output: #{command_output}"
45
53
  end
46
- text
54
+
55
+ case ext
56
+ when "rtf"
57
+ skip_unrtf_header(text)
58
+ else
59
+ text
60
+ end
47
61
  end
48
62
 
49
63
  def initialize(ext=nil)
@@ -57,8 +71,21 @@ class Xtotxt
57
71
  Xtotxt.read_config
58
72
  @@ext
59
73
  end
74
+ end
60
75
 
61
- puts "@ext: #{@ext}, @@ext: #{@@ext}"
76
+ private
77
+
78
+ def skip_unrtf_header(text)
79
+ a = text.lines.to_a
80
+ while true
81
+ unless a.shift =~ /^###/
82
+ unless a.shift == "-----------------\n"
83
+ raise "cannot parse rtf"
84
+ end
85
+ break
86
+ end
87
+ end
88
+ a.join
62
89
  end
63
90
 
64
91
  end
@@ -0,0 +1,6 @@
1
+ <head>
2
+ <title>piglet plan</title>
3
+ </head>
4
+ <body>
5
+ <p>three pigheaded piglets had a plan</p>
6
+ </body>
Binary file
@@ -0,0 +1,8 @@
1
+ {\rtf1\ansi\ansicpg1252\cocoartf1138
2
+ {\fonttbl\f0\fswiss\fcharset0 Helvetica;}
3
+ {\colortbl;\red255\green255\blue255;}
4
+ \margl1440\margr1440\vieww10800\viewh8400\viewkind0
5
+ \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural
6
+
7
+ \f0\fs24 \cf0 three pigheaded piglets had a plan\
8
+ }
data/spec/xtotxt.yml CHANGED
@@ -1,3 +1,6 @@
1
1
  :pdf: "/opt/local/bin/xpdf-pdftotext"
2
2
  :doc: "/opt/local/bin/antiword"
3
3
  :docx: "/usr/local/bin/docx2txt.pl"
4
+ :odt: "/usr/local/bin/odt2txt"
5
+ :rtf: "/opt/local/bin/unrtf -P /opt/local/lib/unrtf"
6
+ :html: "/opt/local/bin/html2text"
data/spec/xtotxt_spec.rb CHANGED
@@ -2,10 +2,12 @@ require 'spec_helper'
2
2
  require 'xtotxt'
3
3
 
4
4
  describe Xtotxt do
5
+ before(:all) do
6
+ @input_prefix = "#{Pathname.new(__FILE__).dirname}/fixtures/test"
7
+ end
5
8
 
6
9
  before do
7
10
  @x = Xtotxt.new
8
- @input_prefix = "fixtures/test"
9
11
  end
10
12
 
11
13
  describe "convert" do
@@ -18,7 +20,7 @@ describe Xtotxt do
18
20
 
19
21
  context "input parameters and results" do
20
22
 
21
- %w{pdf doc docx}.each do |ext|
23
+ %w{pdf doc docx odt rtf}.each do |ext|
22
24
  it "accepts an #{ext} input" do
23
25
  lambda { @x.convert("#{@input_prefix}.#{ext}") }.should_not raise_error
24
26
  end
@@ -49,4 +51,23 @@ describe Xtotxt do
49
51
  text.should == "three pigheaded piglets had a plan\n\n"
50
52
  end
51
53
 
54
+ it "converts an odt document correctly" do
55
+ text = @x.convert("#{@input_prefix}.odt")
56
+
57
+ text.should == "\nthree pigheaded piglets had a plan\n\n"
58
+ end
59
+
60
+ it "converts an rtf document correctly" do
61
+ text = @x.convert("#{@input_prefix}.rtf")
62
+
63
+ text.should == "three pigheaded piglets had a plan\n"
64
+ end
65
+
66
+ it "converts an html document correctly" do
67
+ text = @x.convert("#{@input_prefix}.html")
68
+
69
+ text.should == "three pigheaded piglets had a plan\n"
70
+ end
71
+
72
+
52
73
  end
data/xtotxt/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Xtotxt
2
- VERSION = 0.3
2
+ VERSION = 0.4
3
3
  end
metadata CHANGED
@@ -1,12 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xtotxt
3
3
  version: !ruby/object:Gem::Version
4
- hash: 13
4
+ hash: 3
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 3
9
- version: "0.3"
8
+ - 4
9
+ version: "0.4"
10
10
  platform: ruby
11
11
  authors:
12
12
  - Alexy Khrabrov
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-09-21 00:00:00 Z
17
+ date: 2011-09-23 00:00:00 Z
18
18
  dependencies: []
19
19
 
20
20
  description: A simple wrapper calling, for each supported input format, a given command-line tool
@@ -27,12 +27,16 @@ extensions: []
27
27
  extra_rdoc_files: []
28
28
 
29
29
  files:
30
+ - .gitignore
30
31
  - .rvmrc
31
32
  - Rakefile
32
33
  - lib/xtotxt.rb
33
34
  - spec/fixtures/test.doc
34
35
  - spec/fixtures/test.docx
36
+ - spec/fixtures/test.html
37
+ - spec/fixtures/test.odt
35
38
  - spec/fixtures/test.pdf
39
+ - spec/fixtures/test.rtf
36
40
  - spec/spec_helper.rb
37
41
  - spec/xtotxt.yml
38
42
  - spec/xtotxt_spec.rb
@@ -76,7 +80,10 @@ summary: Convert pdf, doc and docx to plain text
76
80
  test_files:
77
81
  - spec/fixtures/test.doc
78
82
  - spec/fixtures/test.docx
83
+ - spec/fixtures/test.html
84
+ - spec/fixtures/test.odt
79
85
  - spec/fixtures/test.pdf
86
+ - spec/fixtures/test.rtf
80
87
  - spec/spec_helper.rb
81
88
  - spec/xtotxt.yml
82
89
  - spec/xtotxt_spec.rb