odt2html 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,238 @@
1
+ module ODT2HTML
2
+ class Base
3
+ include AnalyzeContent
4
+ include AnalyzeGraphics
5
+ include AnalyzeStyles
6
+
7
+ def initialize( )
8
+
9
+ @@debug = 0
10
+
11
+ @doc = nil
12
+ @input_filename = nil
13
+
14
+ @output_filename = nil
15
+ @output_doc = nil
16
+
17
+ @head = nil
18
+ @body = nil
19
+
20
+ @css_filename = nil
21
+
22
+ @image_dir = nil
23
+
24
+ @namespace_urn = {
25
+ "urn:oasis:names:tc:opendocument:xmlns:office:1.0"=>"office",
26
+ "urn:oasis:names:tc:opendocument:xmlns:style:1.0"=>"style",
27
+ "urn:oasis:names:tc:opendocument:xmlns:text:1.0"=>"text",
28
+ "urn:oasis:names:tc:opendocument:xmlns:table:1.0"=>"table",
29
+ "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"=>"draw",
30
+ "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0"=>"fo",
31
+ "http://www.w3.org/1999/xlink"=>"xlink",
32
+ "http://purl.org/dc/elements/1.1/"=>"dc",
33
+ "urn:oasis:names:tc:opendocument:xmlns:meta:1.0"=>"meta",
34
+ "urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0"=>"number",
35
+ "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0"=>"svg",
36
+ "urn:oasis:names:tc:opendocument:xmlns:chart:1.0"=>"chart",
37
+ "urn:oasis:names:tc:opendocument:xmlns:dr3d:1.0"=>"dr3d",
38
+ "http://www.w3.org/1998/Math/MathML"=>"math",
39
+ "urn:oasis:names:tc:opendocument:xmlns:form:1.0"=>"form",
40
+ "urn:oasis:names:tc:opendocument:xmlns:script:1.0"=>"script",
41
+ "http://openoffice.org/2004/office"=>"ooo",
42
+ "http://openoffice.org/2004/writer"=>"ooow",
43
+ "http://openoffice.org/2004/calc"=>"oooc",
44
+ "http://www.w3.org/2001/xml-events"=>"dom"
45
+ }
46
+
47
+ #
48
+ # These are the "canonical forms" of the styles we want to process.
49
+ # when we get the namespaces, we'll push them into the @style_dispatch
50
+ # hash. If a style name ends with a *, the next entry is the name of
51
+ # a method that handles that entry. Otherwise, process_normal_style_attr
52
+ # gets put into @style_dispatch
53
+ #
54
+ @valid_style = %w(
55
+ style:font-name* process_font_name
56
+ fo:color
57
+ fo:background-color
58
+ fo:font-size
59
+ fo:font-style
60
+ fo:font-weight
61
+ fo:margin-top
62
+ fo:margin-right
63
+ fo:margin-bottom
64
+ fo:margin-left
65
+ fo:margin
66
+ fo:padding-top fo:padding-right fo:padding-bottom fo:padding-left
67
+ fo:padding
68
+ fo:border-top fo:border-right fo:border-bottom fo:border-left
69
+ fo:border
70
+ fo:text-align* process_text_align
71
+ fo:text-indent
72
+ style:column-width* process_column_width
73
+ style:text-underline-style* process_underline_style
74
+ style:text-position* process_style_text_position
75
+ )
76
+
77
+ # The style dispatch hash's key is a style name;
78
+ # the value is the name of the function to call to
79
+ # process that style.
80
+ @style_dispatch = Hash.new
81
+
82
+ # The keys for <tt>@nshash</tt> are canonical namespace names;
83
+ # the values are the actual namespace prefixes used in the
84
+ # document being processed.
85
+ @nshash = Hash.new
86
+
87
+ # The <tt>@style_info</tt> hash gives a style name as its key;
88
+ # the value is a <tt>DeclarationBlock</tt>. When a style is
89
+ # actually used in the document, we set the style's
90
+ # <tt>@block_used</tt> property to <tt>true</tt>.
91
+ #
92
+ @style_info = Hash.new
93
+
94
+ #
95
+ # Paragraphs merge borders by default; this means we
96
+ # must remember the last paragraph style emitted
97
+ # and a reference to the paragraph
98
+ @previous_para_style = nil
99
+ @previous_para = nil
100
+ end
101
+
102
+ #
103
+ # Establish a mapping between "standard" namespaces (in @namespace_urn)
104
+ # and namespace prefixes used in the document at hand.
105
+ #
106
+ # This code dynamically creates instance variables for the namespaces
107
+ # with "_ns" added to the variable name to avoid collisions.
108
+ # It is also added to the namespace hash <tt>@nshash</tt>
109
+ #
110
+ # The technique comes from a post to comp.lang.ruby by Guy Decoux
111
+ #
112
+ def get_namespaces
113
+ @nshash.clear
114
+ root_element = @doc.root
115
+ root_element.attributes.each_attribute do |attr|
116
+ if @namespace_urn.has_key?( attr.value ) then
117
+ @nshash[@namespace_urn[attr.value]] = attr.name
118
+ self.class.send(:attr_accessor, @namespace_urn[attr.value] + "_ns")
119
+ send("#{@namespace_urn[attr.value]+'_ns'}=", attr.name)
120
+ end
121
+ end
122
+ end
123
+
124
+ def get_options
125
+ opts = GetoptLong.new(
126
+ ["--in", GetoptLong::REQUIRED_ARGUMENT],
127
+ ["--out", GetoptLong::OPTIONAL_ARGUMENT],
128
+ ["--css", GetoptLong::REQUIRED_ARGUMENT],
129
+ ["--images", GetoptLong::REQUIRED_ARGUMENT]
130
+ )
131
+ opts.each do |opt, arg|
132
+ case opt
133
+ when "--in"
134
+ @input_filename = arg
135
+ when "--out"
136
+ @output_filename = arg
137
+ when "--css"
138
+ @css_filename = arg
139
+ when "--images"
140
+ @image_dir = arg
141
+ end
142
+ end
143
+ end
144
+
145
+ def get_xml( member_name )
146
+ zipfile = Zip::ZipFile::open( @input_filename )
147
+ stream = zipfile.get_entry( member_name ).get_input_stream
148
+ doc = REXML::Document.new stream.read
149
+ zipfile.close
150
+ return doc
151
+ end
152
+
153
+ def add_xhtml_head_info
154
+ @head.add_element("meta",
155
+ "http-equiv"=>"content-type", "content"=>"text/html; charset=utf-8")
156
+ @head.add_element("title").add_text( @input_filename )
157
+ end
158
+
159
+ def collect_styles
160
+ str = ""
161
+ @style_info.keys.sort.each do |style|
162
+ if (@style_info[style].length > 0 && yield(@style_info[style])) then
163
+ str << style_to_s(style) << "\n"
164
+ end
165
+ end
166
+ return str
167
+ end
168
+
169
+ def convert
170
+ get_options
171
+
172
+ if (@input_filename == nil)
173
+ usage
174
+ raise ArgumentError, "No input file name given"
175
+ end
176
+
177
+ # if (@output_filename == nil)
178
+ # usage
179
+ # raise ArgumentError, "No output file name given"
180
+ # end
181
+
182
+
183
+ if (@image_dir != nil)
184
+ if (!File.exist?(@image_dir))
185
+ Dir.mkdir(@image_dir)
186
+ end
187
+ end
188
+
189
+ str = <<HDR
190
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
191
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
192
+ <html>
193
+ </html>
194
+ HDR
195
+ @output_doc = REXML::Document.new str
196
+ @head = @output_doc.root.add_element("head")
197
+ @body = @output_doc.root.add_element("body")
198
+ add_xhtml_head_info
199
+
200
+ @doc = get_xml("styles.xml")
201
+ analyze_styles_xml
202
+
203
+ @doc = get_xml("content.xml")
204
+ analyze_content_xml
205
+
206
+ all_styles = collect_styles { |item| item.block_used }
207
+
208
+ if (@css_filename != nil) then
209
+ css_file = File.open( @css_filename, "w" )
210
+ @head.add_element("link",
211
+ {"rel" => "stylesheet", "type" => "text/css",
212
+ "href" => @css_filename} )
213
+ css_file.puts(all_styles)
214
+ else
215
+ style_el = @head.add_element("style", {"type" => "text/css"} )
216
+ style_el.add_text( all_styles )
217
+ end
218
+
219
+ if (@output_filename) then
220
+ output_file = File.open( @output_filename, "w")
221
+ else
222
+ output_file = $stdout
223
+ end
224
+
225
+ @output_doc.write( output_file, 2 )
226
+ output_file.close
227
+
228
+ rescue Exception => e
229
+ puts "Cannot convert file: #{e}"
230
+ puts e.backtrace.join("\n")
231
+ end
232
+
233
+ def usage
234
+ puts "Usage: #{$0} --in inputfile --out outputfile [--css cssfile] [--images imagedir]"
235
+ end
236
+
237
+ end
238
+ end
@@ -0,0 +1,15 @@
1
+ module ODT2HTML
2
+ # This class represents a CSS declaration; a
3
+ # property/value pair
4
+ class Declaration
5
+ attr_accessor( :property, :value )
6
+ def initialize( property=nil, value=nil )
7
+ @property = property
8
+ @value = value
9
+ end
10
+
11
+ def to_s
12
+ return "#{property}: #{value}"
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,35 @@
1
+ module ODT2HTML
2
+ # Represents a CSS declaration block; a sequence of zero
3
+ # or more +Declaration+s.
4
+ class DeclarationBlock < Array
5
+ attr_accessor( :block_used )
6
+
7
+ def initialize(*arglist)
8
+ if (arglist[0].kind_of? DeclarationBlock) then
9
+ dblock = arglist[0]
10
+ super( 0 )
11
+ dblock.each do |item|
12
+ push Declaration.new( item.property, item.value )
13
+ end
14
+ else
15
+ super
16
+ end
17
+ @block_used = false
18
+ end
19
+
20
+ def has_top_border?
21
+ result = detect {|item| item.property =~ /border(-top)?/}
22
+ return (result != nil) ? true : nil
23
+ end
24
+
25
+ def to_s
26
+ result = "{\n"
27
+ each { |item|
28
+ result << "\t#{item.property}: #{item.value};\n"
29
+ }
30
+ result << "}\n"
31
+ return result
32
+ end
33
+
34
+ end
35
+ end
data/lib/odt2html.rb ADDED
@@ -0,0 +1,26 @@
1
+ require 'rexml/document'
2
+ require 'rexml/xpath'
3
+ require 'zip/zip'
4
+ require 'stringio'
5
+ require 'getoptlong'
6
+
7
+ module ODT2HTML
8
+
9
+ VERSION = "0.1.0"
10
+ ROOT_PATH = File.expand_path(File.dirname(__FILE__))
11
+
12
+ autoload :Base, "#{ROOT_PATH}/odt2html/base"
13
+ autoload :AnalyzeContent, "#{ROOT_PATH}/odt2html/analyze_content"
14
+ autoload :AnalyzeGraphics, "#{ROOT_PATH}/odt2html/analyze_graphics"
15
+ autoload :AnalyzeStyles, "#{ROOT_PATH}/odt2html/analyze_styles"
16
+ autoload :Declaration, "#{ROOT_PATH}/odt2html/declaration"
17
+ autoload :DeclarationBlock, "#{ROOT_PATH}/odt2html/declaration_block"
18
+
19
+ end
20
+
21
+ class REXML::Element
22
+ def attribute_value( name, namespace=nil )
23
+ attr = attribute( name, namespace )
24
+ return (attr != nil) ? attr.value : nil
25
+ end
26
+ end
data/odt2html.gemspec ADDED
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "odt2html"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "odt2html"
7
+ s.version = ODT2HTML::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Bernard Potocki"]
10
+ s.email = ["bernard.potocki@imanel.org"]
11
+ s.homepage = "http://github.com/imanel/odt2html"
12
+ s.summary = %q{OpenDocument text to HTML converter}
13
+ s.description = %q{OpenDocument text to HTML converter}
14
+
15
+ s.add_dependency 'rubyzip'
16
+ s.add_development_dependency 'rspec', '~> 2.4.0'
17
+
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+ end
@@ -0,0 +1,117 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
2
+ <html>
3
+ <head>
4
+ <meta content='text/html; charset=utf-8' http-equiv='content-type'/>
5
+ <title>
6
+ [TITLE_HERE]
7
+ </title>
8
+ <style type='text/css'>
9
+ .P1{ font-size: 18pt; color: #000000; font-family: Times New Roman;
10
+ text-align: center; font-weight: bold; } .P2{ font-size: 12pt; color:
11
+ #000000; font-family: Luxi Sans; margin-bottom: 0.0835in; margin-top: 0in;
12
+ text-align: center; } .P3{ font-size: 12pt; color: #000000; font-family:
13
+ Luxi Sans; margin-bottom: 0.0835in; margin-top: 0in; text-align: left; }
14
+ .Quotations{ font-size: 12pt; color: #000000; font-family: Luxi Sans;
15
+ margin-bottom: 0.1965in; text-indent: 0in; margin-top: 0in; margin-left:
16
+ 0.3937in; margin-right: 0.3937in; } .T1{ font-weight: bold; }
17
+ .Text_20_body{ font-size: 12pt; color: #000000; font-family: Luxi Sans;
18
+ margin-bottom: 0.0835in; margin-top: 0in; }
19
+ </style>
20
+ </head>
21
+ <body>
22
+ <p class='P1'>
23
+ &quot;ODF Alliance&quot; formed to support OpenDocument format
24
+ </p>
25
+ <p class='P2'/>
26
+ <p class='P3'>
27
+ 3/3/2006 8:25:36 PM, by
28
+ <a href='mailto:jeremy@arstechnica.com'>
29
+ Jeremy Reimer
30
+ </a>
31
+ </p>
32
+ <p class='Text_20_body'>
33
+ A consortium of companies and organizations have banded together to form
34
+ the
35
+ <a href='http://www.odfalliance.org/news.asp'>
36
+ &quot;ODF Alliance,&quot;
37
+ </a>
38
+ a group dedicated to promoting the office software file format first
39
+ implemented by OpenOffice.org. The alliance consists of more than 35
40
+ members from various countries around the world. It includes companies
41
+ such as Red Hat, IBM, Novell, Sun Microsystems, and Corel, and
42
+ governmental organizations such as the American Library Association and
43
+ the Information and Communications Technology council for the city of
44
+ Vienna.
45
+ </p>
46
+ <p class='Text_20_body'>
47
+ The
48
+ <span class='T1'>
49
+ OpenDocument
50
+ </span>
51
+ file format was formed by the industry consortium OASIS, a group headed
52
+ by Sun Microsystems, and was based on OpenOffice.org&apos;s native file
53
+ format. OpenOffice.org is itself an open-sourced version of Star Office,
54
+ the proprietary office suite that Sun purchased when it acquired the
55
+ German company Star Division in 1999. The idea behind OpenDocument was to
56
+ use a text-based XML format (compressed in a zip file to conserve disk
57
+ space) in order to make it easy for other products to interoperate with
58
+ it. The specification was finalized in 2005 and OpenOffice.org was the
59
+ first software suite to support it. Other projects, such as KOffice,
60
+ AbiWord, and IBM Workplace are adding support for the ODF format, either
61
+ natively or through plug-in format translators.
62
+ </p>
63
+ <p class='Text_20_body'>
64
+ In today&apos;s highly networked world, it turns out that operability is a
65
+ very useful thing to have, which is why Microsoft decided that they would
66
+ also jump on the XML bandwagon, introducing a new XML-based file format
67
+ (.docx) for Office 2003. Not only that, but the company is planning to
68
+ make the next version of the Office XML format the default for Office 2007
69
+ (formerly known as Office 12). This means that when users of Office 2007
70
+ go to save a file, they will automatically save in .docx, not .doc.
71
+ </p>
72
+ <p class='Text_20_body'>
73
+ While most people don&apos;t consider file formats to be terribly
74
+ exciting, the question of which format to adopt led to an increasingly
75
+ dramatic series of announcements from the government of Massachusetts. In
76
+ January 2005, the government
77
+ <a href='http://www.crn.com/sections/breakingnews/breakingnews.jhtml;jsessionid=GCCZBBT3QQVBGQSNDBOCKHSCJUMEKJVN?articleId=57701551'>
78
+ approved
79
+ </a>
80
+ Office XML 2003 as an appropriate file format, then in September of that
81
+ year
82
+ <a href='http://arstechnica.com/news.ars/post/20050906-5279.html'>
83
+ reversed
84
+ </a>
85
+ their decision, stating that Office XML was unacceptable and that only
86
+ OpenDocument and PDF files would be allowed. They went
87
+ <a href='http://arstechnica.com/news.ars/post/20051128-5637.html'>
88
+ back again
89
+ </a>
90
+ in November, stating that they were &quot;very pleased&quot; with
91
+ Microsoft&apos;s submission of Office XML to the ECMA standards body, and
92
+ that they were &quot;optimistic that Office Open XML will meet our new
93
+ standards for acceptable open formats.&quot; The champion of ODF in
94
+ Massachusetts, CIO Peter Quinn, then suddenly announced his
95
+ <a href='http://arstechnica.com/news.ars/post/20060104-5895.html'>
96
+ resignation
97
+ </a>
98
+ in January. This announcement was followed with an assurance from
99
+ Quinn&apos;s former boss that their position on ODF &quot;remained
100
+ unchanged&quot; and that they were still committed to supporting that
101
+ format.
102
+ </p>
103
+ <p class='Text_20_body'>
104
+ Confused yet? The so-called &quot;controversy&quot; over the ODF switch
105
+ has generated an unbelievable amount of press, and various groups are now
106
+ busy pushing governments worldwide to switch over to the ODF format. This
107
+ new &quot;alliance&quot; joins the groups SpreadOpenDocument.org, the
108
+ OpenDocument Fellowship, and the &quot;Friends of OpenDocument&quot; in
109
+ their quest to promote the format. My own inbox is now slowly filling up
110
+ with pro-ODF announcements and press releases. The push seems overtly
111
+ political, as evidenced by the rhetoric used by the ODF Alliance:
112
+ </p>
113
+ <p class='Quotations'>
114
+ [...]
115
+ </p>
116
+ </body>
117
+ </html>
Binary file
@@ -0,0 +1,24 @@
1
+ require 'spec_helper'
2
+
3
+ describe "files" do
4
+ it "should read ODT file and generate matching HTML file" do
5
+ odt_path = File.join(File.dirname(__FILE__), *%w[.. fixtures example.odt])
6
+ html_path = File.join(File.dirname(__FILE__), *%w[.. fixtures example.html])
7
+ html_file = File.open(html_path)
8
+ tempfile = Tempfile.new('html')
9
+
10
+ begin
11
+ parser = ODT2HTML::Base.new
12
+ parser.instance_variable_set('@input_filename', odt_path)
13
+ parser.instance_variable_set('@output_filename', tempfile.path)
14
+
15
+ parser.convert
16
+
17
+ tempfile.rewind
18
+ html_content = html_file.read.gsub("[TITLE_HERE]", odt_path)
19
+ tempfile.read.should eql(html_content)
20
+ ensure
21
+ tempfile.close!
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,4 @@
1
+ require 'rubygems'
2
+ require 'rspec'
3
+
4
+ require 'odt2html'
metadata ADDED
@@ -0,0 +1,118 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: odt2html
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Bernard Potocki
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-03-02 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: rubyzip
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ - !ruby/object:Gem::Dependency
36
+ name: rspec
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ hash: 31
44
+ segments:
45
+ - 2
46
+ - 4
47
+ - 0
48
+ version: 2.4.0
49
+ type: :development
50
+ version_requirements: *id002
51
+ description: OpenDocument text to HTML converter
52
+ email:
53
+ - bernard.potocki@imanel.org
54
+ executables:
55
+ - odt2html
56
+ extensions: []
57
+
58
+ extra_rdoc_files: []
59
+
60
+ files:
61
+ - .gitignore
62
+ - CHANGELOG.md
63
+ - Gemfile
64
+ - LICENSE.txt
65
+ - README.md
66
+ - Rakefile
67
+ - bin/odt2html
68
+ - lib/odt2html.rb
69
+ - lib/odt2html/analyze_content.rb
70
+ - lib/odt2html/analyze_graphics.rb
71
+ - lib/odt2html/analyze_styles.rb
72
+ - lib/odt2html/base.rb
73
+ - lib/odt2html/declaration.rb
74
+ - lib/odt2html/declaration_block.rb
75
+ - odt2html.gemspec
76
+ - spec/fixtures/example.html
77
+ - spec/fixtures/example.odt
78
+ - spec/integration/files_spec.rb
79
+ - spec/spec_helper.rb
80
+ has_rdoc: true
81
+ homepage: http://github.com/imanel/odt2html
82
+ licenses: []
83
+
84
+ post_install_message:
85
+ rdoc_options: []
86
+
87
+ require_paths:
88
+ - lib
89
+ required_ruby_version: !ruby/object:Gem::Requirement
90
+ none: false
91
+ requirements:
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ hash: 3
95
+ segments:
96
+ - 0
97
+ version: "0"
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
99
+ none: false
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ hash: 3
104
+ segments:
105
+ - 0
106
+ version: "0"
107
+ requirements: []
108
+
109
+ rubyforge_project:
110
+ rubygems_version: 1.4.2
111
+ signing_key:
112
+ specification_version: 3
113
+ summary: OpenDocument text to HTML converter
114
+ test_files:
115
+ - spec/fixtures/example.html
116
+ - spec/fixtures/example.odt
117
+ - spec/integration/files_spec.rb
118
+ - spec/spec_helper.rb