html2tex 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/COPYING ADDED
@@ -0,0 +1,21 @@
1
+ Licence (MIT)
2
+
3
+ Copyright (c) 2010 Paul Battley
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,41 @@
1
+ HTML2TeX
2
+ ========
3
+
4
+ This is a simple library and command-line tool to convert HTML documents into
5
+ LaTeX. The intended use is for preparing ebooks, but it might also be useful
6
+ for other purposes.
7
+
8
+ Library use
9
+ -----------
10
+
11
+ require "html2tex"
12
+
13
+ # Return output as a string
14
+ tex = HTML2TeX.new(html).to_tex
15
+
16
+ # Write directly to a stream
17
+ File.open("output.tex", "w") do |f|
18
+ HTML2TeX.new(html).to_tex(f)
19
+ end
20
+
21
+ Command-line use
22
+ ----------------
23
+
24
+ html2tex source.html destination.tex
25
+
26
+ Dependencies
27
+ ------------
28
+
29
+ * htmlentities, to decode character entities
30
+ * rubypants, to convert `'` and `"` into something more appealing
31
+
32
+ Limitations
33
+ -----------
34
+
35
+ The HTML recognised is currently limited to headings, paragraphs, and bold and
36
+ italic mark-up. This covers the majority of novels, but it's far from
37
+ comprehensive.
38
+
39
+ StringScanner is used to process the HTML, but cannot read from a stream
40
+ directly, so the entire input document must be read into memory as a string
41
+ first.
data/bin/html2tex ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+ require "html2tex"
3
+
4
+ File.open(ARGV[1], "w") do |output|
5
+ HTML2TeX.new(File.read(ARGV[0])).to_tex(output)
6
+ end
@@ -0,0 +1,80 @@
1
+ class HTML2TeX
2
+ class BasicProcessor
3
+ attr_reader :scanner
4
+
5
+ def initialize(scanner)
6
+ @decoder = HTMLEntities.new
7
+ @scanner = scanner
8
+ end
9
+
10
+ def to_tex(buffer="")
11
+ while n = next_element
12
+ buffer << n
13
+ end
14
+ buffer
15
+ end
16
+
17
+ private
18
+ def next_element
19
+ if scanner.eos?
20
+ nil
21
+ elsif s = scanner.scan(/<[^>]+>/)
22
+ tag(s[/<([^>\s]+)/, 1].downcase)
23
+ elsif scanner.scan(/\s+/)
24
+ whitespace
25
+ elsif scanner.scan(/\*(\s*\*){2,5}/)
26
+ dinkus
27
+ elsif s = scanner.scan(/[^\s<]+/)
28
+ text(s)
29
+ else
30
+ ""
31
+ end
32
+ end
33
+
34
+ def text(s)
35
+ tex_escape(@decoder.decode(RubyPants.new(s).to_html))
36
+ end
37
+
38
+ def tex_escape(s)
39
+ return nil if s.nil?
40
+ s.gsub(/[\\{}$&#%^_~]/, '\\\\\\0')
41
+ end
42
+
43
+ def whitespace
44
+ if @squash_next
45
+ @squash_next = false
46
+ ""
47
+ else
48
+ " "
49
+ end
50
+ end
51
+
52
+ def tag(t)
53
+ @squash_next = false
54
+ case t
55
+ when "/p"
56
+ @squash_next = true
57
+ "\n\n"
58
+ when "em", "i"
59
+ "\\textit{"
60
+ when "strong", "b"
61
+ "\\textbf{"
62
+ when "/em", "/i", "/strong", "/b"
63
+ "}"
64
+ when "br"
65
+ "\\\\"
66
+ when /^h\d/
67
+ @squash_next = true
68
+ HeadingProcessor.new(scanner, t).to_tex + "\n\n"
69
+ else
70
+ ""
71
+ end
72
+ end
73
+
74
+ def dinkus
75
+ "\\begin{center}\n***\n\\end{center}"
76
+ end
77
+ end
78
+ end
79
+
80
+ require "html2tex/heading_processor"
@@ -0,0 +1,32 @@
1
+ require "html2tex/basic_processor"
2
+
3
+ class HTML2TeX
4
+ class HeadingProcessor < BasicProcessor
5
+ HEADINGS = %w[part chapter section subsection subsubsection]
6
+
7
+ def initialize(scanner, label)
8
+ @label = label
9
+ super(scanner)
10
+ end
11
+
12
+ def to_tex(*args)
13
+ wrap(super(*args))
14
+ end
15
+
16
+ private
17
+ def wrap(content)
18
+ return "" if content.strip.empty?
19
+ index = @label[/\d/].to_i - 1
20
+ "\\#{HEADINGS[index]}*{#{content}}"
21
+ end
22
+
23
+ def tag(t)
24
+ case t
25
+ when "/#{@label}"
26
+ nil
27
+ else
28
+ ""
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,3 @@
1
+ class HTML2TeX
2
+ VERSION = "0.1.0"
3
+ end
data/lib/html2tex.rb ADDED
@@ -0,0 +1,14 @@
1
+ require "htmlentities"
2
+ require "rubypants"
3
+ require "html2tex/version"
4
+ require "html2tex/basic_processor"
5
+
6
+ class HTML2TeX
7
+ def initialize(html)
8
+ @html = html
9
+ end
10
+
11
+ def to_tex(buffer="")
12
+ BasicProcessor.new(StringScanner.new(@html)).to_tex(buffer)
13
+ end
14
+ end
metadata ADDED
@@ -0,0 +1,91 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html2tex
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Paul Battley
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-05-09 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: htmlentities
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 4.2.0
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: rubypants
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.2.0
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: shoulda
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ description: Converts an HTML document to a LaTeX version with reasonably sensible markup.
46
+ email:
47
+ - pbattley@gmail.com
48
+ executables:
49
+ - html2tex
50
+ extensions: []
51
+
52
+ extra_rdoc_files: []
53
+
54
+ files:
55
+ - bin/html2tex
56
+ - lib/html2tex.rb
57
+ - lib/html2tex/version.rb
58
+ - lib/html2tex/basic_processor.rb
59
+ - lib/html2tex/heading_processor.rb
60
+ - README.md
61
+ - COPYING
62
+ has_rdoc: true
63
+ homepage: http://github.com/threedaymonk/html2tex
64
+ licenses: []
65
+
66
+ post_install_message:
67
+ rdoc_options: []
68
+
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: "0"
76
+ version:
77
+ required_rubygems_version: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ version: "0"
82
+ version:
83
+ requirements: []
84
+
85
+ rubyforge_project:
86
+ rubygems_version: 1.3.5
87
+ signing_key:
88
+ specification_version: 3
89
+ summary: HTML to LaTeX converter
90
+ test_files: []
91
+