html2tex 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +23 -3
- data/bin/html2tex +14 -2
- data/lib/html2tex/basic_processor.rb +8 -8
- data/lib/html2tex/cli.rb +54 -0
- data/lib/html2tex/heading_processor.rb +5 -4
- data/lib/html2tex/preamble_processor.rb +78 -0
- data/lib/html2tex/tex.rb +14 -0
- data/lib/html2tex/version.rb +1 -1
- data/lib/html2tex.rb +26 -3
- metadata +4 -1
data/README.md
CHANGED
@@ -5,6 +5,22 @@ This is a simple library and command-line tool to convert HTML documents into
|
|
5
5
|
LaTeX. The intended use is for preparing ebooks, but it might also be useful
|
6
6
|
for other purposes.
|
7
7
|
|
8
|
+
Command-line use
|
9
|
+
----------------
|
10
|
+
|
11
|
+
Usage: html2tex [options] input.html [output.tex]
|
12
|
+
-t, --title TITLE Set (or override) the title of the document
|
13
|
+
-a, --author AUTHOR Set (or override) the author of the document
|
14
|
+
-c, --document-class CLASS Set the LaTeX document class to be used; default is book
|
15
|
+
-h, --help Display this screen
|
16
|
+
|
17
|
+
If the output file is not specified, the program will create an output file
|
18
|
+
based on the input filename, replacing the extension with `.tex` – or, if there
|
19
|
+
is no extension, by appending `.tex`.
|
20
|
+
|
21
|
+
The title and author will, by default, be extracted from the HTML document,
|
22
|
+
using either Dublin Core metadata or the HTML title and an author META element.
|
23
|
+
|
8
24
|
Library use
|
9
25
|
-----------
|
10
26
|
|
@@ -18,10 +34,14 @@ Library use
|
|
18
34
|
HTML2TeX.new(html).to_tex(f)
|
19
35
|
end
|
20
36
|
|
21
|
-
|
22
|
-
|
37
|
+
A hash of options can be supplied as the second argument to `to_tex`; the
|
38
|
+
available keys are:
|
39
|
+
|
40
|
+
* `:author`
|
41
|
+
* `:title`
|
42
|
+
* `:class`
|
23
43
|
|
24
|
-
|
44
|
+
See the command-line section above for an explanation of these.
|
25
45
|
|
26
46
|
Dependencies
|
27
47
|
------------
|
data/bin/html2tex
CHANGED
@@ -1,6 +1,18 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
require "html2tex"
|
3
|
+
require "html2tex/cli"
|
3
4
|
|
4
|
-
|
5
|
-
|
5
|
+
cli = HTML2TeX::CLI.new(ARGV)
|
6
|
+
begin
|
7
|
+
cli.parse
|
8
|
+
rescue HTML2TeX::CLI::CleanExit
|
9
|
+
puts cli.usage
|
10
|
+
exit
|
11
|
+
rescue HTML2TeX::CLI::DirtyExit
|
12
|
+
puts cli.usage
|
13
|
+
exit 1
|
14
|
+
end
|
15
|
+
|
16
|
+
File.open(cli.output_file, "w") do |io|
|
17
|
+
HTML2TeX.new(File.read(cli.input_file), cli.options).to_tex(io)
|
6
18
|
end
|
@@ -1,10 +1,15 @@
|
|
1
|
+
require "html2tex/tex"
|
2
|
+
|
1
3
|
class HTML2TeX
|
2
4
|
class BasicProcessor
|
3
|
-
|
5
|
+
include TeX
|
6
|
+
|
7
|
+
attr_reader :scanner, :options
|
4
8
|
|
5
|
-
def initialize(scanner)
|
9
|
+
def initialize(scanner, options)
|
6
10
|
@decoder = HTMLEntities.new
|
7
11
|
@scanner = scanner
|
12
|
+
@options = options
|
8
13
|
end
|
9
14
|
|
10
15
|
def to_tex(buffer="")
|
@@ -35,11 +40,6 @@ class HTML2TeX
|
|
35
40
|
tex_escape(@decoder.decode(RubyPants.new(@decoder.decode(s)).to_html))
|
36
41
|
end
|
37
42
|
|
38
|
-
def tex_escape(s)
|
39
|
-
return nil if s.nil?
|
40
|
-
s.gsub(/[\\{}$&#%^_~]/, '\\\\\\0')
|
41
|
-
end
|
42
|
-
|
43
43
|
def whitespace
|
44
44
|
if @squash_next
|
45
45
|
@squash_next = false
|
@@ -65,7 +65,7 @@ class HTML2TeX
|
|
65
65
|
"\\\\"
|
66
66
|
when /^h\d/
|
67
67
|
@squash_next = true
|
68
|
-
HeadingProcessor.new(scanner, t).to_tex + "\n\n"
|
68
|
+
HeadingProcessor.new(scanner, t, @options).to_tex + "\n\n"
|
69
69
|
else
|
70
70
|
""
|
71
71
|
end
|
data/lib/html2tex/cli.rb
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
require "optparse"
|
2
|
+
|
3
|
+
class HTML2TeX
|
4
|
+
class CLI
|
5
|
+
CleanExit = Class.new(RuntimeError)
|
6
|
+
DirtyExit = Class.new(RuntimeError)
|
7
|
+
|
8
|
+
attr_reader :options, :input_file, :output_file
|
9
|
+
|
10
|
+
def initialize(argv)
|
11
|
+
@argv = argv
|
12
|
+
@options = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
def usage
|
16
|
+
parser.to_s
|
17
|
+
end
|
18
|
+
|
19
|
+
def parse
|
20
|
+
parser.parse!(@argv)
|
21
|
+
|
22
|
+
unless @input_file = @argv[0]
|
23
|
+
raise DirtyExit
|
24
|
+
end
|
25
|
+
|
26
|
+
@output_file = @argv[1] || @input_file.sub(/\.[^\.]+$|$/, ".tex")
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
def parser
|
31
|
+
@parser ||= OptionParser.new{ |opts|
|
32
|
+
opts.banner = "Usage: #{File.basename($0)} [options] input.html [output.tex]"
|
33
|
+
|
34
|
+
opts.on "-t", "--title TITLE",
|
35
|
+
"Set (or override) the title of the document" do |title|
|
36
|
+
options[:title] = title
|
37
|
+
end
|
38
|
+
opts.on "-a", "--author AUTHOR",
|
39
|
+
"Set (or override) the author of the document" do |author|
|
40
|
+
options[:author] = author
|
41
|
+
end
|
42
|
+
opts.on "-c", "--document-class CLASS",
|
43
|
+
"Set the LaTeX document class to be used; default is book" do |klass|
|
44
|
+
options[:class] = klass
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.on "-h", "--help",
|
48
|
+
"Display this screen" do
|
49
|
+
raise CleanExit
|
50
|
+
end
|
51
|
+
}
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -4,13 +4,14 @@ class HTML2TeX
|
|
4
4
|
class HeadingProcessor < BasicProcessor
|
5
5
|
HEADINGS = %w[part chapter section subsection subsubsection]
|
6
6
|
|
7
|
-
def initialize(scanner, label)
|
7
|
+
def initialize(scanner, label, options)
|
8
8
|
@label = label
|
9
|
-
super(scanner)
|
9
|
+
super(scanner, options)
|
10
10
|
end
|
11
11
|
|
12
|
-
def to_tex(
|
13
|
-
wrap(super(
|
12
|
+
def to_tex(buffer="")
|
13
|
+
buffer << wrap(super(""))
|
14
|
+
buffer
|
14
15
|
end
|
15
16
|
|
16
17
|
private
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require "html2tex/basic_processor"
|
2
|
+
require "nokogiri"
|
3
|
+
|
4
|
+
class HTML2TeX
|
5
|
+
class PreambleProcessor < BasicProcessor
|
6
|
+
|
7
|
+
def to_tex(buffer="")
|
8
|
+
read_html_head
|
9
|
+
[ tex(:documentclass, options[:class]),
|
10
|
+
tex(:title, title),
|
11
|
+
tex(:author, author),
|
12
|
+
tex(:begin, "document"),
|
13
|
+
tex(:maketitle),
|
14
|
+
""
|
15
|
+
].each do |s|
|
16
|
+
buffer << s << "\n"
|
17
|
+
end
|
18
|
+
buffer
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def read_html_head
|
23
|
+
scanner.scan %r{\s*}
|
24
|
+
scanner.scan %r{<!doctype[^>]*>\s*}i
|
25
|
+
scanner.scan %r{<html[^>]*>\s*}i
|
26
|
+
if head = scanner.scan(%r{<head[^>]*>.*?</head>}im)
|
27
|
+
h = metadata_hash(head)
|
28
|
+
@author = h["dc.creator"] || h["author"]
|
29
|
+
@title = h["dc.title"] || head[%r{<title[^>]*>\s*(.*?)\s*</title>}im, 1]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def author
|
34
|
+
@options[:author] || @author || "AUTHOR"
|
35
|
+
end
|
36
|
+
|
37
|
+
def title
|
38
|
+
@options[:title] || @title || "TITLE"
|
39
|
+
end
|
40
|
+
|
41
|
+
def parse_meta_element(meta)
|
42
|
+
data = {}
|
43
|
+
s = StringScanner.new(meta)
|
44
|
+
s.scan(/<meta\s+/i)
|
45
|
+
until s.eos? do
|
46
|
+
if s.scan(/\s*\/?>/)
|
47
|
+
return data
|
48
|
+
else
|
49
|
+
key = s.scan(/[^=]+/).downcase
|
50
|
+
s.scan(/=/)
|
51
|
+
end
|
52
|
+
|
53
|
+
if s.scan(/"/)
|
54
|
+
data[key] = s.scan(/[^"]+/)
|
55
|
+
s.scan(/"/)
|
56
|
+
elsif s.scan(/'/)
|
57
|
+
data[key] = s.scan(/[^']+/)
|
58
|
+
s.scan(/'/)
|
59
|
+
else
|
60
|
+
data[key] = s.scan(/\S+/)
|
61
|
+
end
|
62
|
+
s.scan(/\s+/)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def metadata_hash(head)
|
67
|
+
hash = {}
|
68
|
+
head.scan(%r{<meta\s[^>]+>}im).each do |meta|
|
69
|
+
m = parse_meta_element(meta)
|
70
|
+
if (name = m["name"]) && (content = m["content"])
|
71
|
+
hash[name.downcase] = content
|
72
|
+
end
|
73
|
+
end
|
74
|
+
hash
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
data/lib/html2tex/tex.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
class HTML2TeX
|
2
|
+
module TeX
|
3
|
+
def tex(name, param=nil)
|
4
|
+
directive = "\\" + name.to_s
|
5
|
+
directive << "{" << tex_escape(param) << "}" if param
|
6
|
+
directive
|
7
|
+
end
|
8
|
+
|
9
|
+
def tex_escape(s)
|
10
|
+
return nil if s.nil?
|
11
|
+
s.gsub(/[\\{}$&#%^_~]/, '\\\\\\0')
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/lib/html2tex/version.rb
CHANGED
data/lib/html2tex.rb
CHANGED
@@ -2,13 +2,36 @@ require "htmlentities"
|
|
2
2
|
require "rubypants"
|
3
3
|
require "html2tex/version"
|
4
4
|
require "html2tex/basic_processor"
|
5
|
+
require "html2tex/preamble_processor"
|
6
|
+
require "html2tex/tex"
|
5
7
|
|
6
8
|
class HTML2TeX
|
7
|
-
|
8
|
-
|
9
|
+
include TeX
|
10
|
+
|
11
|
+
DEFAULT_OPTIONS = {
|
12
|
+
:document => true,
|
13
|
+
:class => "book"
|
14
|
+
}
|
15
|
+
|
16
|
+
def initialize(html, options={})
|
17
|
+
@html = html
|
18
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
9
19
|
end
|
10
20
|
|
11
21
|
def to_tex(buffer="")
|
12
|
-
|
22
|
+
scanner = StringScanner.new(@html)
|
23
|
+
|
24
|
+
if @options[:document]
|
25
|
+
PreambleProcessor.new(scanner, @options).to_tex(buffer)
|
26
|
+
end
|
27
|
+
|
28
|
+
BasicProcessor.new(scanner, @options).to_tex(buffer)
|
29
|
+
|
30
|
+
if @options[:document]
|
31
|
+
buffer << "\n\n" << tex(:end, "document")
|
32
|
+
end
|
33
|
+
|
34
|
+
buffer << "\n"
|
35
|
+
buffer
|
13
36
|
end
|
14
37
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2tex
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Paul Battley
|
@@ -56,7 +56,10 @@ files:
|
|
56
56
|
- lib/html2tex.rb
|
57
57
|
- lib/html2tex/version.rb
|
58
58
|
- lib/html2tex/basic_processor.rb
|
59
|
+
- lib/html2tex/cli.rb
|
60
|
+
- lib/html2tex/preamble_processor.rb
|
59
61
|
- lib/html2tex/heading_processor.rb
|
62
|
+
- lib/html2tex/tex.rb
|
60
63
|
- README.md
|
61
64
|
- COPYING
|
62
65
|
has_rdoc: true
|