html2tex 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +23 -3
- data/bin/html2tex +14 -2
- data/lib/html2tex/basic_processor.rb +8 -8
- data/lib/html2tex/cli.rb +54 -0
- data/lib/html2tex/heading_processor.rb +5 -4
- data/lib/html2tex/preamble_processor.rb +78 -0
- data/lib/html2tex/tex.rb +14 -0
- data/lib/html2tex/version.rb +1 -1
- data/lib/html2tex.rb +26 -3
- metadata +4 -1
data/README.md
CHANGED
@@ -5,6 +5,22 @@ This is a simple library and command-line tool to convert HTML documents into
|
|
5
5
|
LaTeX. The intended use is for preparing ebooks, but it might also be useful
|
6
6
|
for other purposes.
|
7
7
|
|
8
|
+
Command-line use
|
9
|
+
----------------
|
10
|
+
|
11
|
+
Usage: html2tex [options] input.html [output.tex]
|
12
|
+
-t, --title TITLE Set (or override) the title of the document
|
13
|
+
-a, --author AUTHOR Set (or override) the author of the document
|
14
|
+
-c, --document-class CLASS Set the LaTeX document class to be used; default is book
|
15
|
+
-h, --help Display this screen
|
16
|
+
|
17
|
+
If the output file is not specified, the program will create an output file
|
18
|
+
based on the input filename, replacing the extension with `.tex` – or, if there
|
19
|
+
is no extension, by appending `.tex`.
|
20
|
+
|
21
|
+
The title and author will, by default, be extracted from the HTML document,
|
22
|
+
using either Dublin Core metadata or the HTML title and an author META element.
|
23
|
+
|
8
24
|
Library use
|
9
25
|
-----------
|
10
26
|
|
@@ -18,10 +34,14 @@ Library use
|
|
18
34
|
HTML2TeX.new(html).to_tex(f)
|
19
35
|
end
|
20
36
|
|
21
|
-
|
22
|
-
|
37
|
+
A hash of options can be supplied as the second argument to `to_tex`; the
|
38
|
+
available keys are:
|
39
|
+
|
40
|
+
* `:author`
|
41
|
+
* `:title`
|
42
|
+
* `:class`
|
23
43
|
|
24
|
-
|
44
|
+
See the command-line section above for an explanation of these.
|
25
45
|
|
26
46
|
Dependencies
|
27
47
|
------------
|
data/bin/html2tex
CHANGED
@@ -1,6 +1,18 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
require "html2tex"
|
3
|
+
require "html2tex/cli"
|
3
4
|
|
4
|
-
|
5
|
-
|
5
|
+
cli = HTML2TeX::CLI.new(ARGV)
|
6
|
+
begin
|
7
|
+
cli.parse
|
8
|
+
rescue HTML2TeX::CLI::CleanExit
|
9
|
+
puts cli.usage
|
10
|
+
exit
|
11
|
+
rescue HTML2TeX::CLI::DirtyExit
|
12
|
+
puts cli.usage
|
13
|
+
exit 1
|
14
|
+
end
|
15
|
+
|
16
|
+
File.open(cli.output_file, "w") do |io|
|
17
|
+
HTML2TeX.new(File.read(cli.input_file), cli.options).to_tex(io)
|
6
18
|
end
|
@@ -1,10 +1,15 @@
|
|
1
|
+
require "html2tex/tex"
|
2
|
+
|
1
3
|
class HTML2TeX
|
2
4
|
class BasicProcessor
|
3
|
-
|
5
|
+
include TeX
|
6
|
+
|
7
|
+
attr_reader :scanner, :options
|
4
8
|
|
5
|
-
def initialize(scanner)
|
9
|
+
def initialize(scanner, options)
|
6
10
|
@decoder = HTMLEntities.new
|
7
11
|
@scanner = scanner
|
12
|
+
@options = options
|
8
13
|
end
|
9
14
|
|
10
15
|
def to_tex(buffer="")
|
@@ -35,11 +40,6 @@ class HTML2TeX
|
|
35
40
|
tex_escape(@decoder.decode(RubyPants.new(@decoder.decode(s)).to_html))
|
36
41
|
end
|
37
42
|
|
38
|
-
def tex_escape(s)
|
39
|
-
return nil if s.nil?
|
40
|
-
s.gsub(/[\\{}$&#%^_~]/, '\\\\\\0')
|
41
|
-
end
|
42
|
-
|
43
43
|
def whitespace
|
44
44
|
if @squash_next
|
45
45
|
@squash_next = false
|
@@ -65,7 +65,7 @@ class HTML2TeX
|
|
65
65
|
"\\\\"
|
66
66
|
when /^h\d/
|
67
67
|
@squash_next = true
|
68
|
-
HeadingProcessor.new(scanner, t).to_tex + "\n\n"
|
68
|
+
HeadingProcessor.new(scanner, t, @options).to_tex + "\n\n"
|
69
69
|
else
|
70
70
|
""
|
71
71
|
end
|
data/lib/html2tex/cli.rb
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
require "optparse"
|
2
|
+
|
3
|
+
class HTML2TeX
|
4
|
+
class CLI
|
5
|
+
CleanExit = Class.new(RuntimeError)
|
6
|
+
DirtyExit = Class.new(RuntimeError)
|
7
|
+
|
8
|
+
attr_reader :options, :input_file, :output_file
|
9
|
+
|
10
|
+
def initialize(argv)
|
11
|
+
@argv = argv
|
12
|
+
@options = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
def usage
|
16
|
+
parser.to_s
|
17
|
+
end
|
18
|
+
|
19
|
+
def parse
|
20
|
+
parser.parse!(@argv)
|
21
|
+
|
22
|
+
unless @input_file = @argv[0]
|
23
|
+
raise DirtyExit
|
24
|
+
end
|
25
|
+
|
26
|
+
@output_file = @argv[1] || @input_file.sub(/\.[^\.]+$|$/, ".tex")
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
def parser
|
31
|
+
@parser ||= OptionParser.new{ |opts|
|
32
|
+
opts.banner = "Usage: #{File.basename($0)} [options] input.html [output.tex]"
|
33
|
+
|
34
|
+
opts.on "-t", "--title TITLE",
|
35
|
+
"Set (or override) the title of the document" do |title|
|
36
|
+
options[:title] = title
|
37
|
+
end
|
38
|
+
opts.on "-a", "--author AUTHOR",
|
39
|
+
"Set (or override) the author of the document" do |author|
|
40
|
+
options[:author] = author
|
41
|
+
end
|
42
|
+
opts.on "-c", "--document-class CLASS",
|
43
|
+
"Set the LaTeX document class to be used; default is book" do |klass|
|
44
|
+
options[:class] = klass
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.on "-h", "--help",
|
48
|
+
"Display this screen" do
|
49
|
+
raise CleanExit
|
50
|
+
end
|
51
|
+
}
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -4,13 +4,14 @@ class HTML2TeX
|
|
4
4
|
class HeadingProcessor < BasicProcessor
|
5
5
|
HEADINGS = %w[part chapter section subsection subsubsection]
|
6
6
|
|
7
|
-
def initialize(scanner, label)
|
7
|
+
def initialize(scanner, label, options)
|
8
8
|
@label = label
|
9
|
-
super(scanner)
|
9
|
+
super(scanner, options)
|
10
10
|
end
|
11
11
|
|
12
|
-
def to_tex(
|
13
|
-
wrap(super(
|
12
|
+
def to_tex(buffer="")
|
13
|
+
buffer << wrap(super(""))
|
14
|
+
buffer
|
14
15
|
end
|
15
16
|
|
16
17
|
private
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require "html2tex/basic_processor"
|
2
|
+
require "nokogiri"
|
3
|
+
|
4
|
+
class HTML2TeX
|
5
|
+
class PreambleProcessor < BasicProcessor
|
6
|
+
|
7
|
+
def to_tex(buffer="")
|
8
|
+
read_html_head
|
9
|
+
[ tex(:documentclass, options[:class]),
|
10
|
+
tex(:title, title),
|
11
|
+
tex(:author, author),
|
12
|
+
tex(:begin, "document"),
|
13
|
+
tex(:maketitle),
|
14
|
+
""
|
15
|
+
].each do |s|
|
16
|
+
buffer << s << "\n"
|
17
|
+
end
|
18
|
+
buffer
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def read_html_head
|
23
|
+
scanner.scan %r{\s*}
|
24
|
+
scanner.scan %r{<!doctype[^>]*>\s*}i
|
25
|
+
scanner.scan %r{<html[^>]*>\s*}i
|
26
|
+
if head = scanner.scan(%r{<head[^>]*>.*?</head>}im)
|
27
|
+
h = metadata_hash(head)
|
28
|
+
@author = h["dc.creator"] || h["author"]
|
29
|
+
@title = h["dc.title"] || head[%r{<title[^>]*>\s*(.*?)\s*</title>}im, 1]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def author
|
34
|
+
@options[:author] || @author || "AUTHOR"
|
35
|
+
end
|
36
|
+
|
37
|
+
def title
|
38
|
+
@options[:title] || @title || "TITLE"
|
39
|
+
end
|
40
|
+
|
41
|
+
def parse_meta_element(meta)
|
42
|
+
data = {}
|
43
|
+
s = StringScanner.new(meta)
|
44
|
+
s.scan(/<meta\s+/i)
|
45
|
+
until s.eos? do
|
46
|
+
if s.scan(/\s*\/?>/)
|
47
|
+
return data
|
48
|
+
else
|
49
|
+
key = s.scan(/[^=]+/).downcase
|
50
|
+
s.scan(/=/)
|
51
|
+
end
|
52
|
+
|
53
|
+
if s.scan(/"/)
|
54
|
+
data[key] = s.scan(/[^"]+/)
|
55
|
+
s.scan(/"/)
|
56
|
+
elsif s.scan(/'/)
|
57
|
+
data[key] = s.scan(/[^']+/)
|
58
|
+
s.scan(/'/)
|
59
|
+
else
|
60
|
+
data[key] = s.scan(/\S+/)
|
61
|
+
end
|
62
|
+
s.scan(/\s+/)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def metadata_hash(head)
|
67
|
+
hash = {}
|
68
|
+
head.scan(%r{<meta\s[^>]+>}im).each do |meta|
|
69
|
+
m = parse_meta_element(meta)
|
70
|
+
if (name = m["name"]) && (content = m["content"])
|
71
|
+
hash[name.downcase] = content
|
72
|
+
end
|
73
|
+
end
|
74
|
+
hash
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
data/lib/html2tex/tex.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
class HTML2TeX
|
2
|
+
module TeX
|
3
|
+
def tex(name, param=nil)
|
4
|
+
directive = "\\" + name.to_s
|
5
|
+
directive << "{" << tex_escape(param) << "}" if param
|
6
|
+
directive
|
7
|
+
end
|
8
|
+
|
9
|
+
def tex_escape(s)
|
10
|
+
return nil if s.nil?
|
11
|
+
s.gsub(/[\\{}$&#%^_~]/, '\\\\\\0')
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/lib/html2tex/version.rb
CHANGED
data/lib/html2tex.rb
CHANGED
@@ -2,13 +2,36 @@ require "htmlentities"
|
|
2
2
|
require "rubypants"
|
3
3
|
require "html2tex/version"
|
4
4
|
require "html2tex/basic_processor"
|
5
|
+
require "html2tex/preamble_processor"
|
6
|
+
require "html2tex/tex"
|
5
7
|
|
6
8
|
class HTML2TeX
|
7
|
-
|
8
|
-
|
9
|
+
include TeX
|
10
|
+
|
11
|
+
DEFAULT_OPTIONS = {
|
12
|
+
:document => true,
|
13
|
+
:class => "book"
|
14
|
+
}
|
15
|
+
|
16
|
+
def initialize(html, options={})
|
17
|
+
@html = html
|
18
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
9
19
|
end
|
10
20
|
|
11
21
|
def to_tex(buffer="")
|
12
|
-
|
22
|
+
scanner = StringScanner.new(@html)
|
23
|
+
|
24
|
+
if @options[:document]
|
25
|
+
PreambleProcessor.new(scanner, @options).to_tex(buffer)
|
26
|
+
end
|
27
|
+
|
28
|
+
BasicProcessor.new(scanner, @options).to_tex(buffer)
|
29
|
+
|
30
|
+
if @options[:document]
|
31
|
+
buffer << "\n\n" << tex(:end, "document")
|
32
|
+
end
|
33
|
+
|
34
|
+
buffer << "\n"
|
35
|
+
buffer
|
13
36
|
end
|
14
37
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2tex
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Paul Battley
|
@@ -56,7 +56,10 @@ files:
|
|
56
56
|
- lib/html2tex.rb
|
57
57
|
- lib/html2tex/version.rb
|
58
58
|
- lib/html2tex/basic_processor.rb
|
59
|
+
- lib/html2tex/cli.rb
|
60
|
+
- lib/html2tex/preamble_processor.rb
|
59
61
|
- lib/html2tex/heading_processor.rb
|
62
|
+
- lib/html2tex/tex.rb
|
60
63
|
- README.md
|
61
64
|
- COPYING
|
62
65
|
has_rdoc: true
|