html2tex 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -5,6 +5,22 @@ This is a simple library and command-line tool to convert HTML documents into
5
5
  LaTeX. The intended use is for preparing ebooks, but it might also be useful
6
6
  for other purposes.
7
7
 
8
+ Command-line use
9
+ ----------------
10
+
11
+ Usage: html2tex [options] input.html [output.tex]
12
+ -t, --title TITLE Set (or override) the title of the document
13
+ -a, --author AUTHOR Set (or override) the author of the document
14
+ -c, --document-class CLASS Set the LaTeX document class to be used; default is book
15
+ -h, --help Display this screen
16
+
17
+ If the output file is not specified, the program will create an output file
18
+ based on the input filename, replacing the extension with `.tex` – or, if there
19
+ is no extension, by appending `.tex`.
20
+
21
+ The title and author will, by default, be extracted from the HTML document,
22
+ using either Dublin Core metadata or the HTML title and an author META element.
23
+
8
24
  Library use
9
25
  -----------
10
26
 
@@ -18,10 +34,14 @@ Library use
18
34
  HTML2TeX.new(html).to_tex(f)
19
35
  end
20
36
 
21
- Command-line use
22
- ----------------
37
+ A hash of options can be supplied as the second argument to `to_tex`; the
38
+ available keys are:
39
+
40
+ * `:author`
41
+ * `:title`
42
+ * `:class`
23
43
 
24
- html2tex source.html destination.tex
44
+ See the command-line section above for an explanation of these.
25
45
 
26
46
  Dependencies
27
47
  ------------
data/bin/html2tex CHANGED
@@ -1,6 +1,18 @@
1
1
  #!/usr/bin/env ruby
2
2
  require "html2tex"
3
+ require "html2tex/cli"
3
4
 
4
- File.open(ARGV[1], "w") do |output|
5
- HTML2TeX.new(File.read(ARGV[0])).to_tex(output)
5
+ cli = HTML2TeX::CLI.new(ARGV)
6
+ begin
7
+ cli.parse
8
+ rescue HTML2TeX::CLI::CleanExit
9
+ puts cli.usage
10
+ exit
11
+ rescue HTML2TeX::CLI::DirtyExit
12
+ puts cli.usage
13
+ exit 1
14
+ end
15
+
16
+ File.open(cli.output_file, "w") do |io|
17
+ HTML2TeX.new(File.read(cli.input_file), cli.options).to_tex(io)
6
18
  end
@@ -1,10 +1,15 @@
1
+ require "html2tex/tex"
2
+
1
3
  class HTML2TeX
2
4
  class BasicProcessor
3
- attr_reader :scanner
5
+ include TeX
6
+
7
+ attr_reader :scanner, :options
4
8
 
5
- def initialize(scanner)
9
+ def initialize(scanner, options)
6
10
  @decoder = HTMLEntities.new
7
11
  @scanner = scanner
12
+ @options = options
8
13
  end
9
14
 
10
15
  def to_tex(buffer="")
@@ -35,11 +40,6 @@ class HTML2TeX
35
40
  tex_escape(@decoder.decode(RubyPants.new(@decoder.decode(s)).to_html))
36
41
  end
37
42
 
38
- def tex_escape(s)
39
- return nil if s.nil?
40
- s.gsub(/[\\{}$&#%^_~]/, '\\\\\\0')
41
- end
42
-
43
43
  def whitespace
44
44
  if @squash_next
45
45
  @squash_next = false
@@ -65,7 +65,7 @@ class HTML2TeX
65
65
  "\\\\"
66
66
  when /^h\d/
67
67
  @squash_next = true
68
- HeadingProcessor.new(scanner, t).to_tex + "\n\n"
68
+ HeadingProcessor.new(scanner, t, @options).to_tex + "\n\n"
69
69
  else
70
70
  ""
71
71
  end
@@ -0,0 +1,54 @@
1
+ require "optparse"
2
+
3
+ class HTML2TeX
4
+ class CLI
5
+ CleanExit = Class.new(RuntimeError)
6
+ DirtyExit = Class.new(RuntimeError)
7
+
8
+ attr_reader :options, :input_file, :output_file
9
+
10
+ def initialize(argv)
11
+ @argv = argv
12
+ @options = {}
13
+ end
14
+
15
+ def usage
16
+ parser.to_s
17
+ end
18
+
19
+ def parse
20
+ parser.parse!(@argv)
21
+
22
+ unless @input_file = @argv[0]
23
+ raise DirtyExit
24
+ end
25
+
26
+ @output_file = @argv[1] || @input_file.sub(/\.[^\.]+$|$/, ".tex")
27
+ end
28
+
29
+ private
30
+ def parser
31
+ @parser ||= OptionParser.new{ |opts|
32
+ opts.banner = "Usage: #{File.basename($0)} [options] input.html [output.tex]"
33
+
34
+ opts.on "-t", "--title TITLE",
35
+ "Set (or override) the title of the document" do |title|
36
+ options[:title] = title
37
+ end
38
+ opts.on "-a", "--author AUTHOR",
39
+ "Set (or override) the author of the document" do |author|
40
+ options[:author] = author
41
+ end
42
+ opts.on "-c", "--document-class CLASS",
43
+ "Set the LaTeX document class to be used; default is book" do |klass|
44
+ options[:class] = klass
45
+ end
46
+
47
+ opts.on "-h", "--help",
48
+ "Display this screen" do
49
+ raise CleanExit
50
+ end
51
+ }
52
+ end
53
+ end
54
+ end
@@ -4,13 +4,14 @@ class HTML2TeX
4
4
  class HeadingProcessor < BasicProcessor
5
5
  HEADINGS = %w[part chapter section subsection subsubsection]
6
6
 
7
- def initialize(scanner, label)
7
+ def initialize(scanner, label, options)
8
8
  @label = label
9
- super(scanner)
9
+ super(scanner, options)
10
10
  end
11
11
 
12
- def to_tex(*args)
13
- wrap(super(*args))
12
+ def to_tex(buffer="")
13
+ buffer << wrap(super(""))
14
+ buffer
14
15
  end
15
16
 
16
17
  private
@@ -0,0 +1,78 @@
1
+ require "html2tex/basic_processor"
2
+ require "nokogiri"
3
+
4
+ class HTML2TeX
5
+ class PreambleProcessor < BasicProcessor
6
+
7
+ def to_tex(buffer="")
8
+ read_html_head
9
+ [ tex(:documentclass, options[:class]),
10
+ tex(:title, title),
11
+ tex(:author, author),
12
+ tex(:begin, "document"),
13
+ tex(:maketitle),
14
+ ""
15
+ ].each do |s|
16
+ buffer << s << "\n"
17
+ end
18
+ buffer
19
+ end
20
+
21
+ private
22
+ def read_html_head
23
+ scanner.scan %r{\s*}
24
+ scanner.scan %r{<!doctype[^>]*>\s*}i
25
+ scanner.scan %r{<html[^>]*>\s*}i
26
+ if head = scanner.scan(%r{<head[^>]*>.*?</head>}im)
27
+ h = metadata_hash(head)
28
+ @author = h["dc.creator"] || h["author"]
29
+ @title = h["dc.title"] || head[%r{<title[^>]*>\s*(.*?)\s*</title>}im, 1]
30
+ end
31
+ end
32
+
33
+ def author
34
+ @options[:author] || @author || "AUTHOR"
35
+ end
36
+
37
+ def title
38
+ @options[:title] || @title || "TITLE"
39
+ end
40
+
41
+ def parse_meta_element(meta)
42
+ data = {}
43
+ s = StringScanner.new(meta)
44
+ s.scan(/<meta\s+/i)
45
+ until s.eos? do
46
+ if s.scan(/\s*\/?>/)
47
+ return data
48
+ else
49
+ key = s.scan(/[^=]+/).downcase
50
+ s.scan(/=/)
51
+ end
52
+
53
+ if s.scan(/"/)
54
+ data[key] = s.scan(/[^"]+/)
55
+ s.scan(/"/)
56
+ elsif s.scan(/'/)
57
+ data[key] = s.scan(/[^']+/)
58
+ s.scan(/'/)
59
+ else
60
+ data[key] = s.scan(/\S+/)
61
+ end
62
+ s.scan(/\s+/)
63
+ end
64
+ end
65
+
66
+ def metadata_hash(head)
67
+ hash = {}
68
+ head.scan(%r{<meta\s[^>]+>}im).each do |meta|
69
+ m = parse_meta_element(meta)
70
+ if (name = m["name"]) && (content = m["content"])
71
+ hash[name.downcase] = content
72
+ end
73
+ end
74
+ hash
75
+ end
76
+ end
77
+ end
78
+
@@ -0,0 +1,14 @@
1
+ class HTML2TeX
2
+ module TeX
3
+ def tex(name, param=nil)
4
+ directive = "\\" + name.to_s
5
+ directive << "{" << tex_escape(param) << "}" if param
6
+ directive
7
+ end
8
+
9
+ def tex_escape(s)
10
+ return nil if s.nil?
11
+ s.gsub(/[\\{}$&#%^_~]/, '\\\\\\0')
12
+ end
13
+ end
14
+ end
@@ -1,3 +1,3 @@
1
1
  class HTML2TeX
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
data/lib/html2tex.rb CHANGED
@@ -2,13 +2,36 @@ require "htmlentities"
2
2
  require "rubypants"
3
3
  require "html2tex/version"
4
4
  require "html2tex/basic_processor"
5
+ require "html2tex/preamble_processor"
6
+ require "html2tex/tex"
5
7
 
6
8
  class HTML2TeX
7
- def initialize(html)
8
- @html = html
9
+ include TeX
10
+
11
+ DEFAULT_OPTIONS = {
12
+ :document => true,
13
+ :class => "book"
14
+ }
15
+
16
+ def initialize(html, options={})
17
+ @html = html
18
+ @options = DEFAULT_OPTIONS.merge(options)
9
19
  end
10
20
 
11
21
  def to_tex(buffer="")
12
- BasicProcessor.new(StringScanner.new(@html)).to_tex(buffer)
22
+ scanner = StringScanner.new(@html)
23
+
24
+ if @options[:document]
25
+ PreambleProcessor.new(scanner, @options).to_tex(buffer)
26
+ end
27
+
28
+ BasicProcessor.new(scanner, @options).to_tex(buffer)
29
+
30
+ if @options[:document]
31
+ buffer << "\n\n" << tex(:end, "document")
32
+ end
33
+
34
+ buffer << "\n"
35
+ buffer
13
36
  end
14
37
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2tex
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paul Battley
@@ -56,7 +56,10 @@ files:
56
56
  - lib/html2tex.rb
57
57
  - lib/html2tex/version.rb
58
58
  - lib/html2tex/basic_processor.rb
59
+ - lib/html2tex/cli.rb
60
+ - lib/html2tex/preamble_processor.rb
59
61
  - lib/html2tex/heading_processor.rb
62
+ - lib/html2tex/tex.rb
60
63
  - README.md
61
64
  - COPYING
62
65
  has_rdoc: true