html2tex 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -5,6 +5,22 @@ This is a simple library and command-line tool to convert HTML documents into
5
5
  LaTeX. The intended use is for preparing ebooks, but it might also be useful
6
6
  for other purposes.
7
7
 
8
+ Command-line use
9
+ ----------------
10
+
11
+ Usage: html2tex [options] input.html [output.tex]
12
+ -t, --title TITLE Set (or override) the title of the document
13
+ -a, --author AUTHOR Set (or override) the author of the document
14
+ -c, --document-class CLASS Set the LaTeX document class to be used; default is book
15
+ -h, --help Display this screen
16
+
17
+ If the output file is not specified, the program will create an output file
18
+ based on the input filename, replacing the extension with `.tex` – or, if there
19
+ is no extension, by appending `.tex`.
20
+
21
+ The title and author will, by default, be extracted from the HTML document,
22
+ using either Dublin Core metadata or the HTML title and an author META element.
23
+
8
24
  Library use
9
25
  -----------
10
26
 
@@ -18,10 +34,14 @@ Library use
18
34
  HTML2TeX.new(html).to_tex(f)
19
35
  end
20
36
 
21
- Command-line use
22
- ----------------
37
+ A hash of options can be supplied as the second argument to `to_tex`; the
38
+ available keys are:
39
+
40
+ * `:author`
41
+ * `:title`
42
+ * `:class`
23
43
 
24
- html2tex source.html destination.tex
44
+ See the command-line section above for an explanation of these.
25
45
 
26
46
  Dependencies
27
47
  ------------
data/bin/html2tex CHANGED
@@ -1,6 +1,18 @@
1
1
  #!/usr/bin/env ruby
2
2
  require "html2tex"
3
+ require "html2tex/cli"
3
4
 
4
- File.open(ARGV[1], "w") do |output|
5
- HTML2TeX.new(File.read(ARGV[0])).to_tex(output)
5
+ cli = HTML2TeX::CLI.new(ARGV)
6
+ begin
7
+ cli.parse
8
+ rescue HTML2TeX::CLI::CleanExit
9
+ puts cli.usage
10
+ exit
11
+ rescue HTML2TeX::CLI::DirtyExit
12
+ puts cli.usage
13
+ exit 1
14
+ end
15
+
16
+ File.open(cli.output_file, "w") do |io|
17
+ HTML2TeX.new(File.read(cli.input_file), cli.options).to_tex(io)
6
18
  end
@@ -1,10 +1,15 @@
1
+ require "html2tex/tex"
2
+
1
3
  class HTML2TeX
2
4
  class BasicProcessor
3
- attr_reader :scanner
5
+ include TeX
6
+
7
+ attr_reader :scanner, :options
4
8
 
5
- def initialize(scanner)
9
+ def initialize(scanner, options)
6
10
  @decoder = HTMLEntities.new
7
11
  @scanner = scanner
12
+ @options = options
8
13
  end
9
14
 
10
15
  def to_tex(buffer="")
@@ -35,11 +40,6 @@ class HTML2TeX
35
40
  tex_escape(@decoder.decode(RubyPants.new(@decoder.decode(s)).to_html))
36
41
  end
37
42
 
38
- def tex_escape(s)
39
- return nil if s.nil?
40
- s.gsub(/[\\{}$&#%^_~]/, '\\\\\\0')
41
- end
42
-
43
43
  def whitespace
44
44
  if @squash_next
45
45
  @squash_next = false
@@ -65,7 +65,7 @@ class HTML2TeX
65
65
  "\\\\"
66
66
  when /^h\d/
67
67
  @squash_next = true
68
- HeadingProcessor.new(scanner, t).to_tex + "\n\n"
68
+ HeadingProcessor.new(scanner, t, @options).to_tex + "\n\n"
69
69
  else
70
70
  ""
71
71
  end
@@ -0,0 +1,54 @@
1
+ require "optparse"
2
+
3
+ class HTML2TeX
4
+ class CLI
5
+ CleanExit = Class.new(RuntimeError)
6
+ DirtyExit = Class.new(RuntimeError)
7
+
8
+ attr_reader :options, :input_file, :output_file
9
+
10
+ def initialize(argv)
11
+ @argv = argv
12
+ @options = {}
13
+ end
14
+
15
+ def usage
16
+ parser.to_s
17
+ end
18
+
19
+ def parse
20
+ parser.parse!(@argv)
21
+
22
+ unless @input_file = @argv[0]
23
+ raise DirtyExit
24
+ end
25
+
26
+ @output_file = @argv[1] || @input_file.sub(/\.[^\.]+$|$/, ".tex")
27
+ end
28
+
29
+ private
30
+ def parser
31
+ @parser ||= OptionParser.new{ |opts|
32
+ opts.banner = "Usage: #{File.basename($0)} [options] input.html [output.tex]"
33
+
34
+ opts.on "-t", "--title TITLE",
35
+ "Set (or override) the title of the document" do |title|
36
+ options[:title] = title
37
+ end
38
+ opts.on "-a", "--author AUTHOR",
39
+ "Set (or override) the author of the document" do |author|
40
+ options[:author] = author
41
+ end
42
+ opts.on "-c", "--document-class CLASS",
43
+ "Set the LaTeX document class to be used; default is book" do |klass|
44
+ options[:class] = klass
45
+ end
46
+
47
+ opts.on "-h", "--help",
48
+ "Display this screen" do
49
+ raise CleanExit
50
+ end
51
+ }
52
+ end
53
+ end
54
+ end
@@ -4,13 +4,14 @@ class HTML2TeX
4
4
  class HeadingProcessor < BasicProcessor
5
5
  HEADINGS = %w[part chapter section subsection subsubsection]
6
6
 
7
- def initialize(scanner, label)
7
+ def initialize(scanner, label, options)
8
8
  @label = label
9
- super(scanner)
9
+ super(scanner, options)
10
10
  end
11
11
 
12
- def to_tex(*args)
13
- wrap(super(*args))
12
+ def to_tex(buffer="")
13
+ buffer << wrap(super(""))
14
+ buffer
14
15
  end
15
16
 
16
17
  private
@@ -0,0 +1,78 @@
1
+ require "html2tex/basic_processor"
2
+ require "nokogiri"
3
+
4
+ class HTML2TeX
5
+ class PreambleProcessor < BasicProcessor
6
+
7
+ def to_tex(buffer="")
8
+ read_html_head
9
+ [ tex(:documentclass, options[:class]),
10
+ tex(:title, title),
11
+ tex(:author, author),
12
+ tex(:begin, "document"),
13
+ tex(:maketitle),
14
+ ""
15
+ ].each do |s|
16
+ buffer << s << "\n"
17
+ end
18
+ buffer
19
+ end
20
+
21
+ private
22
+ def read_html_head
23
+ scanner.scan %r{\s*}
24
+ scanner.scan %r{<!doctype[^>]*>\s*}i
25
+ scanner.scan %r{<html[^>]*>\s*}i
26
+ if head = scanner.scan(%r{<head[^>]*>.*?</head>}im)
27
+ h = metadata_hash(head)
28
+ @author = h["dc.creator"] || h["author"]
29
+ @title = h["dc.title"] || head[%r{<title[^>]*>\s*(.*?)\s*</title>}im, 1]
30
+ end
31
+ end
32
+
33
+ def author
34
+ @options[:author] || @author || "AUTHOR"
35
+ end
36
+
37
+ def title
38
+ @options[:title] || @title || "TITLE"
39
+ end
40
+
41
+ def parse_meta_element(meta)
42
+ data = {}
43
+ s = StringScanner.new(meta)
44
+ s.scan(/<meta\s+/i)
45
+ until s.eos? do
46
+ if s.scan(/\s*\/?>/)
47
+ return data
48
+ else
49
+ key = s.scan(/[^=]+/).downcase
50
+ s.scan(/=/)
51
+ end
52
+
53
+ if s.scan(/"/)
54
+ data[key] = s.scan(/[^"]+/)
55
+ s.scan(/"/)
56
+ elsif s.scan(/'/)
57
+ data[key] = s.scan(/[^']+/)
58
+ s.scan(/'/)
59
+ else
60
+ data[key] = s.scan(/\S+/)
61
+ end
62
+ s.scan(/\s+/)
63
+ end
64
+ end
65
+
66
+ def metadata_hash(head)
67
+ hash = {}
68
+ head.scan(%r{<meta\s[^>]+>}im).each do |meta|
69
+ m = parse_meta_element(meta)
70
+ if (name = m["name"]) && (content = m["content"])
71
+ hash[name.downcase] = content
72
+ end
73
+ end
74
+ hash
75
+ end
76
+ end
77
+ end
78
+
@@ -0,0 +1,14 @@
1
+ class HTML2TeX
2
+ module TeX
3
+ def tex(name, param=nil)
4
+ directive = "\\" + name.to_s
5
+ directive << "{" << tex_escape(param) << "}" if param
6
+ directive
7
+ end
8
+
9
+ def tex_escape(s)
10
+ return nil if s.nil?
11
+ s.gsub(/[\\{}$&#%^_~]/, '\\\\\\0')
12
+ end
13
+ end
14
+ end
@@ -1,3 +1,3 @@
1
1
  class HTML2TeX
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
data/lib/html2tex.rb CHANGED
@@ -2,13 +2,36 @@ require "htmlentities"
2
2
  require "rubypants"
3
3
  require "html2tex/version"
4
4
  require "html2tex/basic_processor"
5
+ require "html2tex/preamble_processor"
6
+ require "html2tex/tex"
5
7
 
6
8
  class HTML2TeX
7
- def initialize(html)
8
- @html = html
9
+ include TeX
10
+
11
+ DEFAULT_OPTIONS = {
12
+ :document => true,
13
+ :class => "book"
14
+ }
15
+
16
+ def initialize(html, options={})
17
+ @html = html
18
+ @options = DEFAULT_OPTIONS.merge(options)
9
19
  end
10
20
 
11
21
  def to_tex(buffer="")
12
- BasicProcessor.new(StringScanner.new(@html)).to_tex(buffer)
22
+ scanner = StringScanner.new(@html)
23
+
24
+ if @options[:document]
25
+ PreambleProcessor.new(scanner, @options).to_tex(buffer)
26
+ end
27
+
28
+ BasicProcessor.new(scanner, @options).to_tex(buffer)
29
+
30
+ if @options[:document]
31
+ buffer << "\n\n" << tex(:end, "document")
32
+ end
33
+
34
+ buffer << "\n"
35
+ buffer
13
36
  end
14
37
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2tex
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paul Battley
@@ -56,7 +56,10 @@ files:
56
56
  - lib/html2tex.rb
57
57
  - lib/html2tex/version.rb
58
58
  - lib/html2tex/basic_processor.rb
59
+ - lib/html2tex/cli.rb
60
+ - lib/html2tex/preamble_processor.rb
59
61
  - lib/html2tex/heading_processor.rb
62
+ - lib/html2tex/tex.rb
60
63
  - README.md
61
64
  - COPYING
62
65
  has_rdoc: true