RubyGems - html2tex - Versions diffs - 0.1.1 → 0.1.2 - Mend

html2tex 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/README.md +23 -3
data/bin/html2tex +14 -2
data/lib/html2tex/basic_processor.rb +8 -8
data/lib/html2tex/cli.rb +54 -0
data/lib/html2tex/heading_processor.rb +5 -4
data/lib/html2tex/preamble_processor.rb +78 -0
data/lib/html2tex/tex.rb +14 -0
data/lib/html2tex/version.rb +1 -1
data/lib/html2tex.rb +26 -3
metadata +4 -1

data/README.md CHANGED Viewed

@@ -5,6 +5,22 @@ This is a simple library and command-line tool to convert HTML documents into
 LaTeX. The intended use is for preparing ebooks, but it might also be useful
 for other purposes.
+Command-line use
+----------------
+    Usage: html2tex [options] input.html [output.tex]
+        -t, --title TITLE                Set (or override) the title of the document
+        -a, --author AUTHOR              Set (or override) the author of the document
+        -c, --document-class CLASS       Set the LaTeX document class to be used; default is book
+        -h, --help                       Display this screen
+If the output file is not specified, the program will create an output file
+based on the input filename, replacing the extension with `.tex` – or, if there
+is no extension, by appending `.tex`.
+The title and author will, by default, be extracted from the HTML document,
+using either Dublin Core metadata or the HTML title and an author META element.
 Library use
 -----------
@@ -18,10 +34,14 @@ Library use
       HTML2TeX.new(html).to_tex(f)
     end
-Command-line use
-----------------
+A hash of options can be supplied as the second argument to `to_tex`; the
+available keys are:
+* `:author`
+* `:title`
+* `:class`
-    html2tex source.html destination.tex
+See the command-line section above for an explanation of these.
 Dependencies
 ------------

data/bin/html2tex CHANGED Viewed

@@ -1,6 +1,18 @@
 #!/usr/bin/env ruby
 require "html2tex"
+require "html2tex/cli"
-File.open(ARGV[1], "w") do |output|
-  HTML2TeX.new(File.read(ARGV[0])).to_tex(output)
+cli = HTML2TeX::CLI.new(ARGV)
+begin
+  cli.parse
+rescue HTML2TeX::CLI::CleanExit
+  puts cli.usage
+  exit
+rescue HTML2TeX::CLI::DirtyExit
+  puts cli.usage
+  exit 1
+end
+File.open(cli.output_file, "w") do |io|
+  HTML2TeX.new(File.read(cli.input_file), cli.options).to_tex(io)
 end

data/lib/html2tex/basic_processor.rb CHANGED Viewed

@@ -1,10 +1,15 @@
+require "html2tex/tex"
 class HTML2TeX
   class BasicProcessor
-    attr_reader :scanner
+    include TeX
+    attr_reader :scanner, :options
-    def initialize(scanner)
+    def initialize(scanner, options)
       @decoder = HTMLEntities.new
       @scanner = scanner
+      @options = options
     end
     def to_tex(buffer="")
@@ -35,11 +40,6 @@ class HTML2TeX
       tex_escape(@decoder.decode(RubyPants.new(@decoder.decode(s)).to_html))
     end
-    def tex_escape(s)
-      return nil if s.nil?
-      s.gsub(/[\\{}$&#%^_~]/, '\\\\\\0')
-    end
     def whitespace
       if @squash_next
         @squash_next = false
@@ -65,7 +65,7 @@ class HTML2TeX
         "\\\\"
       when /^h\d/
         @squash_next = true
-        HeadingProcessor.new(scanner, t).to_tex + "\n\n"
+        HeadingProcessor.new(scanner, t, @options).to_tex + "\n\n"
       else
         ""
       end

data/lib/html2tex/cli.rb ADDED Viewed

@@ -0,0 +1,54 @@
+require "optparse"
+class HTML2TeX
+  class CLI
+    CleanExit = Class.new(RuntimeError)
+    DirtyExit = Class.new(RuntimeError)
+    attr_reader :options, :input_file, :output_file
+    def initialize(argv)
+      @argv    = argv
+      @options = {}
+    end
+    def usage
+      parser.to_s
+    end
+    def parse
+      parser.parse!(@argv)
+      unless @input_file = @argv[0]
+        raise DirtyExit
+      end
+      @output_file = @argv[1] || @input_file.sub(/\.[^\.]+$|$/, ".tex")
+    end
+  private
+    def parser
+      @parser ||= OptionParser.new{ |opts|
+        opts.banner = "Usage: #{File.basename($0)} [options] input.html [output.tex]"
+        opts.on "-t", "--title TITLE",
+                "Set (or override) the title of the document" do |title|
+          options[:title] = title
+        end
+        opts.on "-a", "--author AUTHOR",
+                "Set (or override) the author of the document" do |author|
+          options[:author] = author
+        end
+        opts.on "-c", "--document-class CLASS",
+                "Set the LaTeX document class to be used; default is book" do |klass|
+          options[:class] = klass
+        end
+        opts.on "-h", "--help",
+                "Display this screen" do
+          raise CleanExit
+        end
+      }
+    end
+  end
+end

data/lib/html2tex/heading_processor.rb CHANGED Viewed

@@ -4,13 +4,14 @@ class HTML2TeX
   class HeadingProcessor < BasicProcessor
     HEADINGS = %w[part chapter section subsection subsubsection]
-    def initialize(scanner, label)
+    def initialize(scanner, label, options)
       @label = label
-      super(scanner)
+      super(scanner, options)
     end
-    def to_tex(*args)
-      wrap(super(*args))
+    def to_tex(buffer="")
+      buffer << wrap(super(""))
+      buffer
     end
   private

data/lib/html2tex/preamble_processor.rb ADDED Viewed

@@ -0,0 +1,78 @@
+require "html2tex/basic_processor"
+require "nokogiri"
+class HTML2TeX
+  class PreambleProcessor < BasicProcessor
+    def to_tex(buffer="")
+      read_html_head
+      [ tex(:documentclass, options[:class]),
+        tex(:title, title),
+        tex(:author, author),
+        tex(:begin, "document"),
+        tex(:maketitle),
+        ""
+      ].each do |s|
+        buffer << s << "\n"
+      end
+      buffer
+    end
+  private
+    def read_html_head
+      scanner.scan %r{\s*}
+      scanner.scan %r{<!doctype[^>]*>\s*}i
+      scanner.scan %r{<html[^>]*>\s*}i
+      if head = scanner.scan(%r{<head[^>]*>.*?</head>}im)
+        h = metadata_hash(head)
+        @author = h["dc.creator"] || h["author"]
+        @title  = h["dc.title"] || head[%r{<title[^>]*>\s*(.*?)\s*</title>}im, 1]
+      end
+    end
+    def author
+      @options[:author] || @author || "AUTHOR"
+    end
+    def title
+      @options[:title] || @title || "TITLE"
+    end
+    def parse_meta_element(meta)
+      data = {}
+      s = StringScanner.new(meta)
+      s.scan(/<meta\s+/i)
+      until s.eos? do
+        if s.scan(/\s*\/?>/)
+          return data
+        else
+          key = s.scan(/[^=]+/).downcase
+          s.scan(/=/)
+        end
+        if s.scan(/"/)
+          data[key] = s.scan(/[^"]+/)
+          s.scan(/"/)
+        elsif s.scan(/'/)
+          data[key] = s.scan(/[^']+/)
+          s.scan(/'/)
+        else
+          data[key] = s.scan(/\S+/)
+        end
+        s.scan(/\s+/)
+      end
+    end
+    def metadata_hash(head)
+      hash = {}
+      head.scan(%r{<meta\s[^>]+>}im).each do |meta|
+        m = parse_meta_element(meta)
+        if (name = m["name"]) && (content = m["content"])
+          hash[name.downcase] = content
+        end
+      end
+      hash
+    end
+  end
+end

data/lib/html2tex/tex.rb ADDED Viewed

@@ -0,0 +1,14 @@
+class HTML2TeX
+  module TeX
+    def tex(name, param=nil)
+      directive = "\\" + name.to_s
+      directive << "{" << tex_escape(param) << "}" if param
+      directive
+    end
+    def tex_escape(s)
+      return nil if s.nil?
+      s.gsub(/[\\{}$&#%^_~]/, '\\\\\\0')
+    end
+  end
+end

data/lib/html2tex/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class HTML2TeX
-  VERSION = "0.1.1"
+  VERSION = "0.1.2"
 end

data/lib/html2tex.rb CHANGED Viewed

@@ -2,13 +2,36 @@ require "htmlentities"
 require "rubypants"
 require "html2tex/version"
 require "html2tex/basic_processor"
+require "html2tex/preamble_processor"
+require "html2tex/tex"
 class HTML2TeX
-  def initialize(html)
-    @html = html
+  include TeX
+  DEFAULT_OPTIONS = {
+    :document => true,
+    :class    => "book"
+  }
+  def initialize(html, options={})
+    @html    = html
+    @options = DEFAULT_OPTIONS.merge(options)
   end
   def to_tex(buffer="")
-    BasicProcessor.new(StringScanner.new(@html)).to_tex(buffer)
+    scanner = StringScanner.new(@html)
+    if @options[:document]
+      PreambleProcessor.new(scanner, @options).to_tex(buffer)
+    end
+    BasicProcessor.new(scanner, @options).to_tex(buffer)
+    if @options[:document]
+      buffer << "\n\n" << tex(:end, "document")
+    end
+    buffer << "\n"
+    buffer
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: html2tex
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.2
 platform: ruby
 authors:
 - Paul Battley
@@ -56,7 +56,10 @@ files:
 - lib/html2tex.rb
 - lib/html2tex/version.rb
 - lib/html2tex/basic_processor.rb
+- lib/html2tex/cli.rb
+- lib/html2tex/preamble_processor.rb
 - lib/html2tex/heading_processor.rb
+- lib/html2tex/tex.rb
 - README.md
 - COPYING
 has_rdoc: true