RubyGems - swordfish - Versions diffs - 0.0.6 → 0.0.7 - Mend

swordfish 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +8 -8
data/lib/swordfish/document.rb +64 -6
data/lib/swordfish/formats/docx/document.rb +215 -0
data/lib/swordfish/formats/docx/parser.rb +232 -0
data/lib/swordfish/nodes/base.rb +18 -2
data/lib/swordfish/nodes/footnote.rb +21 -0
data/lib/swordfish/nodes/header.rb +18 -0
data/lib/swordfish/nodes/linebreak.rb +13 -0
data/lib/swordfish/nodes/list_item.rb +1 -1
data/lib/swordfish/nodes/paragraph.rb +1 -1
data/lib/swordfish/nodes/raw.rb +19 -0
data/lib/swordfish/stylesheet.rb +1 -0
data/lib/swordfish.rb +2 -2
metadata +28 -11
data/lib/swordfish/formats/docx.rb +0 -362

checksums.yaml CHANGED Viewed

@@ -1,15 +1,15 @@
 ---
 !binary "U0hBMQ==":
   metadata.gz: !binary |-
-    OGRlYTY4Yjg5NmY5N2QzMWU2Yzg0ZjAxMDAwM2VmZjUwNThhNDMzOA==
+    MjdhYWJlNWU2YmQzY2U1MzY5ODFjYWY2M2FiODczMjZhMDZiYTA1Mg==
   data.tar.gz: !binary |-
-    NGUzNTViOGMwMTJmZmFjOGE4YTA1NzU3MGQwYTMyY2I3YmYzZjhlNQ==
+    ZTMwMDFkODM5NzlmMjQ3NGQ4YTljMDA0NzE2YTVjNmIxZDkzZGIyOQ==
 SHA512:
   metadata.gz: !binary |-
-    YWI3YmFiZGJiYzk3NTRiNTAwNjljZmQ5ODA2NjY0MjMzNjljNTVmYWU3ZmUx
-    OGYwMTljZjU3YjY3MmYzYjcwYmE3NGYzZDc3ZThlNmNmOWJlZDA2YTg2NGFm
-    YzQwMmY0ZmZkMDM4NmQ1OTE4OGZmMjdkODEwZjE3NTUxYWFhZmI=
+    ZTAxMWQ0MDY3Y2FmZjdjOTQwNzMzMzI1MmNlM2VlMWIyZDU2ZTZlMjcyYjNl
+    MmFiZDlmMWUxMGVlYWFiOTBlZGI3ZDE5OWZiN2M5MjllZGQ1MjUwOGExNTYy
+    ZjkzNjRmMTcyOTkzNjMzYjdmYTk3MDE4YTIyMjJhNjBmMzk2ZjE=
   data.tar.gz: !binary |-
-    YTAxYmU1NDBjNzIwNWNhZWRjY2UzMjIwMGQ5ZTk0ZjgwMmVhMzQ5YjA0Yjhh
-    NWEyYzQxZDMzYzEyOWViYjQ2Y2RjZjg3OTkzMjRkYTc0NWZmOGIyMzVlNWZj
-    M2ZlN2U3MDE0YjZhYzkwMjA0MDQ2Y2FmNDViNDMwMmU1NDM5Mzk=
+    NDA2MzliMGI4ZTMwZDE4MDUyZDI3YzNjMTZmOTUwY2YxNzc3NjU2NWZkNzgw
+    NjNkMjc4YTIzNzY4NTVmZDk2YzllZmJkOTZmZWM3Mjg3ZWE2ZDMwMjY4ZDE0
+    ZDQ1NGQ1ZWJhYzdmMWYxMTYzNjc3MmIyZDdkZWIxY2RkZTkzM2E=

data/lib/swordfish/document.rb CHANGED Viewed

@@ -11,6 +11,10 @@ require 'swordfish/nodes/table'
 require 'swordfish/nodes/table_row'
 require 'swordfish/nodes/table_cell'
 require 'swordfish/nodes/image'
+require 'swordfish/nodes/header'
+require 'swordfish/nodes/footnote'
+require 'swordfish/nodes/linebreak'
+require 'swordfish/nodes/raw'
 # Swordfish::Document is the internal representation of a parsed document.
@@ -56,13 +60,21 @@ module Swordfish
       end
     end
-    def to_html(opts = {})
-      html = @nodes.map(&:to_html).join
-      if opts[:pretty]
-        Nokogiri::HTML(html).to_html
+    # Perform various destructive operations that may result in improved output
+    def settings(opts = {})
+      find_headers! if opts[:guess_headers]
+      find_footnotes! if opts[:footnotes]
+      @generate_full_document = !!opts[:full_document]
+      self
+    end
+    def to_html
+      if @generate_full_document
+        prefix = "<!DOCTYPE html><html><head><title></title></head><body>"
+        suffix = "</body></html>"
+        prefix + @nodes.map(&:to_html).join + suffix
       else
-        html
+        @nodes.map(&:to_html).join
       end
     end
@@ -72,5 +84,51 @@ module Swordfish
     def find_nodes_by_type(klass)
       @nodes.collect{|n| n.find_nodes_by_type(klass)}.flatten
     end
+    # Attempt to identify header nodes
+    def find_headers!
+      font_sizes = []
+      # If a paragraph has a single font size throughout, mark it in the array.
+      @nodes.each_with_index do |node, idx|
+        if node.is_a?(Swordfish::Node::Paragraph)
+          para_size = node.style.font_size
+          run_sizes = node.children.collect{ |n| n.style.font_size }.compact
+          if (run_sizes.length == 1) || (run_sizes.length == 0 && para_size)
+            font_sizes << {:idx => idx, :size => run_sizes.first || para_size}
+          end
+        end
+      end
+      # For each node with a consistent size, if it is larger than both of
+      # its neighbors, flag it as a header
+      header_sizes = []
+      font_sizes.each_with_index do |f, idx|
+        if idx == 0
+          header_sizes << f[:size] if f[:size] > font_sizes[idx+1][:size]
+        elsif idx != font_sizes.length - 1
+          header_sizes << f[:size] if (f[:size] > font_sizes[idx-1][:size] && f[:size] > font_sizes[idx+1][:size])
+        end
+      end
+      header_sizes = header_sizes.uniq.sort.reverse
+      font_sizes.each do |f|
+        level = header_sizes.find_index(f[:size])
+        if level
+          header = @nodes[f[:idx]].replace_with(Swordfish::Node::Header)
+          header.inform! :level => (level + 1)
+          @nodes[f[:idx]] = header
+        end
+      end
+    end
+    # Find all foot/endnotes and number them
+    def find_footnotes!
+      find_nodes_by_type(Swordfish::Node::Footnote).each_with_index do |footnote, idx|
+        footnote.inform!({:index => idx})
+        footnote_content = Swordfish::Node::Raw.new
+        footnote_content.content = footnote.content_to_html
+        @nodes << footnote_content
+      end
+    end
   end
 end

data/lib/swordfish/formats/docx/document.rb ADDED Viewed

@@ -0,0 +1,215 @@
+require 'zip'
+require 'nokogiri'
+require 'swordfish/document'
+require_relative 'parser'
+# Swordfish::DOCX defines a parser for .docx (Office OpenXML) formats
+module Swordfish
+  module DOCX
+    class Document
+      include Swordfish::DOCX::Parser
+      attr_reader :swordfish_doc   # The Swordfish::Document corresponding to the parsed document
+      attr_reader :docx_archive    # The source archive
+      # Parse a document and return a Swordfish::Document object
+      def self.open(filepath)
+        # .docx is a zipped file format consisting of several XML files.
+        # Read in the content of each needed file.
+        docx_archive = Zip::File.open(filepath)
+        xml_docs = {
+          :document      => docx_archive.read('word/document.xml'),
+          :styles        => docx_archive.read('word/styles.xml'),
+          :numbering     => (docx_archive.read('word/numbering.xml') rescue nil),
+          :relationships => (docx_archive.read('word/_rels/document.xml.rels') rescue nil),
+          :footnotes     => (docx_archive.read('word/footnotes.xml') rescue nil),
+          :footnote_rels => (docx_archive.read('word/_rels/footnotes.xml.rels') rescue nil),
+          :endnotes      => (docx_archive.read('word/endnotes.xml') rescue nil),
+          :endnote_rels  => (docx_archive.read('word/_rels/endnotes.xml.rels') rescue nil)
+        }
+        # Parse the XML files and generate the Swordfish::Document
+        swordfish_docx = new docx_archive, xml_docs
+        swordfish_docx.swordfish_doc
+      end
+      def initialize(archive, xml_docs)
+        @docx_archive = archive
+        @swordfish_doc = Swordfish::Document.new
+        parse_styles xml_docs[:styles]
+        parse_numbering(xml_docs[:numbering]) if xml_docs[:numbering]
+        parse_relationships(xml_docs[:relationships]) if xml_docs[:relationships]
+        parse_relationships(xml_docs[:footnote_rels], :footnotes) if xml_docs[:footnote_rels]
+        parse_relationships(xml_docs[:endnote_rels], :endnotes) if xml_docs[:endnote_rels]
+        parse_footnotes(xml_docs[:footnotes]) if xml_docs[:footnotes]
+        parse_endnotes(xml_docs[:endnotes]) if xml_docs[:endnotes]
+        parse xml_docs[:document]
+      end
+      private
+      # Take the contents of the build buffer and flush them into the Swordfish::Document object.
+      # This buffer is needed for certain docx constructs that consist of multiple top-level
+      # elements but correspond to a single Swordfish::Node, such as lists.
+      def flush
+        @swordfish_doc.append(@buffer) if @buffer
+        @buffer = nil
+      end
+      # Parse the document structure XML
+      def parse(document_xml)
+        @xml = Nokogiri::XML(document_xml)
+        # Iterate over each element node and dispatch it to the appropriate parser
+        @xml.xpath('//w:body').children.each do |node|
+          case node.name
+            when 'p'
+              if node.xpath('.//w:numPr').length == 0 && (@buffer.is_a?(Swordfish::Node::List) ? node.xpath('.//w:ind').length.zero? : true)
+                # Regular paragraph
+                # (The buffer check makes sure that this isn't an indented paragraph immediately after a list item,
+                # which means we're most likely dealing with a multi-paragraph list item)
+                flush
+                @swordfish_doc.append _node_parse_paragraph(node)
+              elsif node.xpath('.//w:numPr/ancestor::w:pPrChange').length.zero?
+                # List paragraph
+                # (must have a numPr node, but cannot have a pPrChange ancestor, since that means
+                # we are just looking at historical changes)
+                # (Don't flush because we need to first ensure the list is fully parsed)
+                _node_parse_list(node)
+              end
+            when 'tbl'
+              flush
+              @swordfish_doc.append _node_parse_table(node)
+          end
+        end
+        flush
+      end
+      # Parse styles out of a docx element property nodeset (*Pr) and stylize the Swordfish::Node
+      # If the Swordfish::Node is not provided, return a stylesheet instead
+      def get_styles_for_node(xml_nodeset, swordfish_node = nil)
+        return unless xml_nodeset
+        swordfish_node = Swordfish::Node::Base.new if swordfish_node.nil?
+        xml_nodeset.children.each do |style_node|
+          case style_node.name
+            when 'i'
+              swordfish_node.stylize :italic
+            when 'b'
+              swordfish_node.stylize :bold
+            when 'u'
+              swordfish_node.stylize :underline
+            when 'strike'
+              swordfish_node.stylize :strikethrough
+            when 'sz'
+              swordfish_node.stylize :font_size => (style_node['w:val'].to_i / 2)
+            when 'szCs' && !swordfish_node.style.font_size
+              # Only use complex script size node if there is no standard size node
+              swordfish_node.stylize :font_size => (style_node['w:val'].to_i / 2)
+            when 'vertAlign'
+              if style_node['w:val'] == 'superscript'
+                swordfish_node.stylize :superscript
+              elsif style_node['w:val'] == 'subscript'
+                swordfish_node.stylize :subscript
+              end
+            when 'rStyle'
+              if style_node['w:val'] == 'Strong'
+                swordfish_node.stylize :strong
+              elsif style_node['w:val'] == 'Emphasis'
+                swordfish_node.stylize :emphasis
+              end
+          end
+        end
+        swordfish_node.style
+      end
+      # Parse the document styles XML
+      def parse_styles(styles_xml)
+        # This XML document defines a number of styles, which can be referenced by the document
+        # XML in order to quickly reference repeated styles without having to redefine them for
+        # every run. This function will load needed styles into a hash keyed by the style ID.
+        @styles = {}
+        xml = Nokogiri::XML(styles_xml)
+        xml.xpath("//w:style").each do |style|
+          style_id = style['w:styleId']
+          stylesheet = get_styles_for_node(style.xpath(".//w:rPr"))
+          @styles[style_id.to_sym] = stylesheet
+        end
+      end
+      # Parse the abstract numbering XML (defining things such as list numbering)
+      def parse_numbering(numbering_xml)
+        # The XML maps a numbering ID (numId) to an abstract numbering schema ID (abstractNumId).
+        # The abstract numbering schema defines display formats for each level of indentation (lvl).
+        # This function will load up the relevant data into the @numbering class variable in the form
+        # of a nested hash: @numbering[numbering ID][indentation level] = number format.
+        @numbering = {}
+        xml = Nokogiri::XML(numbering_xml)
+        xml.xpath("//w:num").each do |num|
+          numId = num['w:numId'].to_i
+          abstractNumId = num.xpath("./w:abstractNumId")[0]['w:val'].to_i
+          abstract_numbering = {}
+          xml.xpath("//w:abstractNum[@w:abstractNumId='#{abstractNumId}']/w:lvl").each do |level_format|
+            level = level_format['w:ilvl'].to_i
+            format = level_format.xpath("./w:numFmt")[0]['w:val']
+            abstract_numbering[level] = format
+          end
+          @numbering[numId] = abstract_numbering
+        end
+      end
+      # Parse the relationships XML (defining things such as internal references and external links)
+      def parse_relationships(relationships_xml, type = nil)
+        # The XML contains a list of relationships identified by an id. Each relationship includes
+        # a target attribute designating the reference. THis function will load up the relevant
+        # data into the @relationships class variable in the form of a hash:
+        # @relationships[relationship ID] = target URI.
+        rels = @relationships ||= {}
+        rels = (@relationships[type] ||= {}) if type
+        xml = Nokogiri::XML(relationships_xml)
+        xml.css("Relationship").each do |rel| # Nokogiri doesn't seem to like XPath here for some reason
+          rels[rel['Id']] = rel['Target']
+        end
+      end
+      # Parse the footnotes XML
+      def parse_footnotes(footnotes_xml)
+        @footnotes = {}
+        xml = Nokogiri::XML(footnotes_xml)
+        xml.xpath("//w:footnote[@w:id > 0]").each do |footnote|
+          id = footnote['w:id'].to_i
+          f = Swordfish::Node::Footnote.new
+          footnote.xpath(".//w:p").each do |p|
+            f.append _node_parse_paragraph(p, :footnotes)
+          end
+          @footnotes[id] = f
+        end
+      end
+      # Parse the endnotes XML
+      def parse_endnotes(endnotes_xml)
+        @endnotes = {}
+        xml = Nokogiri::XML(endnotes_xml)
+        xml.xpath("//w:endnote[@w:id > 0]").each do |endnote|
+          id = endnote['w:id'].to_i
+          f = Swordfish::Node::Footnote.new
+          endnote.xpath(".//w:p").each do |p|
+            f.append _node_parse_runs(p, :endnotes)
+          end
+          @endnotes[id] = f
+        end
+      end
+      # Extract an image resource as a tempfile
+      def read_image(image_name)
+        tempfile = Tempfile.new(image_name)
+        tempfile.write @docx_archive.get_input_stream("word/media/#{image_name}").read
+        tempfile.close
+        tempfile
+      end
+    end
+  end
+end

data/lib/swordfish/formats/docx/parser.rb ADDED Viewed

@@ -0,0 +1,232 @@
+module Swordfish
+  module DOCX
+    module Parser
+      # NODE PARSERS
+      # Each of the methods below (beginning with '_node') are specialized parsers for handling
+      # a particular type of XML element.
+      # Parse one or more runs
+      def _node_parse_runs(node, context = nil)
+        # The 'run' is the basic unit of text in Office OpenXML. A paragraph, table cell, or other
+        # block element may contain one or more runs, and each run has an associated set of styles.
+        texts = []
+        # A complex field is a special type of node spanning multiple runs, where most of the runs
+        # designate a special control flow rather than normal text.
+        complex_field = nil
+        nodes = node.is_a?(Array) ? node : node.children
+        nodes.each_with_index do |run_xml, idx|
+          case run_xml.name
+            when 'r'
+              if run_xml.xpath('./w:t').length > 0 && complex_field.nil?
+                # A True run node
+                # Only examine the run if it includes text codes. The run may also include
+                # things like comment nodes, which should be ignored.
+                text = Swordfish::Node::Text.new
+                text.content = run_xml.xpath('./w:t')[0].content
+                get_styles_for_node(run_xml.xpath('./w:rPr')[0], text)
+                texts << text
+              elsif run_xml.xpath('.//*[name()="pic:pic"]').length > 0
+                # An image run
+                image = Swordfish::Node::Image.new
+                relationship_id = run_xml.xpath('.//*[name()="pic:pic"]/*[name()="pic:blipFill"]/*[name()="a:blip"]')[0]['r:embed'] rescue nil
+                if relationship_id
+                  image.original_name = @relationships[relationship_id].split('/').last
+                  @swordfish_doc.images[image.original_name] = read_image(image.original_name)
+                  texts << image
+                end
+              elsif run_xml.xpath('./w:fldChar').length > 0 || complex_field
+                # A complex field
+                case
+                  when run_xml.xpath('./w:fldChar').length > 0 && run_xml.xpath('./w:fldChar')[0]['w:fldCharType'] == 'begin'
+                    # Start the complex field
+                    complex_field = true
+                  when run_xml.xpath('./w:instrText').length > 0
+                    # An instruction run, defining the complex field's behavior
+                    instruction = run_xml.xpath('./w:instrText')[0].content
+                    if instruction =~ /^\s*HYPERLINK/
+                      # A hyperlink
+                      complex_field = Swordfish::Node::Hyperlink.new
+                      complex_field.href = instruction.match(/^\s*HYPERLINK "([^"]+)"/).captures[0]
+                    else
+                      # Anything else
+                      complex_field = Swordfish::Node::Text.new
+                    end
+                  when run_xml.xpath('./w:t').length > 0 && complex_field.children.length.zero?
+                    # The textual content
+                    complex_field.append(_node_parse_runs(nodes.to_a[idx..-1]))
+                  when run_xml.xpath('./w:fldChar').length > 0 && run_xml.xpath('./w:fldChar')[0]['w:fldCharType'] == 'end'
+                    # End the complex field
+                    if complex_field
+                      texts << complex_field
+                      complex_field = nil
+                    else
+                      # Handle the case where _node_parse_runs gets called from within a complex field
+                      return texts
+                    end
+                end
+              elsif run_xml.xpath('./w:footnoteReference').length > 0
+                # A footnote reference
+                id = run_xml.xpath('./w:footnoteReference')[0]['w:id'].to_i
+                texts << @footnotes[id] if @footnotes[id]
+              elsif run_xml.xpath('./w:endnoteReference').length > 0
+                # An endnote reference
+                id = run_xml.xpath('./w:endnoteReference')[0]['w:id'].to_i
+                texts << @endnotes[id] if @endnotes[id]
+              elsif run_xml.xpath('./w:br').length > 0
+                # A linebreak run
+                texts << Swordfish::Node::Linebreak.new
+              end
+            when 'hyperlink'
+              # Hyperlink nodes are placed amongst other run nodes, but
+              # they themselves also contain runs. Hyperlinks include
+              # a relationship ID attribute defining their reference.
+              link = Swordfish::Node::Hyperlink.new
+              link.href = context ? @relationships[context][run_xml['r:id']] : @relationships[run_xml['r:id']]
+              _node_parse_runs(run_xml).each {|r| link.append(r)}
+              texts << link
+          end
+        end
+        # Clean up runs by merging them if they have identical styles
+        texts = texts.reduce([]) do |memo, run|
+          if memo.length > 0 && memo.last.is_a?(Swordfish::Node::Text) && run.is_a?(Swordfish::Node::Text) && memo.last.style == run.style
+            memo.last.content += run.content
+          else
+            memo << run
+          end
+          memo
+        end
+        texts
+      end
+      # Parse a paragraph
+      def _node_parse_paragraph(node)
+        paragraph = Swordfish::Node::Paragraph.new
+        _node_parse_runs(node).each {|r| paragraph.append(r)}
+        if node.xpath("./w:pPr/w:pStyle").length > 0
+          style_id = node.xpath("./w:pPr/w:pStyle")[0]['w:val'].to_sym
+          paragraph.style = @styles[style_id] if @styles[style_id]
+        end
+        paragraph
+      end
+      # Parse a list
+      def _node_parse_list(node)
+        # In Office OpenXML, a list is not a distinct element type, but rather a
+        # specialized paragraph that references an abstract numbering scheme
+        # and includes an indentation level. As a result, the build buffer
+        # must be used to assemble the Swordfish::Node representation of the list,
+        # since the only way to tell the list has been fully parsed is to encounter
+        # a non-list element.
+        # Handle paragraphs with no level, which represent multi-paragraph list items
+        if node.xpath(".//w:numPr/w:ilvl").length.zero?
+          para = Swordfish::Node::Paragraph.new
+          _node_parse_runs(node).each {|r| para.append(r)}
+          @buffer.last_list_item(:recurse => true).wrap_children(Swordfish::Node::Text, Swordfish::Node::Paragraph)
+          @buffer.last_list_item(:recurse => true).append para
+          return
+        end
+        # Get the list item's abstract numbering and level
+        list_item = Swordfish::Node::ListItem.new
+        _node_parse_runs(node).each {|r| list_item.append(r)}
+        level = node.xpath(".//w:numPr/w:ilvl")[0]['w:val'].to_i
+        numbering_scheme = node.xpath(".//w:numPr/w:numId")[0]['w:val'].to_i
+        # If the build buffer is empty, this is a new list
+        unless @buffer
+          @buffer = Swordfish::Node::List.new
+          @buffer.stylize @numbering[numbering_scheme][level].to_sym
+          @buffer_initial_value = level # Lists may have an arbitrary initial level
+        end
+        # Compare the level of this list item to the bottommost node in
+        # the build buffer to determine where in the hierarchy to add
+        # this node (i.e., are we dealing with list nesting or not?)
+        if @buffer.depth_of_final_node >= level || @buffer.children.empty?
+          # Add sibling to existing list
+          target = @buffer
+          (level - @buffer_initial_value).times do
+            target = target.last_list_item.nested_list
+          end
+          target.append list_item
+        elsif @buffer.depth_of_final_node < level
+          # Add new nested list
+          target = @buffer
+          (level - @buffer_initial_value- 1).times do
+            target = target.last_list_item.nested_list
+          end
+          list = Swordfish::Node::List.new
+          list.append list_item
+          list.stylize @numbering[numbering_scheme][level].to_sym
+          target.last_list_item.append list
+        end
+      end
+      # Parse a table
+      def _node_parse_table(node)
+        table = Swordfish::Node::Table.new
+        node.xpath("./w:tr").each do |row|
+          table.append _node_parse_table_row(row)
+        end
+        table
+      end
+      # Parse a table row
+      def _node_parse_table_row(node)
+        row = Swordfish::Node::TableRow.new
+        node.xpath('./w:tc').each do |cell|
+          row.append _node_parse_table_cell(cell)
+        end
+        row
+      end
+      # Parse a table cell
+      def _node_parse_table_cell(node)
+        # In a Swordfish::Node::Table object, the number of table cells must equal the
+        # total number of rows times the total number of columns; that is, even if
+        # two cells are merged together, there must be a Swordfish::Node::TableCell for
+        # each one. Merges are defined using the "merge_up" and "merge_left" properties.
+        cell = Swordfish::Node::TableCell.new
+        extra_cells = []
+        # Get the inner content of the cell
+        node.xpath("./w:p").each do |paragraph|
+          cell.append _node_parse_paragraph(paragraph)
+        end
+        # Determine whether this cell spans multiple rows. In Office OpenXML,
+        # a table cell is defined in every row, even if the cell is vertically-merged. The representation
+        # of the merged cell within each row is given a vMerge property, with the topmost one also
+        # having a vMerge value of "restart", and the others having no vMerge value.
+        if node.xpath("./w:tcPr/w:vMerge").length > 0 && node.xpath("./w:tcPr/w:vMerge")[0]['w:val'].nil?
+          cell.merge_up = true
+        end
+        # Determine whether this cell spans multiple columns. Unlike with vertical merges,
+        # a horizontally-merged Office OpenXML cell is only defined once, but is given a gridSpan
+        # property defining the number of columns it spans. Since Swordfish requires a cell for each
+        # column, loop to generate the additional cells, and set their merge_left values appropriately.
+        if node.xpath("./w:tcPr/w:gridSpan").length > 0
+          node.xpath("./w:tcPr/w:gridSpan")[0]['w:val'].to_i.-(1).times do
+            c = Swordfish::Node::TableCell.new
+            c.merge_left = true
+            extra_cells << c
+          end
+        end
+        # Return the generated cell or cells
+        if extra_cells.empty?
+          return cell
+        else
+          return [cell] + extra_cells
+        end
+      end
+    end
+  end
+end

data/lib/swordfish/nodes/base.rb CHANGED Viewed

@@ -6,7 +6,7 @@ module Swordfish
       attr_accessor :content
       attr_accessor :children
-      attr_reader :style
+      attr_accessor :style
       # Initialize with a blank stylesheet and no children
       def initialize
@@ -28,7 +28,15 @@ module Swordfish
       # Take a style or styles and add them to this node's stylesheet
       def stylize(styles)
-        @style.merge styles
+        if styles.is_a? Hash
+          # Key/value pairs
+          styles.each do |k, v|
+            @style.send "#{k}=".to_sym, v
+          end
+        else
+          # Boolean values
+          @style.merge styles
+        end
       end
       # Every subclass must implement to_html in order to be converted to HTML
@@ -67,6 +75,14 @@ module Swordfish
         nodes.compact
       end
+      # Return a clone of this node with a different class
+      def replace_with(klass)
+        if klass <= Swordfish::Node::Base
+          new_node = klass.new
+          new_node.inform!({:style => @style, :children => @children, :content => @content })
+          new_node
+        end
+      end
     end
     class BadContentError < Exception

data/lib/swordfish/nodes/footnote.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# A foonote node
+module Swordfish
+  module Node
+    class Footnote < Base
+      attr_accessor :index
+      def to_html
+        return "" unless @index
+        "<a id='footnote-ref-#{@index}' href='#footnote-#{@index}'>[#{@index}]</a>"
+      end
+      def content_to_html
+        return "" unless @index
+        "<p><a id='footnote-#{@index}' href='#footnote-ref-#{@index}'>[#{@index}]</a> #{@children.map(&:to_html).join}</p>"
+      end
+    end
+  end
+end

data/lib/swordfish/nodes/header.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# A header node
+module Swordfish
+  module Node
+    class Header < Base
+      attr_accessor :level
+      def to_html
+        raise "Missing header level" unless @level
+        tag = @level <= 6 ? "h#{@level}" : "h6"
+        text = @children.map(&:to_html).join
+        "<#{tag}>#{text}</#{tag}>"
+      end
+    end
+  end
+end

data/lib/swordfish/nodes/linebreak.rb ADDED Viewed

@@ -0,0 +1,13 @@
+# A linebreak node
+module Swordfish
+  module Node
+    class Linebreak < Base
+      def to_html
+        "<br/>"
+      end
+    end
+  end
+end

data/lib/swordfish/nodes/list_item.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module Swordfish
     class ListItem < Base
       def to_html
-        "<li>#{@children.map(&:to_html).join}</li>"
+        "<li>#{@children.map(&:to_html).join.strip}</li>"
       end
       # Return the nested list, or nil if this list item has no nested lists

data/lib/swordfish/nodes/paragraph.rb CHANGED Viewed

@@ -11,7 +11,7 @@ module Swordfish
           # If the only child is an image, don't bother putting it in a P tag
           @children.map(&:to_html).join
         else
-          text = @children.map(&:to_html).join
+          text = @children.map(&:to_html).join.strip
           "<p>#{text}</p>" unless text =~ /^[[:space:]]*$/
         end
       end

data/lib/swordfish/nodes/raw.rb ADDED Viewed

@@ -0,0 +1,19 @@
+# A raw content node
+# This node simply outputs its content as-is, with no attempts to reformat or escape text
+module Swordfish
+  module Node
+    class Raw < Base
+      # Override Base append because a raw node should never have children
+      def append(node)
+        raise BadContentError
+      end
+      def to_html
+        @content
+      end
+    end
+  end
+end

data/lib/swordfish/stylesheet.rb CHANGED Viewed

@@ -4,6 +4,7 @@ module Swordfish
   class Stylesheet
     attr_reader :styles
+    attr_accessor :font_size
     # Define all supported values here
     SUPPORTED_STYLES = [

data/lib/swordfish.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 require 'swordfish/document'
-require 'swordfish/formats/docx'
+require 'swordfish/formats/docx/document'
 module Swordfish
@@ -8,7 +8,7 @@ module Swordfish
     extension = (opts[:extension] || filepath.split('.').last).downcase.to_sym
     case extension
       when :docx
-        Swordfish::DOCX.open(filepath)
+        Swordfish::DOCX::Document.open(filepath)
       else
         raise UnsupportedFormatError, "'#{extension}' is not a recognized file format"
     end

metadata CHANGED Viewed

@@ -1,57 +1,69 @@
 --- !ruby/object:Gem::Specification
 name: swordfish
 version: !ruby/object:Gem::Version
-  version: 0.0.6
+  version: 0.0.7
 platform: ruby
 authors:
 - Martin Posthumus
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-06-05 00:00:00.000000000 Z
+date: 2014-07-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ! '>='
+    - - ~>
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '1'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ! '>='
+    - - ~>
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '1'
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
     requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.6'
     - - ! '>='
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 1.6.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.6'
     - - ! '>='
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 1.6.0
 - !ruby/object:Gem::Dependency
   name: rubyzip
   requirement: !ruby/object:Gem::Requirement
     requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.1'
     - - ! '>='
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 1.1.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.1'
     - - ! '>='
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 1.1.0
 description: A simple library for various word processor formats focusing primarily
   around conversion to HTML
 email: martin.posthumus@gmail.com
@@ -62,13 +74,18 @@ files:
 - README.md
 - lib/swordfish.rb
 - lib/swordfish/document.rb
-- lib/swordfish/formats/docx.rb
+- lib/swordfish/formats/docx/document.rb
+- lib/swordfish/formats/docx/parser.rb
 - lib/swordfish/nodes/base.rb
+- lib/swordfish/nodes/footnote.rb
+- lib/swordfish/nodes/header.rb
 - lib/swordfish/nodes/hyperlink.rb
 - lib/swordfish/nodes/image.rb
+- lib/swordfish/nodes/linebreak.rb
 - lib/swordfish/nodes/list.rb
 - lib/swordfish/nodes/list_item.rb
 - lib/swordfish/nodes/paragraph.rb
+- lib/swordfish/nodes/raw.rb
 - lib/swordfish/nodes/table.rb
 - lib/swordfish/nodes/table_cell.rb
 - lib/swordfish/nodes/table_row.rb

data/lib/swordfish/formats/docx.rb DELETED Viewed

@@ -1,362 +0,0 @@
-require 'zip'
-require 'nokogiri'
-require 'swordfish/document'
-# Swordfish::DOCX defines a parser for .docx (Office OpenXML) formats
-module Swordfish
-  class DOCX
-    attr_reader :swordfish_doc   # The Swordfish::Document corresponding to the parsed document
-    attr_reader :docx_archive    # The source archive
-    # Parse a document and return a Swordfish::Document object
-    def self.open(filepath)
-      # .docx is a zipped file format consisting of several XML files.
-      # Read in the content of each needed file.
-      docx_archive = Zip::File.open(filepath)
-      document = docx_archive.read 'word/document.xml'
-      styles = docx_archive.read 'word/styles.xml'
-      numbering = docx_archive.read('word/numbering.xml') rescue nil
-      relationships = docx_archive.read('word/_rels/document.xml.rels') rescue nil
-      # Parse the XML files and generate the Swordfish::Document
-      swordfish_docx = new docx_archive, document, styles, numbering, relationships
-      swordfish_docx.swordfish_doc
-    end
-    def initialize(archive, document_xml, styles_xml, numbering_xml, relationships_xml)
-      @docx_archive = archive
-      @swordfish_doc = Swordfish::Document.new
-      parse_styles styles_xml
-      parse_numbering(numbering_xml) if numbering_xml
-      parse_relationships(relationships_xml) if relationships_xml
-      parse document_xml
-    end
-    private
-    # Take the contents of the build buffer and flush them into the Swordfish::Document object.
-    # This buffer is needed for certain docx constructs that consist of multiple top-level
-    # elements but correspond to a single Swordfish::Node, such as lists.
-    def flush
-      @swordfish_doc.append(@buffer) if @buffer
-      @buffer = nil
-    end
-    # Parse the document structure XML
-    def parse(document_xml)
-      @xml = Nokogiri::XML(document_xml)
-      # Iterate over each element node and dispatch it to the appropriate parser
-      @xml.xpath('//w:body').children.each do |node|
-        case node.name
-          when 'p'
-            if node.xpath('.//w:numPr').length == 0 && (@buffer.is_a?(Swordfish::Node::List) ? node.xpath('.//w:ind').length.zero? : true)
-              # Regular paragraph
-              # (The buffer check makes sure that this isn't an indented paragraph immediately after a list item,
-              # which means we're most likely dealing with a multi-paragraph list item)
-              flush
-              @swordfish_doc.append _node_parse_paragraph(node)
-            elsif node.xpath('.//w:numPr/ancestor::w:pPrChange').length.zero?
-              # List paragraph
-              # (must have a numPr node, but cannot have a pPrChange ancestor, since that means
-              # we are just looking at historical changes)
-              # (Don't flush because we need to first ensure the list is fully parsed)
-              _node_parse_list(node)
-            end
-          when 'tbl'
-            flush
-            @swordfish_doc.append _node_parse_table(node)
-        end
-      end
-      flush
-    end
-    # Parse styles out of a docx element property nodeset (*Pr) and stylize the Swordfish::Node
-    def get_styles_for_node(swordfish_node, xml_nodeset)
-      return unless xml_nodeset
-      xml_nodeset.children.each do |style_node|
-        case style_node.name
-          when 'i'
-            swordfish_node.stylize :italic
-          when 'b'
-            swordfish_node.stylize :bold
-          when 'u'
-            swordfish_node.stylize :underline
-          when 'strike'
-            swordfish_node.stylize :strikethrough
-          when 'vertAlign'
-            if style_node['w:val'] == 'superscript'
-              swordfish_node.stylize :superscript
-            elsif style_node['w:val'] == 'subscript'
-              swordfish_node.stylize :subscript
-            end
-          when 'rStyle'
-            if style_node['w:val'] == 'Strong'
-              swordfish_node.stylize :strong
-            elsif style_node['w:val'] == 'Emphasis'
-              swordfish_node.stylize :emphasis
-            end
-        end
-      end
-    end
-    # Parse the document styles XML
-    def parse_styles(styles_xml)
-    end
-    # Parse the abstract numbering XML (defining things such as list numbering)
-    def parse_numbering(numbering_xml)
-      # The XML maps a numbering ID (numId) to an abstract numbering schema ID (abstractNumId).
-      # The abstract numbering schema defines display formats for each level of indentation (lvl).
-      # This function will load up the relevant data into the @numbering class variable in the form
-      # of a nested hash: @numbering[numbering ID][indentation level] = number format.
-      @numbering = {}
-      xml = Nokogiri::XML(numbering_xml)
-      xml.xpath("//w:num").each do |num|
-        numId = num['w:numId'].to_i
-        abstractNumId = num.xpath("./w:abstractNumId")[0]['w:val'].to_i
-        abstract_numbering = {}
-        xml.xpath("//w:abstractNum[@w:abstractNumId='#{abstractNumId}']/w:lvl").each do |level_format|
-          level = level_format['w:ilvl'].to_i
-          format = level_format.xpath("./w:numFmt")[0]['w:val']
-          abstract_numbering[level] = format
-        end
-        @numbering[numId] = abstract_numbering
-      end
-    end
-    # Parse the relationships XML (defining things such as internal references and external links)
-    def parse_relationships(relationships_xml)
-      # The XML contains a list of relationships identified by an id. Each relationship includes
-      # a target attribute designating the reference. THis function will load up the relevant
-      # data into the @relationships class variable in the form of a hash:
-      # @relationships[relationship ID] = target URI.
-      @relationships = {}
-      xml = Nokogiri::XML(relationships_xml)
-      xml.css("Relationship").each do |rel| # Nokogiri doesn't seem to like XPath here for some reason
-        @relationships[rel['Id']] = rel['Target']
-      end
-    end
-    # Extract an image resource as a tempfile
-    def read_image(image_name)
-      tempfile = Tempfile.new(image_name)
-      tempfile.write @docx_archive.get_input_stream("word/media/#{image_name}").read
-      tempfile.close
-      tempfile
-    end
-    # NODE PARSERS
-    # Each of the methods below (beginning with '_node') are specialized parsers for handling
-    # a particular type of XML element.
-    # Parse one or more runs
-    def _node_parse_runs(node)
-      # The 'run' is the basic unit of text in Office OpenXML. A paragraph, table cell, or other
-      # block element may contain one or more runs, and each run has an associated set of styles.
-      texts = []
-      # A complex field is a special type of node spanning multiple runs, where most of the runs
-      # designate a special control flow rather than normal text.
-      complex_field = nil
-      nodes = node.is_a?(Array) ? node : node.children
-      nodes.each_with_index do |run_xml, idx|
-        case run_xml.name
-          when 'r'
-            if run_xml.xpath('./w:t').length > 0 && complex_field.nil?
-              # A True run node
-              # Only examine the run if it includes text codes. The run may also include
-              # things like comment nodes, which should be ignored.
-              text = Swordfish::Node::Text.new
-              text.content = run_xml.xpath('./w:t')[0].content
-              get_styles_for_node(text, run_xml.xpath('./w:rPr')[0])
-              texts << text
-            elsif run_xml.xpath('.//*[name()="pic:pic"]').length > 0
-              # An image run
-              image = Swordfish::Node::Image.new
-              relationship_id = run_xml.xpath('.//*[name()="pic:pic"]/*[name()="pic:blipFill"]/*[name()="a:blip"]')[0]['r:embed'] rescue nil
-              if relationship_id
-                image.original_name = @relationships[relationship_id].split('/').last
-                @swordfish_doc.images[image.original_name] = read_image(image.original_name)
-                texts << image
-              end
-            elsif run_xml.xpath('./w:fldChar').length > 0 || complex_field
-              # A complex field
-              case
-                when run_xml.xpath('./w:fldChar').length > 0 && run_xml.xpath('./w:fldChar')[0]['w:fldCharType'] == 'begin'
-                  # Start the complex field
-                  complex_field = true
-                when run_xml.xpath('./w:instrText').length > 0
-                  # An instruction run, defining the complex field's behavior
-                  instruction = run_xml.xpath('./w:instrText')[0].content
-                  if instruction =~ /^\s*HYPERLINK/
-                    # A hyperlink
-                    complex_field = Swordfish::Node::Hyperlink.new
-                    complex_field.href = instruction.match(/^\s*HYPERLINK "([^"]+)"/).captures[0]
-                  else
-                    # Anything else
-                    complex_field = Swordfish::Node::Text.new
-                  end
-                when run_xml.xpath('./w:t').length > 0 && complex_field.children.length.zero?
-                  # The textual content
-                  complex_field.append(_node_parse_runs(nodes.to_a[idx..-1]))
-                when run_xml.xpath('./w:fldChar').length > 0 && run_xml.xpath('./w:fldChar')[0]['w:fldCharType'] == 'end'
-                  # End the complex field
-                  if complex_field
-                    texts << complex_field
-                    complex_field = nil
-                  else
-                    # Handle the case where _node_parse_runs gets called from within a complex field
-                    return texts
-                  end
-              end
-            end
-          when 'hyperlink'
-            # Hyperlink nodes are placed amongst other run nodes, but
-            # they themselves also contain runs. Hyperlinks include
-            # a relationship ID attribute defining their reference.
-            link = Swordfish::Node::Hyperlink.new
-            link.href = @relationships[run_xml['r:id']]
-            _node_parse_runs(run_xml).each {|r| link.append(r)}
-            texts << link
-        end
-      end
-      # Clean up runs by merging them if they have identical styles
-      texts = texts.reduce([]) do |memo, run|
-        if memo.length > 0 && memo.last.is_a?(Swordfish::Node::Text) && run.is_a?(Swordfish::Node::Text) && memo.last.style == run.style
-          memo.last.content += run.content
-        else
-          memo << run
-        end
-        memo
-      end
-      texts
-    end
-    # Parse a paragraph
-    def _node_parse_paragraph(node)
-      paragraph = Swordfish::Node::Paragraph.new
-      _node_parse_runs(node).each {|r| paragraph.append(r)}
-      paragraph
-    end
-    # Parse a list
-    def _node_parse_list(node)
-      # In Office OpenXML, a list is not a distinct element type, but rather a
-      # specialized paragraph that references an abstract numbering scheme
-      # and includes an indentation level. As a result, the build buffer
-      # must be used to assemble the Swordfish::Node representation of the list,
-      # since the only way to tell the list has been fully parsed is to encounter
-      # a non-list element.
-      # Handle paragraphs with no level, which represent multi-paragraph list items
-      if node.xpath(".//w:numPr/w:ilvl").length.zero?
-        para = Swordfish::Node::Paragraph.new
-        _node_parse_runs(node).each {|r| para.append(r)}
-        @buffer.last_list_item(:recurse => true).wrap_children(Swordfish::Node::Text, Swordfish::Node::Paragraph)
-        @buffer.last_list_item(:recurse => true).append para
-        return
-      end
-      # Get the list item's abstract numbering and level
-      list_item = Swordfish::Node::ListItem.new
-      _node_parse_runs(node).each {|r| list_item.append(r)}
-      level = node.xpath(".//w:numPr/w:ilvl")[0]['w:val'].to_i
-      numbering_scheme = node.xpath(".//w:numPr/w:numId")[0]['w:val'].to_i
-      # If the build buffer is empty, this is a new list
-      unless @buffer
-        @buffer = Swordfish::Node::List.new
-        @buffer.stylize @numbering[numbering_scheme][level].to_sym
-        @buffer_initial_value = level # Lists may have an arbitrary initial level
-      end
-      # Compare the level of this list item to the bottommost node in
-      # the build buffer to determine where in the hierarchy to add
-      # this node (i.e., are we dealing with list nesting or not?)
-      if @buffer.depth_of_final_node >= level || @buffer.children.empty?
-        # Add sibling to existing list
-        target = @buffer
-        (level - @buffer_initial_value).times do
-          target = target.last_list_item.nested_list
-        end
-        target.append list_item
-      elsif @buffer.depth_of_final_node < level
-        # Add new nested list
-        target = @buffer
-        (level - @buffer_initial_value- 1).times do
-          target = target.last_list_item.nested_list
-        end
-        list = Swordfish::Node::List.new
-        list.append list_item
-        list.stylize @numbering[numbering_scheme][level].to_sym
-        target.last_list_item.append list
-      end
-    end
-    # Parse a table
-    def _node_parse_table(node)
-      table = Swordfish::Node::Table.new
-      node.xpath("./w:tr").each do |row|
-        table.append _node_parse_table_row(row)
-      end
-      table
-    end
-    # Parse a table row
-    def _node_parse_table_row(node)
-      row = Swordfish::Node::TableRow.new
-      node.xpath('./w:tc').each do |cell|
-        row.append _node_parse_table_cell(cell)
-      end
-      row
-    end
-    # Parse a table cell
-    def _node_parse_table_cell(node)
-      # In a Swordfish::Node::Table object, the number of table cells must equal the
-      # total number of rows times the total number of columns; that is, even if
-      # two cells are merged together, there must be a Swordfish::Node::TableCell for
-      # each one. Merges are defined using the "merge_up" and "merge_left" properties.
-      cell = Swordfish::Node::TableCell.new
-      extra_cells = []
-      # Get the inner content of the cell
-      node.xpath("./w:p").each do |paragraph|
-        cell.append _node_parse_paragraph(paragraph)
-      end
-      # Determine whether this cell spans multiple rows. In Office OpenXML,
-      # a table cell is defined in every row, even if the cell is vertically-merged. The representation
-      # of the merged cell within each row is given a vMerge property, with the topmost one also
-      # having a vMerge value of "restart", and the others having no vMerge value.
-      if node.xpath("./w:tcPr/w:vMerge").length > 0 && node.xpath("./w:tcPr/w:vMerge")[0]['w:val'].nil?
-        cell.merge_up = true
-      end
-      # Determine whether this cell spans multiple columns. Unlike with vertical merges,
-      # a horizontally-merged Office OpenXML cell is only defined once, but is given a gridSpan
-      # property defining the number of columns it spans. Since Swordfish requires a cell for each
-      # column, loop to generate the additional cells, and set their merge_left values appropriately.
-      if node.xpath("./w:tcPr/w:gridSpan").length > 0
-        node.xpath("./w:tcPr/w:gridSpan")[0]['w:val'].to_i.-(1).times do
-          c = Swordfish::Node::TableCell.new
-          c.merge_left = true
-          extra_cells << c
-        end
-      end
-      # Return the generated cell or cells
-      if extra_cells.empty?
-        return cell
-      else
-        return [cell] + extra_cells
-      end
-    end
-  end
-end