swordfish 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- OGRlYTY4Yjg5NmY5N2QzMWU2Yzg0ZjAxMDAwM2VmZjUwNThhNDMzOA==
4
+ MjdhYWJlNWU2YmQzY2U1MzY5ODFjYWY2M2FiODczMjZhMDZiYTA1Mg==
5
5
  data.tar.gz: !binary |-
6
- NGUzNTViOGMwMTJmZmFjOGE4YTA1NzU3MGQwYTMyY2I3YmYzZjhlNQ==
6
+ ZTMwMDFkODM5NzlmMjQ3NGQ4YTljMDA0NzE2YTVjNmIxZDkzZGIyOQ==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- YWI3YmFiZGJiYzk3NTRiNTAwNjljZmQ5ODA2NjY0MjMzNjljNTVmYWU3ZmUx
10
- OGYwMTljZjU3YjY3MmYzYjcwYmE3NGYzZDc3ZThlNmNmOWJlZDA2YTg2NGFm
11
- YzQwMmY0ZmZkMDM4NmQ1OTE4OGZmMjdkODEwZjE3NTUxYWFhZmI=
9
+ ZTAxMWQ0MDY3Y2FmZjdjOTQwNzMzMzI1MmNlM2VlMWIyZDU2ZTZlMjcyYjNl
10
+ MmFiZDlmMWUxMGVlYWFiOTBlZGI3ZDE5OWZiN2M5MjllZGQ1MjUwOGExNTYy
11
+ ZjkzNjRmMTcyOTkzNjMzYjdmYTk3MDE4YTIyMjJhNjBmMzk2ZjE=
12
12
  data.tar.gz: !binary |-
13
- YTAxYmU1NDBjNzIwNWNhZWRjY2UzMjIwMGQ5ZTk0ZjgwMmVhMzQ5YjA0Yjhh
14
- NWEyYzQxZDMzYzEyOWViYjQ2Y2RjZjg3OTkzMjRkYTc0NWZmOGIyMzVlNWZj
15
- M2ZlN2U3MDE0YjZhYzkwMjA0MDQ2Y2FmNDViNDMwMmU1NDM5Mzk=
13
+ NDA2MzliMGI4ZTMwZDE4MDUyZDI3YzNjMTZmOTUwY2YxNzc3NjU2NWZkNzgw
14
+ NjNkMjc4YTIzNzY4NTVmZDk2YzllZmJkOTZmZWM3Mjg3ZWE2ZDMwMjY4ZDE0
15
+ ZDQ1NGQ1ZWJhYzdmMWYxMTYzNjc3MmIyZDdkZWIxY2RkZTkzM2E=
@@ -11,6 +11,10 @@ require 'swordfish/nodes/table'
11
11
  require 'swordfish/nodes/table_row'
12
12
  require 'swordfish/nodes/table_cell'
13
13
  require 'swordfish/nodes/image'
14
+ require 'swordfish/nodes/header'
15
+ require 'swordfish/nodes/footnote'
16
+ require 'swordfish/nodes/linebreak'
17
+ require 'swordfish/nodes/raw'
14
18
 
15
19
  # Swordfish::Document is the internal representation of a parsed document.
16
20
 
@@ -56,13 +60,21 @@ module Swordfish
56
60
  end
57
61
  end
58
62
 
59
- def to_html(opts = {})
60
- html = @nodes.map(&:to_html).join
61
-
62
- if opts[:pretty]
63
- Nokogiri::HTML(html).to_html
63
+ # Perform various destructive operations that may result in improved output
64
+ def settings(opts = {})
65
+ find_headers! if opts[:guess_headers]
66
+ find_footnotes! if opts[:footnotes]
67
+ @generate_full_document = !!opts[:full_document]
68
+ self
69
+ end
70
+
71
+ def to_html
72
+ if @generate_full_document
73
+ prefix = "<!DOCTYPE html><html><head><title></title></head><body>"
74
+ suffix = "</body></html>"
75
+ prefix + @nodes.map(&:to_html).join + suffix
64
76
  else
65
- html
77
+ @nodes.map(&:to_html).join
66
78
  end
67
79
  end
68
80
 
@@ -72,5 +84,51 @@ module Swordfish
72
84
  def find_nodes_by_type(klass)
73
85
  @nodes.collect{|n| n.find_nodes_by_type(klass)}.flatten
74
86
  end
87
+
88
+ # Attempt to identify header nodes
89
+ def find_headers!
90
+ font_sizes = []
91
+ # If a paragraph has a single font size throughout, mark it in the array.
92
+ @nodes.each_with_index do |node, idx|
93
+ if node.is_a?(Swordfish::Node::Paragraph)
94
+ para_size = node.style.font_size
95
+ run_sizes = node.children.collect{ |n| n.style.font_size }.compact
96
+ if (run_sizes.length == 1) || (run_sizes.length == 0 && para_size)
97
+ font_sizes << {:idx => idx, :size => run_sizes.first || para_size}
98
+ end
99
+ end
100
+ end
101
+
102
+ # For each node with a consistent size, if it is larger than both of
103
+ # its neighbors, flag it as a header
104
+ header_sizes = []
105
+ font_sizes.each_with_index do |f, idx|
106
+ if idx == 0
107
+ header_sizes << f[:size] if f[:size] > font_sizes[idx+1][:size]
108
+ elsif idx != font_sizes.length - 1
109
+ header_sizes << f[:size] if (f[:size] > font_sizes[idx-1][:size] && f[:size] > font_sizes[idx+1][:size])
110
+ end
111
+ end
112
+ header_sizes = header_sizes.uniq.sort.reverse
113
+ font_sizes.each do |f|
114
+ level = header_sizes.find_index(f[:size])
115
+ if level
116
+ header = @nodes[f[:idx]].replace_with(Swordfish::Node::Header)
117
+ header.inform! :level => (level + 1)
118
+ @nodes[f[:idx]] = header
119
+ end
120
+ end
121
+ end
122
+
123
+ # Find all foot/endnotes and number them
124
+ def find_footnotes!
125
+ find_nodes_by_type(Swordfish::Node::Footnote).each_with_index do |footnote, idx|
126
+ footnote.inform!({:index => idx})
127
+ footnote_content = Swordfish::Node::Raw.new
128
+ footnote_content.content = footnote.content_to_html
129
+ @nodes << footnote_content
130
+ end
131
+ end
132
+
75
133
  end
76
134
  end
@@ -0,0 +1,215 @@
1
+ require 'zip'
2
+ require 'nokogiri'
3
+ require 'swordfish/document'
4
+ require_relative 'parser'
5
+
6
+ # Swordfish::DOCX defines a parser for .docx (Office OpenXML) formats
7
+
8
+ module Swordfish
9
+ module DOCX
10
+ class Document
11
+
12
+ include Swordfish::DOCX::Parser
13
+
14
+ attr_reader :swordfish_doc # The Swordfish::Document corresponding to the parsed document
15
+ attr_reader :docx_archive # The source archive
16
+
17
+ # Parse a document and return a Swordfish::Document object
18
+ def self.open(filepath)
19
+ # .docx is a zipped file format consisting of several XML files.
20
+ # Read in the content of each needed file.
21
+ docx_archive = Zip::File.open(filepath)
22
+
23
+ xml_docs = {
24
+ :document => docx_archive.read('word/document.xml'),
25
+ :styles => docx_archive.read('word/styles.xml'),
26
+ :numbering => (docx_archive.read('word/numbering.xml') rescue nil),
27
+ :relationships => (docx_archive.read('word/_rels/document.xml.rels') rescue nil),
28
+ :footnotes => (docx_archive.read('word/footnotes.xml') rescue nil),
29
+ :footnote_rels => (docx_archive.read('word/_rels/footnotes.xml.rels') rescue nil),
30
+ :endnotes => (docx_archive.read('word/endnotes.xml') rescue nil),
31
+ :endnote_rels => (docx_archive.read('word/_rels/endnotes.xml.rels') rescue nil)
32
+ }
33
+
34
+ # Parse the XML files and generate the Swordfish::Document
35
+ swordfish_docx = new docx_archive, xml_docs
36
+ swordfish_docx.swordfish_doc
37
+ end
38
+
39
+ def initialize(archive, xml_docs)
40
+ @docx_archive = archive
41
+ @swordfish_doc = Swordfish::Document.new
42
+ parse_styles xml_docs[:styles]
43
+ parse_numbering(xml_docs[:numbering]) if xml_docs[:numbering]
44
+ parse_relationships(xml_docs[:relationships]) if xml_docs[:relationships]
45
+ parse_relationships(xml_docs[:footnote_rels], :footnotes) if xml_docs[:footnote_rels]
46
+ parse_relationships(xml_docs[:endnote_rels], :endnotes) if xml_docs[:endnote_rels]
47
+ parse_footnotes(xml_docs[:footnotes]) if xml_docs[:footnotes]
48
+ parse_endnotes(xml_docs[:endnotes]) if xml_docs[:endnotes]
49
+ parse xml_docs[:document]
50
+ end
51
+
52
+ private
53
+
54
+ # Take the contents of the build buffer and flush them into the Swordfish::Document object.
55
+ # This buffer is needed for certain docx constructs that consist of multiple top-level
56
+ # elements but correspond to a single Swordfish::Node, such as lists.
57
+ def flush
58
+ @swordfish_doc.append(@buffer) if @buffer
59
+ @buffer = nil
60
+ end
61
+
62
+ # Parse the document structure XML
63
+ def parse(document_xml)
64
+ @xml = Nokogiri::XML(document_xml)
65
+
66
+ # Iterate over each element node and dispatch it to the appropriate parser
67
+ @xml.xpath('//w:body').children.each do |node|
68
+ case node.name
69
+ when 'p'
70
+ if node.xpath('.//w:numPr').length == 0 && (@buffer.is_a?(Swordfish::Node::List) ? node.xpath('.//w:ind').length.zero? : true)
71
+ # Regular paragraph
72
+ # (The buffer check makes sure that this isn't an indented paragraph immediately after a list item,
73
+ # which means we're most likely dealing with a multi-paragraph list item)
74
+ flush
75
+ @swordfish_doc.append _node_parse_paragraph(node)
76
+ elsif node.xpath('.//w:numPr/ancestor::w:pPrChange').length.zero?
77
+ # List paragraph
78
+ # (must have a numPr node, but cannot have a pPrChange ancestor, since that means
79
+ # we are just looking at historical changes)
80
+ # (Don't flush because we need to first ensure the list is fully parsed)
81
+ _node_parse_list(node)
82
+ end
83
+ when 'tbl'
84
+ flush
85
+ @swordfish_doc.append _node_parse_table(node)
86
+ end
87
+ end
88
+ flush
89
+ end
90
+
91
+ # Parse styles out of a docx element property nodeset (*Pr) and stylize the Swordfish::Node
92
+ # If the Swordfish::Node is not provided, return a stylesheet instead
93
+ def get_styles_for_node(xml_nodeset, swordfish_node = nil)
94
+ return unless xml_nodeset
95
+ swordfish_node = Swordfish::Node::Base.new if swordfish_node.nil?
96
+ xml_nodeset.children.each do |style_node|
97
+ case style_node.name
98
+ when 'i'
99
+ swordfish_node.stylize :italic
100
+ when 'b'
101
+ swordfish_node.stylize :bold
102
+ when 'u'
103
+ swordfish_node.stylize :underline
104
+ when 'strike'
105
+ swordfish_node.stylize :strikethrough
106
+ when 'sz'
107
+ swordfish_node.stylize :font_size => (style_node['w:val'].to_i / 2)
108
+ when 'szCs' && !swordfish_node.style.font_size
109
+ # Only use complex script size node if there is no standard size node
110
+ swordfish_node.stylize :font_size => (style_node['w:val'].to_i / 2)
111
+ when 'vertAlign'
112
+ if style_node['w:val'] == 'superscript'
113
+ swordfish_node.stylize :superscript
114
+ elsif style_node['w:val'] == 'subscript'
115
+ swordfish_node.stylize :subscript
116
+ end
117
+ when 'rStyle'
118
+ if style_node['w:val'] == 'Strong'
119
+ swordfish_node.stylize :strong
120
+ elsif style_node['w:val'] == 'Emphasis'
121
+ swordfish_node.stylize :emphasis
122
+ end
123
+ end
124
+ end
125
+ swordfish_node.style
126
+ end
127
+
128
+ # Parse the document styles XML
129
+ def parse_styles(styles_xml)
130
+ # This XML document defines a number of styles, which can be referenced by the document
131
+ # XML in order to quickly reference repeated styles without having to redefine them for
132
+ # every run. This function will load needed styles into a hash keyed by the style ID.
133
+ @styles = {}
134
+ xml = Nokogiri::XML(styles_xml)
135
+ xml.xpath("//w:style").each do |style|
136
+ style_id = style['w:styleId']
137
+ stylesheet = get_styles_for_node(style.xpath(".//w:rPr"))
138
+ @styles[style_id.to_sym] = stylesheet
139
+ end
140
+ end
141
+
142
+ # Parse the abstract numbering XML (defining things such as list numbering)
143
+ def parse_numbering(numbering_xml)
144
+ # The XML maps a numbering ID (numId) to an abstract numbering schema ID (abstractNumId).
145
+ # The abstract numbering schema defines display formats for each level of indentation (lvl).
146
+ # This function will load up the relevant data into the @numbering class variable in the form
147
+ # of a nested hash: @numbering[numbering ID][indentation level] = number format.
148
+ @numbering = {}
149
+ xml = Nokogiri::XML(numbering_xml)
150
+ xml.xpath("//w:num").each do |num|
151
+ numId = num['w:numId'].to_i
152
+ abstractNumId = num.xpath("./w:abstractNumId")[0]['w:val'].to_i
153
+ abstract_numbering = {}
154
+ xml.xpath("//w:abstractNum[@w:abstractNumId='#{abstractNumId}']/w:lvl").each do |level_format|
155
+ level = level_format['w:ilvl'].to_i
156
+ format = level_format.xpath("./w:numFmt")[0]['w:val']
157
+ abstract_numbering[level] = format
158
+ end
159
+ @numbering[numId] = abstract_numbering
160
+ end
161
+ end
162
+
163
+ # Parse the relationships XML (defining things such as internal references and external links)
164
+ def parse_relationships(relationships_xml, type = nil)
165
+ # The XML contains a list of relationships identified by an id. Each relationship includes
166
+ # a target attribute designating the reference. THis function will load up the relevant
167
+ # data into the @relationships class variable in the form of a hash:
168
+ # @relationships[relationship ID] = target URI.
169
+ rels = @relationships ||= {}
170
+ rels = (@relationships[type] ||= {}) if type
171
+ xml = Nokogiri::XML(relationships_xml)
172
+ xml.css("Relationship").each do |rel| # Nokogiri doesn't seem to like XPath here for some reason
173
+ rels[rel['Id']] = rel['Target']
174
+ end
175
+ end
176
+
177
+ # Parse the footnotes XML
178
+ def parse_footnotes(footnotes_xml)
179
+ @footnotes = {}
180
+ xml = Nokogiri::XML(footnotes_xml)
181
+ xml.xpath("//w:footnote[@w:id > 0]").each do |footnote|
182
+ id = footnote['w:id'].to_i
183
+ f = Swordfish::Node::Footnote.new
184
+ footnote.xpath(".//w:p").each do |p|
185
+ f.append _node_parse_paragraph(p, :footnotes)
186
+ end
187
+ @footnotes[id] = f
188
+ end
189
+ end
190
+
191
+ # Parse the endnotes XML
192
+ def parse_endnotes(endnotes_xml)
193
+ @endnotes = {}
194
+ xml = Nokogiri::XML(endnotes_xml)
195
+ xml.xpath("//w:endnote[@w:id > 0]").each do |endnote|
196
+ id = endnote['w:id'].to_i
197
+ f = Swordfish::Node::Footnote.new
198
+ endnote.xpath(".//w:p").each do |p|
199
+ f.append _node_parse_runs(p, :endnotes)
200
+ end
201
+ @endnotes[id] = f
202
+ end
203
+ end
204
+
205
+ # Extract an image resource as a tempfile
206
+ def read_image(image_name)
207
+ tempfile = Tempfile.new(image_name)
208
+ tempfile.write @docx_archive.get_input_stream("word/media/#{image_name}").read
209
+ tempfile.close
210
+ tempfile
211
+ end
212
+
213
+ end
214
+ end
215
+ end
@@ -0,0 +1,232 @@
1
+ module Swordfish
2
+ module DOCX
3
+ module Parser
4
+
5
+ # NODE PARSERS
6
+ # Each of the methods below (beginning with '_node') are specialized parsers for handling
7
+ # a particular type of XML element.
8
+
9
+ # Parse one or more runs
10
+ def _node_parse_runs(node, context = nil)
11
+ # The 'run' is the basic unit of text in Office OpenXML. A paragraph, table cell, or other
12
+ # block element may contain one or more runs, and each run has an associated set of styles.
13
+ texts = []
14
+ # A complex field is a special type of node spanning multiple runs, where most of the runs
15
+ # designate a special control flow rather than normal text.
16
+ complex_field = nil
17
+
18
+ nodes = node.is_a?(Array) ? node : node.children
19
+ nodes.each_with_index do |run_xml, idx|
20
+ case run_xml.name
21
+ when 'r'
22
+ if run_xml.xpath('./w:t').length > 0 && complex_field.nil?
23
+ # A True run node
24
+ # Only examine the run if it includes text codes. The run may also include
25
+ # things like comment nodes, which should be ignored.
26
+ text = Swordfish::Node::Text.new
27
+ text.content = run_xml.xpath('./w:t')[0].content
28
+ get_styles_for_node(run_xml.xpath('./w:rPr')[0], text)
29
+ texts << text
30
+ elsif run_xml.xpath('.//*[name()="pic:pic"]').length > 0
31
+ # An image run
32
+ image = Swordfish::Node::Image.new
33
+ relationship_id = run_xml.xpath('.//*[name()="pic:pic"]/*[name()="pic:blipFill"]/*[name()="a:blip"]')[0]['r:embed'] rescue nil
34
+ if relationship_id
35
+ image.original_name = @relationships[relationship_id].split('/').last
36
+ @swordfish_doc.images[image.original_name] = read_image(image.original_name)
37
+ texts << image
38
+ end
39
+ elsif run_xml.xpath('./w:fldChar').length > 0 || complex_field
40
+ # A complex field
41
+ case
42
+ when run_xml.xpath('./w:fldChar').length > 0 && run_xml.xpath('./w:fldChar')[0]['w:fldCharType'] == 'begin'
43
+ # Start the complex field
44
+ complex_field = true
45
+ when run_xml.xpath('./w:instrText').length > 0
46
+ # An instruction run, defining the complex field's behavior
47
+ instruction = run_xml.xpath('./w:instrText')[0].content
48
+ if instruction =~ /^\s*HYPERLINK/
49
+ # A hyperlink
50
+ complex_field = Swordfish::Node::Hyperlink.new
51
+ complex_field.href = instruction.match(/^\s*HYPERLINK "([^"]+)"/).captures[0]
52
+ else
53
+ # Anything else
54
+ complex_field = Swordfish::Node::Text.new
55
+ end
56
+ when run_xml.xpath('./w:t').length > 0 && complex_field.children.length.zero?
57
+ # The textual content
58
+ complex_field.append(_node_parse_runs(nodes.to_a[idx..-1]))
59
+ when run_xml.xpath('./w:fldChar').length > 0 && run_xml.xpath('./w:fldChar')[0]['w:fldCharType'] == 'end'
60
+ # End the complex field
61
+ if complex_field
62
+ texts << complex_field
63
+ complex_field = nil
64
+ else
65
+ # Handle the case where _node_parse_runs gets called from within a complex field
66
+ return texts
67
+ end
68
+ end
69
+ elsif run_xml.xpath('./w:footnoteReference').length > 0
70
+ # A footnote reference
71
+ id = run_xml.xpath('./w:footnoteReference')[0]['w:id'].to_i
72
+ texts << @footnotes[id] if @footnotes[id]
73
+ elsif run_xml.xpath('./w:endnoteReference').length > 0
74
+ # An endnote reference
75
+ id = run_xml.xpath('./w:endnoteReference')[0]['w:id'].to_i
76
+ texts << @endnotes[id] if @endnotes[id]
77
+ elsif run_xml.xpath('./w:br').length > 0
78
+ # A linebreak run
79
+ texts << Swordfish::Node::Linebreak.new
80
+ end
81
+ when 'hyperlink'
82
+ # Hyperlink nodes are placed amongst other run nodes, but
83
+ # they themselves also contain runs. Hyperlinks include
84
+ # a relationship ID attribute defining their reference.
85
+ link = Swordfish::Node::Hyperlink.new
86
+ link.href = context ? @relationships[context][run_xml['r:id']] : @relationships[run_xml['r:id']]
87
+ _node_parse_runs(run_xml).each {|r| link.append(r)}
88
+ texts << link
89
+ end
90
+ end
91
+ # Clean up runs by merging them if they have identical styles
92
+ texts = texts.reduce([]) do |memo, run|
93
+ if memo.length > 0 && memo.last.is_a?(Swordfish::Node::Text) && run.is_a?(Swordfish::Node::Text) && memo.last.style == run.style
94
+ memo.last.content += run.content
95
+ else
96
+ memo << run
97
+ end
98
+ memo
99
+ end
100
+
101
+ texts
102
+ end
103
+
104
+ # Parse a paragraph
105
+ def _node_parse_paragraph(node)
106
+ paragraph = Swordfish::Node::Paragraph.new
107
+ _node_parse_runs(node).each {|r| paragraph.append(r)}
108
+ if node.xpath("./w:pPr/w:pStyle").length > 0
109
+ style_id = node.xpath("./w:pPr/w:pStyle")[0]['w:val'].to_sym
110
+ paragraph.style = @styles[style_id] if @styles[style_id]
111
+ end
112
+ paragraph
113
+ end
114
+
115
+ # Parse a list
116
+ def _node_parse_list(node)
117
+ # In Office OpenXML, a list is not a distinct element type, but rather a
118
+ # specialized paragraph that references an abstract numbering scheme
119
+ # and includes an indentation level. As a result, the build buffer
120
+ # must be used to assemble the Swordfish::Node representation of the list,
121
+ # since the only way to tell the list has been fully parsed is to encounter
122
+ # a non-list element.
123
+
124
+ # Handle paragraphs with no level, which represent multi-paragraph list items
125
+ if node.xpath(".//w:numPr/w:ilvl").length.zero?
126
+ para = Swordfish::Node::Paragraph.new
127
+ _node_parse_runs(node).each {|r| para.append(r)}
128
+ @buffer.last_list_item(:recurse => true).wrap_children(Swordfish::Node::Text, Swordfish::Node::Paragraph)
129
+ @buffer.last_list_item(:recurse => true).append para
130
+ return
131
+ end
132
+
133
+ # Get the list item's abstract numbering and level
134
+ list_item = Swordfish::Node::ListItem.new
135
+ _node_parse_runs(node).each {|r| list_item.append(r)}
136
+ level = node.xpath(".//w:numPr/w:ilvl")[0]['w:val'].to_i
137
+ numbering_scheme = node.xpath(".//w:numPr/w:numId")[0]['w:val'].to_i
138
+
139
+ # If the build buffer is empty, this is a new list
140
+ unless @buffer
141
+ @buffer = Swordfish::Node::List.new
142
+ @buffer.stylize @numbering[numbering_scheme][level].to_sym
143
+ @buffer_initial_value = level # Lists may have an arbitrary initial level
144
+ end
145
+
146
+ # Compare the level of this list item to the bottommost node in
147
+ # the build buffer to determine where in the hierarchy to add
148
+ # this node (i.e., are we dealing with list nesting or not?)
149
+ if @buffer.depth_of_final_node >= level || @buffer.children.empty?
150
+ # Add sibling to existing list
151
+ target = @buffer
152
+ (level - @buffer_initial_value).times do
153
+ target = target.last_list_item.nested_list
154
+ end
155
+ target.append list_item
156
+ elsif @buffer.depth_of_final_node < level
157
+ # Add new nested list
158
+ target = @buffer
159
+ (level - @buffer_initial_value- 1).times do
160
+ target = target.last_list_item.nested_list
161
+ end
162
+ list = Swordfish::Node::List.new
163
+ list.append list_item
164
+ list.stylize @numbering[numbering_scheme][level].to_sym
165
+ target.last_list_item.append list
166
+ end
167
+ end
168
+
169
+ # Parse a table
170
+ def _node_parse_table(node)
171
+ table = Swordfish::Node::Table.new
172
+ node.xpath("./w:tr").each do |row|
173
+ table.append _node_parse_table_row(row)
174
+ end
175
+ table
176
+ end
177
+
178
+ # Parse a table row
179
+ def _node_parse_table_row(node)
180
+ row = Swordfish::Node::TableRow.new
181
+ node.xpath('./w:tc').each do |cell|
182
+ row.append _node_parse_table_cell(cell)
183
+ end
184
+ row
185
+ end
186
+
187
+ # Parse a table cell
188
+ def _node_parse_table_cell(node)
189
+ # In a Swordfish::Node::Table object, the number of table cells must equal the
190
+ # total number of rows times the total number of columns; that is, even if
191
+ # two cells are merged together, there must be a Swordfish::Node::TableCell for
192
+ # each one. Merges are defined using the "merge_up" and "merge_left" properties.
193
+
194
+ cell = Swordfish::Node::TableCell.new
195
+ extra_cells = []
196
+
197
+ # Get the inner content of the cell
198
+ node.xpath("./w:p").each do |paragraph|
199
+ cell.append _node_parse_paragraph(paragraph)
200
+ end
201
+
202
+ # Determine whether this cell spans multiple rows. In Office OpenXML,
203
+ # a table cell is defined in every row, even if the cell is vertically-merged. The representation
204
+ # of the merged cell within each row is given a vMerge property, with the topmost one also
205
+ # having a vMerge value of "restart", and the others having no vMerge value.
206
+ if node.xpath("./w:tcPr/w:vMerge").length > 0 && node.xpath("./w:tcPr/w:vMerge")[0]['w:val'].nil?
207
+ cell.merge_up = true
208
+ end
209
+
210
+ # Determine whether this cell spans multiple columns. Unlike with vertical merges,
211
+ # a horizontally-merged Office OpenXML cell is only defined once, but is given a gridSpan
212
+ # property defining the number of columns it spans. Since Swordfish requires a cell for each
213
+ # column, loop to generate the additional cells, and set their merge_left values appropriately.
214
+ if node.xpath("./w:tcPr/w:gridSpan").length > 0
215
+ node.xpath("./w:tcPr/w:gridSpan")[0]['w:val'].to_i.-(1).times do
216
+ c = Swordfish::Node::TableCell.new
217
+ c.merge_left = true
218
+ extra_cells << c
219
+ end
220
+ end
221
+
222
+ # Return the generated cell or cells
223
+ if extra_cells.empty?
224
+ return cell
225
+ else
226
+ return [cell] + extra_cells
227
+ end
228
+ end
229
+
230
+ end
231
+ end
232
+ end
@@ -6,7 +6,7 @@ module Swordfish
6
6
 
7
7
  attr_accessor :content
8
8
  attr_accessor :children
9
- attr_reader :style
9
+ attr_accessor :style
10
10
 
11
11
  # Initialize with a blank stylesheet and no children
12
12
  def initialize
@@ -28,7 +28,15 @@ module Swordfish
28
28
 
29
29
  # Take a style or styles and add them to this node's stylesheet
30
30
  def stylize(styles)
31
- @style.merge styles
31
+ if styles.is_a? Hash
32
+ # Key/value pairs
33
+ styles.each do |k, v|
34
+ @style.send "#{k}=".to_sym, v
35
+ end
36
+ else
37
+ # Boolean values
38
+ @style.merge styles
39
+ end
32
40
  end
33
41
 
34
42
  # Every subclass must implement to_html in order to be converted to HTML
@@ -67,6 +75,14 @@ module Swordfish
67
75
  nodes.compact
68
76
  end
69
77
 
78
+ # Return a clone of this node with a different class
79
+ def replace_with(klass)
80
+ if klass <= Swordfish::Node::Base
81
+ new_node = klass.new
82
+ new_node.inform!({:style => @style, :children => @children, :content => @content })
83
+ new_node
84
+ end
85
+ end
70
86
  end
71
87
 
72
88
  class BadContentError < Exception
@@ -0,0 +1,21 @@
1
+ # A foonote node
2
+
3
+ module Swordfish
4
+ module Node
5
+ class Footnote < Base
6
+
7
+ attr_accessor :index
8
+
9
+ def to_html
10
+ return "" unless @index
11
+ "<a id='footnote-ref-#{@index}' href='#footnote-#{@index}'>[#{@index}]</a>"
12
+ end
13
+
14
+ def content_to_html
15
+ return "" unless @index
16
+ "<p><a id='footnote-#{@index}' href='#footnote-ref-#{@index}'>[#{@index}]</a> #{@children.map(&:to_html).join}</p>"
17
+ end
18
+
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,18 @@
1
+ # A header node
2
+
3
+ module Swordfish
4
+ module Node
5
+ class Header < Base
6
+
7
+ attr_accessor :level
8
+
9
+ def to_html
10
+ raise "Missing header level" unless @level
11
+ tag = @level <= 6 ? "h#{@level}" : "h6"
12
+ text = @children.map(&:to_html).join
13
+ "<#{tag}>#{text}</#{tag}>"
14
+ end
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,13 @@
1
+ # A linebreak node
2
+
3
+ module Swordfish
4
+ module Node
5
+ class Linebreak < Base
6
+
7
+ def to_html
8
+ "<br/>"
9
+ end
10
+
11
+ end
12
+ end
13
+ end
@@ -5,7 +5,7 @@ module Swordfish
5
5
  class ListItem < Base
6
6
 
7
7
  def to_html
8
- "<li>#{@children.map(&:to_html).join}</li>"
8
+ "<li>#{@children.map(&:to_html).join.strip}</li>"
9
9
  end
10
10
 
11
11
  # Return the nested list, or nil if this list item has no nested lists
@@ -11,7 +11,7 @@ module Swordfish
11
11
  # If the only child is an image, don't bother putting it in a P tag
12
12
  @children.map(&:to_html).join
13
13
  else
14
- text = @children.map(&:to_html).join
14
+ text = @children.map(&:to_html).join.strip
15
15
  "<p>#{text}</p>" unless text =~ /^[[:space:]]*$/
16
16
  end
17
17
  end
@@ -0,0 +1,19 @@
1
+ # A raw content node
2
+ # This node simply outputs its content as-is, with no attempts to reformat or escape text
3
+
4
+ module Swordfish
5
+ module Node
6
+ class Raw < Base
7
+
8
+ # Override Base append because a raw node should never have children
9
+ def append(node)
10
+ raise BadContentError
11
+ end
12
+
13
+ def to_html
14
+ @content
15
+ end
16
+
17
+ end
18
+ end
19
+ end
@@ -4,6 +4,7 @@ module Swordfish
4
4
  class Stylesheet
5
5
 
6
6
  attr_reader :styles
7
+ attr_accessor :font_size
7
8
 
8
9
  # Define all supported values here
9
10
  SUPPORTED_STYLES = [
data/lib/swordfish.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require 'swordfish/document'
2
- require 'swordfish/formats/docx'
2
+ require 'swordfish/formats/docx/document'
3
3
 
4
4
  module Swordfish
5
5
 
@@ -8,7 +8,7 @@ module Swordfish
8
8
  extension = (opts[:extension] || filepath.split('.').last).downcase.to_sym
9
9
  case extension
10
10
  when :docx
11
- Swordfish::DOCX.open(filepath)
11
+ Swordfish::DOCX::Document.open(filepath)
12
12
  else
13
13
  raise UnsupportedFormatError, "'#{extension}' is not a recognized file format"
14
14
  end
metadata CHANGED
@@ -1,57 +1,69 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: swordfish
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martin Posthumus
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-05 00:00:00.000000000 Z
11
+ date: 2014-07-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ! '>='
17
+ - - ~>
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
19
+ version: '1'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ! '>='
24
+ - - ~>
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: '1'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: nokogiri
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '1.6'
31
34
  - - ! '>='
32
35
  - !ruby/object:Gem::Version
33
- version: '0'
36
+ version: 1.6.0
34
37
  type: :runtime
35
38
  prerelease: false
36
39
  version_requirements: !ruby/object:Gem::Requirement
37
40
  requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ version: '1.6'
38
44
  - - ! '>='
39
45
  - !ruby/object:Gem::Version
40
- version: '0'
46
+ version: 1.6.0
41
47
  - !ruby/object:Gem::Dependency
42
48
  name: rubyzip
43
49
  requirement: !ruby/object:Gem::Requirement
44
50
  requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: '1.1'
45
54
  - - ! '>='
46
55
  - !ruby/object:Gem::Version
47
- version: '0'
56
+ version: 1.1.0
48
57
  type: :runtime
49
58
  prerelease: false
50
59
  version_requirements: !ruby/object:Gem::Requirement
51
60
  requirements:
61
+ - - ~>
62
+ - !ruby/object:Gem::Version
63
+ version: '1.1'
52
64
  - - ! '>='
53
65
  - !ruby/object:Gem::Version
54
- version: '0'
66
+ version: 1.1.0
55
67
  description: A simple library for various word processor formats focusing primarily
56
68
  around conversion to HTML
57
69
  email: martin.posthumus@gmail.com
@@ -62,13 +74,18 @@ files:
62
74
  - README.md
63
75
  - lib/swordfish.rb
64
76
  - lib/swordfish/document.rb
65
- - lib/swordfish/formats/docx.rb
77
+ - lib/swordfish/formats/docx/document.rb
78
+ - lib/swordfish/formats/docx/parser.rb
66
79
  - lib/swordfish/nodes/base.rb
80
+ - lib/swordfish/nodes/footnote.rb
81
+ - lib/swordfish/nodes/header.rb
67
82
  - lib/swordfish/nodes/hyperlink.rb
68
83
  - lib/swordfish/nodes/image.rb
84
+ - lib/swordfish/nodes/linebreak.rb
69
85
  - lib/swordfish/nodes/list.rb
70
86
  - lib/swordfish/nodes/list_item.rb
71
87
  - lib/swordfish/nodes/paragraph.rb
88
+ - lib/swordfish/nodes/raw.rb
72
89
  - lib/swordfish/nodes/table.rb
73
90
  - lib/swordfish/nodes/table_cell.rb
74
91
  - lib/swordfish/nodes/table_row.rb
@@ -1,362 +0,0 @@
1
- require 'zip'
2
- require 'nokogiri'
3
- require 'swordfish/document'
4
-
5
- # Swordfish::DOCX defines a parser for .docx (Office OpenXML) formats
6
-
7
- module Swordfish
8
- class DOCX
9
-
10
- attr_reader :swordfish_doc # The Swordfish::Document corresponding to the parsed document
11
- attr_reader :docx_archive # The source archive
12
-
13
- # Parse a document and return a Swordfish::Document object
14
- def self.open(filepath)
15
- # .docx is a zipped file format consisting of several XML files.
16
- # Read in the content of each needed file.
17
- docx_archive = Zip::File.open(filepath)
18
- document = docx_archive.read 'word/document.xml'
19
- styles = docx_archive.read 'word/styles.xml'
20
- numbering = docx_archive.read('word/numbering.xml') rescue nil
21
- relationships = docx_archive.read('word/_rels/document.xml.rels') rescue nil
22
-
23
- # Parse the XML files and generate the Swordfish::Document
24
- swordfish_docx = new docx_archive, document, styles, numbering, relationships
25
- swordfish_docx.swordfish_doc
26
- end
27
-
28
- def initialize(archive, document_xml, styles_xml, numbering_xml, relationships_xml)
29
- @docx_archive = archive
30
- @swordfish_doc = Swordfish::Document.new
31
- parse_styles styles_xml
32
- parse_numbering(numbering_xml) if numbering_xml
33
- parse_relationships(relationships_xml) if relationships_xml
34
- parse document_xml
35
- end
36
-
37
- private
38
-
39
- # Take the contents of the build buffer and flush them into the Swordfish::Document object.
40
- # This buffer is needed for certain docx constructs that consist of multiple top-level
41
- # elements but correspond to a single Swordfish::Node, such as lists.
42
- def flush
43
- @swordfish_doc.append(@buffer) if @buffer
44
- @buffer = nil
45
- end
46
-
47
- # Parse the document structure XML
48
- def parse(document_xml)
49
- @xml = Nokogiri::XML(document_xml)
50
-
51
- # Iterate over each element node and dispatch it to the appropriate parser
52
- @xml.xpath('//w:body').children.each do |node|
53
- case node.name
54
- when 'p'
55
- if node.xpath('.//w:numPr').length == 0 && (@buffer.is_a?(Swordfish::Node::List) ? node.xpath('.//w:ind').length.zero? : true)
56
- # Regular paragraph
57
- # (The buffer check makes sure that this isn't an indented paragraph immediately after a list item,
58
- # which means we're most likely dealing with a multi-paragraph list item)
59
- flush
60
- @swordfish_doc.append _node_parse_paragraph(node)
61
- elsif node.xpath('.//w:numPr/ancestor::w:pPrChange').length.zero?
62
- # List paragraph
63
- # (must have a numPr node, but cannot have a pPrChange ancestor, since that means
64
- # we are just looking at historical changes)
65
- # (Don't flush because we need to first ensure the list is fully parsed)
66
- _node_parse_list(node)
67
- end
68
- when 'tbl'
69
- flush
70
- @swordfish_doc.append _node_parse_table(node)
71
- end
72
- end
73
- flush
74
- end
75
-
76
- # Parse styles out of a docx element property nodeset (*Pr) and stylize the Swordfish::Node
77
- def get_styles_for_node(swordfish_node, xml_nodeset)
78
- return unless xml_nodeset
79
- xml_nodeset.children.each do |style_node|
80
- case style_node.name
81
- when 'i'
82
- swordfish_node.stylize :italic
83
- when 'b'
84
- swordfish_node.stylize :bold
85
- when 'u'
86
- swordfish_node.stylize :underline
87
- when 'strike'
88
- swordfish_node.stylize :strikethrough
89
- when 'vertAlign'
90
- if style_node['w:val'] == 'superscript'
91
- swordfish_node.stylize :superscript
92
- elsif style_node['w:val'] == 'subscript'
93
- swordfish_node.stylize :subscript
94
- end
95
- when 'rStyle'
96
- if style_node['w:val'] == 'Strong'
97
- swordfish_node.stylize :strong
98
- elsif style_node['w:val'] == 'Emphasis'
99
- swordfish_node.stylize :emphasis
100
- end
101
- end
102
- end
103
- end
104
-
105
- # Parse the document styles XML
106
- def parse_styles(styles_xml)
107
- end
108
-
109
- # Parse the abstract numbering XML (defining things such as list numbering)
110
- def parse_numbering(numbering_xml)
111
- # The XML maps a numbering ID (numId) to an abstract numbering schema ID (abstractNumId).
112
- # The abstract numbering schema defines display formats for each level of indentation (lvl).
113
- # This function will load up the relevant data into the @numbering class variable in the form
114
- # of a nested hash: @numbering[numbering ID][indentation level] = number format.
115
- @numbering = {}
116
- xml = Nokogiri::XML(numbering_xml)
117
- xml.xpath("//w:num").each do |num|
118
- numId = num['w:numId'].to_i
119
- abstractNumId = num.xpath("./w:abstractNumId")[0]['w:val'].to_i
120
- abstract_numbering = {}
121
- xml.xpath("//w:abstractNum[@w:abstractNumId='#{abstractNumId}']/w:lvl").each do |level_format|
122
- level = level_format['w:ilvl'].to_i
123
- format = level_format.xpath("./w:numFmt")[0]['w:val']
124
- abstract_numbering[level] = format
125
- end
126
- @numbering[numId] = abstract_numbering
127
- end
128
- end
129
-
130
- # Parse the relationships XML (defining things such as internal references and external links)
131
- def parse_relationships(relationships_xml)
132
- # The XML contains a list of relationships identified by an id. Each relationship includes
133
- # a target attribute designating the reference. THis function will load up the relevant
134
- # data into the @relationships class variable in the form of a hash:
135
- # @relationships[relationship ID] = target URI.
136
- @relationships = {}
137
- xml = Nokogiri::XML(relationships_xml)
138
- xml.css("Relationship").each do |rel| # Nokogiri doesn't seem to like XPath here for some reason
139
- @relationships[rel['Id']] = rel['Target']
140
- end
141
- end
142
-
143
- # Extract an image resource as a tempfile
144
- def read_image(image_name)
145
- tempfile = Tempfile.new(image_name)
146
- tempfile.write @docx_archive.get_input_stream("word/media/#{image_name}").read
147
- tempfile.close
148
- tempfile
149
- end
150
-
151
- # NODE PARSERS
152
- # Each of the methods below (beginning with '_node') are specialized parsers for handling
153
- # a particular type of XML element.
154
-
155
- # Parse one or more runs
156
- def _node_parse_runs(node)
157
- # The 'run' is the basic unit of text in Office OpenXML. A paragraph, table cell, or other
158
- # block element may contain one or more runs, and each run has an associated set of styles.
159
- texts = []
160
- # A complex field is a special type of node spanning multiple runs, where most of the runs
161
- # designate a special control flow rather than normal text.
162
- complex_field = nil
163
-
164
- nodes = node.is_a?(Array) ? node : node.children
165
- nodes.each_with_index do |run_xml, idx|
166
- case run_xml.name
167
- when 'r'
168
- if run_xml.xpath('./w:t').length > 0 && complex_field.nil?
169
- # A True run node
170
- # Only examine the run if it includes text codes. The run may also include
171
- # things like comment nodes, which should be ignored.
172
- text = Swordfish::Node::Text.new
173
- text.content = run_xml.xpath('./w:t')[0].content
174
- get_styles_for_node(text, run_xml.xpath('./w:rPr')[0])
175
- texts << text
176
- elsif run_xml.xpath('.//*[name()="pic:pic"]').length > 0
177
- # An image run
178
- image = Swordfish::Node::Image.new
179
- relationship_id = run_xml.xpath('.//*[name()="pic:pic"]/*[name()="pic:blipFill"]/*[name()="a:blip"]')[0]['r:embed'] rescue nil
180
- if relationship_id
181
- image.original_name = @relationships[relationship_id].split('/').last
182
- @swordfish_doc.images[image.original_name] = read_image(image.original_name)
183
- texts << image
184
- end
185
- elsif run_xml.xpath('./w:fldChar').length > 0 || complex_field
186
- # A complex field
187
- case
188
- when run_xml.xpath('./w:fldChar').length > 0 && run_xml.xpath('./w:fldChar')[0]['w:fldCharType'] == 'begin'
189
- # Start the complex field
190
- complex_field = true
191
- when run_xml.xpath('./w:instrText').length > 0
192
- # An instruction run, defining the complex field's behavior
193
- instruction = run_xml.xpath('./w:instrText')[0].content
194
- if instruction =~ /^\s*HYPERLINK/
195
- # A hyperlink
196
- complex_field = Swordfish::Node::Hyperlink.new
197
- complex_field.href = instruction.match(/^\s*HYPERLINK "([^"]+)"/).captures[0]
198
- else
199
- # Anything else
200
- complex_field = Swordfish::Node::Text.new
201
- end
202
- when run_xml.xpath('./w:t').length > 0 && complex_field.children.length.zero?
203
- # The textual content
204
- complex_field.append(_node_parse_runs(nodes.to_a[idx..-1]))
205
- when run_xml.xpath('./w:fldChar').length > 0 && run_xml.xpath('./w:fldChar')[0]['w:fldCharType'] == 'end'
206
- # End the complex field
207
- if complex_field
208
- texts << complex_field
209
- complex_field = nil
210
- else
211
- # Handle the case where _node_parse_runs gets called from within a complex field
212
- return texts
213
- end
214
- end
215
- end
216
- when 'hyperlink'
217
- # Hyperlink nodes are placed amongst other run nodes, but
218
- # they themselves also contain runs. Hyperlinks include
219
- # a relationship ID attribute defining their reference.
220
- link = Swordfish::Node::Hyperlink.new
221
- link.href = @relationships[run_xml['r:id']]
222
- _node_parse_runs(run_xml).each {|r| link.append(r)}
223
- texts << link
224
- end
225
- end
226
- # Clean up runs by merging them if they have identical styles
227
- texts = texts.reduce([]) do |memo, run|
228
- if memo.length > 0 && memo.last.is_a?(Swordfish::Node::Text) && run.is_a?(Swordfish::Node::Text) && memo.last.style == run.style
229
- memo.last.content += run.content
230
- else
231
- memo << run
232
- end
233
- memo
234
- end
235
-
236
- texts
237
- end
238
-
239
- # Parse a paragraph
240
- def _node_parse_paragraph(node)
241
- paragraph = Swordfish::Node::Paragraph.new
242
- _node_parse_runs(node).each {|r| paragraph.append(r)}
243
- paragraph
244
- end
245
-
246
- # Parse a list
247
- def _node_parse_list(node)
248
- # In Office OpenXML, a list is not a distinct element type, but rather a
249
- # specialized paragraph that references an abstract numbering scheme
250
- # and includes an indentation level. As a result, the build buffer
251
- # must be used to assemble the Swordfish::Node representation of the list,
252
- # since the only way to tell the list has been fully parsed is to encounter
253
- # a non-list element.
254
-
255
- # Handle paragraphs with no level, which represent multi-paragraph list items
256
- if node.xpath(".//w:numPr/w:ilvl").length.zero?
257
- para = Swordfish::Node::Paragraph.new
258
- _node_parse_runs(node).each {|r| para.append(r)}
259
- @buffer.last_list_item(:recurse => true).wrap_children(Swordfish::Node::Text, Swordfish::Node::Paragraph)
260
- @buffer.last_list_item(:recurse => true).append para
261
- return
262
- end
263
-
264
- # Get the list item's abstract numbering and level
265
- list_item = Swordfish::Node::ListItem.new
266
- _node_parse_runs(node).each {|r| list_item.append(r)}
267
- level = node.xpath(".//w:numPr/w:ilvl")[0]['w:val'].to_i
268
- numbering_scheme = node.xpath(".//w:numPr/w:numId")[0]['w:val'].to_i
269
-
270
- # If the build buffer is empty, this is a new list
271
- unless @buffer
272
- @buffer = Swordfish::Node::List.new
273
- @buffer.stylize @numbering[numbering_scheme][level].to_sym
274
- @buffer_initial_value = level # Lists may have an arbitrary initial level
275
- end
276
-
277
- # Compare the level of this list item to the bottommost node in
278
- # the build buffer to determine where in the hierarchy to add
279
- # this node (i.e., are we dealing with list nesting or not?)
280
- if @buffer.depth_of_final_node >= level || @buffer.children.empty?
281
- # Add sibling to existing list
282
- target = @buffer
283
- (level - @buffer_initial_value).times do
284
- target = target.last_list_item.nested_list
285
- end
286
- target.append list_item
287
- elsif @buffer.depth_of_final_node < level
288
- # Add new nested list
289
- target = @buffer
290
- (level - @buffer_initial_value- 1).times do
291
- target = target.last_list_item.nested_list
292
- end
293
- list = Swordfish::Node::List.new
294
- list.append list_item
295
- list.stylize @numbering[numbering_scheme][level].to_sym
296
- target.last_list_item.append list
297
- end
298
- end
299
-
300
- # Parse a table
301
- def _node_parse_table(node)
302
- table = Swordfish::Node::Table.new
303
- node.xpath("./w:tr").each do |row|
304
- table.append _node_parse_table_row(row)
305
- end
306
- table
307
- end
308
-
309
- # Parse a table row
310
- def _node_parse_table_row(node)
311
- row = Swordfish::Node::TableRow.new
312
- node.xpath('./w:tc').each do |cell|
313
- row.append _node_parse_table_cell(cell)
314
- end
315
- row
316
- end
317
-
318
- # Parse a table cell
319
- def _node_parse_table_cell(node)
320
- # In a Swordfish::Node::Table object, the number of table cells must equal the
321
- # total number of rows times the total number of columns; that is, even if
322
- # two cells are merged together, there must be a Swordfish::Node::TableCell for
323
- # each one. Merges are defined using the "merge_up" and "merge_left" properties.
324
-
325
- cell = Swordfish::Node::TableCell.new
326
- extra_cells = []
327
-
328
- # Get the inner content of the cell
329
- node.xpath("./w:p").each do |paragraph|
330
- cell.append _node_parse_paragraph(paragraph)
331
- end
332
-
333
- # Determine whether this cell spans multiple rows. In Office OpenXML,
334
- # a table cell is defined in every row, even if the cell is vertically-merged. The representation
335
- # of the merged cell within each row is given a vMerge property, with the topmost one also
336
- # having a vMerge value of "restart", and the others having no vMerge value.
337
- if node.xpath("./w:tcPr/w:vMerge").length > 0 && node.xpath("./w:tcPr/w:vMerge")[0]['w:val'].nil?
338
- cell.merge_up = true
339
- end
340
-
341
- # Determine whether this cell spans multiple columns. Unlike with vertical merges,
342
- # a horizontally-merged Office OpenXML cell is only defined once, but is given a gridSpan
343
- # property defining the number of columns it spans. Since Swordfish requires a cell for each
344
- # column, loop to generate the additional cells, and set their merge_left values appropriately.
345
- if node.xpath("./w:tcPr/w:gridSpan").length > 0
346
- node.xpath("./w:tcPr/w:gridSpan")[0]['w:val'].to_i.-(1).times do
347
- c = Swordfish::Node::TableCell.new
348
- c.merge_left = true
349
- extra_cells << c
350
- end
351
- end
352
-
353
- # Return the generated cell or cells
354
- if extra_cells.empty?
355
- return cell
356
- else
357
- return [cell] + extra_cells
358
- end
359
- end
360
-
361
- end
362
- end