swordfish 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/README.md +4 -0
- data/lib/swordfish/document.rb +37 -0
- data/lib/swordfish/formats/docx.rb +274 -0
- data/lib/swordfish/nodes/base.rb +47 -0
- data/lib/swordfish/nodes/hyperlink.rb +15 -0
- data/lib/swordfish/nodes/list.rb +33 -0
- data/lib/swordfish/nodes/list_item.rb +18 -0
- data/lib/swordfish/nodes/paragraph.rb +17 -0
- data/lib/swordfish/nodes/table.rb +85 -0
- data/lib/swordfish/nodes/table_cell.rb +37 -0
- data/lib/swordfish/nodes/table_row.rb +13 -0
- data/lib/swordfish/nodes/text.rb +25 -0
- data/lib/swordfish/stylesheet.rb +42 -0
- data/lib/swordfish.rb +19 -0
- metadata +101 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
NzVjN2FlM2EwNzY2ZWQxOTJiZjdiZmVlZTZiZDdhMTE2MjYyNWQ1Ng==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
MTU0ZjQzMTRiOTkzMGU5NDdkMDk0MzAyZTc5NTkyNTBiNzQwOGNiMA==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
ZWM2M2IzZjJmMmExMGYxNGU3MTllNmE2ZjQ2YTdlZDhiOTE0YWU3YjYwYzBl
|
10
|
+
MjBiY2ZiYTE0MWQ2OWRlYTkyYzE3ZTg1Y2I4NDIwOWYxNDY3MDk1ZWM0NjYw
|
11
|
+
YzcxYmVjZjEzNWEyNjI5NmU5OWM1Y2IyZTg3YzBhNWFhNjliNjg=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
NmI2NGQ4ZjIxOWI2YzEzNWI5OTEwMzE4NzEyYjRmZjg2MmJjOTcwMDQ1OWYx
|
14
|
+
NzZjZjBjZmEyMzhhMTcwNjEyMzE1M2RkNzM2ZWQwZTE1YzhiMjVhNWRmOTQx
|
15
|
+
NWJlZTdkMTY1MDg0YTY5NDcwMGUzODdkM2I4ODFmOWEzMWM1MjE=
|
data/README.md
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'swordfish/stylesheet'
|
2
|
+
require 'swordfish/nodes/base'
|
3
|
+
require 'swordfish/nodes/text'
|
4
|
+
require 'swordfish/nodes/paragraph'
|
5
|
+
require 'swordfish/nodes/list'
|
6
|
+
require 'swordfish/nodes/list_item'
|
7
|
+
require 'swordfish/nodes/hyperlink'
|
8
|
+
require 'swordfish/nodes/table'
|
9
|
+
require 'swordfish/nodes/table_row'
|
10
|
+
require 'swordfish/nodes/table_cell'
|
11
|
+
|
12
|
+
# Swordfish::Document is the internal representation of a parsed document.
|
13
|
+
|
14
|
+
module Swordfish
|
15
|
+
class Document
|
16
|
+
|
17
|
+
attr_reader :nodes # An array of all top-level elements in the document
|
18
|
+
|
19
|
+
# On initialization, set the nodes list to an empty array
|
20
|
+
def initialize
|
21
|
+
@nodes = []
|
22
|
+
end
|
23
|
+
|
24
|
+
# Pass in a node and append it to the nodes array
|
25
|
+
def append(node)
|
26
|
+
if Swordfish::Node.constants.include? node.class.to_s.split('::').last.to_sym
|
27
|
+
@nodes << node
|
28
|
+
else
|
29
|
+
raise ArgumentError, "Object is not a node"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def to_html
|
34
|
+
@nodes.map(&:to_html).join
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,274 @@
|
|
1
|
+
require 'zip'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'swordfish/document'
|
4
|
+
|
5
|
+
# Swordfish::DOCX defines a parser for .docx (Office OpenXML) formats
|
6
|
+
|
7
|
+
module Swordfish
|
8
|
+
class DOCX
|
9
|
+
|
10
|
+
attr_reader :swordfish_doc # The Swordfish::Document corresponding to the parsed document
|
11
|
+
|
12
|
+
# Parse a document and return a Swordfish::Document object
|
13
|
+
def self.open(filepath)
|
14
|
+
# .docx is a zipped file format consisting of several XML files.
|
15
|
+
# Read in the content of each needed file.
|
16
|
+
docx_archive = Zip::File.open(filepath)
|
17
|
+
document = docx_archive.read 'word/document.xml'
|
18
|
+
styles = docx_archive.read 'word/styles.xml'
|
19
|
+
numbering = docx_archive.read 'word/numbering.xml'
|
20
|
+
relationships = docx_archive.read 'word/_rels/document.xml.rels'
|
21
|
+
|
22
|
+
# Parse the XML files and generate the Swordfish::Document
|
23
|
+
swordfish_docx = new document, styles, numbering, relationships
|
24
|
+
swordfish_docx.swordfish_doc
|
25
|
+
end
|
26
|
+
|
27
|
+
def initialize(document_xml, styles_xml, numbering_xml, relationships_xml)
|
28
|
+
@swordfish_doc = Swordfish::Document.new
|
29
|
+
parse_styles styles_xml
|
30
|
+
parse_numbering numbering_xml
|
31
|
+
parse_relationships relationships_xml
|
32
|
+
parse document_xml
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
# Take the contents of the build buffer and flush them into the Swordfish::Document object.
|
38
|
+
# This buffer is needed for certain docx constructs that consist of multiple top-level
|
39
|
+
# elements but correspond to a single Swordfish::Node, such as lists.
|
40
|
+
def flush
|
41
|
+
@swordfish_doc.append(@buffer) if @buffer
|
42
|
+
@buffer = nil
|
43
|
+
end
|
44
|
+
|
45
|
+
# Parse the document structure XML
|
46
|
+
def parse(document_xml)
|
47
|
+
@xml = Nokogiri::XML(document_xml)
|
48
|
+
|
49
|
+
# Iterate over each element node and dispatch it to the appropriate parser
|
50
|
+
@xml.xpath('//w:body').children.each do |node|
|
51
|
+
case node.name
|
52
|
+
when 'p'
|
53
|
+
if node.xpath('.//w:numPr').length == 0
|
54
|
+
# Regular paragraph
|
55
|
+
flush
|
56
|
+
@swordfish_doc.append _node_parse_paragraph(node)
|
57
|
+
else
|
58
|
+
# List paragraph
|
59
|
+
# (Don't flush because we need to first ensure the list is fully parsed)
|
60
|
+
_node_parse_list(node)
|
61
|
+
end
|
62
|
+
when 'tbl'
|
63
|
+
flush
|
64
|
+
@swordfish_doc.append _node_parse_table(node)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
flush
|
68
|
+
end
|
69
|
+
|
70
|
+
# Parse styles out of a docx element property nodeset (*Pr) and stylize the Swordfish::Node
|
71
|
+
def get_styles_for_node(swordfish_node, xml_nodeset)
|
72
|
+
return unless xml_nodeset
|
73
|
+
xml_nodeset.children.each do |style_node|
|
74
|
+
case style_node.name
|
75
|
+
when 'i'
|
76
|
+
swordfish_node.stylize :italic
|
77
|
+
when 'b'
|
78
|
+
swordfish_node.stylize :bold
|
79
|
+
when 'u'
|
80
|
+
swordfish_node.stylize :underline
|
81
|
+
when 'strike'
|
82
|
+
swordfish_node.stylize :strikethrough
|
83
|
+
when 'vertAlign'
|
84
|
+
if style_node['w:val'] == 'superscript'
|
85
|
+
swordfish_node.stylize :superscript
|
86
|
+
elsif style_node['w:val'] == 'subscript'
|
87
|
+
swordfish_node.stylize :subscript
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# Parse the document styles XML
|
94
|
+
def parse_styles(styles_xml)
|
95
|
+
end
|
96
|
+
|
97
|
+
# Parse the abstract numbering XML (defining things such as list numbering)
|
98
|
+
def parse_numbering(numbering_xml)
|
99
|
+
# The XML maps a numbering ID (numId) to an abstract numbering schema ID (abstractNumId).
|
100
|
+
# The abstract numbering schema defines display formats for each level of indentation (lvl).
|
101
|
+
# This function will load up the relevant data into the @numbering class variable in the form
|
102
|
+
# of a nested hash: @numbering[numbering ID][indentation level] = number format.
|
103
|
+
@numbering = {}
|
104
|
+
xml = Nokogiri::XML(numbering_xml)
|
105
|
+
xml.xpath("//w:num").each do |num|
|
106
|
+
numId = num['w:numId'].to_i
|
107
|
+
abstractNumId = num.xpath("./w:abstractNumId")[0]['w:val'].to_i
|
108
|
+
abstract_numbering = {}
|
109
|
+
xml.xpath("//w:abstractNum[@w:abstractNumId='#{abstractNumId}']/w:lvl").each do |level_format|
|
110
|
+
level = level_format['w:ilvl'].to_i
|
111
|
+
format = level_format.xpath("./w:numFmt")[0]['w:val']
|
112
|
+
abstract_numbering[level] = format
|
113
|
+
end
|
114
|
+
@numbering[numId] = abstract_numbering
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# Parse the relationships XML (defining things such as internal references and external links)
|
119
|
+
def parse_relationships(relationships_xml)
|
120
|
+
# The XML contains a list of relationships identified by an id. Each relationship includes
|
121
|
+
# a target attribute designating the reference. THis function will load up the relevant
|
122
|
+
# data into the @relationships class variable in the form of a hash:
|
123
|
+
# @relationships[relationship ID] = target URI.
|
124
|
+
@relationships = {}
|
125
|
+
xml = Nokogiri::XML(relationships_xml)
|
126
|
+
xml.css("Relationship").each do |rel| # Nokogiri doesn't seem to like XPath here for some reason
|
127
|
+
@relationships[rel['Id']] = rel['Target']
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# NODE PARSERS
|
132
|
+
# Each of the methods below (beginning with '_node') are specialized parsers for handling
|
133
|
+
# a particular type of XML element.
|
134
|
+
|
135
|
+
# Parse one or more runs
|
136
|
+
def _node_parse_runs(node)
|
137
|
+
# The 'run' is the basic unit of text in Office OpenXML. A paragraph, table cell, or other
|
138
|
+
# block element may contain one or more runs, and each run has an associated set of styles.
|
139
|
+
texts = []
|
140
|
+
node.children.each do |run_xml|
|
141
|
+
case run_xml.name
|
142
|
+
when 'r'
|
143
|
+
# A true run node
|
144
|
+
text = Swordfish::Node::Text.new
|
145
|
+
text.content = run_xml.xpath('./w:t')[0].content
|
146
|
+
get_styles_for_node(text, run_xml.xpath('./w:rPr')[0])
|
147
|
+
texts << text
|
148
|
+
when 'hyperlink'
|
149
|
+
# Hyperlink nodes are placed amongst other run nodes, but
|
150
|
+
# they themselves also contain runs. Hyperlinks include
|
151
|
+
# a relationship ID attribute defining their reference.
|
152
|
+
link = Swordfish::Node::Hyperlink.new
|
153
|
+
link.href = @relationships[run_xml['r:id']]
|
154
|
+
_node_parse_runs(run_xml).each {|r| link.append(r)}
|
155
|
+
texts << link
|
156
|
+
end
|
157
|
+
end
|
158
|
+
texts
|
159
|
+
end
|
160
|
+
|
161
|
+
# Parse a paragraph
|
162
|
+
def _node_parse_paragraph(node)
|
163
|
+
paragraph = Swordfish::Node::Paragraph.new
|
164
|
+
_node_parse_runs(node).each {|r| paragraph.append(r)}
|
165
|
+
paragraph
|
166
|
+
end
|
167
|
+
|
168
|
+
# Parse a list
|
169
|
+
def _node_parse_list(node)
|
170
|
+
# In Office OpenXML, a list is not a distinct element type, but rather a
|
171
|
+
# specialized paragraph that references an abstract numbering scheme
|
172
|
+
# and includes an indentation level. As a result, the build buffer
|
173
|
+
# must be used to assemble the Swordfish::Node representation of the list,
|
174
|
+
# since the only way to tell the list has been fully parsed is to encounter
|
175
|
+
# a non-list element.
|
176
|
+
|
177
|
+
# Get the list item's abstract numbering and level
|
178
|
+
list_item = Swordfish::Node::ListItem.new
|
179
|
+
_node_parse_runs(node).each {|r| list_item.append(r)}
|
180
|
+
level = node.xpath(".//w:numPr/w:ilvl")[0]['w:val'].to_i
|
181
|
+
numbering_scheme = node.xpath(".//w:numPr/w:numId")[0]['w:val'].to_i
|
182
|
+
|
183
|
+
# If the build buffer is empty, this is a new list
|
184
|
+
unless @buffer
|
185
|
+
@buffer = Swordfish::Node::List.new
|
186
|
+
@buffer.stylize @numbering[numbering_scheme][level].to_sym
|
187
|
+
end
|
188
|
+
|
189
|
+
# Compare the level of this list item to the bottommost node in
|
190
|
+
# the build buffer to determine where in the hierarchy to add
|
191
|
+
# this node (i.e., are we dealing with list nesting or not?)
|
192
|
+
if @buffer.depth_of_final_node >= level
|
193
|
+
# Add sibling to existing list
|
194
|
+
target = @buffer
|
195
|
+
level.times do
|
196
|
+
target = target.last_list_item.nested_list
|
197
|
+
end
|
198
|
+
target.append list_item
|
199
|
+
elsif @buffer.depth_of_final_node < level
|
200
|
+
# Add new nested list
|
201
|
+
target = @buffer
|
202
|
+
(level - 1).times do
|
203
|
+
target = target.last_list_item.nested_list
|
204
|
+
end
|
205
|
+
list = Swordfish::Node::List.new
|
206
|
+
list.append list_item
|
207
|
+
list.stylize @numbering[numbering_scheme][level].to_sym
|
208
|
+
target.last_list_item.append list
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
# Parse a table
|
213
|
+
def _node_parse_table(node)
|
214
|
+
table = Swordfish::Node::Table.new
|
215
|
+
node.xpath("./w:tr").each do |row|
|
216
|
+
table.append _node_parse_table_row(row)
|
217
|
+
end
|
218
|
+
table
|
219
|
+
end
|
220
|
+
|
221
|
+
# Parse a table row
|
222
|
+
def _node_parse_table_row(node)
|
223
|
+
row = Swordfish::Node::TableRow.new
|
224
|
+
node.xpath('./w:tc').each do |cell|
|
225
|
+
row.append _node_parse_table_cell(cell)
|
226
|
+
end
|
227
|
+
row
|
228
|
+
end
|
229
|
+
|
230
|
+
# Parse a table cell
|
231
|
+
def _node_parse_table_cell(node)
|
232
|
+
# In a Swordfish::Node::Table object, the number of table cells must equal the
|
233
|
+
# total number of rows times the total number of columns; that is, even if
|
234
|
+
# two cells are merged together, there must be a Swordfish::Node::TableCell for
|
235
|
+
# each one. Merges are defined using the "merge_up" and "merge_left" properties.
|
236
|
+
|
237
|
+
cell = Swordfish::Node::TableCell.new
|
238
|
+
extra_cells = []
|
239
|
+
|
240
|
+
# Get the inner content of the cell
|
241
|
+
node.xpath("./w:p").each do |paragraph|
|
242
|
+
cell.append _node_parse_paragraph(paragraph)
|
243
|
+
end
|
244
|
+
|
245
|
+
# Determine whether this cell spans multiple rows. In Office OpenXML,
|
246
|
+
# a table cell is defined in every row, even if the cell is vertically-merged. The representation
|
247
|
+
# of the merged cell within each row is given a vMerge property, with the topmost one also
|
248
|
+
# having a vMerge value of "restart", and the others having no vMerge value.
|
249
|
+
if node.xpath("./w:tcPr/w:vMerge").length > 0 && node.xpath("./w:tcPr/w:vMerge")[0]['w:val'].nil?
|
250
|
+
cell.merge_up = true
|
251
|
+
end
|
252
|
+
|
253
|
+
# Determine whether this cell spans multiple columns. Unlike with vertical merges,
|
254
|
+
# a horizontally-merged Office OpenXML cell is only defined once, but is given a gridSpan
|
255
|
+
# property defining the number of columns it spans. Since Swordfish requires a cell for each
|
256
|
+
# column, loop to generate the additional cells, and set their merge_left values appropriately.
|
257
|
+
if node.xpath("./w:tcPr/w:gridSpan").length > 0
|
258
|
+
node.xpath("./w:tcPr/w:gridSpan")[0]['w:val'].to_i.-(1).times do
|
259
|
+
c = Swordfish::Node::TableCell.new
|
260
|
+
c.merge_left = true
|
261
|
+
extra_cells << c
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
# Return the generated cell or cells
|
266
|
+
if extra_cells.empty?
|
267
|
+
return cell
|
268
|
+
else
|
269
|
+
return [cell] + extra_cells
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
end
|
274
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# Superclass for all Swordfish::Node objects
|
2
|
+
|
3
|
+
module Swordfish
|
4
|
+
module Node
|
5
|
+
class Base
|
6
|
+
|
7
|
+
attr_accessor :content
|
8
|
+
attr_reader :children
|
9
|
+
attr_reader :style
|
10
|
+
|
11
|
+
# Initialize with a blank stylesheet and no children
|
12
|
+
def initialize
|
13
|
+
@style = Swordfish::Stylesheet.new []
|
14
|
+
@children = []
|
15
|
+
end
|
16
|
+
|
17
|
+
# Append a node or nodes to this node as a child
|
18
|
+
def append(node)
|
19
|
+
@children ||= []
|
20
|
+
@children << node
|
21
|
+
@children.flatten!
|
22
|
+
end
|
23
|
+
|
24
|
+
# Take a style or styles and add them to this node's stylesheet
|
25
|
+
def stylize(styles)
|
26
|
+
@style.merge styles
|
27
|
+
end
|
28
|
+
|
29
|
+
# Every subclass must implement to_html in order to be converted to HTML
|
30
|
+
def to_html
|
31
|
+
raise NotImplementedError
|
32
|
+
end
|
33
|
+
|
34
|
+
# Given a hash, create instance variables for each key in that hash.
|
35
|
+
# This is used for communication between nodes in the hierarchy.
|
36
|
+
def inform!(hash)
|
37
|
+
hash.each do |k, v|
|
38
|
+
instance_variable_set "@#{k}", v
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
class BadContentError < Exception
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# A list node
|
2
|
+
|
3
|
+
module Swordfish
|
4
|
+
module Node
|
5
|
+
class List < Base
|
6
|
+
|
7
|
+
def to_html
|
8
|
+
if @style.bullet?
|
9
|
+
"<ul>#{@children.map(&:to_html).join}</ul>"
|
10
|
+
else
|
11
|
+
"<ol>#{@children.map(&:to_html).join}</ol>"
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
# Get the zero-indexed depth of the bottommost child list
|
16
|
+
# (This is not the deepest node, just the last child)
|
17
|
+
def depth_of_final_node
|
18
|
+
depth = 0
|
19
|
+
node = self
|
20
|
+
while !@children.empty? && node = node.last_list_item.nested_list do
|
21
|
+
depth += 1
|
22
|
+
end
|
23
|
+
depth
|
24
|
+
end
|
25
|
+
|
26
|
+
# Return the final child list item (no nesting)
|
27
|
+
def last_list_item
|
28
|
+
@children.last
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# A list item node
|
2
|
+
|
3
|
+
module Swordfish
|
4
|
+
module Node
|
5
|
+
class ListItem < Base
|
6
|
+
|
7
|
+
def to_html
|
8
|
+
"<li>#{@children.map(&:to_html).join}</li>"
|
9
|
+
end
|
10
|
+
|
11
|
+
# Return the nested list, or nil if this list item has no nested lists
|
12
|
+
def nested_list
|
13
|
+
@children.last.is_a?(Swordfish::Node::List) ? @children.last : nil
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
# A table node
|
2
|
+
|
3
|
+
module Swordfish
|
4
|
+
module Node
|
5
|
+
class Table < Base
|
6
|
+
|
7
|
+
# Get the number of rows in the table
|
8
|
+
def rows
|
9
|
+
@children.length
|
10
|
+
end
|
11
|
+
|
12
|
+
# Get the number of columns in the table
|
13
|
+
def cols
|
14
|
+
@children[0].children.length
|
15
|
+
end
|
16
|
+
|
17
|
+
# Return the TableCell object at a given position
|
18
|
+
def cell_at(row, col)
|
19
|
+
@children[row].children[col]
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_html
|
23
|
+
collapse_cells!
|
24
|
+
"<table><tbody>#{@children.map(&:to_html).join}</tbody></table>"
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
# A Swordfish::Node::Table always contains rows*cols cells, even
|
30
|
+
# if some of them are to be merged. This method determines how
|
31
|
+
# cells ought to be merged together and then informs each cell
|
32
|
+
# of its configuration, so that each cell will then properly know
|
33
|
+
# how to render itself (if at all).
|
34
|
+
def collapse_cells!
|
35
|
+
# Create a 2D array representing each cell, and give each one
|
36
|
+
# an initial colspan and rowspan of 1
|
37
|
+
structure = []
|
38
|
+
rows.times do
|
39
|
+
r = []
|
40
|
+
cols.times do
|
41
|
+
r << {:colspan => 1, :rowspan => 1}
|
42
|
+
end
|
43
|
+
structure << r
|
44
|
+
end
|
45
|
+
|
46
|
+
# Iterate over each table cell and see if it has the merge_up
|
47
|
+
# or merge_left properties set. If so, find the corresponding
|
48
|
+
# "parent" cell and incremenet its colspan or rowspan appropriately.
|
49
|
+
# If the cell is to be merged up or left, set its value to nil
|
50
|
+
# within the "structure" variable.
|
51
|
+
rows.times do |r|
|
52
|
+
cols.times do |c|
|
53
|
+
if cell_at(r, c).merge_up?
|
54
|
+
(r-1).downto(0).each do |i|
|
55
|
+
unless structure[i][c].nil?
|
56
|
+
structure[i][c][:rowspan] += 1
|
57
|
+
break
|
58
|
+
end
|
59
|
+
end
|
60
|
+
structure[r][c] = nil
|
61
|
+
end
|
62
|
+
if cell_at(r, c).merge_left?
|
63
|
+
(c-1).downto(0).each do |i|
|
64
|
+
unless structure[r][i].nil?
|
65
|
+
structure[r][i][:colspan] += 1
|
66
|
+
break
|
67
|
+
end
|
68
|
+
end
|
69
|
+
structure[r][c] = nil
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Inform every table cell of its calculated colspan and rowspan.
|
75
|
+
# If the cell is not to be drawn, set its rowspan and colspan to 0.
|
76
|
+
rows.times do |r|
|
77
|
+
cols.times do |c|
|
78
|
+
cell_at(r, c).inform!(structure[r][c] || {:colspan => 0, :rowspan => 0})
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# A table cell node
|
2
|
+
|
3
|
+
module Swordfish
|
4
|
+
module Node
|
5
|
+
class TableCell < Base
|
6
|
+
|
7
|
+
attr_accessor :merge_left
|
8
|
+
attr_accessor :merge_up
|
9
|
+
attr_reader :rowspan
|
10
|
+
attr_reader :colspan
|
11
|
+
|
12
|
+
# True if this cell is merged with the one to the left
|
13
|
+
def merge_left?
|
14
|
+
!!@merge_left
|
15
|
+
end
|
16
|
+
|
17
|
+
# True if this cell is merged with the one above
|
18
|
+
def merge_up?
|
19
|
+
!!@merge_up
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_html
|
23
|
+
return nil if @colspan == 0 && @rowspan == 0
|
24
|
+
|
25
|
+
if @rowspan && @rowspan > 1
|
26
|
+
rowspan = " rowspan=#{@rowspan}"
|
27
|
+
end
|
28
|
+
if @colspan && @colspan > 1
|
29
|
+
colspan = " colspan=#{@colspan}"
|
30
|
+
end
|
31
|
+
|
32
|
+
"<td#{rowspan}#{colspan}>#{@children.map(&:to_html).join}</td>"
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# A generic text node
|
2
|
+
|
3
|
+
module Swordfish
|
4
|
+
module Node
|
5
|
+
class Text < Base
|
6
|
+
|
7
|
+
# Override Base append because a text node should never have children
|
8
|
+
def append(node)
|
9
|
+
raise BadContentError
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_html
|
13
|
+
html = @content
|
14
|
+
html = "<i>#{html}</i>" if @style.italic?
|
15
|
+
html = "<b>#{html}</b>" if @style.bold?
|
16
|
+
html = "<u>#{html}</u>" if @style.underline?
|
17
|
+
html = "<strike>#{html}</strike>" if @style.strikethrough?
|
18
|
+
html = "<sup>#{html}</sup>" if @style.superscript?
|
19
|
+
html = "<sub>#{html}</sub>" if @style.subscript?
|
20
|
+
html
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# Swordfish::Stylesheet represents formatting applied to a node
|
2
|
+
|
3
|
+
module Swordfish
|
4
|
+
class Stylesheet
|
5
|
+
|
6
|
+
# Define all supported values here
|
7
|
+
SUPPORTED_STYLES = [
|
8
|
+
# Inline styles
|
9
|
+
:bold, :italic, :underline, :superscript, :subscript, :strikethrough,
|
10
|
+
# List enumeration styles
|
11
|
+
:bullet, :decimal, :lowerLetter, :lowerRoman
|
12
|
+
]
|
13
|
+
|
14
|
+
# Initialize a stylesheet with an optional list of styles
|
15
|
+
def initialize(styles)
|
16
|
+
@styles = []
|
17
|
+
merge styles
|
18
|
+
end
|
19
|
+
|
20
|
+
# Take a style or list of styles and add them to an existing stylesheet
|
21
|
+
def merge(styles)
|
22
|
+
styles = [styles] unless styles.is_a?(Array)
|
23
|
+
@styles |= styles.select{|s| SUPPORTED_STYLES.include?(s)}
|
24
|
+
end
|
25
|
+
|
26
|
+
# For each supported style, define a boolean method to check its presence
|
27
|
+
# (i.e., :bold?, :italic?, etc.)
|
28
|
+
SUPPORTED_STYLES.each do |style|
|
29
|
+
define_method "#{style}?".to_sym do
|
30
|
+
has_style?(style)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
# Check if a style is included in a stylesheet
|
37
|
+
def has_style?(style)
|
38
|
+
@styles.include? style
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
data/lib/swordfish.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'swordfish/document'
|
2
|
+
require 'swordfish/formats/docx'
|
3
|
+
|
4
|
+
module Swordfish
|
5
|
+
|
6
|
+
# Main entry point into the parser. Pass in a filepath and return a parsed document.
|
7
|
+
def self.open(filepath)
|
8
|
+
extension = filepath.split('.').last.downcase
|
9
|
+
case extension
|
10
|
+
when 'docx'
|
11
|
+
Swordfish::DOCX.open(filepath)
|
12
|
+
else
|
13
|
+
raise UnsupportedFormatError, "'#{extension}' is not a recognized file format"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
class UnsupportedFormatError < LoadError
|
18
|
+
end
|
19
|
+
end
|
metadata
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: swordfish
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Martin Posthumus
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-05-27 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ! '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ! '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rubyzip
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ! '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: A simple library for various word processor formats focusing primarily
|
56
|
+
around conversion to HTML
|
57
|
+
email: martin.posthumus@gmail.com
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- README.md
|
63
|
+
- lib/swordfish.rb
|
64
|
+
- lib/swordfish/document.rb
|
65
|
+
- lib/swordfish/formats/docx.rb
|
66
|
+
- lib/swordfish/nodes/base.rb
|
67
|
+
- lib/swordfish/nodes/hyperlink.rb
|
68
|
+
- lib/swordfish/nodes/list.rb
|
69
|
+
- lib/swordfish/nodes/list_item.rb
|
70
|
+
- lib/swordfish/nodes/paragraph.rb
|
71
|
+
- lib/swordfish/nodes/table.rb
|
72
|
+
- lib/swordfish/nodes/table_cell.rb
|
73
|
+
- lib/swordfish/nodes/table_row.rb
|
74
|
+
- lib/swordfish/nodes/text.rb
|
75
|
+
- lib/swordfish/stylesheet.rb
|
76
|
+
homepage: https://github.com/voikya/swordfish
|
77
|
+
licenses:
|
78
|
+
- MIT
|
79
|
+
metadata: {}
|
80
|
+
post_install_message:
|
81
|
+
rdoc_options: []
|
82
|
+
require_paths:
|
83
|
+
- lib
|
84
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ! '>='
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: 1.9.3
|
89
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
requirements: []
|
95
|
+
rubyforge_project:
|
96
|
+
rubygems_version: 2.2.2
|
97
|
+
signing_key:
|
98
|
+
specification_version: 4
|
99
|
+
summary: A simple library for various word processor formats
|
100
|
+
test_files: []
|
101
|
+
has_rdoc:
|