swordfish 0.0.0 → 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- NzVjN2FlM2EwNzY2ZWQxOTJiZjdiZmVlZTZiZDdhMTE2MjYyNWQ1Ng==
4
+ MGMzODc2Y2FlNzUzMzdiMGNmMjlmODA1MjUwYjg3MWJhMzYwZWViOQ==
5
5
  data.tar.gz: !binary |-
6
- MTU0ZjQzMTRiOTkzMGU5NDdkMDk0MzAyZTc5NTkyNTBiNzQwOGNiMA==
6
+ ODgzMmMyNDBkNTEzZDg1M2NkNWFiY2RlN2VlZDBjYmE2M2I3NTNlYw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- ZWM2M2IzZjJmMmExMGYxNGU3MTllNmE2ZjQ2YTdlZDhiOTE0YWU3YjYwYzBl
10
- MjBiY2ZiYTE0MWQ2OWRlYTkyYzE3ZTg1Y2I4NDIwOWYxNDY3MDk1ZWM0NjYw
11
- YzcxYmVjZjEzNWEyNjI5NmU5OWM1Y2IyZTg3YzBhNWFhNjliNjg=
9
+ ZjY4M2JiYmZlZGM1MTMxY2FlYWI2MjVlOTg1YmU0OTMxZDZmOTY1MWRmZjc3
10
+ ZTFiYTJiZTAyODJiZjA0OWM1MjAzNDE0YWQyODVjYTY4ZjYwYzlmNmFmM2Y1
11
+ MzE5NThjZDQwMGRmNDJjNjc4YzI3NmZkNWM3Y2I3Nzc4ZjQ2MWQ=
12
12
  data.tar.gz: !binary |-
13
- NmI2NGQ4ZjIxOWI2YzEzNWI5OTEwMzE4NzEyYjRmZjg2MmJjOTcwMDQ1OWYx
14
- NzZjZjBjZmEyMzhhMTcwNjEyMzE1M2RkNzM2ZWQwZTE1YzhiMjVhNWRmOTQx
15
- NWJlZTdkMTY1MDg0YTY5NDcwMGUzODdkM2I4ODFmOWEzMWM1MjE=
13
+ OTRhZmZkYjQzOWQ5NGIxOGQ1NmNiNGE1MWUwNTBjYTIyMWFmOWUxNDdkN2U1
14
+ MzBmYWZhM2UwOGYxNmFkN2M0ZjY1ZWI1Y2MzZjczMzgxMGU5MmY2M2MwZWVh
15
+ N2QxYjIzMTkzYTdkODkzMDg2Mjg3OWMwYTcwZGQ0ZmI4Yzk1MjM=
@@ -50,12 +50,16 @@ module Swordfish
50
50
  @xml.xpath('//w:body').children.each do |node|
51
51
  case node.name
52
52
  when 'p'
53
- if node.xpath('.//w:numPr').length == 0
53
+ if node.xpath('.//w:numPr').length == 0 && (@buffer.is_a?(Swordfish::Node::List) ? node.xpath('.//w:ind').length.zero? : true)
54
54
  # Regular paragraph
55
+ # (The buffer check makes sure that this isn't an indented paragraph immediately after a list item,
56
+ # which means we're most likely dealing with a multi-paragraph list item)
55
57
  flush
56
58
  @swordfish_doc.append _node_parse_paragraph(node)
57
- else
59
+ elsif node.xpath('.//w:numPr/ancestor::w:pPrChange').length.zero?
58
60
  # List paragraph
61
+ # (must have a numPr node, but cannot have a pPrChange ancestor, since that means
62
+ # we are just looking at historical changes)
59
63
  # (Don't flush because we need to first ensure the list is fully parsed)
60
64
  _node_parse_list(node)
61
65
  end
@@ -86,6 +90,12 @@ module Swordfish
86
90
  elsif style_node['w:val'] == 'subscript'
87
91
  swordfish_node.stylize :subscript
88
92
  end
93
+ when 'rStyle'
94
+ if style_node['w:val'] == 'Strong'
95
+ swordfish_node.stylize :strong
96
+ elsif style_node['w:val'] == 'Emphasis'
97
+ swordfish_node.stylize :emphasis
98
+ end
89
99
  end
90
100
  end
91
101
  end
@@ -142,9 +152,13 @@ module Swordfish
142
152
  when 'r'
143
153
  # A true run node
144
154
  text = Swordfish::Node::Text.new
145
- text.content = run_xml.xpath('./w:t')[0].content
146
- get_styles_for_node(text, run_xml.xpath('./w:rPr')[0])
147
- texts << text
155
+ if run_xml.xpath('./w:t').length > 0
156
+ # Only examine the run if it includes text codes. The run may also include
157
+ # things like comment nodes, which should be ignored.
158
+ text.content = run_xml.xpath('./w:t')[0].content
159
+ get_styles_for_node(text, run_xml.xpath('./w:rPr')[0])
160
+ texts << text
161
+ end
148
162
  when 'hyperlink'
149
163
  # Hyperlink nodes are placed amongst other run nodes, but
150
164
  # they themselves also contain runs. Hyperlinks include
@@ -174,6 +188,15 @@ module Swordfish
174
188
  # since the only way to tell the list has been fully parsed is to encounter
175
189
  # a non-list element.
176
190
 
191
+ # Handle paragraphs with no level, which represent multi-paragraph list items
192
+ if node.xpath(".//w:numPr/w:ilvl").length.zero?
193
+ para = Swordfish::Node::Paragraph.new
194
+ _node_parse_runs(node).each {|r| para.append(r)}
195
+ @buffer.last_list_item(:recurse => true).wrap_children(Swordfish::Node::Text, Swordfish::Node::Paragraph)
196
+ @buffer.last_list_item(:recurse => true).append para
197
+ return
198
+ end
199
+
177
200
  # Get the list item's abstract numbering and level
178
201
  list_item = Swordfish::Node::ListItem.new
179
202
  _node_parse_runs(node).each {|r| list_item.append(r)}
@@ -184,22 +207,23 @@ module Swordfish
184
207
  unless @buffer
185
208
  @buffer = Swordfish::Node::List.new
186
209
  @buffer.stylize @numbering[numbering_scheme][level].to_sym
210
+ @buffer_initial_value = level # Lists may have an arbitrary initial level
187
211
  end
188
212
 
189
213
  # Compare the level of this list item to the bottommost node in
190
214
  # the build buffer to determine where in the hierarchy to add
191
215
  # this node (i.e., are we dealing with list nesting or not?)
192
- if @buffer.depth_of_final_node >= level
216
+ if @buffer.depth_of_final_node >= level || @buffer.children.empty?
193
217
  # Add sibling to existing list
194
218
  target = @buffer
195
- level.times do
219
+ (level - @buffer_initial_value).times do
196
220
  target = target.last_list_item.nested_list
197
221
  end
198
222
  target.append list_item
199
223
  elsif @buffer.depth_of_final_node < level
200
224
  # Add new nested list
201
225
  target = @buffer
202
- (level - 1).times do
226
+ (level - @buffer_initial_value- 1).times do
203
227
  target = target.last_list_item.nested_list
204
228
  end
205
229
  list = Swordfish::Node::List.new
@@ -5,7 +5,7 @@ module Swordfish
5
5
  class Base
6
6
 
7
7
  attr_accessor :content
8
- attr_reader :children
8
+ attr_accessor :children
9
9
  attr_reader :style
10
10
 
11
11
  # Initialize with a blank stylesheet and no children
@@ -21,6 +21,11 @@ module Swordfish
21
21
  @children.flatten!
22
22
  end
23
23
 
24
+ # Replace a child node at a given index
25
+ def replace(node, idx)
26
+ @children[idx] = node
27
+ end
28
+
24
29
  # Take a style or styles and add them to this node's stylesheet
25
30
  def stylize(styles)
26
31
  @style.merge styles
@@ -39,6 +44,22 @@ module Swordfish
39
44
  end
40
45
  end
41
46
 
47
+ # Delete all child nodes
48
+ def clear_children
49
+ @children = []
50
+ end
51
+
52
+ # Wrap all children of type child_class with a new node of type wrapper_class
53
+ def wrap_children(child_class, wrapper_class)
54
+ new_node = wrapper_class.new
55
+ new_node.append @children.select{|n| n.is_a? child_class}
56
+ unless new_node.children.empty?
57
+ idx = @children.find_index(new_node.children[0])
58
+ @children = @children - new_node.children
59
+ @children.insert idx, new_node
60
+ end
61
+ end
62
+
42
63
  end
43
64
 
44
65
  class BadContentError < Exception
@@ -23,9 +23,27 @@ module Swordfish
23
23
  depth
24
24
  end
25
25
 
26
- # Return the final child list item (no nesting)
27
- def last_list_item
28
- @children.last
26
+ # Return the final child list
27
+ def last_list
28
+ node = self
29
+ while node.children && node.last_list_item.nested_list
30
+ node = node.last_list_item.nested_list
31
+ end
32
+ node
33
+ end
34
+
35
+ # Return the final child list item
36
+ def last_list_item(opts = {})
37
+ if opts[:recurse]
38
+ node = self
39
+ li = @children.last
40
+ while node.children && node = node.last_list_item.nested_list
41
+ li = node.children.last
42
+ end
43
+ li
44
+ else
45
+ @children.last
46
+ end
29
47
  end
30
48
 
31
49
  end
@@ -8,7 +8,8 @@ module Swordfish
8
8
  if @content
9
9
  "<p>#{@content}</p>"
10
10
  else
11
- "<p>#{@children.map(&:to_html).join}</p>"
11
+ text = @children.map(&:to_html).join
12
+ "<p>#{text}</p>" unless text.length.zero?
12
13
  end
13
14
  end
14
15
 
@@ -17,6 +17,8 @@ module Swordfish
17
17
  html = "<strike>#{html}</strike>" if @style.strikethrough?
18
18
  html = "<sup>#{html}</sup>" if @style.superscript?
19
19
  html = "<sub>#{html}</sub>" if @style.subscript?
20
+ html = "<strong>#{html}</strong>" if @style.strong?
21
+ html = "<em>#{html}</em>" if @style.emphasis?
20
22
  html
21
23
  end
22
24
 
@@ -6,7 +6,7 @@ module Swordfish
6
6
  # Define all supported values here
7
7
  SUPPORTED_STYLES = [
8
8
  # Inline styles
9
- :bold, :italic, :underline, :superscript, :subscript, :strikethrough,
9
+ :bold, :italic, :underline, :superscript, :subscript, :strikethrough, :strong, :emphasis,
10
10
  # List enumeration styles
11
11
  :bullet, :decimal, :lowerLetter, :lowerRoman
12
12
  ]
data/lib/swordfish.rb CHANGED
@@ -4,10 +4,10 @@ require 'swordfish/formats/docx'
4
4
  module Swordfish
5
5
 
6
6
  # Main entry point into the parser. Pass in a filepath and return a parsed document.
7
- def self.open(filepath)
8
- extension = filepath.split('.').last.downcase
7
+ def self.open(filepath, opts={})
8
+ extension = (opts[:extension] || filepath.split('.').last).downcase.to_sym
9
9
  case extension
10
- when 'docx'
10
+ when :docx
11
11
  Swordfish::DOCX.open(filepath)
12
12
  else
13
13
  raise UnsupportedFormatError, "'#{extension}' is not a recognized file format"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: swordfish
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martin Posthumus