swordfish 0.0.12 → 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ODBlMzJjZjVkMmUxZTdlNmVmMTUxOTI1NjFiMTI3NmNjMGVmNWM5OQ==
4
+ MDZkZmUxZDg4NTE1Y2RmNzJiMWVkOTVmYTIyOGU5Yzk5NjQ0ODI5Yw==
5
5
  data.tar.gz: !binary |-
6
- NWQ4ZDMxOTJkN2VmZThkMmM3MDNmYjM5OWZjMWZkYjcxMTBkNjdmNQ==
6
+ YzkzMmVhNzkzNTExMjI3YjUyMTVlYzMzNGVhMWU2N2VkZGFhMmMxZA==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- ZTc0NTZkNjExNGFkODBjZTBkZjk1YTJiNzliMGZjZmRhY2I0MzQyNjQzNTUw
10
- MDQ4YWE2YTAwMTYzN2EzYjY1Mzk5YjhiOTZmOWRmOTE3OTQyMTkyZDRkMzNh
11
- NDNjMzUwNjZhOTg1YzIzZWViODk5NDZmNTQyYjcwNzBkZDhmN2I=
9
+ MWE1NDQxNzJkMWJjMzY0MDQyOGViMmUxNTI3NzUyN2RlMzFmNWJhMzMzODRl
10
+ NzA1YjI4ZWMwMmJiMTVhMzk5MmVmNWY4M2E3ZTg1ZWI2MjEyNWZmODIzNjdl
11
+ NGE1OWIzMjc5Y2ZkNWI1NmQ3NmI4MTcyNDM3NGEwNmM2OGM4OTU=
12
12
  data.tar.gz: !binary |-
13
- OGQzZThlMmIzNGNlNDBhY2QwNThiNzYyMmY4ZmZmNGQ3ZmJhZGYyZGEyYjFl
14
- MDg1ZTQ4ZjllNGEyODU5NmYzMzUxMGNiODUzMzhlYjI3MTY0OTY2MDU2NTg1
15
- YzY4MGU0ZWM0NjkyNmMzMmZlZTU0NTM5YTk5MDY5YWY5ZTM4ZjM=
13
+ N2UwNTRkYWIwNTA1NWNlNzgwMTJlNWE4N2FjODU0NzVhMWU1Y2FhNjI3MWY1
14
+ MmM1ZmUxMDIxZjczZWUxNDU3ODg0NGZmOGMwZmIzOTZhZTlhZDc0ZGU0ODIy
15
+ MjFiNjhkOWVjNDViMmMwNzQ3YThmZjg2YWVmYjY1YmE5NjdiMDA=
data/README.md CHANGED
@@ -1,4 +1,54 @@
1
- swordfish
2
- =====
1
+ ![Swordfish](https://raw.githubusercontent.com/voikya/swordfish/master/swordfish.png)
3
2
 
4
- Document parser
3
+ Swordfish is a simple document processing library for Ruby. It enables the conversion of Microsoft Word XML documents (.docx) into clean, semantic HTML5, without all the mess that normal export tools or copy-and-paste would produce.
4
+
5
+ Features
6
+ -----
7
+
8
+ Swordfish currently supports identifying the following features:
9
+
10
+ - Paragraphs
11
+ - Formatting: bold, italic, underline, superscript, subscript, strikethrough
12
+ - Links
13
+ - Lists (including nested lists)
14
+ - Tables
15
+ - Footnotes and Endnotes
16
+ - Images (except for Word Drawings)
17
+
18
+ Installation
19
+ -----
20
+
21
+ Swordfish is available through RubyGems, so you can install it with `gem install swordfish`.
22
+
23
+ Converting a Document
24
+ -----
25
+
26
+ Converting a Word document into HTML just requires two calls: one to parse the document, and one to generate the markup:
27
+
28
+ ```ruby
29
+ require 'swordfish'
30
+ Swordfish.open('~/Documents/my_word_doc.docx').to_html
31
+ ```
32
+
33
+ Additional configuration options may be provided by calling `settings` with a hash of parameters prior to generating the final markup. For instance, if you want to enable footnotes (appearing as a block at the end of the HTML document), enable the `footnotes` option:
34
+
35
+ ```ruby
36
+ Swordfish.open('~/Documents/my_word_doc.docx').settings(:footnotes => true).to_html
37
+ ```
38
+
39
+ The following settings are currently available (all are boolean, and default to `false`)
40
+
41
+ - `guess_headers` — When true, attempt to identify headers within the text and assign them the appropriate `<h1>` through `<h6>` tags. When false, all text will be presented as normal paragraphs.
42
+ - `footnotes` — When true, preserve footnote and endnote content in a block at the end of the generated HTML, including links back to the original reference points in the text. When false, footnotes will be ignored.
43
+ - `smart_br` — When true, attempt to clean up unnecessary linebreaks often present in Word markup, such as at the very beginning or end of a paragraph. When false, linebreaks will be preserved exactly as in the original Word markup.
44
+ - `full_document` — When true, the generated HTML will represent a complete HTML document, including a doctype and header. When false, the output will be an HTML fragment suitable for insertion into the DOM, for example.
45
+
46
+ Images within the Word document are available after parsing by calling the `images` method, which returns a hash of file names and temporary files.
47
+
48
+ ```ruby
49
+ # Print the file name and size of each image in a document
50
+ doc = Swordfish.open('~/Documents/my_word_doc.docx')
51
+ doc.images.each do |filename, tempfile|
52
+ puts "#{filename}: #{tempfile.size}"
53
+ end
54
+ ```
@@ -67,7 +67,9 @@ module Swordfish
67
67
  @xml.xpath('//w:body').children.each do |node|
68
68
  case node.name
69
69
  when 'p'
70
- if node.xpath('.//w:numPr').length == 0 && (@buffer.is_a?(Swordfish::Node::List) ? node.xpath('.//w:ind[@w:left]').length.zero? : true)
70
+ no_numbering_prop = node.xpath('.//w:numPr').length.zero? || node.xpath('.//w:numPr/w:ilvl | .//w:numPr/w:numId').length.zero?
71
+ not_multiparagraph_list_item = (@buffer.is_a?(Swordfish::Node::List) ? node.xpath('.//w:ind[@w:left]').length.zero? : true)
72
+ if no_numbering_prop && not_multiparagraph_list_item
71
73
  # Regular paragraph
72
74
  # (The buffer check makes sure that this isn't an indented paragraph immediately after a list item,
73
75
  # which means we're most likely dealing with a multi-paragraph list item)
@@ -53,7 +53,7 @@ module Swordfish
53
53
  if instruction =~ /^\s*HYPERLINK/
54
54
  # A hyperlink
55
55
  complex_field = Swordfish::Node::Hyperlink.new
56
- complex_field.href = instruction.match(/^\s*HYPERLINK (?:\\l )?"([^"]+)"/).captures[0]
56
+ complex_field.href = instruction.match(/^\s*HYPERLINK (?:"" )?(?:\\l )?"([^"]+)"/).captures[0]
57
57
  else
58
58
  # Anything else
59
59
  complex_field = Swordfish::Node::Text.new
@@ -127,7 +127,7 @@ module Swordfish
127
127
  if node.xpath(".//w:numPr/w:ilvl").length.zero?
128
128
  para = Swordfish::Node::Paragraph.new
129
129
  _node_parse_runs(node).each {|r| para.append(r)}
130
- @buffer.last_list_item(:recurse => true).wrap_children(Swordfish::Node::Text, Swordfish::Node::Paragraph)
130
+ @buffer.last_list_item(:recurse => true).wrap_children(Swordfish::Node::Inline, Swordfish::Node::Paragraph)
131
131
  @buffer.last_list_item(:recurse => true).append para
132
132
  return
133
133
  end
@@ -85,6 +85,9 @@ module Swordfish
85
85
  end
86
86
  end
87
87
 
88
+ class Inline < Base
89
+ end
90
+
88
91
  class BadContentError < Exception
89
92
  end
90
93
  end
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Swordfish
4
4
  module Node
5
- class Hyperlink < Base
5
+ class Hyperlink < Inline
6
6
 
7
7
  attr_accessor :href
8
8
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Swordfish
4
4
  module Node
5
- class Text < Base
5
+ class Text < Inline
6
6
 
7
7
  # Override Base append because a text node should never have children
8
8
  def append(node)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: swordfish
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.12
4
+ version: 0.0.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martin Posthumus
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-07-10 00:00:00.000000000 Z
11
+ date: 2014-08-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler