doctor_ninja 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +62 -0
- data/Rakefile +6 -0
- data/bin/ninjadoc +29 -0
- data/doctor_ninja.gemspec +29 -0
- data/lib/doctor_ninja.rb +9 -0
- data/lib/doctor_ninja/document.rb +29 -0
- data/lib/doctor_ninja/errors.rb +4 -0
- data/lib/doctor_ninja/parser.rb +59 -0
- data/lib/doctor_ninja/parsers/base.rb +26 -0
- data/lib/doctor_ninja/parsers/blip_fill.rb +43 -0
- data/lib/doctor_ninja/parsers/drawing.rb +35 -0
- data/lib/doctor_ninja/parsers/math.rb +29 -0
- data/lib/doctor_ninja/parsers/math_para.rb +13 -0
- data/lib/doctor_ninja/parsers/omml2mml.xsl +2068 -0
- data/lib/doctor_ninja/parsers/paragraph.rb +38 -0
- data/lib/doctor_ninja/parsers/run.rb +24 -0
- data/lib/doctor_ninja/parsers/text.rb +12 -0
- data/lib/doctor_ninja/relationships.rb +20 -0
- data/lib/doctor_ninja/version.rb +3 -0
- data/test/document_test.rb +15 -0
- data/test/fixtures/img.docx +0 -0
- data/test/fixtures/img_crop.docx +0 -0
- data/test/fixtures/img_rot.docx +0 -0
- data/test/fixtures/img_rot_crop.docx +0 -0
- data/test/fixtures/invalid_file.docx +0 -0
- data/test/fixtures/limites.docx +0 -0
- data/test/integration_test.rb +33 -0
- data/test/minitest_helper.rb +3 -0
- metadata +186 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ec64e692dfb01d06fbd09bb42162997d2b47bfb8
|
4
|
+
data.tar.gz: 1b80f7d87e7f22966c3733587ade0fdeffceafae
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9eaf28e4b06b699f63c9062e3f53b67c436817383178da6971479a643f57dfa697ea79d442631f35764712038f546d38a1e847f265f75d3a76602a0259b778b4
|
7
|
+
data.tar.gz: 80c0b08e752ad68265d70db184ff38447ffcbfdcfd5d8850003ede10973a31bc961db8e5a573406b4009c6218bcd54a7e5985cd1b446e7fda9bd6f00629ad12d
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Bernardo Amorim
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
Here be dragons! This is a WIP.
|
2
|
+
|
3
|
+
# DoctorNinja
|
4
|
+
|
5
|
+
DoctorNinja is a library to convert word documents into html files like a ninja.
|
6
|
+
|
7
|
+
## Why DoctorNinja?
|
8
|
+
|
9
|
+
Well, the name is simple, **Doctor** comes from **Doc** (.docx) and **Ninja** comes from our first MVP (Ninja das Dúvidas).
|
10
|
+
|
11
|
+
## Features
|
12
|
+
|
13
|
+
* Convert images and applies the following transformations
|
14
|
+
* Crop
|
15
|
+
* Convert Microsoft's Math to MathML
|
16
|
+
* Accepts bold, italic and underline
|
17
|
+
* Sets text-align to center when there are only non-text children on paragrpahs ( This is for our own usecase, we should probably give an interface to configure it, maybe passing a lambda )
|
18
|
+
|
19
|
+
## TODO
|
20
|
+
|
21
|
+
* Pass styleMap as an option(to choose what tag in paragraph rendering)
|
22
|
+
* Add general configurations:
|
23
|
+
1. style map
|
24
|
+
2. image inline(src=data)/as file
|
25
|
+
3. MathML/ansimath/latex
|
26
|
+
|
27
|
+
## Installation
|
28
|
+
|
29
|
+
Add this line to your application's Gemfile:
|
30
|
+
|
31
|
+
```ruby
|
32
|
+
gem 'doctor_ninja'
|
33
|
+
```
|
34
|
+
|
35
|
+
And then execute:
|
36
|
+
|
37
|
+
$ bundle
|
38
|
+
|
39
|
+
Or install it yourself as:
|
40
|
+
|
41
|
+
$ gem install doctor_ninja
|
42
|
+
|
43
|
+
## Usage
|
44
|
+
|
45
|
+
To use it, you first load the .docx file and then use the to_html to get the html.
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
doc = DoctorNinja::Document.new('path_to_file.docx')
|
49
|
+
doc.to_html
|
50
|
+
```
|
51
|
+
|
52
|
+
Alternativly, you can use our binary, that wraps inside a html boilerplate and adds MathJax to correcly render the MathML
|
53
|
+
|
54
|
+
$> doctor_ninja document.docx document.html
|
55
|
+
|
56
|
+
## Contributing
|
57
|
+
|
58
|
+
1. Fork it ( https://github.com/bamorim/doctor_ninja/fork )
|
59
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
60
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
61
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
62
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/bin/ninjadoc
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
MATHJAX = <<-HTML
|
4
|
+
<script type="text/javascript"
|
5
|
+
src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
|
6
|
+
</script>
|
7
|
+
HTML
|
8
|
+
|
9
|
+
PREFIX = <<-HTML
|
10
|
+
<!doctype HTML>
|
11
|
+
<html>
|
12
|
+
<head>
|
13
|
+
<meta charset="utf-8"/>
|
14
|
+
#{MATHJAX}
|
15
|
+
</head>
|
16
|
+
<body>
|
17
|
+
HTML
|
18
|
+
|
19
|
+
SUFFIX = <<-HTML
|
20
|
+
</body>
|
21
|
+
</html>
|
22
|
+
HTML
|
23
|
+
|
24
|
+
require 'doctor_ninja'
|
25
|
+
doc = DoctorNinja::Document.new(ARGV[0])
|
26
|
+
file = ARGV[1] ? File.open(ARGV[1],"w+") : STDOUT
|
27
|
+
file.write("#{PREFIX}#{doc.to_html}#{SUFFIX}")
|
28
|
+
file.close
|
29
|
+
doc.close
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'doctor_ninja/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "doctor_ninja"
|
8
|
+
spec.version = DoctorNinja::VERSION
|
9
|
+
spec.authors = ["Bernardo Amorim"]
|
10
|
+
spec.email = ["contato@bamorim.com"]
|
11
|
+
spec.summary = %q{DoctorNinja is a library to convert word documents into html files like a ninja.}
|
12
|
+
spec.description = %q{DoctorNinja is built by the Responde Aí Team (www.respondeai.com.br/time) in an attempt to fill our needs to convert docx files into html that also converts Word Forumlas into MathML.}
|
13
|
+
spec.homepage = "https://github.com/bamorim/doctor_ninja"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
23
|
+
spec.add_development_dependency "minitest", "~> 5.4"
|
24
|
+
spec.add_development_dependency "minitest-reporters", "~> 1.0"
|
25
|
+
|
26
|
+
spec.add_dependency "nokogiri", "~> 1.6"
|
27
|
+
spec.add_dependency "rubyzip", "~> 1.1"
|
28
|
+
spec.add_dependency "rmagick", "~> 2.13"
|
29
|
+
end
|
data/lib/doctor_ninja.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require "zip"
|
2
|
+
require "doctor_ninja/parser"
|
3
|
+
require "doctor_ninja/relationships"
|
4
|
+
|
5
|
+
module DoctorNinja
|
6
|
+
class Document
|
7
|
+
def initialize file
|
8
|
+
@file = Zip::File.new(file)
|
9
|
+
rescue Zip::Error
|
10
|
+
raise InvalidDocumentError.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def relationships
|
14
|
+
@relationships ||= DoctorNinja::Relationships.new(self)
|
15
|
+
end
|
16
|
+
|
17
|
+
def read file
|
18
|
+
@file.read(file)
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_html
|
22
|
+
DoctorNinja::Parser.new(self).parse
|
23
|
+
end
|
24
|
+
|
25
|
+
def close
|
26
|
+
@file.close
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
Dir[File.dirname(__FILE__) + '/parsers/*.rb'].each {|file| require file }
|
3
|
+
|
4
|
+
module DoctorNinja
|
5
|
+
class Parser
|
6
|
+
class Noop < DoctorNinja::Parsers::Base
|
7
|
+
def self.applicable_to?(node)
|
8
|
+
return true
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize(doc)
|
13
|
+
@docx = doc
|
14
|
+
@xmldoc = Nokogiri::XML @docx.read "word/document.xml"
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse
|
18
|
+
self.parse_node(@xmldoc.root, {})
|
19
|
+
end
|
20
|
+
|
21
|
+
def parse_node(node,context)
|
22
|
+
parsers = parsers_for(node,context)
|
23
|
+
|
24
|
+
if debug?(node,parsers)
|
25
|
+
debug(node,binding)
|
26
|
+
end
|
27
|
+
|
28
|
+
parsers
|
29
|
+
.first
|
30
|
+
.parse
|
31
|
+
end
|
32
|
+
|
33
|
+
def parsers_for(node,context)
|
34
|
+
parsers
|
35
|
+
.select{|p| p.applicable_to? node}
|
36
|
+
.map{|p| p.new(node, self.public_method(:parse_node), @docx, context)}
|
37
|
+
end
|
38
|
+
|
39
|
+
def parsers
|
40
|
+
DoctorNinja::Parsers.constants.map{|c| DoctorNinja::Parsers.const_get(c)}+[Noop]
|
41
|
+
end
|
42
|
+
|
43
|
+
def debug?(node,parsers)
|
44
|
+
ENV["DEBUG"] == "all" ||
|
45
|
+
(ENV["DEBUG"] == "missing" && parsers.length == 1) ||
|
46
|
+
ENV["DEBUG"] == node.name ||
|
47
|
+
(node.namespace && ENV["DEBUG"] == "#{node.namespace.prefix}:#{node.name}")
|
48
|
+
end
|
49
|
+
|
50
|
+
def debug(node,b)
|
51
|
+
if(ENV["DEBUG_MODE"]=="pry")
|
52
|
+
require "pry"
|
53
|
+
b.pry
|
54
|
+
else
|
55
|
+
puts "---BEGIN---\n#{node.to_xml}\n----END----"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module DoctorNinja
|
2
|
+
module Parsers
|
3
|
+
class Base
|
4
|
+
def self.applicable_to?(node)
|
5
|
+
false
|
6
|
+
end
|
7
|
+
|
8
|
+
def initialize(node, yielder, document, context)
|
9
|
+
@node = node
|
10
|
+
@yielder = yielder
|
11
|
+
@document = document
|
12
|
+
@context = context
|
13
|
+
end
|
14
|
+
|
15
|
+
def parse
|
16
|
+
parse_children
|
17
|
+
end
|
18
|
+
|
19
|
+
def parse_children(context=@context)
|
20
|
+
@node.children.inject("") do |str,child|
|
21
|
+
str << @yielder.call(child,context).to_s
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require_relative "./base"
|
2
|
+
|
3
|
+
class DoctorNinja::Parsers::BlipFill < DoctorNinja::Parsers::Base
|
4
|
+
def self.applicable_to?(node)
|
5
|
+
node.name == "blipFill"
|
6
|
+
end
|
7
|
+
|
8
|
+
def parse
|
9
|
+
@image = Magick::Image.from_blob(@document.relationships[rel_id]).first
|
10
|
+
transform
|
11
|
+
@context.background_image = @image
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def rel_id
|
17
|
+
@node.xpath("./a:blip", "a" => xmlns_a).attribute("embed").value
|
18
|
+
end
|
19
|
+
|
20
|
+
def transform
|
21
|
+
return unless src_rect
|
22
|
+
h = {
|
23
|
+
"l" => src_rect["l"].to_i*@image.columns/100000,
|
24
|
+
"t" => src_rect["t"].to_i*@image.rows/100000,
|
25
|
+
"r" => src_rect["r"].to_i*@image.columns/100000,
|
26
|
+
"b" => src_rect["b"].to_i*@image.rows/100000
|
27
|
+
}
|
28
|
+
@image = @image.crop(
|
29
|
+
h["l"],
|
30
|
+
h["t"],
|
31
|
+
@image.columns - h["l"]-h["r"] ,
|
32
|
+
@image.rows-h["b"]-h["t"]
|
33
|
+
)
|
34
|
+
end
|
35
|
+
|
36
|
+
def src_rect
|
37
|
+
@node.xpath("./a:srcRect","a" => xmlns_a)[0]
|
38
|
+
end
|
39
|
+
|
40
|
+
def xmlns_a
|
41
|
+
@node.namespaces["xmlns:a"]
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require "rvg/rvg"
|
2
|
+
require "base64"
|
3
|
+
|
4
|
+
class DoctorNinja::Parsers::Drawing < DoctorNinja::Parsers::Base
|
5
|
+
include Magick
|
6
|
+
|
7
|
+
def self.applicable_to?(node)
|
8
|
+
node.name == "drawing"
|
9
|
+
end
|
10
|
+
|
11
|
+
def parse
|
12
|
+
width = extent[:x]/DoctorNinja::EMU_PER_PIXEL
|
13
|
+
height = extent[:y]/DoctorNinja::EMU_PER_PIXEL
|
14
|
+
|
15
|
+
rvg = RVG.new(width,height).viewbox(0,0,width,height) do |canvas|
|
16
|
+
parse_children(canvas)
|
17
|
+
end
|
18
|
+
|
19
|
+
format = "png"
|
20
|
+
base64 = Base64.encode64 rvg.draw.to_blob { self.format = format }
|
21
|
+
|
22
|
+
"<img src=\"data:image/#{format};base64,#{base64}\"/>"
|
23
|
+
end
|
24
|
+
|
25
|
+
def extent
|
26
|
+
@extent ||= {
|
27
|
+
x: extent_node.attribute("cx").value.to_i,
|
28
|
+
y: extent_node.attribute("cy").value.to_i
|
29
|
+
}
|
30
|
+
end
|
31
|
+
|
32
|
+
def extent_node
|
33
|
+
@node.xpath(".//wp:extent")
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require_relative "./base"
|
2
|
+
|
3
|
+
class DoctorNinja::Parsers::Math < DoctorNinja::Parsers::Base
|
4
|
+
@@xsl = File.join(File.dirname(__FILE__),"omml2mml.xsl")
|
5
|
+
|
6
|
+
def self.applicable_to?(node)
|
7
|
+
node.name == "oMath"
|
8
|
+
end
|
9
|
+
|
10
|
+
# Uses saxon to convert by now
|
11
|
+
def parse
|
12
|
+
doc = Nokogiri::XML(mml)
|
13
|
+
doc.remove_namespaces!
|
14
|
+
doc.root.set_attribute "xmlns", "http://www.w3.org/1998/Math/MathML"
|
15
|
+
doc.root.set_attribute "display", "block" if @context[:is_math_para]
|
16
|
+
doc.root.to_xml
|
17
|
+
end
|
18
|
+
|
19
|
+
def mml
|
20
|
+
doc = Nokogiri::XML("<root>#{@node.to_xml(encoding: "utf-8")}</root>")
|
21
|
+
|
22
|
+
@node.namespaces.each do |k,v|
|
23
|
+
doc.root.add_namespace k.split(":")[1], v
|
24
|
+
end
|
25
|
+
|
26
|
+
xslt = Nokogiri::XSLT(File.read(@@xsl))
|
27
|
+
xslt.transform(Nokogiri::XML(doc.to_xml)).to_xml
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative "./base"
|
2
|
+
|
3
|
+
class DoctorNinja::Parsers::MathPara < DoctorNinja::Parsers::Base
|
4
|
+
def self.applicable_to?(node)
|
5
|
+
node.name == "oMathPara"
|
6
|
+
end
|
7
|
+
|
8
|
+
def parse
|
9
|
+
@context = @context.dup
|
10
|
+
@context[:is_math_para] = true
|
11
|
+
parse_children
|
12
|
+
end
|
13
|
+
end
|