doctor_ninja 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +62 -0
- data/Rakefile +6 -0
- data/bin/ninjadoc +29 -0
- data/doctor_ninja.gemspec +29 -0
- data/lib/doctor_ninja.rb +9 -0
- data/lib/doctor_ninja/document.rb +29 -0
- data/lib/doctor_ninja/errors.rb +4 -0
- data/lib/doctor_ninja/parser.rb +59 -0
- data/lib/doctor_ninja/parsers/base.rb +26 -0
- data/lib/doctor_ninja/parsers/blip_fill.rb +43 -0
- data/lib/doctor_ninja/parsers/drawing.rb +35 -0
- data/lib/doctor_ninja/parsers/math.rb +29 -0
- data/lib/doctor_ninja/parsers/math_para.rb +13 -0
- data/lib/doctor_ninja/parsers/omml2mml.xsl +2068 -0
- data/lib/doctor_ninja/parsers/paragraph.rb +38 -0
- data/lib/doctor_ninja/parsers/run.rb +24 -0
- data/lib/doctor_ninja/parsers/text.rb +12 -0
- data/lib/doctor_ninja/relationships.rb +20 -0
- data/lib/doctor_ninja/version.rb +3 -0
- data/test/document_test.rb +15 -0
- data/test/fixtures/img.docx +0 -0
- data/test/fixtures/img_crop.docx +0 -0
- data/test/fixtures/img_rot.docx +0 -0
- data/test/fixtures/img_rot_crop.docx +0 -0
- data/test/fixtures/invalid_file.docx +0 -0
- data/test/fixtures/limites.docx +0 -0
- data/test/integration_test.rb +33 -0
- data/test/minitest_helper.rb +3 -0
- metadata +186 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ec64e692dfb01d06fbd09bb42162997d2b47bfb8
|
4
|
+
data.tar.gz: 1b80f7d87e7f22966c3733587ade0fdeffceafae
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9eaf28e4b06b699f63c9062e3f53b67c436817383178da6971479a643f57dfa697ea79d442631f35764712038f546d38a1e847f265f75d3a76602a0259b778b4
|
7
|
+
data.tar.gz: 80c0b08e752ad68265d70db184ff38447ffcbfdcfd5d8850003ede10973a31bc961db8e5a573406b4009c6218bcd54a7e5985cd1b446e7fda9bd6f00629ad12d
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Bernardo Amorim
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
Here be dragons! This is a WIP.
|
2
|
+
|
3
|
+
# DoctorNinja
|
4
|
+
|
5
|
+
DoctorNinja is a library to convert word documents into html files like a ninja.
|
6
|
+
|
7
|
+
## Why DoctorNinja?
|
8
|
+
|
9
|
+
Well, the name is simple, **Doctor** comes from **Doc** (.docx) and **Ninja** comes from our first MVP (Ninja das Dúvidas).
|
10
|
+
|
11
|
+
## Features
|
12
|
+
|
13
|
+
* Convert images and applies the following transformations
|
14
|
+
* Crop
|
15
|
+
* Convert Microsoft's Math to MathML
|
16
|
+
* Accepts bold, italic and underline
|
17
|
+
* Sets text-align to center when there are only non-text children on paragrpahs ( This is for our own usecase, we should probably give an interface to configure it, maybe passing a lambda )
|
18
|
+
|
19
|
+
## TODO
|
20
|
+
|
21
|
+
* Pass styleMap as an option(to choose what tag in paragraph rendering)
|
22
|
+
* Add general configurations:
|
23
|
+
1. style map
|
24
|
+
2. image inline(src=data)/as file
|
25
|
+
3. MathML/ansimath/latex
|
26
|
+
|
27
|
+
## Installation
|
28
|
+
|
29
|
+
Add this line to your application's Gemfile:
|
30
|
+
|
31
|
+
```ruby
|
32
|
+
gem 'doctor_ninja'
|
33
|
+
```
|
34
|
+
|
35
|
+
And then execute:
|
36
|
+
|
37
|
+
$ bundle
|
38
|
+
|
39
|
+
Or install it yourself as:
|
40
|
+
|
41
|
+
$ gem install doctor_ninja
|
42
|
+
|
43
|
+
## Usage
|
44
|
+
|
45
|
+
To use it, you first load the .docx file and then use the to_html to get the html.
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
doc = DoctorNinja::Document.new('path_to_file.docx')
|
49
|
+
doc.to_html
|
50
|
+
```
|
51
|
+
|
52
|
+
Alternativly, you can use our binary, that wraps inside a html boilerplate and adds MathJax to correcly render the MathML
|
53
|
+
|
54
|
+
$> doctor_ninja document.docx document.html
|
55
|
+
|
56
|
+
## Contributing
|
57
|
+
|
58
|
+
1. Fork it ( https://github.com/bamorim/doctor_ninja/fork )
|
59
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
60
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
61
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
62
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/bin/ninjadoc
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
MATHJAX = <<-HTML
|
4
|
+
<script type="text/javascript"
|
5
|
+
src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
|
6
|
+
</script>
|
7
|
+
HTML
|
8
|
+
|
9
|
+
PREFIX = <<-HTML
|
10
|
+
<!doctype HTML>
|
11
|
+
<html>
|
12
|
+
<head>
|
13
|
+
<meta charset="utf-8"/>
|
14
|
+
#{MATHJAX}
|
15
|
+
</head>
|
16
|
+
<body>
|
17
|
+
HTML
|
18
|
+
|
19
|
+
SUFFIX = <<-HTML
|
20
|
+
</body>
|
21
|
+
</html>
|
22
|
+
HTML
|
23
|
+
|
24
|
+
require 'doctor_ninja'
|
25
|
+
doc = DoctorNinja::Document.new(ARGV[0])
|
26
|
+
file = ARGV[1] ? File.open(ARGV[1],"w+") : STDOUT
|
27
|
+
file.write("#{PREFIX}#{doc.to_html}#{SUFFIX}")
|
28
|
+
file.close
|
29
|
+
doc.close
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'doctor_ninja/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "doctor_ninja"
|
8
|
+
spec.version = DoctorNinja::VERSION
|
9
|
+
spec.authors = ["Bernardo Amorim"]
|
10
|
+
spec.email = ["contato@bamorim.com"]
|
11
|
+
spec.summary = %q{DoctorNinja is a library to convert word documents into html files like a ninja.}
|
12
|
+
spec.description = %q{DoctorNinja is built by the Responde Aí Team (www.respondeai.com.br/time) in an attempt to fill our needs to convert docx files into html that also converts Word Forumlas into MathML.}
|
13
|
+
spec.homepage = "https://github.com/bamorim/doctor_ninja"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
23
|
+
spec.add_development_dependency "minitest", "~> 5.4"
|
24
|
+
spec.add_development_dependency "minitest-reporters", "~> 1.0"
|
25
|
+
|
26
|
+
spec.add_dependency "nokogiri", "~> 1.6"
|
27
|
+
spec.add_dependency "rubyzip", "~> 1.1"
|
28
|
+
spec.add_dependency "rmagick", "~> 2.13"
|
29
|
+
end
|
data/lib/doctor_ninja.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require "zip"
|
2
|
+
require "doctor_ninja/parser"
|
3
|
+
require "doctor_ninja/relationships"
|
4
|
+
|
5
|
+
module DoctorNinja
|
6
|
+
class Document
|
7
|
+
def initialize file
|
8
|
+
@file = Zip::File.new(file)
|
9
|
+
rescue Zip::Error
|
10
|
+
raise InvalidDocumentError.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def relationships
|
14
|
+
@relationships ||= DoctorNinja::Relationships.new(self)
|
15
|
+
end
|
16
|
+
|
17
|
+
def read file
|
18
|
+
@file.read(file)
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_html
|
22
|
+
DoctorNinja::Parser.new(self).parse
|
23
|
+
end
|
24
|
+
|
25
|
+
def close
|
26
|
+
@file.close
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
Dir[File.dirname(__FILE__) + '/parsers/*.rb'].each {|file| require file }
|
3
|
+
|
4
|
+
module DoctorNinja
|
5
|
+
class Parser
|
6
|
+
class Noop < DoctorNinja::Parsers::Base
|
7
|
+
def self.applicable_to?(node)
|
8
|
+
return true
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize(doc)
|
13
|
+
@docx = doc
|
14
|
+
@xmldoc = Nokogiri::XML @docx.read "word/document.xml"
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse
|
18
|
+
self.parse_node(@xmldoc.root, {})
|
19
|
+
end
|
20
|
+
|
21
|
+
def parse_node(node,context)
|
22
|
+
parsers = parsers_for(node,context)
|
23
|
+
|
24
|
+
if debug?(node,parsers)
|
25
|
+
debug(node,binding)
|
26
|
+
end
|
27
|
+
|
28
|
+
parsers
|
29
|
+
.first
|
30
|
+
.parse
|
31
|
+
end
|
32
|
+
|
33
|
+
def parsers_for(node,context)
|
34
|
+
parsers
|
35
|
+
.select{|p| p.applicable_to? node}
|
36
|
+
.map{|p| p.new(node, self.public_method(:parse_node), @docx, context)}
|
37
|
+
end
|
38
|
+
|
39
|
+
def parsers
|
40
|
+
DoctorNinja::Parsers.constants.map{|c| DoctorNinja::Parsers.const_get(c)}+[Noop]
|
41
|
+
end
|
42
|
+
|
43
|
+
def debug?(node,parsers)
|
44
|
+
ENV["DEBUG"] == "all" ||
|
45
|
+
(ENV["DEBUG"] == "missing" && parsers.length == 1) ||
|
46
|
+
ENV["DEBUG"] == node.name ||
|
47
|
+
(node.namespace && ENV["DEBUG"] == "#{node.namespace.prefix}:#{node.name}")
|
48
|
+
end
|
49
|
+
|
50
|
+
def debug(node,b)
|
51
|
+
if(ENV["DEBUG_MODE"]=="pry")
|
52
|
+
require "pry"
|
53
|
+
b.pry
|
54
|
+
else
|
55
|
+
puts "---BEGIN---\n#{node.to_xml}\n----END----"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module DoctorNinja
|
2
|
+
module Parsers
|
3
|
+
class Base
|
4
|
+
def self.applicable_to?(node)
|
5
|
+
false
|
6
|
+
end
|
7
|
+
|
8
|
+
def initialize(node, yielder, document, context)
|
9
|
+
@node = node
|
10
|
+
@yielder = yielder
|
11
|
+
@document = document
|
12
|
+
@context = context
|
13
|
+
end
|
14
|
+
|
15
|
+
def parse
|
16
|
+
parse_children
|
17
|
+
end
|
18
|
+
|
19
|
+
def parse_children(context=@context)
|
20
|
+
@node.children.inject("") do |str,child|
|
21
|
+
str << @yielder.call(child,context).to_s
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require_relative "./base"
|
2
|
+
|
3
|
+
class DoctorNinja::Parsers::BlipFill < DoctorNinja::Parsers::Base
|
4
|
+
def self.applicable_to?(node)
|
5
|
+
node.name == "blipFill"
|
6
|
+
end
|
7
|
+
|
8
|
+
def parse
|
9
|
+
@image = Magick::Image.from_blob(@document.relationships[rel_id]).first
|
10
|
+
transform
|
11
|
+
@context.background_image = @image
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def rel_id
|
17
|
+
@node.xpath("./a:blip", "a" => xmlns_a).attribute("embed").value
|
18
|
+
end
|
19
|
+
|
20
|
+
def transform
|
21
|
+
return unless src_rect
|
22
|
+
h = {
|
23
|
+
"l" => src_rect["l"].to_i*@image.columns/100000,
|
24
|
+
"t" => src_rect["t"].to_i*@image.rows/100000,
|
25
|
+
"r" => src_rect["r"].to_i*@image.columns/100000,
|
26
|
+
"b" => src_rect["b"].to_i*@image.rows/100000
|
27
|
+
}
|
28
|
+
@image = @image.crop(
|
29
|
+
h["l"],
|
30
|
+
h["t"],
|
31
|
+
@image.columns - h["l"]-h["r"] ,
|
32
|
+
@image.rows-h["b"]-h["t"]
|
33
|
+
)
|
34
|
+
end
|
35
|
+
|
36
|
+
def src_rect
|
37
|
+
@node.xpath("./a:srcRect","a" => xmlns_a)[0]
|
38
|
+
end
|
39
|
+
|
40
|
+
def xmlns_a
|
41
|
+
@node.namespaces["xmlns:a"]
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require "rvg/rvg"
|
2
|
+
require "base64"
|
3
|
+
|
4
|
+
class DoctorNinja::Parsers::Drawing < DoctorNinja::Parsers::Base
|
5
|
+
include Magick
|
6
|
+
|
7
|
+
def self.applicable_to?(node)
|
8
|
+
node.name == "drawing"
|
9
|
+
end
|
10
|
+
|
11
|
+
def parse
|
12
|
+
width = extent[:x]/DoctorNinja::EMU_PER_PIXEL
|
13
|
+
height = extent[:y]/DoctorNinja::EMU_PER_PIXEL
|
14
|
+
|
15
|
+
rvg = RVG.new(width,height).viewbox(0,0,width,height) do |canvas|
|
16
|
+
parse_children(canvas)
|
17
|
+
end
|
18
|
+
|
19
|
+
format = "png"
|
20
|
+
base64 = Base64.encode64 rvg.draw.to_blob { self.format = format }
|
21
|
+
|
22
|
+
"<img src=\"data:image/#{format};base64,#{base64}\"/>"
|
23
|
+
end
|
24
|
+
|
25
|
+
def extent
|
26
|
+
@extent ||= {
|
27
|
+
x: extent_node.attribute("cx").value.to_i,
|
28
|
+
y: extent_node.attribute("cy").value.to_i
|
29
|
+
}
|
30
|
+
end
|
31
|
+
|
32
|
+
def extent_node
|
33
|
+
@node.xpath(".//wp:extent")
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require_relative "./base"
|
2
|
+
|
3
|
+
class DoctorNinja::Parsers::Math < DoctorNinja::Parsers::Base
|
4
|
+
@@xsl = File.join(File.dirname(__FILE__),"omml2mml.xsl")
|
5
|
+
|
6
|
+
def self.applicable_to?(node)
|
7
|
+
node.name == "oMath"
|
8
|
+
end
|
9
|
+
|
10
|
+
# Uses saxon to convert by now
|
11
|
+
def parse
|
12
|
+
doc = Nokogiri::XML(mml)
|
13
|
+
doc.remove_namespaces!
|
14
|
+
doc.root.set_attribute "xmlns", "http://www.w3.org/1998/Math/MathML"
|
15
|
+
doc.root.set_attribute "display", "block" if @context[:is_math_para]
|
16
|
+
doc.root.to_xml
|
17
|
+
end
|
18
|
+
|
19
|
+
def mml
|
20
|
+
doc = Nokogiri::XML("<root>#{@node.to_xml(encoding: "utf-8")}</root>")
|
21
|
+
|
22
|
+
@node.namespaces.each do |k,v|
|
23
|
+
doc.root.add_namespace k.split(":")[1], v
|
24
|
+
end
|
25
|
+
|
26
|
+
xslt = Nokogiri::XSLT(File.read(@@xsl))
|
27
|
+
xslt.transform(Nokogiri::XML(doc.to_xml)).to_xml
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative "./base"
|
2
|
+
|
3
|
+
class DoctorNinja::Parsers::MathPara < DoctorNinja::Parsers::Base
|
4
|
+
def self.applicable_to?(node)
|
5
|
+
node.name == "oMathPara"
|
6
|
+
end
|
7
|
+
|
8
|
+
def parse
|
9
|
+
@context = @context.dup
|
10
|
+
@context[:is_math_para] = true
|
11
|
+
parse_children
|
12
|
+
end
|
13
|
+
end
|