doctor_ninja 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ec64e692dfb01d06fbd09bb42162997d2b47bfb8
4
+ data.tar.gz: 1b80f7d87e7f22966c3733587ade0fdeffceafae
5
+ SHA512:
6
+ metadata.gz: 9eaf28e4b06b699f63c9062e3f53b67c436817383178da6971479a643f57dfa697ea79d442631f35764712038f546d38a1e847f265f75d3a76602a0259b778b4
7
+ data.tar.gz: 80c0b08e752ad68265d70db184ff38447ffcbfdcfd5d8850003ede10973a31bc961db8e5a573406b4009c6218bcd54a7e5985cd1b446e7fda9bd6f00629ad12d
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in doctor_ninja.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Bernardo Amorim
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,62 @@
1
+ Here be dragons! This is a WIP.
2
+
3
+ # DoctorNinja
4
+
5
+ DoctorNinja is a library to convert word documents into html files like a ninja.
6
+
7
+ ## Why DoctorNinja?
8
+
9
+ Well, the name is simple, **Doctor** comes from **Doc** (.docx) and **Ninja** comes from our first MVP (Ninja das Dúvidas).
10
+
11
+ ## Features
12
+
13
+ * Convert images and applies the following transformations
14
+ * Crop
15
+ * Convert Microsoft's Math to MathML
16
+ * Accepts bold, italic and underline
17
+ * Sets text-align to center when there are only non-text children on paragrpahs ( This is for our own usecase, we should probably give an interface to configure it, maybe passing a lambda )
18
+
19
+ ## TODO
20
+
21
+ * Pass styleMap as an option(to choose what tag in paragraph rendering)
22
+ * Add general configurations:
23
+ 1. style map
24
+ 2. image inline(src=data)/as file
25
+ 3. MathML/ansimath/latex
26
+
27
+ ## Installation
28
+
29
+ Add this line to your application's Gemfile:
30
+
31
+ ```ruby
32
+ gem 'doctor_ninja'
33
+ ```
34
+
35
+ And then execute:
36
+
37
+ $ bundle
38
+
39
+ Or install it yourself as:
40
+
41
+ $ gem install doctor_ninja
42
+
43
+ ## Usage
44
+
45
+ To use it, you first load the .docx file and then use the to_html to get the html.
46
+
47
+ ```ruby
48
+ doc = DoctorNinja::Document.new('path_to_file.docx')
49
+ doc.to_html
50
+ ```
51
+
52
+ Alternativly, you can use our binary, that wraps inside a html boilerplate and adds MathJax to correcly render the MathML
53
+
54
+ $> doctor_ninja document.docx document.html
55
+
56
+ ## Contributing
57
+
58
+ 1. Fork it ( https://github.com/bamorim/doctor_ninja/fork )
59
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
60
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
61
+ 4. Push to the branch (`git push origin my-new-feature`)
62
+ 5. Create a new Pull Request
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ task :default => :test
4
+ task :test do
5
+ Dir.glob('./test/*_test.rb').each { |file| require file}
6
+ end
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ MATHJAX = <<-HTML
4
+ <script type="text/javascript"
5
+ src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
6
+ </script>
7
+ HTML
8
+
9
+ PREFIX = <<-HTML
10
+ <!doctype HTML>
11
+ <html>
12
+ <head>
13
+ <meta charset="utf-8"/>
14
+ #{MATHJAX}
15
+ </head>
16
+ <body>
17
+ HTML
18
+
19
+ SUFFIX = <<-HTML
20
+ </body>
21
+ </html>
22
+ HTML
23
+
24
+ require 'doctor_ninja'
25
+ doc = DoctorNinja::Document.new(ARGV[0])
26
+ file = ARGV[1] ? File.open(ARGV[1],"w+") : STDOUT
27
+ file.write("#{PREFIX}#{doc.to_html}#{SUFFIX}")
28
+ file.close
29
+ doc.close
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'doctor_ninja/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "doctor_ninja"
8
+ spec.version = DoctorNinja::VERSION
9
+ spec.authors = ["Bernardo Amorim"]
10
+ spec.email = ["contato@bamorim.com"]
11
+ spec.summary = %q{DoctorNinja is a library to convert word documents into html files like a ninja.}
12
+ spec.description = %q{DoctorNinja is built by the Responde Aí Team (www.respondeai.com.br/time) in an attempt to fill our needs to convert docx files into html that also converts Word Forumlas into MathML.}
13
+ spec.homepage = "https://github.com/bamorim/doctor_ninja"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_development_dependency "minitest", "~> 5.4"
24
+ spec.add_development_dependency "minitest-reporters", "~> 1.0"
25
+
26
+ spec.add_dependency "nokogiri", "~> 1.6"
27
+ spec.add_dependency "rubyzip", "~> 1.1"
28
+ spec.add_dependency "rmagick", "~> 2.13"
29
+ end
@@ -0,0 +1,9 @@
1
+ require "doctor_ninja/version"
2
+ require "doctor_ninja/document"
3
+ require "doctor_ninja/parser"
4
+
5
+ module DoctorNinja
6
+ RESOLUTION = 96
7
+ EMU_PER_INCH = 914400
8
+ EMU_PER_PIXEL = EMU_PER_INCH/RESOLUTION
9
+ end
@@ -0,0 +1,29 @@
1
+ require "zip"
2
+ require "doctor_ninja/parser"
3
+ require "doctor_ninja/relationships"
4
+
5
+ module DoctorNinja
6
+ class Document
7
+ def initialize file
8
+ @file = Zip::File.new(file)
9
+ rescue Zip::Error
10
+ raise InvalidDocumentError.new
11
+ end
12
+
13
+ def relationships
14
+ @relationships ||= DoctorNinja::Relationships.new(self)
15
+ end
16
+
17
+ def read file
18
+ @file.read(file)
19
+ end
20
+
21
+ def to_html
22
+ DoctorNinja::Parser.new(self).parse
23
+ end
24
+
25
+ def close
26
+ @file.close
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,4 @@
1
+ module DoctorNinja
2
+ class Error < StandardError; end
3
+ class InvalidDocumentError < Error; end
4
+ end
@@ -0,0 +1,59 @@
1
+ require 'nokogiri'
2
+ Dir[File.dirname(__FILE__) + '/parsers/*.rb'].each {|file| require file }
3
+
4
+ module DoctorNinja
5
+ class Parser
6
+ class Noop < DoctorNinja::Parsers::Base
7
+ def self.applicable_to?(node)
8
+ return true
9
+ end
10
+ end
11
+
12
+ def initialize(doc)
13
+ @docx = doc
14
+ @xmldoc = Nokogiri::XML @docx.read "word/document.xml"
15
+ end
16
+
17
+ def parse
18
+ self.parse_node(@xmldoc.root, {})
19
+ end
20
+
21
+ def parse_node(node,context)
22
+ parsers = parsers_for(node,context)
23
+
24
+ if debug?(node,parsers)
25
+ debug(node,binding)
26
+ end
27
+
28
+ parsers
29
+ .first
30
+ .parse
31
+ end
32
+
33
+ def parsers_for(node,context)
34
+ parsers
35
+ .select{|p| p.applicable_to? node}
36
+ .map{|p| p.new(node, self.public_method(:parse_node), @docx, context)}
37
+ end
38
+
39
+ def parsers
40
+ DoctorNinja::Parsers.constants.map{|c| DoctorNinja::Parsers.const_get(c)}+[Noop]
41
+ end
42
+
43
+ def debug?(node,parsers)
44
+ ENV["DEBUG"] == "all" ||
45
+ (ENV["DEBUG"] == "missing" && parsers.length == 1) ||
46
+ ENV["DEBUG"] == node.name ||
47
+ (node.namespace && ENV["DEBUG"] == "#{node.namespace.prefix}:#{node.name}")
48
+ end
49
+
50
+ def debug(node,b)
51
+ if(ENV["DEBUG_MODE"]=="pry")
52
+ require "pry"
53
+ b.pry
54
+ else
55
+ puts "---BEGIN---\n#{node.to_xml}\n----END----"
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,26 @@
1
+ module DoctorNinja
2
+ module Parsers
3
+ class Base
4
+ def self.applicable_to?(node)
5
+ false
6
+ end
7
+
8
+ def initialize(node, yielder, document, context)
9
+ @node = node
10
+ @yielder = yielder
11
+ @document = document
12
+ @context = context
13
+ end
14
+
15
+ def parse
16
+ parse_children
17
+ end
18
+
19
+ def parse_children(context=@context)
20
+ @node.children.inject("") do |str,child|
21
+ str << @yielder.call(child,context).to_s
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,43 @@
1
+ require_relative "./base"
2
+
3
+ class DoctorNinja::Parsers::BlipFill < DoctorNinja::Parsers::Base
4
+ def self.applicable_to?(node)
5
+ node.name == "blipFill"
6
+ end
7
+
8
+ def parse
9
+ @image = Magick::Image.from_blob(@document.relationships[rel_id]).first
10
+ transform
11
+ @context.background_image = @image
12
+ end
13
+
14
+ private
15
+
16
+ def rel_id
17
+ @node.xpath("./a:blip", "a" => xmlns_a).attribute("embed").value
18
+ end
19
+
20
+ def transform
21
+ return unless src_rect
22
+ h = {
23
+ "l" => src_rect["l"].to_i*@image.columns/100000,
24
+ "t" => src_rect["t"].to_i*@image.rows/100000,
25
+ "r" => src_rect["r"].to_i*@image.columns/100000,
26
+ "b" => src_rect["b"].to_i*@image.rows/100000
27
+ }
28
+ @image = @image.crop(
29
+ h["l"],
30
+ h["t"],
31
+ @image.columns - h["l"]-h["r"] ,
32
+ @image.rows-h["b"]-h["t"]
33
+ )
34
+ end
35
+
36
+ def src_rect
37
+ @node.xpath("./a:srcRect","a" => xmlns_a)[0]
38
+ end
39
+
40
+ def xmlns_a
41
+ @node.namespaces["xmlns:a"]
42
+ end
43
+ end
@@ -0,0 +1,35 @@
1
+ require "rvg/rvg"
2
+ require "base64"
3
+
4
+ class DoctorNinja::Parsers::Drawing < DoctorNinja::Parsers::Base
5
+ include Magick
6
+
7
+ def self.applicable_to?(node)
8
+ node.name == "drawing"
9
+ end
10
+
11
+ def parse
12
+ width = extent[:x]/DoctorNinja::EMU_PER_PIXEL
13
+ height = extent[:y]/DoctorNinja::EMU_PER_PIXEL
14
+
15
+ rvg = RVG.new(width,height).viewbox(0,0,width,height) do |canvas|
16
+ parse_children(canvas)
17
+ end
18
+
19
+ format = "png"
20
+ base64 = Base64.encode64 rvg.draw.to_blob { self.format = format }
21
+
22
+ "<img src=\"data:image/#{format};base64,#{base64}\"/>"
23
+ end
24
+
25
+ def extent
26
+ @extent ||= {
27
+ x: extent_node.attribute("cx").value.to_i,
28
+ y: extent_node.attribute("cy").value.to_i
29
+ }
30
+ end
31
+
32
+ def extent_node
33
+ @node.xpath(".//wp:extent")
34
+ end
35
+ end
@@ -0,0 +1,29 @@
1
+ require_relative "./base"
2
+
3
+ class DoctorNinja::Parsers::Math < DoctorNinja::Parsers::Base
4
+ @@xsl = File.join(File.dirname(__FILE__),"omml2mml.xsl")
5
+
6
+ def self.applicable_to?(node)
7
+ node.name == "oMath"
8
+ end
9
+
10
+ # Uses saxon to convert by now
11
+ def parse
12
+ doc = Nokogiri::XML(mml)
13
+ doc.remove_namespaces!
14
+ doc.root.set_attribute "xmlns", "http://www.w3.org/1998/Math/MathML"
15
+ doc.root.set_attribute "display", "block" if @context[:is_math_para]
16
+ doc.root.to_xml
17
+ end
18
+
19
+ def mml
20
+ doc = Nokogiri::XML("<root>#{@node.to_xml(encoding: "utf-8")}</root>")
21
+
22
+ @node.namespaces.each do |k,v|
23
+ doc.root.add_namespace k.split(":")[1], v
24
+ end
25
+
26
+ xslt = Nokogiri::XSLT(File.read(@@xsl))
27
+ xslt.transform(Nokogiri::XML(doc.to_xml)).to_xml
28
+ end
29
+ end
@@ -0,0 +1,13 @@
1
+ require_relative "./base"
2
+
3
+ class DoctorNinja::Parsers::MathPara < DoctorNinja::Parsers::Base
4
+ def self.applicable_to?(node)
5
+ node.name == "oMathPara"
6
+ end
7
+
8
+ def parse
9
+ @context = @context.dup
10
+ @context[:is_math_para] = true
11
+ parse_children
12
+ end
13
+ end