doc2text 0.3.2 → 0.4.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/bin/doc2text +43 -0
- data/lib/doc2text.rb +16 -5
- data/lib/doc2text/docx/docx.rb +31 -0
- data/lib/doc2text/docx/docx_xml_namespaces.rb +55 -0
- data/lib/doc2text/docx/markdown_docx_parser.rb +81 -0
- data/lib/doc2text/generic_xml_nodes.rb +78 -0
- data/lib/doc2text/odt/markdown_odt_parser.rb +81 -0
- data/lib/doc2text/odt/odt.rb +43 -0
- data/lib/doc2text/odt/odt_xml_namespaces.rb +240 -0
- data/lib/doc2text/pptx/markdown_pptx_parser.rb +55 -0
- data/lib/doc2text/pptx/pptx.rb +30 -0
- data/lib/doc2text/pptx/pptx_xml_namespaces.rb +55 -0
- data/lib/doc2text/resolution.rb +14 -0
- data/lib/doc2text/styles_parser.rb +28 -0
- data/lib/doc2text/xml_based_document_file.rb +48 -0
- metadata +26 -29
- data/lib/doc2text/content.rb +0 -25
- data/lib/doc2text/markdown_odt_parser.rb +0 -72
- data/lib/doc2text/odt.rb +0 -73
- data/lib/doc2text/odt_xml_namespaces.rb +0 -215
- data/lib/doc2text/odt_xml_node.rb +0 -99
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e803f834de30ec59e70080fa881b47e4647e467f1d443cca097aaa726d80d48a
|
4
|
+
data.tar.gz: 5ff4569486e0a8f59a089918abd5f3025e46bf3970b7e7c5d50e6d2bd38e9718
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 29766bf3c446cd231277da1d8f41f6d3e2c8c8b46e01f58acefe9b62f123646de7757680cd94687637ef06a439bb69a8066a623fc25235a97e70970d311886dd
|
7
|
+
data.tar.gz: 585d4d505d4ffa9c9885e813debe512edefda10943fdf1693388838c858830e0a4802f12716fbc956ac8b8de4c0899892d9b35a9a8fe32e1e3707ff3c8fd7e00
|
data/bin/doc2text
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
|
4
|
+
require 'doc2text'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
opt_parse = OptionParser.new do |opts|
|
9
|
+
opts.banner = "Usage: doc2text document.odt output.md
|
10
|
+
or: doc2text [OPTION]... -s input[.odt] -o output[.md]\n\n"
|
11
|
+
|
12
|
+
opts.on('-s FILE', '--source FILE', 'Odt FILE document to be processed') do |file|
|
13
|
+
options[:source] = file
|
14
|
+
end
|
15
|
+
|
16
|
+
opts.on('-o FILE', '--output FILE', 'Output to markdown document FILE') do |file|
|
17
|
+
options[:output] = file
|
18
|
+
end
|
19
|
+
|
20
|
+
opts.on_tail('-h', '--help', 'Show this message') do
|
21
|
+
puts opts
|
22
|
+
exit
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
begin
|
27
|
+
opt_parse.parse!
|
28
|
+
if options.empty?
|
29
|
+
if ARGV.size == 2
|
30
|
+
options[:source], options[:output] = *ARGV
|
31
|
+
else
|
32
|
+
puts opt_parse
|
33
|
+
exit
|
34
|
+
end
|
35
|
+
end
|
36
|
+
rescue OptionParser::InvalidOption, OptionParser::MissingArgument
|
37
|
+
puts $!.to_s
|
38
|
+
puts opt_parse
|
39
|
+
exit
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
Doc2Text::Resolution.parse_and_save options[:source], options[:output]
|
data/lib/doc2text.rb
CHANGED
@@ -2,10 +2,21 @@ require 'nokogiri'
|
|
2
2
|
#require 'nokogiri/xml'
|
3
3
|
require 'fileutils'
|
4
4
|
|
5
|
-
require 'doc2text/
|
6
|
-
require 'doc2text/
|
7
|
-
require 'doc2text/
|
8
|
-
require 'doc2text/markdown_odt_parser'
|
5
|
+
require 'doc2text/xml_based_document_file'
|
6
|
+
require 'doc2text/generic_xml_nodes'
|
7
|
+
require 'doc2text/resolution'
|
9
8
|
require 'doc2text/errors'
|
10
9
|
|
11
|
-
require 'doc2text/
|
10
|
+
require 'doc2text/odt/odt'
|
11
|
+
require 'doc2text/odt/odt_xml_namespaces'
|
12
|
+
require 'doc2text/odt/markdown_odt_parser'
|
13
|
+
|
14
|
+
require 'doc2text/docx/docx'
|
15
|
+
require 'doc2text/docx/markdown_docx_parser'
|
16
|
+
require 'doc2text/docx/docx_xml_namespaces'
|
17
|
+
|
18
|
+
require 'doc2text/pptx/pptx'
|
19
|
+
require 'doc2text/pptx/markdown_pptx_parser'
|
20
|
+
require 'doc2text/pptx/pptx_xml_namespaces'
|
21
|
+
|
22
|
+
require 'doc2text/styles_parser'
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Docx
|
3
|
+
class Document < XmlBasedDocument::DocumentFile
|
4
|
+
|
5
|
+
def self.parse_and_save(input, output_filename)
|
6
|
+
docx = new input
|
7
|
+
begin
|
8
|
+
docx.unpack
|
9
|
+
styles_xml_root = docx.parse_styles
|
10
|
+
output = File.open output_filename, 'w'
|
11
|
+
markdown = Markdown::DocxParser.new output, styles_xml_root
|
12
|
+
begin
|
13
|
+
docx.parse markdown
|
14
|
+
ensure
|
15
|
+
markdown.close
|
16
|
+
end
|
17
|
+
ensure
|
18
|
+
docx.clean
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def contains_extracted_files?
|
23
|
+
File.exist? File.join(extract_path, '[Content_Types].xml')
|
24
|
+
end
|
25
|
+
|
26
|
+
def extract_extension
|
27
|
+
'unpacked_docx'
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Docx
|
3
|
+
module XmlNodes
|
4
|
+
class Node < XmlBasedDocument::XmlNodes::Node
|
5
|
+
def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
|
6
|
+
begin
|
7
|
+
clazz = XmlNodes.const_get "#{prefix.capitalize}::W#{name}"
|
8
|
+
rescue NameError => e
|
9
|
+
# markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
|
10
|
+
Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
|
11
|
+
else
|
12
|
+
clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def body?
|
17
|
+
false
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class PlainText < XmlBasedDocument::XmlNodes::PlainText
|
22
|
+
def body?
|
23
|
+
false
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class Generic < Node
|
28
|
+
end
|
29
|
+
|
30
|
+
module W
|
31
|
+
class Wbody < Node
|
32
|
+
def body?
|
33
|
+
true
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class Wbr < Node
|
38
|
+
def open
|
39
|
+
'<br/>'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Wp < Node
|
44
|
+
def open
|
45
|
+
"\n"
|
46
|
+
end
|
47
|
+
|
48
|
+
def close
|
49
|
+
"\n"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
module Doc2Text
|
4
|
+
module Markdown
|
5
|
+
class DocxParser < Nokogiri::XML::SAX::Document
|
6
|
+
def initialize(output, styles_xml_root = nil)
|
7
|
+
@styles_xml_root = styles_xml_root
|
8
|
+
@output = output
|
9
|
+
@automatic_styles = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
|
13
|
+
unless @xml_root
|
14
|
+
@xml_root = @current_node = Docx::XmlNodes::Node.create_node prefix, name, nil, attrs, self
|
15
|
+
else
|
16
|
+
new_node = Docx::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
|
17
|
+
@current_node.children << new_node
|
18
|
+
@current_node = new_node
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def end_element_namespace(name, prefix = nil, uri = nil)
|
23
|
+
if @current_node.parent and @current_node.parent.body?
|
24
|
+
@output << @current_node.expand
|
25
|
+
@current_node.delete
|
26
|
+
end
|
27
|
+
@current_node = @current_node.parent
|
28
|
+
end
|
29
|
+
|
30
|
+
def characters(string)
|
31
|
+
unless string.strip.empty?
|
32
|
+
plain_text = Docx::XmlNodes::PlainText.new(string)
|
33
|
+
@current_node.children << plain_text
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def close
|
38
|
+
@output.close
|
39
|
+
end
|
40
|
+
|
41
|
+
def print_tree(node)
|
42
|
+
puts node
|
43
|
+
node.children.each do |child|
|
44
|
+
print_tree child
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Select nodes xpath style
|
49
|
+
# - supports selecting from the root node
|
50
|
+
def xpath(string)
|
51
|
+
patterns = string.split '|'
|
52
|
+
raise Doc2Text::XmlError, 'it does not support this xpath syntax' if patterns.length == 0
|
53
|
+
result = []
|
54
|
+
patterns.each do |pattern|
|
55
|
+
if /^(\/[\w:\-]+)+$/ =~ pattern
|
56
|
+
path = pattern.scan /[\w:\-]+/
|
57
|
+
result += xpath_search_nodes(path, @xml_root)
|
58
|
+
result += xpath_search_nodes(path, @styles_xml_root) if @styles_xml_root
|
59
|
+
else
|
60
|
+
raise Doc2Text::XmlError, 'it does not support this xpath syntax'
|
61
|
+
end
|
62
|
+
end
|
63
|
+
result
|
64
|
+
end
|
65
|
+
|
66
|
+
def xpath_search_nodes(path, xml_root)
|
67
|
+
seek_nodes = [xml_root]
|
68
|
+
path.each_with_index do |xml_name, index|
|
69
|
+
seek_nodes.select! { |node| node.xml_name == xml_name }
|
70
|
+
seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
|
71
|
+
break if seek_nodes.empty?
|
72
|
+
end
|
73
|
+
seek_nodes
|
74
|
+
end
|
75
|
+
|
76
|
+
def logger
|
77
|
+
@logger ||= Logger.new(STDOUT)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module XmlBasedDocument
|
3
|
+
module XmlNodes
|
4
|
+
class Node
|
5
|
+
attr_reader :parent, :children, :attrs, :prefix, :name
|
6
|
+
attr_accessor :text
|
7
|
+
|
8
|
+
def self.inherited(subclass)
|
9
|
+
def subclass.titleize(tag)
|
10
|
+
tag.split('-').map(&:capitalize).join
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
15
|
+
@parent, @attrs, @prefix, @name, @xml_parser = parent, attrs, prefix, name, markdown_odt_parser
|
16
|
+
@children = []
|
17
|
+
@has_text = false
|
18
|
+
end
|
19
|
+
|
20
|
+
def root?
|
21
|
+
!@parent
|
22
|
+
end
|
23
|
+
|
24
|
+
def has_text?
|
25
|
+
@has_text
|
26
|
+
end
|
27
|
+
|
28
|
+
def open
|
29
|
+
''
|
30
|
+
end
|
31
|
+
|
32
|
+
def close
|
33
|
+
''
|
34
|
+
end
|
35
|
+
|
36
|
+
def delete
|
37
|
+
return true unless @children
|
38
|
+
@children.each { |child| child.delete }
|
39
|
+
@children = []
|
40
|
+
end
|
41
|
+
|
42
|
+
def eql?(object)
|
43
|
+
return false unless object.is_a? Node
|
44
|
+
object.xml_name == xml_name
|
45
|
+
end
|
46
|
+
|
47
|
+
def generic?
|
48
|
+
instance_of? Node
|
49
|
+
end
|
50
|
+
|
51
|
+
def xml_name
|
52
|
+
"#{@prefix}:#{@name}"
|
53
|
+
end
|
54
|
+
|
55
|
+
def to_s
|
56
|
+
"#{xml_name} : #{attrs}"
|
57
|
+
end
|
58
|
+
|
59
|
+
def expand
|
60
|
+
expanded = "#{open}#{@children.map(&:expand).join}#{close}"
|
61
|
+
delete
|
62
|
+
expanded.clone
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
class PlainText < Node
|
67
|
+
|
68
|
+
attr_accessor :text
|
69
|
+
|
70
|
+
alias_method :expand, :text
|
71
|
+
|
72
|
+
def initialize(text)
|
73
|
+
@text = text
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
module Doc2Text
|
4
|
+
module Markdown
|
5
|
+
class OdtParser < Nokogiri::XML::SAX::Document
|
6
|
+
def initialize(output, styles_xml_root = nil)
|
7
|
+
@styles_xml_root = styles_xml_root
|
8
|
+
@output = output
|
9
|
+
@automatic_styles = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
|
13
|
+
unless @xml_root
|
14
|
+
@xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
|
15
|
+
else
|
16
|
+
new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
|
17
|
+
@current_node.children << new_node
|
18
|
+
@current_node = new_node
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def end_element_namespace(name, prefix = nil, uri = nil)
|
23
|
+
if @current_node.parent and @current_node.parent.office_text?
|
24
|
+
@output.write @current_node.expand
|
25
|
+
@current_node.delete
|
26
|
+
end
|
27
|
+
@current_node = @current_node.parent
|
28
|
+
end
|
29
|
+
|
30
|
+
def characters(string)
|
31
|
+
unless string.strip.empty?
|
32
|
+
plain_text = Odt::XmlNodes::PlainText.new(string)
|
33
|
+
@current_node.children << plain_text
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def close
|
38
|
+
@output.close
|
39
|
+
end
|
40
|
+
|
41
|
+
def print_tree(node)
|
42
|
+
puts node
|
43
|
+
node.children.each do |child|
|
44
|
+
print_tree child
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Select nodes xpath style
|
49
|
+
# - supports selecting from the root node
|
50
|
+
def xpath(string)
|
51
|
+
patterns = string.split '|'
|
52
|
+
raise Doc2Text::XmlError, 'it does not support this xpath syntax' if patterns.length == 0
|
53
|
+
result = []
|
54
|
+
patterns.each do |pattern|
|
55
|
+
if /^(\/[\w:\-]+)+$/ =~ pattern
|
56
|
+
path = pattern.scan /[\w:\-]+/
|
57
|
+
result += xpath_search_nodes(path, @xml_root)
|
58
|
+
result += xpath_search_nodes(path, @styles_xml_root) if @styles_xml_root
|
59
|
+
else
|
60
|
+
raise Doc2Text::XmlError, 'it does not support this xpath syntax'
|
61
|
+
end
|
62
|
+
end
|
63
|
+
result
|
64
|
+
end
|
65
|
+
|
66
|
+
def xpath_search_nodes(path, xml_root)
|
67
|
+
seek_nodes = [xml_root]
|
68
|
+
path.each_with_index do |xml_name, index|
|
69
|
+
seek_nodes.select! { |node| node.xml_name == xml_name }
|
70
|
+
seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
|
71
|
+
break if seek_nodes.empty?
|
72
|
+
end
|
73
|
+
seek_nodes
|
74
|
+
end
|
75
|
+
|
76
|
+
def logger
|
77
|
+
@logger ||= Logger.new(STDOUT)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Odt
|
3
|
+
class Document < XmlBasedDocument::DocumentFile
|
4
|
+
|
5
|
+
def extract_extension
|
6
|
+
'unpacked_odt'
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.parse_and_save(input, output_filename)
|
10
|
+
odt = new input
|
11
|
+
begin
|
12
|
+
odt.unpack
|
13
|
+
styles_xml_root = odt.parse_styles
|
14
|
+
output = File.open output_filename, 'w'
|
15
|
+
markdown = Markdown::OdtParser.new output, styles_xml_root
|
16
|
+
begin
|
17
|
+
odt.parse markdown
|
18
|
+
ensure
|
19
|
+
markdown.close
|
20
|
+
end
|
21
|
+
ensure
|
22
|
+
odt.clean
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def parse_styles
|
27
|
+
styles_parser = Doc2Text::Odt::StylesParser.new
|
28
|
+
xml = Nokogiri::XML::SAX::Parser.new(styles_parser)
|
29
|
+
xml.parse open 'styles.xml'
|
30
|
+
styles_parser.xml_root
|
31
|
+
end
|
32
|
+
|
33
|
+
def parse(markdown)
|
34
|
+
parser = Nokogiri::XML::SAX::Parser.new(markdown) # { |config| config.strict}
|
35
|
+
parser.parse open 'content.xml'
|
36
|
+
end
|
37
|
+
|
38
|
+
def contains_extracted_files?
|
39
|
+
[File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|