doc2text 0.3.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/bin/doc2text +43 -0
- data/lib/doc2text.rb +16 -5
- data/lib/doc2text/docx/docx.rb +31 -0
- data/lib/doc2text/docx/docx_xml_namespaces.rb +55 -0
- data/lib/doc2text/docx/markdown_docx_parser.rb +81 -0
- data/lib/doc2text/generic_xml_nodes.rb +78 -0
- data/lib/doc2text/odt/markdown_odt_parser.rb +81 -0
- data/lib/doc2text/odt/odt.rb +43 -0
- data/lib/doc2text/odt/odt_xml_namespaces.rb +240 -0
- data/lib/doc2text/pptx/markdown_pptx_parser.rb +55 -0
- data/lib/doc2text/pptx/pptx.rb +30 -0
- data/lib/doc2text/pptx/pptx_xml_namespaces.rb +55 -0
- data/lib/doc2text/resolution.rb +14 -0
- data/lib/doc2text/styles_parser.rb +28 -0
- data/lib/doc2text/xml_based_document_file.rb +48 -0
- metadata +26 -29
- data/lib/doc2text/content.rb +0 -25
- data/lib/doc2text/markdown_odt_parser.rb +0 -72
- data/lib/doc2text/odt.rb +0 -73
- data/lib/doc2text/odt_xml_namespaces.rb +0 -215
- data/lib/doc2text/odt_xml_node.rb +0 -99
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e803f834de30ec59e70080fa881b47e4647e467f1d443cca097aaa726d80d48a
|
4
|
+
data.tar.gz: 5ff4569486e0a8f59a089918abd5f3025e46bf3970b7e7c5d50e6d2bd38e9718
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 29766bf3c446cd231277da1d8f41f6d3e2c8c8b46e01f58acefe9b62f123646de7757680cd94687637ef06a439bb69a8066a623fc25235a97e70970d311886dd
|
7
|
+
data.tar.gz: 585d4d505d4ffa9c9885e813debe512edefda10943fdf1693388838c858830e0a4802f12716fbc956ac8b8de4c0899892d9b35a9a8fe32e1e3707ff3c8fd7e00
|
data/bin/doc2text
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
|
4
|
+
require 'doc2text'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
opt_parse = OptionParser.new do |opts|
|
9
|
+
opts.banner = "Usage: doc2text document.odt output.md
|
10
|
+
or: doc2text [OPTION]... -s input[.odt] -o output[.md]\n\n"
|
11
|
+
|
12
|
+
opts.on('-s FILE', '--source FILE', 'Odt FILE document to be processed') do |file|
|
13
|
+
options[:source] = file
|
14
|
+
end
|
15
|
+
|
16
|
+
opts.on('-o FILE', '--output FILE', 'Output to markdown document FILE') do |file|
|
17
|
+
options[:output] = file
|
18
|
+
end
|
19
|
+
|
20
|
+
opts.on_tail('-h', '--help', 'Show this message') do
|
21
|
+
puts opts
|
22
|
+
exit
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
begin
|
27
|
+
opt_parse.parse!
|
28
|
+
if options.empty?
|
29
|
+
if ARGV.size == 2
|
30
|
+
options[:source], options[:output] = *ARGV
|
31
|
+
else
|
32
|
+
puts opt_parse
|
33
|
+
exit
|
34
|
+
end
|
35
|
+
end
|
36
|
+
rescue OptionParser::InvalidOption, OptionParser::MissingArgument
|
37
|
+
puts $!.to_s
|
38
|
+
puts opt_parse
|
39
|
+
exit
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
Doc2Text::Resolution.parse_and_save options[:source], options[:output]
|
data/lib/doc2text.rb
CHANGED
@@ -2,10 +2,21 @@ require 'nokogiri'
|
|
2
2
|
#require 'nokogiri/xml'
|
3
3
|
require 'fileutils'
|
4
4
|
|
5
|
-
require 'doc2text/
|
6
|
-
require 'doc2text/
|
7
|
-
require 'doc2text/
|
8
|
-
require 'doc2text/markdown_odt_parser'
|
5
|
+
require 'doc2text/xml_based_document_file'
|
6
|
+
require 'doc2text/generic_xml_nodes'
|
7
|
+
require 'doc2text/resolution'
|
9
8
|
require 'doc2text/errors'
|
10
9
|
|
11
|
-
require 'doc2text/
|
10
|
+
require 'doc2text/odt/odt'
|
11
|
+
require 'doc2text/odt/odt_xml_namespaces'
|
12
|
+
require 'doc2text/odt/markdown_odt_parser'
|
13
|
+
|
14
|
+
require 'doc2text/docx/docx'
|
15
|
+
require 'doc2text/docx/markdown_docx_parser'
|
16
|
+
require 'doc2text/docx/docx_xml_namespaces'
|
17
|
+
|
18
|
+
require 'doc2text/pptx/pptx'
|
19
|
+
require 'doc2text/pptx/markdown_pptx_parser'
|
20
|
+
require 'doc2text/pptx/pptx_xml_namespaces'
|
21
|
+
|
22
|
+
require 'doc2text/styles_parser'
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Docx
|
3
|
+
class Document < XmlBasedDocument::DocumentFile
|
4
|
+
|
5
|
+
def self.parse_and_save(input, output_filename)
|
6
|
+
docx = new input
|
7
|
+
begin
|
8
|
+
docx.unpack
|
9
|
+
styles_xml_root = docx.parse_styles
|
10
|
+
output = File.open output_filename, 'w'
|
11
|
+
markdown = Markdown::DocxParser.new output, styles_xml_root
|
12
|
+
begin
|
13
|
+
docx.parse markdown
|
14
|
+
ensure
|
15
|
+
markdown.close
|
16
|
+
end
|
17
|
+
ensure
|
18
|
+
docx.clean
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def contains_extracted_files?
|
23
|
+
File.exist? File.join(extract_path, '[Content_Types].xml')
|
24
|
+
end
|
25
|
+
|
26
|
+
def extract_extension
|
27
|
+
'unpacked_docx'
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Docx
|
3
|
+
module XmlNodes
|
4
|
+
class Node < XmlBasedDocument::XmlNodes::Node
|
5
|
+
def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
|
6
|
+
begin
|
7
|
+
clazz = XmlNodes.const_get "#{prefix.capitalize}::W#{name}"
|
8
|
+
rescue NameError => e
|
9
|
+
# markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
|
10
|
+
Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
|
11
|
+
else
|
12
|
+
clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def body?
|
17
|
+
false
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class PlainText < XmlBasedDocument::XmlNodes::PlainText
|
22
|
+
def body?
|
23
|
+
false
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class Generic < Node
|
28
|
+
end
|
29
|
+
|
30
|
+
module W
|
31
|
+
class Wbody < Node
|
32
|
+
def body?
|
33
|
+
true
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class Wbr < Node
|
38
|
+
def open
|
39
|
+
'<br/>'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Wp < Node
|
44
|
+
def open
|
45
|
+
"\n"
|
46
|
+
end
|
47
|
+
|
48
|
+
def close
|
49
|
+
"\n"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
module Doc2Text
|
4
|
+
module Markdown
|
5
|
+
class DocxParser < Nokogiri::XML::SAX::Document
|
6
|
+
def initialize(output, styles_xml_root = nil)
|
7
|
+
@styles_xml_root = styles_xml_root
|
8
|
+
@output = output
|
9
|
+
@automatic_styles = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
|
13
|
+
unless @xml_root
|
14
|
+
@xml_root = @current_node = Docx::XmlNodes::Node.create_node prefix, name, nil, attrs, self
|
15
|
+
else
|
16
|
+
new_node = Docx::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
|
17
|
+
@current_node.children << new_node
|
18
|
+
@current_node = new_node
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def end_element_namespace(name, prefix = nil, uri = nil)
|
23
|
+
if @current_node.parent and @current_node.parent.body?
|
24
|
+
@output << @current_node.expand
|
25
|
+
@current_node.delete
|
26
|
+
end
|
27
|
+
@current_node = @current_node.parent
|
28
|
+
end
|
29
|
+
|
30
|
+
def characters(string)
|
31
|
+
unless string.strip.empty?
|
32
|
+
plain_text = Docx::XmlNodes::PlainText.new(string)
|
33
|
+
@current_node.children << plain_text
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def close
|
38
|
+
@output.close
|
39
|
+
end
|
40
|
+
|
41
|
+
def print_tree(node)
|
42
|
+
puts node
|
43
|
+
node.children.each do |child|
|
44
|
+
print_tree child
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Select nodes xpath style
|
49
|
+
# - supports selecting from the root node
|
50
|
+
def xpath(string)
|
51
|
+
patterns = string.split '|'
|
52
|
+
raise Doc2Text::XmlError, 'it does not support this xpath syntax' if patterns.length == 0
|
53
|
+
result = []
|
54
|
+
patterns.each do |pattern|
|
55
|
+
if /^(\/[\w:\-]+)+$/ =~ pattern
|
56
|
+
path = pattern.scan /[\w:\-]+/
|
57
|
+
result += xpath_search_nodes(path, @xml_root)
|
58
|
+
result += xpath_search_nodes(path, @styles_xml_root) if @styles_xml_root
|
59
|
+
else
|
60
|
+
raise Doc2Text::XmlError, 'it does not support this xpath syntax'
|
61
|
+
end
|
62
|
+
end
|
63
|
+
result
|
64
|
+
end
|
65
|
+
|
66
|
+
def xpath_search_nodes(path, xml_root)
|
67
|
+
seek_nodes = [xml_root]
|
68
|
+
path.each_with_index do |xml_name, index|
|
69
|
+
seek_nodes.select! { |node| node.xml_name == xml_name }
|
70
|
+
seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
|
71
|
+
break if seek_nodes.empty?
|
72
|
+
end
|
73
|
+
seek_nodes
|
74
|
+
end
|
75
|
+
|
76
|
+
def logger
|
77
|
+
@logger ||= Logger.new(STDOUT)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module XmlBasedDocument
|
3
|
+
module XmlNodes
|
4
|
+
class Node
|
5
|
+
attr_reader :parent, :children, :attrs, :prefix, :name
|
6
|
+
attr_accessor :text
|
7
|
+
|
8
|
+
def self.inherited(subclass)
|
9
|
+
def subclass.titleize(tag)
|
10
|
+
tag.split('-').map(&:capitalize).join
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
15
|
+
@parent, @attrs, @prefix, @name, @xml_parser = parent, attrs, prefix, name, markdown_odt_parser
|
16
|
+
@children = []
|
17
|
+
@has_text = false
|
18
|
+
end
|
19
|
+
|
20
|
+
def root?
|
21
|
+
!@parent
|
22
|
+
end
|
23
|
+
|
24
|
+
def has_text?
|
25
|
+
@has_text
|
26
|
+
end
|
27
|
+
|
28
|
+
def open
|
29
|
+
''
|
30
|
+
end
|
31
|
+
|
32
|
+
def close
|
33
|
+
''
|
34
|
+
end
|
35
|
+
|
36
|
+
def delete
|
37
|
+
return true unless @children
|
38
|
+
@children.each { |child| child.delete }
|
39
|
+
@children = []
|
40
|
+
end
|
41
|
+
|
42
|
+
def eql?(object)
|
43
|
+
return false unless object.is_a? Node
|
44
|
+
object.xml_name == xml_name
|
45
|
+
end
|
46
|
+
|
47
|
+
def generic?
|
48
|
+
instance_of? Node
|
49
|
+
end
|
50
|
+
|
51
|
+
def xml_name
|
52
|
+
"#{@prefix}:#{@name}"
|
53
|
+
end
|
54
|
+
|
55
|
+
def to_s
|
56
|
+
"#{xml_name} : #{attrs}"
|
57
|
+
end
|
58
|
+
|
59
|
+
def expand
|
60
|
+
expanded = "#{open}#{@children.map(&:expand).join}#{close}"
|
61
|
+
delete
|
62
|
+
expanded.clone
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
class PlainText < Node
|
67
|
+
|
68
|
+
attr_accessor :text
|
69
|
+
|
70
|
+
alias_method :expand, :text
|
71
|
+
|
72
|
+
def initialize(text)
|
73
|
+
@text = text
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
module Doc2Text
|
4
|
+
module Markdown
|
5
|
+
class OdtParser < Nokogiri::XML::SAX::Document
|
6
|
+
def initialize(output, styles_xml_root = nil)
|
7
|
+
@styles_xml_root = styles_xml_root
|
8
|
+
@output = output
|
9
|
+
@automatic_styles = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
|
13
|
+
unless @xml_root
|
14
|
+
@xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
|
15
|
+
else
|
16
|
+
new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
|
17
|
+
@current_node.children << new_node
|
18
|
+
@current_node = new_node
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def end_element_namespace(name, prefix = nil, uri = nil)
|
23
|
+
if @current_node.parent and @current_node.parent.office_text?
|
24
|
+
@output.write @current_node.expand
|
25
|
+
@current_node.delete
|
26
|
+
end
|
27
|
+
@current_node = @current_node.parent
|
28
|
+
end
|
29
|
+
|
30
|
+
def characters(string)
|
31
|
+
unless string.strip.empty?
|
32
|
+
plain_text = Odt::XmlNodes::PlainText.new(string)
|
33
|
+
@current_node.children << plain_text
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def close
|
38
|
+
@output.close
|
39
|
+
end
|
40
|
+
|
41
|
+
def print_tree(node)
|
42
|
+
puts node
|
43
|
+
node.children.each do |child|
|
44
|
+
print_tree child
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Select nodes xpath style
|
49
|
+
# - supports selecting from the root node
|
50
|
+
def xpath(string)
|
51
|
+
patterns = string.split '|'
|
52
|
+
raise Doc2Text::XmlError, 'it does not support this xpath syntax' if patterns.length == 0
|
53
|
+
result = []
|
54
|
+
patterns.each do |pattern|
|
55
|
+
if /^(\/[\w:\-]+)+$/ =~ pattern
|
56
|
+
path = pattern.scan /[\w:\-]+/
|
57
|
+
result += xpath_search_nodes(path, @xml_root)
|
58
|
+
result += xpath_search_nodes(path, @styles_xml_root) if @styles_xml_root
|
59
|
+
else
|
60
|
+
raise Doc2Text::XmlError, 'it does not support this xpath syntax'
|
61
|
+
end
|
62
|
+
end
|
63
|
+
result
|
64
|
+
end
|
65
|
+
|
66
|
+
def xpath_search_nodes(path, xml_root)
|
67
|
+
seek_nodes = [xml_root]
|
68
|
+
path.each_with_index do |xml_name, index|
|
69
|
+
seek_nodes.select! { |node| node.xml_name == xml_name }
|
70
|
+
seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
|
71
|
+
break if seek_nodes.empty?
|
72
|
+
end
|
73
|
+
seek_nodes
|
74
|
+
end
|
75
|
+
|
76
|
+
def logger
|
77
|
+
@logger ||= Logger.new(STDOUT)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Odt
|
3
|
+
class Document < XmlBasedDocument::DocumentFile
|
4
|
+
|
5
|
+
def extract_extension
|
6
|
+
'unpacked_odt'
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.parse_and_save(input, output_filename)
|
10
|
+
odt = new input
|
11
|
+
begin
|
12
|
+
odt.unpack
|
13
|
+
styles_xml_root = odt.parse_styles
|
14
|
+
output = File.open output_filename, 'w'
|
15
|
+
markdown = Markdown::OdtParser.new output, styles_xml_root
|
16
|
+
begin
|
17
|
+
odt.parse markdown
|
18
|
+
ensure
|
19
|
+
markdown.close
|
20
|
+
end
|
21
|
+
ensure
|
22
|
+
odt.clean
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def parse_styles
|
27
|
+
styles_parser = Doc2Text::Odt::StylesParser.new
|
28
|
+
xml = Nokogiri::XML::SAX::Parser.new(styles_parser)
|
29
|
+
xml.parse open 'styles.xml'
|
30
|
+
styles_parser.xml_root
|
31
|
+
end
|
32
|
+
|
33
|
+
def parse(markdown)
|
34
|
+
parser = Nokogiri::XML::SAX::Parser.new(markdown) # { |config| config.strict}
|
35
|
+
parser.parse open 'content.xml'
|
36
|
+
end
|
37
|
+
|
38
|
+
def contains_extracted_files?
|
39
|
+
[File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|