doc2text 0.3.3 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/bin/doc2text +43 -0
- data/lib/doc2text.rb +11 -5
- data/lib/doc2text/docx/docx.rb +31 -0
- data/lib/doc2text/docx/docx_xml_namespaces.rb +55 -0
- data/lib/doc2text/docx/markdown_docx_parser.rb +81 -0
- data/lib/doc2text/{odt_xml_node.rb → generic_xml_nodes.rb} +11 -32
- data/lib/doc2text/{markdown_odt_parser.rb → odt/markdown_odt_parser.rb} +0 -0
- data/lib/doc2text/odt/odt.rb +43 -0
- data/lib/doc2text/{odt_xml_namespaces.rb → odt/odt_xml_namespaces.rb} +83 -64
- data/lib/doc2text/resolution.rb +2 -7
- data/lib/doc2text/styles_parser.rb +2 -2
- data/lib/doc2text/xml_based_document_file.rb +48 -0
- metadata +22 -18
- data/lib/doc2text/odt.rb +0 -80
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 7b9b5aeaa63d276696f0f4f716242b181d8d3aef2e47861053c03f9623cdf498
|
4
|
+
data.tar.gz: a9ac2a3e0314334dda782f8ce8ef0d5a0691015ae70da0cc7a7fd79b2d6d7cd2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 88fcdc3ade712a061c62641dd7713274c676f41d37c7020348ad401e0e7af3a86b07a3726a057870396ee68a290867fccf635d3191c8376b45850507e2f566e9
|
7
|
+
data.tar.gz: a96c1f4cbfbb42079f5e5d6eea757531d7d7e852e01724c40cc58f94ee5ebd27e700bdffc221cc7c7202a3686a6f3c40d8a5e153f07dcdca7aa4fd542b13eac9
|
data/bin/doc2text
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
|
4
|
+
require 'doc2text'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
opt_parse = OptionParser.new do |opts|
|
9
|
+
opts.banner = "Usage: doc2text document.odt output.md
|
10
|
+
or: doc2text [OPTION]... -s input[.odt] -o output[.md]\n\n"
|
11
|
+
|
12
|
+
opts.on('-s FILE', '--source FILE', 'Odt FILE document to be processed') do |file|
|
13
|
+
options[:source] = file
|
14
|
+
end
|
15
|
+
|
16
|
+
opts.on('-o FILE', '--output FILE', 'Output to markdown document FILE') do |file|
|
17
|
+
options[:output] = file
|
18
|
+
end
|
19
|
+
|
20
|
+
opts.on_tail('-h', '--help', 'Show this message') do
|
21
|
+
puts opts
|
22
|
+
exit
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
begin
|
27
|
+
opt_parse.parse!
|
28
|
+
if options.empty?
|
29
|
+
if ARGV.size == 2
|
30
|
+
options[:source], options[:output] = *ARGV
|
31
|
+
else
|
32
|
+
puts opt_parse
|
33
|
+
exit
|
34
|
+
end
|
35
|
+
end
|
36
|
+
rescue OptionParser::InvalidOption, OptionParser::MissingArgument
|
37
|
+
puts $!.to_s
|
38
|
+
puts opt_parse
|
39
|
+
exit
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
Doc2Text::Resolution.parse_and_save options[:source], options[:output]
|
data/lib/doc2text.rb
CHANGED
@@ -2,11 +2,17 @@ require 'nokogiri'
|
|
2
2
|
#require 'nokogiri/xml'
|
3
3
|
require 'fileutils'
|
4
4
|
|
5
|
+
require 'doc2text/xml_based_document_file'
|
6
|
+
require 'doc2text/generic_xml_nodes'
|
5
7
|
require 'doc2text/resolution'
|
6
|
-
require 'doc2text/odt'
|
7
|
-
require 'doc2text/odt_xml_node'
|
8
|
-
require 'doc2text/odt_xml_namespaces'
|
9
|
-
require 'doc2text/markdown_odt_parser'
|
10
8
|
require 'doc2text/errors'
|
11
9
|
|
12
|
-
require 'doc2text/
|
10
|
+
require 'doc2text/odt/odt'
|
11
|
+
require 'doc2text/odt/odt_xml_namespaces'
|
12
|
+
require 'doc2text/odt/markdown_odt_parser'
|
13
|
+
|
14
|
+
require 'doc2text/docx/docx'
|
15
|
+
require 'doc2text/docx/markdown_docx_parser'
|
16
|
+
require 'doc2text/docx/docx_xml_namespaces'
|
17
|
+
|
18
|
+
require 'doc2text/styles_parser'
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Docx
|
3
|
+
class Document < XmlBasedDocument::DocumentFile
|
4
|
+
|
5
|
+
def self.parse_and_save(input, output_filename)
|
6
|
+
docx = new input
|
7
|
+
begin
|
8
|
+
docx.unpack
|
9
|
+
styles_xml_root = docx.parse_styles
|
10
|
+
output = File.open output_filename, 'w'
|
11
|
+
markdown = Markdown::DocxParser.new output, styles_xml_root
|
12
|
+
begin
|
13
|
+
docx.parse markdown
|
14
|
+
ensure
|
15
|
+
markdown.close
|
16
|
+
end
|
17
|
+
ensure
|
18
|
+
docx.clean
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def contains_extracted_files?
|
23
|
+
File.exist? File.join(extract_path, '[Content_Types].xml')
|
24
|
+
end
|
25
|
+
|
26
|
+
def extract_extension
|
27
|
+
'unpacked_docx'
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Docx
|
3
|
+
module XmlNodes
|
4
|
+
class Node < XmlBasedDocument::XmlNodes::Node
|
5
|
+
def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
|
6
|
+
begin
|
7
|
+
clazz = XmlNodes.const_get "#{prefix.capitalize}::W#{name}"
|
8
|
+
rescue NameError => e
|
9
|
+
# markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
|
10
|
+
Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
|
11
|
+
else
|
12
|
+
clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def body?
|
17
|
+
false
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class PlainText < XmlBasedDocument::XmlNodes::PlainText
|
22
|
+
def body?
|
23
|
+
false
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class Generic < Node
|
28
|
+
end
|
29
|
+
|
30
|
+
module W
|
31
|
+
class Wbody < Node
|
32
|
+
def body?
|
33
|
+
true
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class Wbr < Node
|
38
|
+
def open
|
39
|
+
'<br/>'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Wp < Node
|
44
|
+
def open
|
45
|
+
"\n"
|
46
|
+
end
|
47
|
+
|
48
|
+
def close
|
49
|
+
"\n"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
module Doc2Text
|
4
|
+
module Markdown
|
5
|
+
class DocxParser < Nokogiri::XML::SAX::Document
|
6
|
+
def initialize(output, styles_xml_root = nil)
|
7
|
+
@styles_xml_root = styles_xml_root
|
8
|
+
@output = output
|
9
|
+
@automatic_styles = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
|
13
|
+
unless @xml_root
|
14
|
+
@xml_root = @current_node = Docx::XmlNodes::Node.create_node prefix, name, nil, attrs, self
|
15
|
+
else
|
16
|
+
new_node = Docx::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
|
17
|
+
@current_node.children << new_node
|
18
|
+
@current_node = new_node
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def end_element_namespace(name, prefix = nil, uri = nil)
|
23
|
+
if @current_node.parent and @current_node.parent.body?
|
24
|
+
@output << @current_node.expand
|
25
|
+
@current_node.delete
|
26
|
+
end
|
27
|
+
@current_node = @current_node.parent
|
28
|
+
end
|
29
|
+
|
30
|
+
def characters(string)
|
31
|
+
unless string.strip.empty?
|
32
|
+
plain_text = Docx::XmlNodes::PlainText.new(string)
|
33
|
+
@current_node.children << plain_text
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def close
|
38
|
+
@output.close
|
39
|
+
end
|
40
|
+
|
41
|
+
def print_tree(node)
|
42
|
+
puts node
|
43
|
+
node.children.each do |child|
|
44
|
+
print_tree child
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Select nodes xpath style
|
49
|
+
# - supports selecting from the root node
|
50
|
+
def xpath(string)
|
51
|
+
patterns = string.split '|'
|
52
|
+
raise Doc2Text::XmlError, 'it does not support this xpath syntax' if patterns.length == 0
|
53
|
+
result = []
|
54
|
+
patterns.each do |pattern|
|
55
|
+
if /^(\/[\w:\-]+)+$/ =~ pattern
|
56
|
+
path = pattern.scan /[\w:\-]+/
|
57
|
+
result += xpath_search_nodes(path, @xml_root)
|
58
|
+
result += xpath_search_nodes(path, @styles_xml_root) if @styles_xml_root
|
59
|
+
else
|
60
|
+
raise Doc2Text::XmlError, 'it does not support this xpath syntax'
|
61
|
+
end
|
62
|
+
end
|
63
|
+
result
|
64
|
+
end
|
65
|
+
|
66
|
+
def xpath_search_nodes(path, xml_root)
|
67
|
+
seek_nodes = [xml_root]
|
68
|
+
path.each_with_index do |xml_name, index|
|
69
|
+
seek_nodes.select! { |node| node.xml_name == xml_name }
|
70
|
+
seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
|
71
|
+
break if seek_nodes.empty?
|
72
|
+
end
|
73
|
+
seek_nodes
|
74
|
+
end
|
75
|
+
|
76
|
+
def logger
|
77
|
+
@logger ||= Logger.new(STDOUT)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -1,25 +1,16 @@
|
|
1
1
|
module Doc2Text
|
2
|
-
module
|
2
|
+
module XmlBasedDocument
|
3
3
|
module XmlNodes
|
4
|
-
|
4
|
+
class Node
|
5
5
|
attr_reader :parent, :children, :attrs, :prefix, :name
|
6
6
|
attr_accessor :text
|
7
7
|
|
8
|
-
def self.
|
9
|
-
|
10
|
-
|
11
|
-
rescue NameError => e
|
12
|
-
# markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
|
13
|
-
Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
|
14
|
-
else
|
15
|
-
clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
|
8
|
+
def self.inherited(subclass)
|
9
|
+
def subclass.titleize(tag)
|
10
|
+
tag.split('-').map(&:capitalize).join
|
16
11
|
end
|
17
12
|
end
|
18
13
|
|
19
|
-
def self.titleize(tag)
|
20
|
-
tag.split('-').map(&:capitalize).join
|
21
|
-
end
|
22
|
-
|
23
14
|
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
24
15
|
@parent, @attrs, @prefix, @name, @xml_parser = parent, attrs, prefix, name, markdown_odt_parser
|
25
16
|
@children = []
|
@@ -42,10 +33,6 @@ module Doc2Text
|
|
42
33
|
''
|
43
34
|
end
|
44
35
|
|
45
|
-
def office_text?
|
46
|
-
false
|
47
|
-
end
|
48
|
-
|
49
36
|
def delete
|
50
37
|
return true unless @children
|
51
38
|
@children.each { |child| child.delete }
|
@@ -74,24 +61,16 @@ module Doc2Text
|
|
74
61
|
delete
|
75
62
|
expanded.clone
|
76
63
|
end
|
64
|
+
end
|
77
65
|
|
78
|
-
|
79
|
-
!root? && parent.class.not_enclosing_tags && parent.class.not_enclosing_tags.find do |tag|
|
80
|
-
@prefix == parent.prefix && @name == tag
|
81
|
-
end
|
82
|
-
end
|
66
|
+
class PlainText < Node
|
83
67
|
|
84
|
-
|
85
|
-
base.extend ClassMethods
|
86
|
-
end
|
68
|
+
attr_accessor :text
|
87
69
|
|
88
|
-
|
89
|
-
attr_reader :not_enclosing_tags
|
70
|
+
alias_method :expand, :text
|
90
71
|
|
91
|
-
|
92
|
-
|
93
|
-
@not_enclosing_tags << tag
|
94
|
-
end
|
72
|
+
def initialize(text)
|
73
|
+
@text = text
|
95
74
|
end
|
96
75
|
end
|
97
76
|
end
|
File without changes
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Odt
|
3
|
+
class Document < XmlBasedDocument::DocumentFile
|
4
|
+
|
5
|
+
def extract_extension
|
6
|
+
'unpacked_odt'
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.parse_and_save(input, output_filename)
|
10
|
+
odt = new input
|
11
|
+
begin
|
12
|
+
odt.unpack
|
13
|
+
styles_xml_root = odt.parse_styles
|
14
|
+
output = File.open output_filename, 'w'
|
15
|
+
markdown = Markdown::OdtParser.new output, styles_xml_root
|
16
|
+
begin
|
17
|
+
odt.parse markdown
|
18
|
+
ensure
|
19
|
+
markdown.close
|
20
|
+
end
|
21
|
+
ensure
|
22
|
+
odt.clean
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def parse_styles
|
27
|
+
styles_parser = Doc2Text::Odt::StylesParser.new
|
28
|
+
xml = Nokogiri::XML::SAX::Parser.new(styles_parser)
|
29
|
+
xml.parse open 'styles.xml'
|
30
|
+
styles_parser.xml_root
|
31
|
+
end
|
32
|
+
|
33
|
+
def parse(markdown)
|
34
|
+
parser = Nokogiri::XML::SAX::Parser.new(markdown) # { |config| config.strict}
|
35
|
+
parser.parse open 'content.xml'
|
36
|
+
end
|
37
|
+
|
38
|
+
def contains_extracted_files?
|
39
|
+
[File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -1,37 +1,40 @@
|
|
1
1
|
module Doc2Text
|
2
2
|
module Odt
|
3
3
|
module XmlNodes
|
4
|
-
class
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
4
|
+
class Node < XmlBasedDocument::XmlNodes::Node
|
5
|
+
def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
|
6
|
+
begin
|
7
|
+
clazz = XmlNodes.const_get "#{titleize prefix}::#{titleize name}"
|
8
|
+
rescue NameError => e
|
9
|
+
# markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
|
10
|
+
Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
|
11
|
+
else
|
12
|
+
clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
|
13
|
+
end
|
14
|
+
end
|
10
15
|
|
11
|
-
def
|
12
|
-
|
16
|
+
def office_text?
|
17
|
+
false
|
13
18
|
end
|
14
19
|
end
|
15
20
|
|
16
|
-
class
|
17
|
-
include Node
|
21
|
+
class PlainText < XmlBasedDocument::XmlNodes::PlainText
|
18
22
|
end
|
19
23
|
|
24
|
+
class Generic < Node
|
25
|
+
end
|
20
26
|
#
|
21
27
|
# These are the namespaces available in the open document format
|
22
28
|
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os.html
|
23
29
|
#
|
24
30
|
module Office
|
25
|
-
class AutomaticStyles
|
26
|
-
include Node
|
31
|
+
class AutomaticStyles < Node
|
27
32
|
end
|
28
33
|
|
29
|
-
class DocumentContent
|
30
|
-
include Node
|
34
|
+
class DocumentContent < Node
|
31
35
|
end
|
32
36
|
|
33
|
-
class Text
|
34
|
-
include Node
|
37
|
+
class Text < Node
|
35
38
|
|
36
39
|
def office_text?
|
37
40
|
true
|
@@ -52,8 +55,7 @@ module Doc2Text
|
|
52
55
|
module Presentation; end
|
53
56
|
module Script; end
|
54
57
|
module Table
|
55
|
-
class TableRow
|
56
|
-
include Node
|
58
|
+
class TableRow < Node
|
57
59
|
|
58
60
|
def expand
|
59
61
|
header_delimiter = parent.children.count >= 2 && parent.children[1] == self ? "\n|---|---|" : ''
|
@@ -63,8 +65,7 @@ module Doc2Text
|
|
63
65
|
end
|
64
66
|
end
|
65
67
|
|
66
|
-
class TableCell
|
67
|
-
include Node
|
68
|
+
class TableCell < Node
|
68
69
|
|
69
70
|
def open
|
70
71
|
' | '
|
@@ -72,12 +73,10 @@ module Doc2Text
|
|
72
73
|
end
|
73
74
|
end
|
74
75
|
module Style
|
75
|
-
class Style
|
76
|
-
include Node
|
76
|
+
class Style < Node
|
77
77
|
end
|
78
78
|
|
79
|
-
class TextProperties
|
80
|
-
include Node
|
79
|
+
class TextProperties < Node
|
81
80
|
end
|
82
81
|
end
|
83
82
|
module XslFoCompatible; end
|
@@ -86,48 +85,73 @@ module Doc2Text
|
|
86
85
|
module Of; end
|
87
86
|
|
88
87
|
module Text
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
88
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
89
|
+
super parent, attrs, prefix, name
|
90
|
+
@xml_parser = markdown_odt_parser
|
91
|
+
style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
|
92
|
+
@enclosing_style = []
|
93
|
+
if style_index and fetch_style?
|
94
|
+
elem_style = find_style attrs[style_index].value
|
95
|
+
fetch_style elem_style
|
96
|
+
end
|
97
97
|
end
|
98
|
-
end
|
99
98
|
|
100
|
-
|
101
|
-
|
102
|
-
|
99
|
+
def fetch_style?
|
100
|
+
true
|
101
|
+
end
|
103
102
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
103
|
+
def fetch_style(elem_style)
|
104
|
+
if elem_style
|
105
|
+
elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
|
106
|
+
text_property.attrs.each { |attr|
|
107
|
+
if attr.prefix == 'style'
|
108
|
+
if attr.localname == 'font-style-complex' && attr.value == 'italic'
|
109
|
+
@enclosing_style << '_'
|
110
|
+
elsif attr.localname == 'font-weight-complex' && attr.value == 'bold'
|
111
|
+
@enclosing_style << '**'
|
112
|
+
end
|
113
113
|
end
|
114
|
-
|
114
|
+
}
|
115
115
|
}
|
116
|
-
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def find_style(style_name)
|
120
|
+
styles = @xml_parser.xpath '/office:document-content/office:automatic-styles/style:style'
|
121
|
+
styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' } &&
|
122
|
+
style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
|
117
123
|
end
|
118
|
-
end
|
119
124
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
125
|
+
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419212_253892949
|
126
|
+
class H < Node
|
127
|
+
include Text
|
128
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
129
|
+
super parent, attrs, prefix, name, markdown_odt_parser
|
130
|
+
outline_level_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'outline-level' }
|
131
|
+
if outline_level_index and fetch_style?
|
132
|
+
@elem_outline_level = attrs[outline_level_index].value.to_i
|
133
|
+
else
|
134
|
+
@elem_outline_level = 0
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
|
139
|
+
def self.style_family
|
140
|
+
'paragraph'
|
141
|
+
end
|
142
|
+
|
143
|
+
def open
|
144
|
+
"\n#{'#' * @elem_outline_level} "
|
145
|
+
end
|
146
|
+
|
147
|
+
def close
|
148
|
+
"\n\n"
|
149
|
+
end
|
124
150
|
end
|
125
151
|
|
126
152
|
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419256_253892949
|
127
|
-
class P
|
128
|
-
include Node
|
153
|
+
class P < Node
|
129
154
|
include Text
|
130
|
-
|
131
155
|
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
132
156
|
super parent, attrs, prefix, name, markdown_odt_parser
|
133
157
|
end
|
@@ -145,8 +169,7 @@ module Doc2Text
|
|
145
169
|
end
|
146
170
|
end
|
147
171
|
|
148
|
-
class LineBreak
|
149
|
-
include Node
|
172
|
+
class LineBreak < Node
|
150
173
|
|
151
174
|
def open
|
152
175
|
'<br/>'
|
@@ -154,8 +177,7 @@ module Doc2Text
|
|
154
177
|
end
|
155
178
|
|
156
179
|
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419264_253892949
|
157
|
-
class Span
|
158
|
-
include Node
|
180
|
+
class Span < Node
|
159
181
|
include Text
|
160
182
|
|
161
183
|
def self.style_family
|
@@ -178,10 +200,8 @@ module Doc2Text
|
|
178
200
|
end
|
179
201
|
|
180
202
|
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415154_253892949
|
181
|
-
class ListItem
|
182
|
-
include Node
|
203
|
+
class ListItem < Node
|
183
204
|
include Text
|
184
|
-
|
185
205
|
def expand
|
186
206
|
result = "* #{@children.map(&:expand).join.strip.gsub /\n{2,}/, "\n"}\n"
|
187
207
|
delete
|
@@ -193,8 +213,7 @@ module Doc2Text
|
|
193
213
|
end
|
194
214
|
end
|
195
215
|
|
196
|
-
class List
|
197
|
-
include Node
|
216
|
+
class List < Node
|
198
217
|
include Text
|
199
218
|
|
200
219
|
def open
|
data/lib/doc2text/resolution.rb
CHANGED
@@ -2,13 +2,8 @@ module Doc2Text
|
|
2
2
|
class Resolution
|
3
3
|
def self.parse_and_save(source, output)
|
4
4
|
case File.extname source
|
5
|
-
when '.
|
6
|
-
|
7
|
-
File.basename(source, File.extname(source)) + '.odt')
|
8
|
-
system "soffice --headless --convert-to odt #{source} --outdir #{File.dirname output}"
|
9
|
-
source = mid_name
|
10
|
-
Doc2Text::Odt::Document.parse_and_save source, output
|
11
|
-
File.delete(mid_name)
|
5
|
+
when '.docx'
|
6
|
+
Doc2Text::Docx::Document.parse_and_save source, output
|
12
7
|
else
|
13
8
|
Doc2Text::Odt::Document.parse_and_save source, output
|
14
9
|
end
|
@@ -5,9 +5,9 @@ module Doc2Text
|
|
5
5
|
|
6
6
|
def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
|
7
7
|
unless @xml_root
|
8
|
-
@xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
|
8
|
+
@xml_root = @current_node = Doc2Text::Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
|
9
9
|
else
|
10
|
-
new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
|
10
|
+
new_node = Doc2Text::Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
|
11
11
|
@current_node.children << new_node
|
12
12
|
@current_node = new_node
|
13
13
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'zip'
|
2
|
+
|
3
|
+
module Doc2Text
|
4
|
+
module XmlBasedDocument
|
5
|
+
class DocumentFile
|
6
|
+
def initialize(document_path)
|
7
|
+
@document_path = document_path
|
8
|
+
end
|
9
|
+
|
10
|
+
def unpack
|
11
|
+
Zip::File.open(@document_path) {
|
12
|
+
|zip_file|
|
13
|
+
Dir.mkdir(extract_path)
|
14
|
+
zip_file.each do |entry|
|
15
|
+
zipped_file_extract_path = File.join extract_path, entry.name
|
16
|
+
FileUtils.mkdir_p File.dirname(zipped_file_extract_path)
|
17
|
+
zip_file.extract entry, zipped_file_extract_path
|
18
|
+
end
|
19
|
+
}
|
20
|
+
end
|
21
|
+
|
22
|
+
def contains_extracted_files?
|
23
|
+
false
|
24
|
+
end
|
25
|
+
|
26
|
+
def clean
|
27
|
+
if File.exist?(extract_path) and contains_extracted_files?
|
28
|
+
FileUtils.rm_r extract_path
|
29
|
+
else
|
30
|
+
puts 'Failed to clean temp files'
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Open file from the current odt
|
35
|
+
def open(filename)
|
36
|
+
File.open File.join(extract_path, filename), 'r'
|
37
|
+
end
|
38
|
+
|
39
|
+
def extract_extension
|
40
|
+
'unpacked'
|
41
|
+
end
|
42
|
+
|
43
|
+
def extract_path
|
44
|
+
File.join File.dirname(@document_path), ".#{File.basename(@document_path)}_#{extract_extension}"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: doc2text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Valentin Aitken
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-01-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -16,57 +16,62 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '1.
|
19
|
+
version: '1.8'
|
20
20
|
- - ">="
|
21
21
|
- !ruby/object:Gem::Version
|
22
|
-
version: 1.
|
22
|
+
version: 1.8.2
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
27
27
|
- - "~>"
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: '1.
|
29
|
+
version: '1.8'
|
30
30
|
- - ">="
|
31
31
|
- !ruby/object:Gem::Version
|
32
|
-
version: 1.
|
32
|
+
version: 1.8.2
|
33
33
|
- !ruby/object:Gem::Dependency
|
34
34
|
name: rubyzip
|
35
35
|
requirement: !ruby/object:Gem::Requirement
|
36
36
|
requirements:
|
37
37
|
- - "~>"
|
38
38
|
- !ruby/object:Gem::Version
|
39
|
-
version: '1.
|
39
|
+
version: '1.2'
|
40
40
|
- - ">="
|
41
41
|
- !ruby/object:Gem::Version
|
42
|
-
version: 1.
|
42
|
+
version: 1.2.2
|
43
43
|
type: :runtime
|
44
44
|
prerelease: false
|
45
45
|
version_requirements: !ruby/object:Gem::Requirement
|
46
46
|
requirements:
|
47
47
|
- - "~>"
|
48
48
|
- !ruby/object:Gem::Version
|
49
|
-
version: '1.
|
49
|
+
version: '1.2'
|
50
50
|
- - ">="
|
51
51
|
- !ruby/object:Gem::Version
|
52
|
-
version: 1.
|
52
|
+
version: 1.2.2
|
53
53
|
description: Parses odt to markdown
|
54
|
-
email:
|
54
|
+
email: valentin@nalisbg.com
|
55
55
|
executables: []
|
56
56
|
extensions: []
|
57
57
|
extra_rdoc_files: []
|
58
58
|
files:
|
59
|
+
- bin/doc2text
|
59
60
|
- lib/doc2text.rb
|
61
|
+
- lib/doc2text/docx/docx.rb
|
62
|
+
- lib/doc2text/docx/docx_xml_namespaces.rb
|
63
|
+
- lib/doc2text/docx/markdown_docx_parser.rb
|
60
64
|
- lib/doc2text/errors.rb
|
61
|
-
- lib/doc2text/
|
62
|
-
- lib/doc2text/odt.rb
|
63
|
-
- lib/doc2text/
|
64
|
-
- lib/doc2text/
|
65
|
+
- lib/doc2text/generic_xml_nodes.rb
|
66
|
+
- lib/doc2text/odt/markdown_odt_parser.rb
|
67
|
+
- lib/doc2text/odt/odt.rb
|
68
|
+
- lib/doc2text/odt/odt_xml_namespaces.rb
|
65
69
|
- lib/doc2text/resolution.rb
|
66
70
|
- lib/doc2text/styles_parser.rb
|
71
|
+
- lib/doc2text/xml_based_document_file.rb
|
67
72
|
homepage: http://doc2text.com
|
68
73
|
licenses:
|
69
|
-
-
|
74
|
+
- Apache-2.0
|
70
75
|
metadata: {}
|
71
76
|
post_install_message:
|
72
77
|
rdoc_options: []
|
@@ -84,9 +89,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
84
89
|
version: '0'
|
85
90
|
requirements: []
|
86
91
|
rubyforge_project:
|
87
|
-
rubygems_version: 2.
|
92
|
+
rubygems_version: 2.7.8
|
88
93
|
signing_key:
|
89
94
|
specification_version: 4
|
90
95
|
summary: Translates odt to markdown
|
91
96
|
test_files: []
|
92
|
-
has_rdoc:
|
data/lib/doc2text/odt.rb
DELETED
@@ -1,80 +0,0 @@
|
|
1
|
-
require 'zip'
|
2
|
-
|
3
|
-
module Doc2Text
|
4
|
-
module Odt
|
5
|
-
class Document
|
6
|
-
EXTRACT_EXTENSION = 'unpacked_odt'
|
7
|
-
|
8
|
-
def self.parse_and_save(input, output_filename)
|
9
|
-
odt = new input
|
10
|
-
begin
|
11
|
-
odt.unpack
|
12
|
-
styles_xml_root = odt.parse_styles
|
13
|
-
output = File.open output_filename, 'w'
|
14
|
-
markdown = Markdown::OdtParser.new output, styles_xml_root
|
15
|
-
begin
|
16
|
-
odt.parse markdown
|
17
|
-
ensure
|
18
|
-
markdown.close
|
19
|
-
end
|
20
|
-
ensure
|
21
|
-
odt.clean
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
def parse_styles
|
26
|
-
styles_parser = Doc2Text::Odt::StylesParser.new
|
27
|
-
xml = Nokogiri::XML::SAX::Parser.new(styles_parser)
|
28
|
-
xml.parse open 'styles.xml'
|
29
|
-
styles_parser.xml_root
|
30
|
-
end
|
31
|
-
|
32
|
-
def parse(markdown)
|
33
|
-
parser = Nokogiri::XML::SAX::Parser.new(markdown) # { |config| config.strict}
|
34
|
-
parser.parse open 'content.xml'
|
35
|
-
end
|
36
|
-
|
37
|
-
def initialize(document_path)
|
38
|
-
@document_path = document_path
|
39
|
-
end
|
40
|
-
|
41
|
-
def unpack
|
42
|
-
Zip::File.open(@document_path) {
|
43
|
-
|zip_file|
|
44
|
-
Dir.mkdir(extract_path)
|
45
|
-
zip_file.each do |entry|
|
46
|
-
zipped_file_extract_path = File.join extract_path, entry.name
|
47
|
-
FileUtils.mkdir_p File.dirname(zipped_file_extract_path)
|
48
|
-
zip_file.extract entry, zipped_file_extract_path
|
49
|
-
end
|
50
|
-
}
|
51
|
-
end
|
52
|
-
|
53
|
-
def clean
|
54
|
-
if [extract_path, File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
|
55
|
-
FileUtils.rm_r extract_path
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
# Open file from the current odt
|
60
|
-
def open(filename)
|
61
|
-
File.open File.join(extract_path, filename), 'r'
|
62
|
-
end
|
63
|
-
|
64
|
-
# Parse xml file from the current odt
|
65
|
-
def xml_file(filename, rood_node_name)
|
66
|
-
Nokogiri::XML::Document.parse(open(filename)) { |config| config.strict }
|
67
|
-
root_node = doc.root
|
68
|
-
if root_node.name != rood_node_name or root_node.namespace.prefix != 'office'
|
69
|
-
raise XmlError, 'Document does not have correct root element'
|
70
|
-
else
|
71
|
-
open(filename)
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
def extract_path
|
76
|
-
File.join File.dirname(@document_path), ".#{File.basename(@document_path)}_#{EXTRACT_EXTENSION}"
|
77
|
-
end
|
78
|
-
end
|
79
|
-
end
|
80
|
-
end
|