doc2text 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/bin/doc2text +43 -0
- data/lib/doc2text.rb +11 -5
- data/lib/doc2text/docx/docx.rb +31 -0
- data/lib/doc2text/docx/docx_xml_namespaces.rb +55 -0
- data/lib/doc2text/docx/markdown_docx_parser.rb +81 -0
- data/lib/doc2text/{odt_xml_node.rb → generic_xml_nodes.rb} +11 -32
- data/lib/doc2text/{markdown_odt_parser.rb → odt/markdown_odt_parser.rb} +0 -0
- data/lib/doc2text/odt/odt.rb +43 -0
- data/lib/doc2text/{odt_xml_namespaces.rb → odt/odt_xml_namespaces.rb} +83 -64
- data/lib/doc2text/resolution.rb +2 -7
- data/lib/doc2text/styles_parser.rb +2 -2
- data/lib/doc2text/xml_based_document_file.rb +48 -0
- metadata +22 -18
- data/lib/doc2text/odt.rb +0 -80
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 7b9b5aeaa63d276696f0f4f716242b181d8d3aef2e47861053c03f9623cdf498
|
4
|
+
data.tar.gz: a9ac2a3e0314334dda782f8ce8ef0d5a0691015ae70da0cc7a7fd79b2d6d7cd2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 88fcdc3ade712a061c62641dd7713274c676f41d37c7020348ad401e0e7af3a86b07a3726a057870396ee68a290867fccf635d3191c8376b45850507e2f566e9
|
7
|
+
data.tar.gz: a96c1f4cbfbb42079f5e5d6eea757531d7d7e852e01724c40cc58f94ee5ebd27e700bdffc221cc7c7202a3686a6f3c40d8a5e153f07dcdca7aa4fd542b13eac9
|
data/bin/doc2text
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
|
4
|
+
require 'doc2text'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
opt_parse = OptionParser.new do |opts|
|
9
|
+
opts.banner = "Usage: doc2text document.odt output.md
|
10
|
+
or: doc2text [OPTION]... -s input[.odt] -o output[.md]\n\n"
|
11
|
+
|
12
|
+
opts.on('-s FILE', '--source FILE', 'Odt FILE document to be processed') do |file|
|
13
|
+
options[:source] = file
|
14
|
+
end
|
15
|
+
|
16
|
+
opts.on('-o FILE', '--output FILE', 'Output to markdown document FILE') do |file|
|
17
|
+
options[:output] = file
|
18
|
+
end
|
19
|
+
|
20
|
+
opts.on_tail('-h', '--help', 'Show this message') do
|
21
|
+
puts opts
|
22
|
+
exit
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
begin
|
27
|
+
opt_parse.parse!
|
28
|
+
if options.empty?
|
29
|
+
if ARGV.size == 2
|
30
|
+
options[:source], options[:output] = *ARGV
|
31
|
+
else
|
32
|
+
puts opt_parse
|
33
|
+
exit
|
34
|
+
end
|
35
|
+
end
|
36
|
+
rescue OptionParser::InvalidOption, OptionParser::MissingArgument
|
37
|
+
puts $!.to_s
|
38
|
+
puts opt_parse
|
39
|
+
exit
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
Doc2Text::Resolution.parse_and_save options[:source], options[:output]
|
data/lib/doc2text.rb
CHANGED
@@ -2,11 +2,17 @@ require 'nokogiri'
|
|
2
2
|
#require 'nokogiri/xml'
|
3
3
|
require 'fileutils'
|
4
4
|
|
5
|
+
require 'doc2text/xml_based_document_file'
|
6
|
+
require 'doc2text/generic_xml_nodes'
|
5
7
|
require 'doc2text/resolution'
|
6
|
-
require 'doc2text/odt'
|
7
|
-
require 'doc2text/odt_xml_node'
|
8
|
-
require 'doc2text/odt_xml_namespaces'
|
9
|
-
require 'doc2text/markdown_odt_parser'
|
10
8
|
require 'doc2text/errors'
|
11
9
|
|
12
|
-
require 'doc2text/
|
10
|
+
require 'doc2text/odt/odt'
|
11
|
+
require 'doc2text/odt/odt_xml_namespaces'
|
12
|
+
require 'doc2text/odt/markdown_odt_parser'
|
13
|
+
|
14
|
+
require 'doc2text/docx/docx'
|
15
|
+
require 'doc2text/docx/markdown_docx_parser'
|
16
|
+
require 'doc2text/docx/docx_xml_namespaces'
|
17
|
+
|
18
|
+
require 'doc2text/styles_parser'
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Docx
|
3
|
+
class Document < XmlBasedDocument::DocumentFile
|
4
|
+
|
5
|
+
def self.parse_and_save(input, output_filename)
|
6
|
+
docx = new input
|
7
|
+
begin
|
8
|
+
docx.unpack
|
9
|
+
styles_xml_root = docx.parse_styles
|
10
|
+
output = File.open output_filename, 'w'
|
11
|
+
markdown = Markdown::DocxParser.new output, styles_xml_root
|
12
|
+
begin
|
13
|
+
docx.parse markdown
|
14
|
+
ensure
|
15
|
+
markdown.close
|
16
|
+
end
|
17
|
+
ensure
|
18
|
+
docx.clean
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def contains_extracted_files?
|
23
|
+
File.exist? File.join(extract_path, '[Content_Types].xml')
|
24
|
+
end
|
25
|
+
|
26
|
+
def extract_extension
|
27
|
+
'unpacked_docx'
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Docx
|
3
|
+
module XmlNodes
|
4
|
+
class Node < XmlBasedDocument::XmlNodes::Node
|
5
|
+
def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
|
6
|
+
begin
|
7
|
+
clazz = XmlNodes.const_get "#{prefix.capitalize}::W#{name}"
|
8
|
+
rescue NameError => e
|
9
|
+
# markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
|
10
|
+
Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
|
11
|
+
else
|
12
|
+
clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def body?
|
17
|
+
false
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class PlainText < XmlBasedDocument::XmlNodes::PlainText
|
22
|
+
def body?
|
23
|
+
false
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class Generic < Node
|
28
|
+
end
|
29
|
+
|
30
|
+
module W
|
31
|
+
class Wbody < Node
|
32
|
+
def body?
|
33
|
+
true
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class Wbr < Node
|
38
|
+
def open
|
39
|
+
'<br/>'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Wp < Node
|
44
|
+
def open
|
45
|
+
"\n"
|
46
|
+
end
|
47
|
+
|
48
|
+
def close
|
49
|
+
"\n"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
module Doc2Text
|
4
|
+
module Markdown
|
5
|
+
class DocxParser < Nokogiri::XML::SAX::Document
|
6
|
+
def initialize(output, styles_xml_root = nil)
|
7
|
+
@styles_xml_root = styles_xml_root
|
8
|
+
@output = output
|
9
|
+
@automatic_styles = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
|
13
|
+
unless @xml_root
|
14
|
+
@xml_root = @current_node = Docx::XmlNodes::Node.create_node prefix, name, nil, attrs, self
|
15
|
+
else
|
16
|
+
new_node = Docx::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
|
17
|
+
@current_node.children << new_node
|
18
|
+
@current_node = new_node
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def end_element_namespace(name, prefix = nil, uri = nil)
|
23
|
+
if @current_node.parent and @current_node.parent.body?
|
24
|
+
@output << @current_node.expand
|
25
|
+
@current_node.delete
|
26
|
+
end
|
27
|
+
@current_node = @current_node.parent
|
28
|
+
end
|
29
|
+
|
30
|
+
def characters(string)
|
31
|
+
unless string.strip.empty?
|
32
|
+
plain_text = Docx::XmlNodes::PlainText.new(string)
|
33
|
+
@current_node.children << plain_text
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def close
|
38
|
+
@output.close
|
39
|
+
end
|
40
|
+
|
41
|
+
def print_tree(node)
|
42
|
+
puts node
|
43
|
+
node.children.each do |child|
|
44
|
+
print_tree child
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Select nodes xpath style
|
49
|
+
# - supports selecting from the root node
|
50
|
+
def xpath(string)
|
51
|
+
patterns = string.split '|'
|
52
|
+
raise Doc2Text::XmlError, 'it does not support this xpath syntax' if patterns.length == 0
|
53
|
+
result = []
|
54
|
+
patterns.each do |pattern|
|
55
|
+
if /^(\/[\w:\-]+)+$/ =~ pattern
|
56
|
+
path = pattern.scan /[\w:\-]+/
|
57
|
+
result += xpath_search_nodes(path, @xml_root)
|
58
|
+
result += xpath_search_nodes(path, @styles_xml_root) if @styles_xml_root
|
59
|
+
else
|
60
|
+
raise Doc2Text::XmlError, 'it does not support this xpath syntax'
|
61
|
+
end
|
62
|
+
end
|
63
|
+
result
|
64
|
+
end
|
65
|
+
|
66
|
+
def xpath_search_nodes(path, xml_root)
|
67
|
+
seek_nodes = [xml_root]
|
68
|
+
path.each_with_index do |xml_name, index|
|
69
|
+
seek_nodes.select! { |node| node.xml_name == xml_name }
|
70
|
+
seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
|
71
|
+
break if seek_nodes.empty?
|
72
|
+
end
|
73
|
+
seek_nodes
|
74
|
+
end
|
75
|
+
|
76
|
+
def logger
|
77
|
+
@logger ||= Logger.new(STDOUT)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -1,25 +1,16 @@
|
|
1
1
|
module Doc2Text
|
2
|
-
module
|
2
|
+
module XmlBasedDocument
|
3
3
|
module XmlNodes
|
4
|
-
|
4
|
+
class Node
|
5
5
|
attr_reader :parent, :children, :attrs, :prefix, :name
|
6
6
|
attr_accessor :text
|
7
7
|
|
8
|
-
def self.
|
9
|
-
|
10
|
-
|
11
|
-
rescue NameError => e
|
12
|
-
# markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
|
13
|
-
Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
|
14
|
-
else
|
15
|
-
clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
|
8
|
+
def self.inherited(subclass)
|
9
|
+
def subclass.titleize(tag)
|
10
|
+
tag.split('-').map(&:capitalize).join
|
16
11
|
end
|
17
12
|
end
|
18
13
|
|
19
|
-
def self.titleize(tag)
|
20
|
-
tag.split('-').map(&:capitalize).join
|
21
|
-
end
|
22
|
-
|
23
14
|
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
24
15
|
@parent, @attrs, @prefix, @name, @xml_parser = parent, attrs, prefix, name, markdown_odt_parser
|
25
16
|
@children = []
|
@@ -42,10 +33,6 @@ module Doc2Text
|
|
42
33
|
''
|
43
34
|
end
|
44
35
|
|
45
|
-
def office_text?
|
46
|
-
false
|
47
|
-
end
|
48
|
-
|
49
36
|
def delete
|
50
37
|
return true unless @children
|
51
38
|
@children.each { |child| child.delete }
|
@@ -74,24 +61,16 @@ module Doc2Text
|
|
74
61
|
delete
|
75
62
|
expanded.clone
|
76
63
|
end
|
64
|
+
end
|
77
65
|
|
78
|
-
|
79
|
-
!root? && parent.class.not_enclosing_tags && parent.class.not_enclosing_tags.find do |tag|
|
80
|
-
@prefix == parent.prefix && @name == tag
|
81
|
-
end
|
82
|
-
end
|
66
|
+
class PlainText < Node
|
83
67
|
|
84
|
-
|
85
|
-
base.extend ClassMethods
|
86
|
-
end
|
68
|
+
attr_accessor :text
|
87
69
|
|
88
|
-
|
89
|
-
attr_reader :not_enclosing_tags
|
70
|
+
alias_method :expand, :text
|
90
71
|
|
91
|
-
|
92
|
-
|
93
|
-
@not_enclosing_tags << tag
|
94
|
-
end
|
72
|
+
def initialize(text)
|
73
|
+
@text = text
|
95
74
|
end
|
96
75
|
end
|
97
76
|
end
|
File without changes
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Odt
|
3
|
+
class Document < XmlBasedDocument::DocumentFile
|
4
|
+
|
5
|
+
def extract_extension
|
6
|
+
'unpacked_odt'
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.parse_and_save(input, output_filename)
|
10
|
+
odt = new input
|
11
|
+
begin
|
12
|
+
odt.unpack
|
13
|
+
styles_xml_root = odt.parse_styles
|
14
|
+
output = File.open output_filename, 'w'
|
15
|
+
markdown = Markdown::OdtParser.new output, styles_xml_root
|
16
|
+
begin
|
17
|
+
odt.parse markdown
|
18
|
+
ensure
|
19
|
+
markdown.close
|
20
|
+
end
|
21
|
+
ensure
|
22
|
+
odt.clean
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def parse_styles
|
27
|
+
styles_parser = Doc2Text::Odt::StylesParser.new
|
28
|
+
xml = Nokogiri::XML::SAX::Parser.new(styles_parser)
|
29
|
+
xml.parse open 'styles.xml'
|
30
|
+
styles_parser.xml_root
|
31
|
+
end
|
32
|
+
|
33
|
+
def parse(markdown)
|
34
|
+
parser = Nokogiri::XML::SAX::Parser.new(markdown) # { |config| config.strict}
|
35
|
+
parser.parse open 'content.xml'
|
36
|
+
end
|
37
|
+
|
38
|
+
def contains_extracted_files?
|
39
|
+
[File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -1,37 +1,40 @@
|
|
1
1
|
module Doc2Text
|
2
2
|
module Odt
|
3
3
|
module XmlNodes
|
4
|
-
class
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
4
|
+
class Node < XmlBasedDocument::XmlNodes::Node
|
5
|
+
def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
|
6
|
+
begin
|
7
|
+
clazz = XmlNodes.const_get "#{titleize prefix}::#{titleize name}"
|
8
|
+
rescue NameError => e
|
9
|
+
# markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
|
10
|
+
Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
|
11
|
+
else
|
12
|
+
clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
|
13
|
+
end
|
14
|
+
end
|
10
15
|
|
11
|
-
def
|
12
|
-
|
16
|
+
def office_text?
|
17
|
+
false
|
13
18
|
end
|
14
19
|
end
|
15
20
|
|
16
|
-
class
|
17
|
-
include Node
|
21
|
+
class PlainText < XmlBasedDocument::XmlNodes::PlainText
|
18
22
|
end
|
19
23
|
|
24
|
+
class Generic < Node
|
25
|
+
end
|
20
26
|
#
|
21
27
|
# These are the namespaces available in the open document format
|
22
28
|
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os.html
|
23
29
|
#
|
24
30
|
module Office
|
25
|
-
class AutomaticStyles
|
26
|
-
include Node
|
31
|
+
class AutomaticStyles < Node
|
27
32
|
end
|
28
33
|
|
29
|
-
class DocumentContent
|
30
|
-
include Node
|
34
|
+
class DocumentContent < Node
|
31
35
|
end
|
32
36
|
|
33
|
-
class Text
|
34
|
-
include Node
|
37
|
+
class Text < Node
|
35
38
|
|
36
39
|
def office_text?
|
37
40
|
true
|
@@ -52,8 +55,7 @@ module Doc2Text
|
|
52
55
|
module Presentation; end
|
53
56
|
module Script; end
|
54
57
|
module Table
|
55
|
-
class TableRow
|
56
|
-
include Node
|
58
|
+
class TableRow < Node
|
57
59
|
|
58
60
|
def expand
|
59
61
|
header_delimiter = parent.children.count >= 2 && parent.children[1] == self ? "\n|---|---|" : ''
|
@@ -63,8 +65,7 @@ module Doc2Text
|
|
63
65
|
end
|
64
66
|
end
|
65
67
|
|
66
|
-
class TableCell
|
67
|
-
include Node
|
68
|
+
class TableCell < Node
|
68
69
|
|
69
70
|
def open
|
70
71
|
' | '
|
@@ -72,12 +73,10 @@ module Doc2Text
|
|
72
73
|
end
|
73
74
|
end
|
74
75
|
module Style
|
75
|
-
class Style
|
76
|
-
include Node
|
76
|
+
class Style < Node
|
77
77
|
end
|
78
78
|
|
79
|
-
class TextProperties
|
80
|
-
include Node
|
79
|
+
class TextProperties < Node
|
81
80
|
end
|
82
81
|
end
|
83
82
|
module XslFoCompatible; end
|
@@ -86,48 +85,73 @@ module Doc2Text
|
|
86
85
|
module Of; end
|
87
86
|
|
88
87
|
module Text
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
88
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
89
|
+
super parent, attrs, prefix, name
|
90
|
+
@xml_parser = markdown_odt_parser
|
91
|
+
style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
|
92
|
+
@enclosing_style = []
|
93
|
+
if style_index and fetch_style?
|
94
|
+
elem_style = find_style attrs[style_index].value
|
95
|
+
fetch_style elem_style
|
96
|
+
end
|
97
97
|
end
|
98
|
-
end
|
99
98
|
|
100
|
-
|
101
|
-
|
102
|
-
|
99
|
+
def fetch_style?
|
100
|
+
true
|
101
|
+
end
|
103
102
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
103
|
+
def fetch_style(elem_style)
|
104
|
+
if elem_style
|
105
|
+
elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
|
106
|
+
text_property.attrs.each { |attr|
|
107
|
+
if attr.prefix == 'style'
|
108
|
+
if attr.localname == 'font-style-complex' && attr.value == 'italic'
|
109
|
+
@enclosing_style << '_'
|
110
|
+
elsif attr.localname == 'font-weight-complex' && attr.value == 'bold'
|
111
|
+
@enclosing_style << '**'
|
112
|
+
end
|
113
113
|
end
|
114
|
-
|
114
|
+
}
|
115
115
|
}
|
116
|
-
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def find_style(style_name)
|
120
|
+
styles = @xml_parser.xpath '/office:document-content/office:automatic-styles/style:style'
|
121
|
+
styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' } &&
|
122
|
+
style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
|
117
123
|
end
|
118
|
-
end
|
119
124
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
125
|
+
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419212_253892949
|
126
|
+
class H < Node
|
127
|
+
include Text
|
128
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
129
|
+
super parent, attrs, prefix, name, markdown_odt_parser
|
130
|
+
outline_level_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'outline-level' }
|
131
|
+
if outline_level_index and fetch_style?
|
132
|
+
@elem_outline_level = attrs[outline_level_index].value.to_i
|
133
|
+
else
|
134
|
+
@elem_outline_level = 0
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
|
139
|
+
def self.style_family
|
140
|
+
'paragraph'
|
141
|
+
end
|
142
|
+
|
143
|
+
def open
|
144
|
+
"\n#{'#' * @elem_outline_level} "
|
145
|
+
end
|
146
|
+
|
147
|
+
def close
|
148
|
+
"\n\n"
|
149
|
+
end
|
124
150
|
end
|
125
151
|
|
126
152
|
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419256_253892949
|
127
|
-
class P
|
128
|
-
include Node
|
153
|
+
class P < Node
|
129
154
|
include Text
|
130
|
-
|
131
155
|
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
132
156
|
super parent, attrs, prefix, name, markdown_odt_parser
|
133
157
|
end
|
@@ -145,8 +169,7 @@ module Doc2Text
|
|
145
169
|
end
|
146
170
|
end
|
147
171
|
|
148
|
-
class LineBreak
|
149
|
-
include Node
|
172
|
+
class LineBreak < Node
|
150
173
|
|
151
174
|
def open
|
152
175
|
'<br/>'
|
@@ -154,8 +177,7 @@ module Doc2Text
|
|
154
177
|
end
|
155
178
|
|
156
179
|
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419264_253892949
|
157
|
-
class Span
|
158
|
-
include Node
|
180
|
+
class Span < Node
|
159
181
|
include Text
|
160
182
|
|
161
183
|
def self.style_family
|
@@ -178,10 +200,8 @@ module Doc2Text
|
|
178
200
|
end
|
179
201
|
|
180
202
|
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415154_253892949
|
181
|
-
class ListItem
|
182
|
-
include Node
|
203
|
+
class ListItem < Node
|
183
204
|
include Text
|
184
|
-
|
185
205
|
def expand
|
186
206
|
result = "* #{@children.map(&:expand).join.strip.gsub /\n{2,}/, "\n"}\n"
|
187
207
|
delete
|
@@ -193,8 +213,7 @@ module Doc2Text
|
|
193
213
|
end
|
194
214
|
end
|
195
215
|
|
196
|
-
class List
|
197
|
-
include Node
|
216
|
+
class List < Node
|
198
217
|
include Text
|
199
218
|
|
200
219
|
def open
|
data/lib/doc2text/resolution.rb
CHANGED
@@ -2,13 +2,8 @@ module Doc2Text
|
|
2
2
|
class Resolution
|
3
3
|
def self.parse_and_save(source, output)
|
4
4
|
case File.extname source
|
5
|
-
when '.
|
6
|
-
|
7
|
-
File.basename(source, File.extname(source)) + '.odt')
|
8
|
-
system "soffice --headless --convert-to odt #{source} --outdir #{File.dirname output}"
|
9
|
-
source = mid_name
|
10
|
-
Doc2Text::Odt::Document.parse_and_save source, output
|
11
|
-
File.delete(mid_name)
|
5
|
+
when '.docx'
|
6
|
+
Doc2Text::Docx::Document.parse_and_save source, output
|
12
7
|
else
|
13
8
|
Doc2Text::Odt::Document.parse_and_save source, output
|
14
9
|
end
|
@@ -5,9 +5,9 @@ module Doc2Text
|
|
5
5
|
|
6
6
|
def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
|
7
7
|
unless @xml_root
|
8
|
-
@xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
|
8
|
+
@xml_root = @current_node = Doc2Text::Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
|
9
9
|
else
|
10
|
-
new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
|
10
|
+
new_node = Doc2Text::Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
|
11
11
|
@current_node.children << new_node
|
12
12
|
@current_node = new_node
|
13
13
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'zip'
|
2
|
+
|
3
|
+
module Doc2Text
|
4
|
+
module XmlBasedDocument
|
5
|
+
class DocumentFile
|
6
|
+
def initialize(document_path)
|
7
|
+
@document_path = document_path
|
8
|
+
end
|
9
|
+
|
10
|
+
def unpack
|
11
|
+
Zip::File.open(@document_path) {
|
12
|
+
|zip_file|
|
13
|
+
Dir.mkdir(extract_path)
|
14
|
+
zip_file.each do |entry|
|
15
|
+
zipped_file_extract_path = File.join extract_path, entry.name
|
16
|
+
FileUtils.mkdir_p File.dirname(zipped_file_extract_path)
|
17
|
+
zip_file.extract entry, zipped_file_extract_path
|
18
|
+
end
|
19
|
+
}
|
20
|
+
end
|
21
|
+
|
22
|
+
def contains_extracted_files?
|
23
|
+
false
|
24
|
+
end
|
25
|
+
|
26
|
+
def clean
|
27
|
+
if File.exist?(extract_path) and contains_extracted_files?
|
28
|
+
FileUtils.rm_r extract_path
|
29
|
+
else
|
30
|
+
puts 'Failed to clean temp files'
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Open file from the current odt
|
35
|
+
def open(filename)
|
36
|
+
File.open File.join(extract_path, filename), 'r'
|
37
|
+
end
|
38
|
+
|
39
|
+
def extract_extension
|
40
|
+
'unpacked'
|
41
|
+
end
|
42
|
+
|
43
|
+
def extract_path
|
44
|
+
File.join File.dirname(@document_path), ".#{File.basename(@document_path)}_#{extract_extension}"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: doc2text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Valentin Aitken
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-01-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -16,57 +16,62 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '1.
|
19
|
+
version: '1.8'
|
20
20
|
- - ">="
|
21
21
|
- !ruby/object:Gem::Version
|
22
|
-
version: 1.
|
22
|
+
version: 1.8.2
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
27
27
|
- - "~>"
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: '1.
|
29
|
+
version: '1.8'
|
30
30
|
- - ">="
|
31
31
|
- !ruby/object:Gem::Version
|
32
|
-
version: 1.
|
32
|
+
version: 1.8.2
|
33
33
|
- !ruby/object:Gem::Dependency
|
34
34
|
name: rubyzip
|
35
35
|
requirement: !ruby/object:Gem::Requirement
|
36
36
|
requirements:
|
37
37
|
- - "~>"
|
38
38
|
- !ruby/object:Gem::Version
|
39
|
-
version: '1.
|
39
|
+
version: '1.2'
|
40
40
|
- - ">="
|
41
41
|
- !ruby/object:Gem::Version
|
42
|
-
version: 1.
|
42
|
+
version: 1.2.2
|
43
43
|
type: :runtime
|
44
44
|
prerelease: false
|
45
45
|
version_requirements: !ruby/object:Gem::Requirement
|
46
46
|
requirements:
|
47
47
|
- - "~>"
|
48
48
|
- !ruby/object:Gem::Version
|
49
|
-
version: '1.
|
49
|
+
version: '1.2'
|
50
50
|
- - ">="
|
51
51
|
- !ruby/object:Gem::Version
|
52
|
-
version: 1.
|
52
|
+
version: 1.2.2
|
53
53
|
description: Parses odt to markdown
|
54
|
-
email:
|
54
|
+
email: valentin@nalisbg.com
|
55
55
|
executables: []
|
56
56
|
extensions: []
|
57
57
|
extra_rdoc_files: []
|
58
58
|
files:
|
59
|
+
- bin/doc2text
|
59
60
|
- lib/doc2text.rb
|
61
|
+
- lib/doc2text/docx/docx.rb
|
62
|
+
- lib/doc2text/docx/docx_xml_namespaces.rb
|
63
|
+
- lib/doc2text/docx/markdown_docx_parser.rb
|
60
64
|
- lib/doc2text/errors.rb
|
61
|
-
- lib/doc2text/
|
62
|
-
- lib/doc2text/odt.rb
|
63
|
-
- lib/doc2text/
|
64
|
-
- lib/doc2text/
|
65
|
+
- lib/doc2text/generic_xml_nodes.rb
|
66
|
+
- lib/doc2text/odt/markdown_odt_parser.rb
|
67
|
+
- lib/doc2text/odt/odt.rb
|
68
|
+
- lib/doc2text/odt/odt_xml_namespaces.rb
|
65
69
|
- lib/doc2text/resolution.rb
|
66
70
|
- lib/doc2text/styles_parser.rb
|
71
|
+
- lib/doc2text/xml_based_document_file.rb
|
67
72
|
homepage: http://doc2text.com
|
68
73
|
licenses:
|
69
|
-
-
|
74
|
+
- Apache-2.0
|
70
75
|
metadata: {}
|
71
76
|
post_install_message:
|
72
77
|
rdoc_options: []
|
@@ -84,9 +89,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
84
89
|
version: '0'
|
85
90
|
requirements: []
|
86
91
|
rubyforge_project:
|
87
|
-
rubygems_version: 2.
|
92
|
+
rubygems_version: 2.7.8
|
88
93
|
signing_key:
|
89
94
|
specification_version: 4
|
90
95
|
summary: Translates odt to markdown
|
91
96
|
test_files: []
|
92
|
-
has_rdoc:
|
data/lib/doc2text/odt.rb
DELETED
@@ -1,80 +0,0 @@
|
|
1
|
-
require 'zip'
|
2
|
-
|
3
|
-
module Doc2Text
|
4
|
-
module Odt
|
5
|
-
class Document
|
6
|
-
EXTRACT_EXTENSION = 'unpacked_odt'
|
7
|
-
|
8
|
-
def self.parse_and_save(input, output_filename)
|
9
|
-
odt = new input
|
10
|
-
begin
|
11
|
-
odt.unpack
|
12
|
-
styles_xml_root = odt.parse_styles
|
13
|
-
output = File.open output_filename, 'w'
|
14
|
-
markdown = Markdown::OdtParser.new output, styles_xml_root
|
15
|
-
begin
|
16
|
-
odt.parse markdown
|
17
|
-
ensure
|
18
|
-
markdown.close
|
19
|
-
end
|
20
|
-
ensure
|
21
|
-
odt.clean
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
def parse_styles
|
26
|
-
styles_parser = Doc2Text::Odt::StylesParser.new
|
27
|
-
xml = Nokogiri::XML::SAX::Parser.new(styles_parser)
|
28
|
-
xml.parse open 'styles.xml'
|
29
|
-
styles_parser.xml_root
|
30
|
-
end
|
31
|
-
|
32
|
-
def parse(markdown)
|
33
|
-
parser = Nokogiri::XML::SAX::Parser.new(markdown) # { |config| config.strict}
|
34
|
-
parser.parse open 'content.xml'
|
35
|
-
end
|
36
|
-
|
37
|
-
def initialize(document_path)
|
38
|
-
@document_path = document_path
|
39
|
-
end
|
40
|
-
|
41
|
-
def unpack
|
42
|
-
Zip::File.open(@document_path) {
|
43
|
-
|zip_file|
|
44
|
-
Dir.mkdir(extract_path)
|
45
|
-
zip_file.each do |entry|
|
46
|
-
zipped_file_extract_path = File.join extract_path, entry.name
|
47
|
-
FileUtils.mkdir_p File.dirname(zipped_file_extract_path)
|
48
|
-
zip_file.extract entry, zipped_file_extract_path
|
49
|
-
end
|
50
|
-
}
|
51
|
-
end
|
52
|
-
|
53
|
-
def clean
|
54
|
-
if [extract_path, File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
|
55
|
-
FileUtils.rm_r extract_path
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
# Open file from the current odt
|
60
|
-
def open(filename)
|
61
|
-
File.open File.join(extract_path, filename), 'r'
|
62
|
-
end
|
63
|
-
|
64
|
-
# Parse xml file from the current odt
|
65
|
-
def xml_file(filename, rood_node_name)
|
66
|
-
Nokogiri::XML::Document.parse(open(filename)) { |config| config.strict }
|
67
|
-
root_node = doc.root
|
68
|
-
if root_node.name != rood_node_name or root_node.namespace.prefix != 'office'
|
69
|
-
raise XmlError, 'Document does not have correct root element'
|
70
|
-
else
|
71
|
-
open(filename)
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
def extract_path
|
76
|
-
File.join File.dirname(@document_path), ".#{File.basename(@document_path)}_#{EXTRACT_EXTENSION}"
|
77
|
-
end
|
78
|
-
end
|
79
|
-
end
|
80
|
-
end
|