doc2text 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/doc2text.rb +11 -0
- data/lib/doc2text/content.rb +25 -0
- data/lib/doc2text/errors.rb +5 -0
- data/lib/doc2text/markdown.rb +85 -0
- data/lib/doc2text/namespaces.rb +169 -0
- data/lib/doc2text/odt.rb +73 -0
- data/lib/doc2text/odt_xml_node.rb +98 -0
- metadata +51 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1821e833815ea821090507cea37fdec0c68dc2af
|
4
|
+
data.tar.gz: 7815dd2e3f7fbf1f822959e4a120ef0e0bcbef79
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 5b3a0e9729eccccd888432527455336214d52ad996920593e30edab1d2e5f29bcdefdca19dfdbf9f7bf686bfabb8a6888cf706a386f058d7ee07b9bacac1e9e8
|
7
|
+
data.tar.gz: 6ffbb43bd9c8e4eac000b4733f3bcb149a6cee3b8cd608a22a0ca4516fbbc7317a2d30b4b0f15d8dee0c103df509ee24b029393503bd4bb40e16d22ed6c1543a
|
data/lib/doc2text.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Odt
|
3
|
+
module Content
|
4
|
+
class Document < ::Nokogiri::XML::SAX::Document
|
5
|
+
def initialize(markdown_document)
|
6
|
+
@markdown_document = markdown_document
|
7
|
+
end
|
8
|
+
|
9
|
+
def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
|
10
|
+
@markdown_document.new_node prefix, name, attrs
|
11
|
+
end
|
12
|
+
|
13
|
+
def end_element_namespace(name, prefix = nil, uri = nil)
|
14
|
+
@markdown_document.close_node prefix, name
|
15
|
+
end
|
16
|
+
|
17
|
+
def characters(string)
|
18
|
+
unless string.strip.empty?
|
19
|
+
@markdown_document << string
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Markdown
|
3
|
+
class Document
|
4
|
+
def initialize(output)
|
5
|
+
@output = output
|
6
|
+
@automatic_styles = {}
|
7
|
+
end
|
8
|
+
|
9
|
+
def new_node(prefix, name, attrs)
|
10
|
+
unless @xml_root
|
11
|
+
@xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
|
12
|
+
else
|
13
|
+
new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
|
14
|
+
@current_node.children << new_node
|
15
|
+
@current_node = new_node
|
16
|
+
self << @current_node.open
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def close_node(prefix, name)
|
21
|
+
if Odt::XmlNodes::Node.create_node(prefix, name).eql? @current_node
|
22
|
+
if @current_node.delete_on_close?
|
23
|
+
remove_current_node!
|
24
|
+
else
|
25
|
+
remove_current_node! false
|
26
|
+
end
|
27
|
+
elsif Odt::XmlNodes::Node.create_node(prefix, name).eql? @current_node.parent
|
28
|
+
if @current_node.parent.delete_on_close?
|
29
|
+
remove_current_node!
|
30
|
+
remove_current_node!
|
31
|
+
else
|
32
|
+
remove_current_node! false
|
33
|
+
remove_current_node! false
|
34
|
+
end
|
35
|
+
else
|
36
|
+
# TODO remove this redundant(tree build algorithm) checks
|
37
|
+
raise Doc2Text::XmlError, "!Close node child #{prefix} #{name} IS NOT correct, CURRENT_ELEM #{@current_node}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def remove_current_node!(remove = true)
|
42
|
+
return if !@current_node
|
43
|
+
self << @current_node.close
|
44
|
+
node_for_deletion = @current_node
|
45
|
+
@current_node = @current_node.parent
|
46
|
+
return unless @current_node
|
47
|
+
if remove
|
48
|
+
@current_node.remove_last_child! node_for_deletion
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def <<(string)
|
53
|
+
@output << string
|
54
|
+
end
|
55
|
+
|
56
|
+
def close
|
57
|
+
@output.close
|
58
|
+
end
|
59
|
+
|
60
|
+
def print_tree(node)
|
61
|
+
puts node
|
62
|
+
node.children.each do |child|
|
63
|
+
print_tree child
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Select nodes xpath style
|
68
|
+
# - supports selecting from the root node
|
69
|
+
def xpath(string)
|
70
|
+
if /^(\/[\w:\-]+)+$/ =~ string
|
71
|
+
path = string.scan /[\w:\-]+/
|
72
|
+
seek_nodes = [@xml_root]
|
73
|
+
path.each_with_index do |xml_name, index|
|
74
|
+
seek_nodes.select! { |node| node.xml_name == xml_name }
|
75
|
+
seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
|
76
|
+
break if seek_nodes.empty?
|
77
|
+
end
|
78
|
+
seek_nodes
|
79
|
+
else
|
80
|
+
raise Doc2Text::XmlError, 'it does not support this xpath syntax'
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,169 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Odt
|
3
|
+
module XmlNodes
|
4
|
+
class Generic
|
5
|
+
include Node
|
6
|
+
end
|
7
|
+
|
8
|
+
#
|
9
|
+
# These are the namespaces available in the open document format
|
10
|
+
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os.html
|
11
|
+
#
|
12
|
+
module Office
|
13
|
+
class AutomaticStyles
|
14
|
+
include Node
|
15
|
+
|
16
|
+
def visit
|
17
|
+
:automatic_styles
|
18
|
+
end
|
19
|
+
|
20
|
+
def delete_on_close?
|
21
|
+
false
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class DocumentContent
|
26
|
+
include Node
|
27
|
+
|
28
|
+
def delete_on_close?
|
29
|
+
false # required for testing purposes. After a document has been parsed, some tests could be run against the tree built
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
module Animation; end
|
35
|
+
module Chart; end
|
36
|
+
module Config; end
|
37
|
+
module Database; end
|
38
|
+
module Dr3d; end
|
39
|
+
module Drawing; end
|
40
|
+
module Form; end
|
41
|
+
module Manifest; end
|
42
|
+
module Meta; end
|
43
|
+
module DataStyle; end
|
44
|
+
module Presentation; end
|
45
|
+
module Script; end
|
46
|
+
module Table; end
|
47
|
+
module Style
|
48
|
+
class Style
|
49
|
+
include Node
|
50
|
+
|
51
|
+
def delete_on_close?
|
52
|
+
false
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class TextProperties
|
57
|
+
include Node
|
58
|
+
|
59
|
+
def delete_on_close?
|
60
|
+
false
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
module XslFoCompatible; end
|
65
|
+
module SvgCompatible; end
|
66
|
+
module SmilCompatible; end
|
67
|
+
module Of; end
|
68
|
+
|
69
|
+
module Text
|
70
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
|
71
|
+
super parent, attrs, prefix, name, markdown_document
|
72
|
+
@markdown_document = markdown_document
|
73
|
+
style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
|
74
|
+
@enclosing_style = []
|
75
|
+
if style_index
|
76
|
+
fetch_style attrs[style_index].value
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def fetch_common_style(style)
|
81
|
+
if style
|
82
|
+
style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
|
83
|
+
text_property.attrs.each { |attr|
|
84
|
+
if attr.prefix == 'style'
|
85
|
+
if attr.localname == 'font-style-complex' && attr.value == 'italic'
|
86
|
+
@enclosing_style << '_'
|
87
|
+
elsif attr.localname == 'font-weight-complex' && attr.value == 'bold'
|
88
|
+
@enclosing_style << '**'
|
89
|
+
end
|
90
|
+
end
|
91
|
+
}
|
92
|
+
}
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def fetch_style(style_name)
|
97
|
+
styles = @markdown_document.xpath '/office:document-content/office:automatic-styles/style:style'
|
98
|
+
style = styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' && attr.value == self.class.style_family } &&
|
99
|
+
style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
|
100
|
+
fetch_common_style style
|
101
|
+
end
|
102
|
+
|
103
|
+
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419256_253892949
|
104
|
+
class P
|
105
|
+
include Node
|
106
|
+
include Text
|
107
|
+
|
108
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
|
109
|
+
super parent, attrs, prefix, name, markdown_document
|
110
|
+
end
|
111
|
+
|
112
|
+
def self.style_family
|
113
|
+
'paragraph'
|
114
|
+
end
|
115
|
+
|
116
|
+
def open
|
117
|
+
"\n#{@enclosing_style.join}"
|
118
|
+
end
|
119
|
+
|
120
|
+
def close
|
121
|
+
"#{@enclosing_style.reverse.join}\n"
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
class LineBreak
|
126
|
+
include Node
|
127
|
+
|
128
|
+
def open
|
129
|
+
'<br/>'
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419264_253892949
|
134
|
+
class Span
|
135
|
+
include Node
|
136
|
+
include Text
|
137
|
+
|
138
|
+
def self.style_family
|
139
|
+
'text'
|
140
|
+
end
|
141
|
+
|
142
|
+
def open
|
143
|
+
@enclosing_style.join
|
144
|
+
end
|
145
|
+
|
146
|
+
def close
|
147
|
+
@enclosing_style.reverse.join
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415154_253892949
|
152
|
+
class ListItem
|
153
|
+
include Node
|
154
|
+
include Text
|
155
|
+
|
156
|
+
not_enclosing 'p'
|
157
|
+
|
158
|
+
def open
|
159
|
+
'* '
|
160
|
+
end
|
161
|
+
|
162
|
+
def close
|
163
|
+
"\n"
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
data/lib/doc2text/odt.rb
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'zip'
|
2
|
+
|
3
|
+
module Doc2Text
|
4
|
+
module Odt
|
5
|
+
class Document
|
6
|
+
EXTRACT_EXTENSION = 'unpacked_odt'
|
7
|
+
|
8
|
+
def self.parse_and_save(input, output_filename)
|
9
|
+
odt = new input
|
10
|
+
begin
|
11
|
+
odt.unpack
|
12
|
+
output = File.open output_filename, 'w'
|
13
|
+
markdown = Markdown::Document.new output
|
14
|
+
begin
|
15
|
+
odt.parse markdown
|
16
|
+
ensure
|
17
|
+
markdown.close
|
18
|
+
end
|
19
|
+
ensure
|
20
|
+
odt.clean
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def parse(markdown)
|
25
|
+
content = ::Doc2Text::Odt::Content::Document.new markdown
|
26
|
+
parser = Nokogiri::XML::SAX::Parser.new(content) # { |config| config.strict}
|
27
|
+
parser.parse open 'content.xml'
|
28
|
+
end
|
29
|
+
|
30
|
+
def initialize(document_path)
|
31
|
+
@document_path = document_path
|
32
|
+
end
|
33
|
+
|
34
|
+
def unpack
|
35
|
+
Zip::File.open(@document_path) {
|
36
|
+
|zip_file|
|
37
|
+
Dir.mkdir(extract_path)
|
38
|
+
zip_file.each do |entry|
|
39
|
+
zipped_file_extract_path = File.join extract_path, entry.name
|
40
|
+
FileUtils.mkdir_p File.dirname(zipped_file_extract_path)
|
41
|
+
zip_file.extract entry, zipped_file_extract_path
|
42
|
+
end
|
43
|
+
}
|
44
|
+
end
|
45
|
+
|
46
|
+
def clean
|
47
|
+
if [extract_path, File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
|
48
|
+
FileUtils.rm_r extract_path
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Open file from the current odt
|
53
|
+
def open(filename)
|
54
|
+
File.open File.join(extract_path, filename), 'r'
|
55
|
+
end
|
56
|
+
|
57
|
+
# Parse xml file from the current odt
|
58
|
+
def xml_file(filename, rood_node_name)
|
59
|
+
Nokogiri::XML::Document.parse(open(filename)) { |config| config.strict }
|
60
|
+
root_node = doc.root
|
61
|
+
if root_node.name != rood_node_name or root_node.namespace.prefix != 'office'
|
62
|
+
raise XmlError, 'Document does not have correct root element'
|
63
|
+
else
|
64
|
+
open(filename)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def extract_path
|
69
|
+
File.join File.dirname(@document_path), ".#{File.basename(@document_path)}_#{EXTRACT_EXTENSION}"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Odt
|
3
|
+
module XmlNodes
|
4
|
+
module Node
|
5
|
+
attr_reader :parent, :children, :attrs, :prefix, :name
|
6
|
+
|
7
|
+
def self.create_node(prefix, name, parent = nil, attrs = [], markdown_document = nil)
|
8
|
+
begin
|
9
|
+
clazz = XmlNodes.const_get "#{titleize prefix}::#{titleize name}"
|
10
|
+
rescue NameError => e
|
11
|
+
Generic.new(parent, attrs, prefix, name, markdown_document)
|
12
|
+
else
|
13
|
+
clazz.new(parent, attrs, prefix, name, markdown_document)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.titleize(tag)
|
18
|
+
tag.split('-').map(&:capitalize).join
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
|
22
|
+
@parent, @attrs, @prefix, @name = parent, attrs, prefix, name
|
23
|
+
@children = []
|
24
|
+
@has_text = false
|
25
|
+
end
|
26
|
+
|
27
|
+
def root?
|
28
|
+
!@parent
|
29
|
+
end
|
30
|
+
|
31
|
+
def has_text?
|
32
|
+
@has_text
|
33
|
+
end
|
34
|
+
|
35
|
+
def open
|
36
|
+
''
|
37
|
+
end
|
38
|
+
|
39
|
+
def close
|
40
|
+
''
|
41
|
+
end
|
42
|
+
|
43
|
+
def <<(child)
|
44
|
+
@children << child
|
45
|
+
end
|
46
|
+
|
47
|
+
def delete_on_close?
|
48
|
+
true
|
49
|
+
end
|
50
|
+
|
51
|
+
def eql?(object)
|
52
|
+
return false unless object.is_a? Node
|
53
|
+
object.xml_name == xml_name
|
54
|
+
end
|
55
|
+
|
56
|
+
def generic?
|
57
|
+
instance_of? Node
|
58
|
+
end
|
59
|
+
|
60
|
+
def remove_last_child!(child)
|
61
|
+
unless child === @children.last
|
62
|
+
# TODO remove this redundant(tree build algorithm) checks
|
63
|
+
raise Doc2Text::XmlError, "!The child #{child} IS NOT among the children of #{self}"
|
64
|
+
else
|
65
|
+
@children.pop
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def xml_name
|
70
|
+
"#{@prefix}:#{@name}"
|
71
|
+
end
|
72
|
+
|
73
|
+
def to_s
|
74
|
+
"#{xml_name} : #{attrs}"
|
75
|
+
end
|
76
|
+
|
77
|
+
def not_enclosing?
|
78
|
+
!root? && parent.class.not_enclosing_tags && parent.class.not_enclosing_tags.find do |tag|
|
79
|
+
@prefix == parent.prefix && @name == tag
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def self.included(base)
|
84
|
+
base.extend ClassMethods
|
85
|
+
end
|
86
|
+
|
87
|
+
module ClassMethods
|
88
|
+
attr_reader :not_enclosing_tags
|
89
|
+
|
90
|
+
def not_enclosing(tag)
|
91
|
+
@not_enclosing_tags ||= []
|
92
|
+
@not_enclosing_tags << tag
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
metadata
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: doc2text
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Valentin Aitken
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-07-12 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Parses odt to markdown
|
14
|
+
email: bostko@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/doc2text.rb
|
20
|
+
- lib/doc2text/content.rb
|
21
|
+
- lib/doc2text/errors.rb
|
22
|
+
- lib/doc2text/markdown.rb
|
23
|
+
- lib/doc2text/namespaces.rb
|
24
|
+
- lib/doc2text/odt.rb
|
25
|
+
- lib/doc2text/odt_xml_node.rb
|
26
|
+
homepage: https://github.com/bostko/doc2text
|
27
|
+
licenses:
|
28
|
+
- GPL
|
29
|
+
metadata: {}
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
require_paths:
|
33
|
+
- lib
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubyforge_project:
|
46
|
+
rubygems_version: 2.3.0
|
47
|
+
signing_key:
|
48
|
+
specification_version: 4
|
49
|
+
summary: Translates odt to markdown
|
50
|
+
test_files: []
|
51
|
+
has_rdoc:
|