doc2text 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/doc2text/content.rb +5 -5
- data/lib/doc2text/{markdown.rb → markdown_odt_parser.rb} +21 -26
- data/lib/doc2text/odt.rb +1 -1
- data/lib/doc2text/{namespaces.rb → odt_xml_namespaces.rb} +94 -20
- data/lib/doc2text/odt_xml_node.rb +27 -18
- data/lib/doc2text.rb +2 -2
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 555e469e3b8ba226e8ffe20fa8052e377f037a74
|
4
|
+
data.tar.gz: 0ba30a01a5a55188202b3224a8ade79becf9099d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a608fd6d8f606c6dd29669a50e6219ce0ca505bce2aa58ad7d6005e81b22bf34a6b9a9db59cade78fa66c04906f71fede4059f3e03666883dbd7cc6934c41d7f
|
7
|
+
data.tar.gz: 01e299c31d90f31c4cdaaadb2010ad50f67c305364a12e8afb826f2e9af93b7e7a93e49f77778366c1d8c40a138d8506846c3d65a1fa9c9ee31c3532c4d13414
|
data/lib/doc2text/content.rb
CHANGED
@@ -2,21 +2,21 @@ module Doc2Text
|
|
2
2
|
module Odt
|
3
3
|
module Content
|
4
4
|
class Document < ::Nokogiri::XML::SAX::Document
|
5
|
-
def initialize(
|
6
|
-
@
|
5
|
+
def initialize(markdown_odt_parser)
|
6
|
+
@markdown_odt_parser = markdown_odt_parser
|
7
7
|
end
|
8
8
|
|
9
9
|
def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
|
10
|
-
@
|
10
|
+
@markdown_odt_parser.new_node prefix, name, attrs
|
11
11
|
end
|
12
12
|
|
13
13
|
def end_element_namespace(name, prefix = nil, uri = nil)
|
14
|
-
@
|
14
|
+
@markdown_odt_parser.close_node prefix, name
|
15
15
|
end
|
16
16
|
|
17
17
|
def characters(string)
|
18
18
|
unless string.strip.empty?
|
19
|
-
@
|
19
|
+
@markdown_odt_parser.text string
|
20
20
|
end
|
21
21
|
end
|
22
22
|
end
|
@@ -1,6 +1,8 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
1
3
|
module Doc2Text
|
2
4
|
module Markdown
|
3
|
-
class
|
5
|
+
class OdtParser
|
4
6
|
def initialize(output)
|
5
7
|
@output = output
|
6
8
|
@automatic_styles = {}
|
@@ -13,24 +15,23 @@ module Doc2Text
|
|
13
15
|
new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
|
14
16
|
@current_node.children << new_node
|
15
17
|
@current_node = new_node
|
16
|
-
self << @current_node.open
|
17
18
|
end
|
18
19
|
end
|
19
20
|
|
20
21
|
def close_node(prefix, name)
|
21
|
-
if Odt::XmlNodes::Node.create_node(prefix, name).eql? @current_node
|
22
|
+
if Odt::XmlNodes::Node.create_node(prefix, name, nil, [], self).eql? @current_node
|
23
|
+
return if !@current_node
|
22
24
|
if @current_node.delete_on_close?
|
23
|
-
|
24
|
-
|
25
|
-
|
25
|
+
# if @current_node.parent
|
26
|
+
# @output << @current_node.parent.expand
|
27
|
+
# @current_node.parent.un_delete
|
28
|
+
# else
|
29
|
+
@output << @current_node.expand
|
30
|
+
# end
|
26
31
|
end
|
27
|
-
|
28
|
-
if @current_node.
|
29
|
-
|
30
|
-
remove_current_node!
|
31
|
-
else
|
32
|
-
remove_current_node! false
|
33
|
-
remove_current_node! false
|
32
|
+
@current_node = @current_node.parent
|
33
|
+
if @current_node && @current_node.delete_on_close?
|
34
|
+
@current_node.delete
|
34
35
|
end
|
35
36
|
else
|
36
37
|
# TODO remove this redundant(tree build algorithm) checks
|
@@ -38,19 +39,9 @@ module Doc2Text
|
|
38
39
|
end
|
39
40
|
end
|
40
41
|
|
41
|
-
def
|
42
|
-
|
43
|
-
|
44
|
-
node_for_deletion = @current_node
|
45
|
-
@current_node = @current_node.parent
|
46
|
-
return unless @current_node
|
47
|
-
if remove
|
48
|
-
@current_node.remove_last_child! node_for_deletion
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def <<(string)
|
53
|
-
@output << string
|
42
|
+
def text(string)
|
43
|
+
plain_text = Odt::XmlNodes::PlainText.new(string)
|
44
|
+
@current_node.children << plain_text
|
54
45
|
end
|
55
46
|
|
56
47
|
def close
|
@@ -80,6 +71,10 @@ module Doc2Text
|
|
80
71
|
raise Doc2Text::XmlError, 'it does not support this xpath syntax'
|
81
72
|
end
|
82
73
|
end
|
74
|
+
|
75
|
+
def logger
|
76
|
+
@logger ||= Logger.new(STDOUT)
|
77
|
+
end
|
83
78
|
end
|
84
79
|
end
|
85
80
|
end
|
data/lib/doc2text/odt.rb
CHANGED
@@ -1,6 +1,18 @@
|
|
1
1
|
module Doc2Text
|
2
2
|
module Odt
|
3
3
|
module XmlNodes
|
4
|
+
class PlainText
|
5
|
+
include Node
|
6
|
+
|
7
|
+
attr_accessor :text
|
8
|
+
|
9
|
+
alias_method :expand, :text
|
10
|
+
|
11
|
+
def initialize(text)
|
12
|
+
@text = text
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
4
16
|
class Generic
|
5
17
|
include Node
|
6
18
|
end
|
@@ -26,7 +38,15 @@ module Doc2Text
|
|
26
38
|
include Node
|
27
39
|
|
28
40
|
def delete_on_close?
|
29
|
-
|
41
|
+
true
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
class Text
|
46
|
+
include Node
|
47
|
+
|
48
|
+
def delete_on_close?
|
49
|
+
true
|
30
50
|
end
|
31
51
|
end
|
32
52
|
end
|
@@ -43,7 +63,26 @@ module Doc2Text
|
|
43
63
|
module DataStyle; end
|
44
64
|
module Presentation; end
|
45
65
|
module Script; end
|
46
|
-
module Table
|
66
|
+
module Table
|
67
|
+
class TableRow
|
68
|
+
include Node
|
69
|
+
|
70
|
+
def expand
|
71
|
+
header_delimiter = parent.children.count >= 2 && parent.children[1] == self ? "\n|---|---|" : ''
|
72
|
+
result = "\n#{@children.select(&:not_deleted?).map(&:expand).join.strip.gsub "\n", ''} |#{header_delimiter}"
|
73
|
+
delete
|
74
|
+
result
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
class TableCell
|
79
|
+
include Node
|
80
|
+
|
81
|
+
def open
|
82
|
+
' | '
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
47
86
|
module Style
|
48
87
|
class Style
|
49
88
|
include Node
|
@@ -67,19 +106,24 @@ module Doc2Text
|
|
67
106
|
module Of; end
|
68
107
|
|
69
108
|
module Text
|
70
|
-
def initialize(parent = nil, attrs = [], prefix = nil, name = nil,
|
71
|
-
super parent, attrs, prefix, name
|
72
|
-
@
|
109
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
110
|
+
super parent, attrs, prefix, name
|
111
|
+
@markdown_odt_parser = markdown_odt_parser
|
73
112
|
style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
|
74
113
|
@enclosing_style = []
|
75
|
-
if style_index
|
76
|
-
|
114
|
+
if style_index and fetch_style?
|
115
|
+
elem_style = find_style attrs[style_index].value
|
116
|
+
fetch_style elem_style
|
77
117
|
end
|
78
118
|
end
|
79
119
|
|
80
|
-
def
|
81
|
-
|
82
|
-
|
120
|
+
def fetch_style?
|
121
|
+
true
|
122
|
+
end
|
123
|
+
|
124
|
+
def fetch_style(elem_style)
|
125
|
+
if elem_style
|
126
|
+
elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
|
83
127
|
text_property.attrs.each { |attr|
|
84
128
|
if attr.prefix == 'style'
|
85
129
|
if attr.localname == 'font-style-complex' && attr.value == 'italic'
|
@@ -93,11 +137,10 @@ module Doc2Text
|
|
93
137
|
end
|
94
138
|
end
|
95
139
|
|
96
|
-
def
|
97
|
-
styles = @
|
98
|
-
|
140
|
+
def find_style(style_name)
|
141
|
+
styles = @markdown_odt_parser.xpath '/office:document-content/office:automatic-styles/style:style'
|
142
|
+
styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' } &&
|
99
143
|
style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
|
100
|
-
fetch_common_style style
|
101
144
|
end
|
102
145
|
|
103
146
|
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419256_253892949
|
@@ -105,8 +148,8 @@ module Doc2Text
|
|
105
148
|
include Node
|
106
149
|
include Text
|
107
150
|
|
108
|
-
def initialize(parent = nil, attrs = [], prefix = nil, name = nil,
|
109
|
-
super parent, attrs, prefix, name,
|
151
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
152
|
+
super parent, attrs, prefix, name, markdown_odt_parser
|
110
153
|
end
|
111
154
|
|
112
155
|
def self.style_family
|
@@ -153,15 +196,46 @@ module Doc2Text
|
|
153
196
|
include Node
|
154
197
|
include Text
|
155
198
|
|
156
|
-
|
199
|
+
def expand
|
200
|
+
result = "* #{@children.select(&:not_deleted?).map(&:expand).join.strip.gsub /\n{2,}/, "\n"}\n"
|
201
|
+
delete
|
202
|
+
result.clone
|
203
|
+
end
|
157
204
|
|
158
|
-
def
|
159
|
-
|
205
|
+
def fetch_style?
|
206
|
+
false
|
160
207
|
end
|
161
208
|
|
162
|
-
def
|
209
|
+
def delete_on_close?
|
210
|
+
false
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
class List
|
215
|
+
include Node
|
216
|
+
include Text
|
217
|
+
|
218
|
+
def open
|
163
219
|
"\n"
|
164
220
|
end
|
221
|
+
|
222
|
+
def fetch_style(elem_style)
|
223
|
+
if elem_style
|
224
|
+
elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
|
225
|
+
text_property.attrs.each { |attr|
|
226
|
+
if attr.prefix == 'style'
|
227
|
+
if attr.localname == 'list-level-style-number' && attr.value == 'Numbering_20_Symbols'
|
228
|
+
@enclosing_style << '_'
|
229
|
+
end
|
230
|
+
end
|
231
|
+
}
|
232
|
+
}
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
def delete_on_close?
|
237
|
+
false
|
238
|
+
end
|
165
239
|
end
|
166
240
|
end
|
167
241
|
end
|
@@ -3,14 +3,16 @@ module Doc2Text
|
|
3
3
|
module XmlNodes
|
4
4
|
module Node
|
5
5
|
attr_reader :parent, :children, :attrs, :prefix, :name
|
6
|
+
attr_accessor :text
|
6
7
|
|
7
|
-
def self.create_node(prefix, name, parent = nil, attrs = [],
|
8
|
+
def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
|
8
9
|
begin
|
9
10
|
clazz = XmlNodes.const_get "#{titleize prefix}::#{titleize name}"
|
10
11
|
rescue NameError => e
|
11
|
-
|
12
|
+
# markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
|
13
|
+
Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
|
12
14
|
else
|
13
|
-
clazz.new(parent, attrs, prefix, name,
|
15
|
+
clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
|
14
16
|
end
|
15
17
|
end
|
16
18
|
|
@@ -18,8 +20,8 @@ module Doc2Text
|
|
18
20
|
tag.split('-').map(&:capitalize).join
|
19
21
|
end
|
20
22
|
|
21
|
-
def initialize(parent = nil, attrs = [], prefix = nil, name = nil,
|
22
|
-
@parent, @attrs, @prefix, @name = parent, attrs, prefix, name
|
23
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
24
|
+
@parent, @attrs, @prefix, @name, @markdown_odt_parser = parent, attrs, prefix, name, markdown_odt_parser
|
23
25
|
@children = []
|
24
26
|
@has_text = false
|
25
27
|
end
|
@@ -40,12 +42,22 @@ module Doc2Text
|
|
40
42
|
''
|
41
43
|
end
|
42
44
|
|
43
|
-
def
|
44
|
-
|
45
|
+
def delete_on_close?
|
46
|
+
false
|
45
47
|
end
|
46
48
|
|
47
|
-
def
|
48
|
-
|
49
|
+
def not_deleted?
|
50
|
+
!@deleted
|
51
|
+
end
|
52
|
+
|
53
|
+
def delete
|
54
|
+
@deleted = true
|
55
|
+
# @children.each { |child| child.delete }
|
56
|
+
# @children = []
|
57
|
+
end
|
58
|
+
|
59
|
+
def un_delete
|
60
|
+
@deleted = false
|
49
61
|
end
|
50
62
|
|
51
63
|
def eql?(object)
|
@@ -57,15 +69,6 @@ module Doc2Text
|
|
57
69
|
instance_of? Node
|
58
70
|
end
|
59
71
|
|
60
|
-
def remove_last_child!(child)
|
61
|
-
unless child === @children.last
|
62
|
-
# TODO remove this redundant(tree build algorithm) checks
|
63
|
-
raise Doc2Text::XmlError, "!The child #{child} IS NOT among the children of #{self}"
|
64
|
-
else
|
65
|
-
@children.pop
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
72
|
def xml_name
|
70
73
|
"#{@prefix}:#{@name}"
|
71
74
|
end
|
@@ -74,6 +77,12 @@ module Doc2Text
|
|
74
77
|
"#{xml_name} : #{attrs}"
|
75
78
|
end
|
76
79
|
|
80
|
+
def expand
|
81
|
+
expanded = "#{open}#{@children.select(&:not_deleted?).map(&:expand).join}#{close}"
|
82
|
+
delete
|
83
|
+
expanded.clone
|
84
|
+
end
|
85
|
+
|
77
86
|
def not_enclosing?
|
78
87
|
!root? && parent.class.not_enclosing_tags && parent.class.not_enclosing_tags.find do |tag|
|
79
88
|
@prefix == parent.prefix && @name == tag
|
data/lib/doc2text.rb
CHANGED
@@ -4,8 +4,8 @@ require 'fileutils'
|
|
4
4
|
|
5
5
|
require 'doc2text/odt'
|
6
6
|
require 'doc2text/odt_xml_node'
|
7
|
-
require 'doc2text/
|
8
|
-
require 'doc2text/
|
7
|
+
require 'doc2text/odt_xml_namespaces'
|
8
|
+
require 'doc2text/markdown_odt_parser'
|
9
9
|
require 'doc2text/errors'
|
10
10
|
|
11
11
|
require 'doc2text/content'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: doc2text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.2'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Valentin Aitken
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Parses odt to markdown
|
14
14
|
email: bostko@gmail.com
|
@@ -19,9 +19,9 @@ files:
|
|
19
19
|
- lib/doc2text.rb
|
20
20
|
- lib/doc2text/content.rb
|
21
21
|
- lib/doc2text/errors.rb
|
22
|
-
- lib/doc2text/
|
23
|
-
- lib/doc2text/namespaces.rb
|
22
|
+
- lib/doc2text/markdown_odt_parser.rb
|
24
23
|
- lib/doc2text/odt.rb
|
24
|
+
- lib/doc2text/odt_xml_namespaces.rb
|
25
25
|
- lib/doc2text/odt_xml_node.rb
|
26
26
|
homepage: https://github.com/bostko/doc2text
|
27
27
|
licenses:
|
@@ -43,7 +43,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
43
43
|
version: '0'
|
44
44
|
requirements: []
|
45
45
|
rubyforge_project:
|
46
|
-
rubygems_version: 2.
|
46
|
+
rubygems_version: 2.2.2
|
47
47
|
signing_key:
|
48
48
|
specification_version: 4
|
49
49
|
summary: Translates odt to markdown
|