doc2text 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/doc2text/content.rb +5 -5
- data/lib/doc2text/{markdown.rb → markdown_odt_parser.rb} +21 -26
- data/lib/doc2text/odt.rb +1 -1
- data/lib/doc2text/{namespaces.rb → odt_xml_namespaces.rb} +94 -20
- data/lib/doc2text/odt_xml_node.rb +27 -18
- data/lib/doc2text.rb +2 -2
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 555e469e3b8ba226e8ffe20fa8052e377f037a74
|
4
|
+
data.tar.gz: 0ba30a01a5a55188202b3224a8ade79becf9099d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a608fd6d8f606c6dd29669a50e6219ce0ca505bce2aa58ad7d6005e81b22bf34a6b9a9db59cade78fa66c04906f71fede4059f3e03666883dbd7cc6934c41d7f
|
7
|
+
data.tar.gz: 01e299c31d90f31c4cdaaadb2010ad50f67c305364a12e8afb826f2e9af93b7e7a93e49f77778366c1d8c40a138d8506846c3d65a1fa9c9ee31c3532c4d13414
|
data/lib/doc2text/content.rb
CHANGED
@@ -2,21 +2,21 @@ module Doc2Text
|
|
2
2
|
module Odt
|
3
3
|
module Content
|
4
4
|
class Document < ::Nokogiri::XML::SAX::Document
|
5
|
-
def initialize(
|
6
|
-
@
|
5
|
+
def initialize(markdown_odt_parser)
|
6
|
+
@markdown_odt_parser = markdown_odt_parser
|
7
7
|
end
|
8
8
|
|
9
9
|
def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
|
10
|
-
@
|
10
|
+
@markdown_odt_parser.new_node prefix, name, attrs
|
11
11
|
end
|
12
12
|
|
13
13
|
def end_element_namespace(name, prefix = nil, uri = nil)
|
14
|
-
@
|
14
|
+
@markdown_odt_parser.close_node prefix, name
|
15
15
|
end
|
16
16
|
|
17
17
|
def characters(string)
|
18
18
|
unless string.strip.empty?
|
19
|
-
@
|
19
|
+
@markdown_odt_parser.text string
|
20
20
|
end
|
21
21
|
end
|
22
22
|
end
|
@@ -1,6 +1,8 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
1
3
|
module Doc2Text
|
2
4
|
module Markdown
|
3
|
-
class
|
5
|
+
class OdtParser
|
4
6
|
def initialize(output)
|
5
7
|
@output = output
|
6
8
|
@automatic_styles = {}
|
@@ -13,24 +15,23 @@ module Doc2Text
|
|
13
15
|
new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
|
14
16
|
@current_node.children << new_node
|
15
17
|
@current_node = new_node
|
16
|
-
self << @current_node.open
|
17
18
|
end
|
18
19
|
end
|
19
20
|
|
20
21
|
def close_node(prefix, name)
|
21
|
-
if Odt::XmlNodes::Node.create_node(prefix, name).eql? @current_node
|
22
|
+
if Odt::XmlNodes::Node.create_node(prefix, name, nil, [], self).eql? @current_node
|
23
|
+
return if !@current_node
|
22
24
|
if @current_node.delete_on_close?
|
23
|
-
|
24
|
-
|
25
|
-
|
25
|
+
# if @current_node.parent
|
26
|
+
# @output << @current_node.parent.expand
|
27
|
+
# @current_node.parent.un_delete
|
28
|
+
# else
|
29
|
+
@output << @current_node.expand
|
30
|
+
# end
|
26
31
|
end
|
27
|
-
|
28
|
-
if @current_node.
|
29
|
-
|
30
|
-
remove_current_node!
|
31
|
-
else
|
32
|
-
remove_current_node! false
|
33
|
-
remove_current_node! false
|
32
|
+
@current_node = @current_node.parent
|
33
|
+
if @current_node && @current_node.delete_on_close?
|
34
|
+
@current_node.delete
|
34
35
|
end
|
35
36
|
else
|
36
37
|
# TODO remove this redundant(tree build algorithm) checks
|
@@ -38,19 +39,9 @@ module Doc2Text
|
|
38
39
|
end
|
39
40
|
end
|
40
41
|
|
41
|
-
def
|
42
|
-
|
43
|
-
|
44
|
-
node_for_deletion = @current_node
|
45
|
-
@current_node = @current_node.parent
|
46
|
-
return unless @current_node
|
47
|
-
if remove
|
48
|
-
@current_node.remove_last_child! node_for_deletion
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def <<(string)
|
53
|
-
@output << string
|
42
|
+
def text(string)
|
43
|
+
plain_text = Odt::XmlNodes::PlainText.new(string)
|
44
|
+
@current_node.children << plain_text
|
54
45
|
end
|
55
46
|
|
56
47
|
def close
|
@@ -80,6 +71,10 @@ module Doc2Text
|
|
80
71
|
raise Doc2Text::XmlError, 'it does not support this xpath syntax'
|
81
72
|
end
|
82
73
|
end
|
74
|
+
|
75
|
+
def logger
|
76
|
+
@logger ||= Logger.new(STDOUT)
|
77
|
+
end
|
83
78
|
end
|
84
79
|
end
|
85
80
|
end
|
data/lib/doc2text/odt.rb
CHANGED
@@ -1,6 +1,18 @@
|
|
1
1
|
module Doc2Text
|
2
2
|
module Odt
|
3
3
|
module XmlNodes
|
4
|
+
class PlainText
|
5
|
+
include Node
|
6
|
+
|
7
|
+
attr_accessor :text
|
8
|
+
|
9
|
+
alias_method :expand, :text
|
10
|
+
|
11
|
+
def initialize(text)
|
12
|
+
@text = text
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
4
16
|
class Generic
|
5
17
|
include Node
|
6
18
|
end
|
@@ -26,7 +38,15 @@ module Doc2Text
|
|
26
38
|
include Node
|
27
39
|
|
28
40
|
def delete_on_close?
|
29
|
-
|
41
|
+
true
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
class Text
|
46
|
+
include Node
|
47
|
+
|
48
|
+
def delete_on_close?
|
49
|
+
true
|
30
50
|
end
|
31
51
|
end
|
32
52
|
end
|
@@ -43,7 +63,26 @@ module Doc2Text
|
|
43
63
|
module DataStyle; end
|
44
64
|
module Presentation; end
|
45
65
|
module Script; end
|
46
|
-
module Table
|
66
|
+
module Table
|
67
|
+
class TableRow
|
68
|
+
include Node
|
69
|
+
|
70
|
+
def expand
|
71
|
+
header_delimiter = parent.children.count >= 2 && parent.children[1] == self ? "\n|---|---|" : ''
|
72
|
+
result = "\n#{@children.select(&:not_deleted?).map(&:expand).join.strip.gsub "\n", ''} |#{header_delimiter}"
|
73
|
+
delete
|
74
|
+
result
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
class TableCell
|
79
|
+
include Node
|
80
|
+
|
81
|
+
def open
|
82
|
+
' | '
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
47
86
|
module Style
|
48
87
|
class Style
|
49
88
|
include Node
|
@@ -67,19 +106,24 @@ module Doc2Text
|
|
67
106
|
module Of; end
|
68
107
|
|
69
108
|
module Text
|
70
|
-
def initialize(parent = nil, attrs = [], prefix = nil, name = nil,
|
71
|
-
super parent, attrs, prefix, name
|
72
|
-
@
|
109
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
110
|
+
super parent, attrs, prefix, name
|
111
|
+
@markdown_odt_parser = markdown_odt_parser
|
73
112
|
style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
|
74
113
|
@enclosing_style = []
|
75
|
-
if style_index
|
76
|
-
|
114
|
+
if style_index and fetch_style?
|
115
|
+
elem_style = find_style attrs[style_index].value
|
116
|
+
fetch_style elem_style
|
77
117
|
end
|
78
118
|
end
|
79
119
|
|
80
|
-
def
|
81
|
-
|
82
|
-
|
120
|
+
def fetch_style?
|
121
|
+
true
|
122
|
+
end
|
123
|
+
|
124
|
+
def fetch_style(elem_style)
|
125
|
+
if elem_style
|
126
|
+
elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
|
83
127
|
text_property.attrs.each { |attr|
|
84
128
|
if attr.prefix == 'style'
|
85
129
|
if attr.localname == 'font-style-complex' && attr.value == 'italic'
|
@@ -93,11 +137,10 @@ module Doc2Text
|
|
93
137
|
end
|
94
138
|
end
|
95
139
|
|
96
|
-
def
|
97
|
-
styles = @
|
98
|
-
|
140
|
+
def find_style(style_name)
|
141
|
+
styles = @markdown_odt_parser.xpath '/office:document-content/office:automatic-styles/style:style'
|
142
|
+
styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' } &&
|
99
143
|
style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
|
100
|
-
fetch_common_style style
|
101
144
|
end
|
102
145
|
|
103
146
|
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419256_253892949
|
@@ -105,8 +148,8 @@ module Doc2Text
|
|
105
148
|
include Node
|
106
149
|
include Text
|
107
150
|
|
108
|
-
def initialize(parent = nil, attrs = [], prefix = nil, name = nil,
|
109
|
-
super parent, attrs, prefix, name,
|
151
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
152
|
+
super parent, attrs, prefix, name, markdown_odt_parser
|
110
153
|
end
|
111
154
|
|
112
155
|
def self.style_family
|
@@ -153,15 +196,46 @@ module Doc2Text
|
|
153
196
|
include Node
|
154
197
|
include Text
|
155
198
|
|
156
|
-
|
199
|
+
def expand
|
200
|
+
result = "* #{@children.select(&:not_deleted?).map(&:expand).join.strip.gsub /\n{2,}/, "\n"}\n"
|
201
|
+
delete
|
202
|
+
result.clone
|
203
|
+
end
|
157
204
|
|
158
|
-
def
|
159
|
-
|
205
|
+
def fetch_style?
|
206
|
+
false
|
160
207
|
end
|
161
208
|
|
162
|
-
def
|
209
|
+
def delete_on_close?
|
210
|
+
false
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
class List
|
215
|
+
include Node
|
216
|
+
include Text
|
217
|
+
|
218
|
+
def open
|
163
219
|
"\n"
|
164
220
|
end
|
221
|
+
|
222
|
+
def fetch_style(elem_style)
|
223
|
+
if elem_style
|
224
|
+
elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
|
225
|
+
text_property.attrs.each { |attr|
|
226
|
+
if attr.prefix == 'style'
|
227
|
+
if attr.localname == 'list-level-style-number' && attr.value == 'Numbering_20_Symbols'
|
228
|
+
@enclosing_style << '_'
|
229
|
+
end
|
230
|
+
end
|
231
|
+
}
|
232
|
+
}
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
def delete_on_close?
|
237
|
+
false
|
238
|
+
end
|
165
239
|
end
|
166
240
|
end
|
167
241
|
end
|
@@ -3,14 +3,16 @@ module Doc2Text
|
|
3
3
|
module XmlNodes
|
4
4
|
module Node
|
5
5
|
attr_reader :parent, :children, :attrs, :prefix, :name
|
6
|
+
attr_accessor :text
|
6
7
|
|
7
|
-
def self.create_node(prefix, name, parent = nil, attrs = [],
|
8
|
+
def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
|
8
9
|
begin
|
9
10
|
clazz = XmlNodes.const_get "#{titleize prefix}::#{titleize name}"
|
10
11
|
rescue NameError => e
|
11
|
-
|
12
|
+
# markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
|
13
|
+
Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
|
12
14
|
else
|
13
|
-
clazz.new(parent, attrs, prefix, name,
|
15
|
+
clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
|
14
16
|
end
|
15
17
|
end
|
16
18
|
|
@@ -18,8 +20,8 @@ module Doc2Text
|
|
18
20
|
tag.split('-').map(&:capitalize).join
|
19
21
|
end
|
20
22
|
|
21
|
-
def initialize(parent = nil, attrs = [], prefix = nil, name = nil,
|
22
|
-
@parent, @attrs, @prefix, @name = parent, attrs, prefix, name
|
23
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
24
|
+
@parent, @attrs, @prefix, @name, @markdown_odt_parser = parent, attrs, prefix, name, markdown_odt_parser
|
23
25
|
@children = []
|
24
26
|
@has_text = false
|
25
27
|
end
|
@@ -40,12 +42,22 @@ module Doc2Text
|
|
40
42
|
''
|
41
43
|
end
|
42
44
|
|
43
|
-
def
|
44
|
-
|
45
|
+
def delete_on_close?
|
46
|
+
false
|
45
47
|
end
|
46
48
|
|
47
|
-
def
|
48
|
-
|
49
|
+
def not_deleted?
|
50
|
+
!@deleted
|
51
|
+
end
|
52
|
+
|
53
|
+
def delete
|
54
|
+
@deleted = true
|
55
|
+
# @children.each { |child| child.delete }
|
56
|
+
# @children = []
|
57
|
+
end
|
58
|
+
|
59
|
+
def un_delete
|
60
|
+
@deleted = false
|
49
61
|
end
|
50
62
|
|
51
63
|
def eql?(object)
|
@@ -57,15 +69,6 @@ module Doc2Text
|
|
57
69
|
instance_of? Node
|
58
70
|
end
|
59
71
|
|
60
|
-
def remove_last_child!(child)
|
61
|
-
unless child === @children.last
|
62
|
-
# TODO remove this redundant(tree build algorithm) checks
|
63
|
-
raise Doc2Text::XmlError, "!The child #{child} IS NOT among the children of #{self}"
|
64
|
-
else
|
65
|
-
@children.pop
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
72
|
def xml_name
|
70
73
|
"#{@prefix}:#{@name}"
|
71
74
|
end
|
@@ -74,6 +77,12 @@ module Doc2Text
|
|
74
77
|
"#{xml_name} : #{attrs}"
|
75
78
|
end
|
76
79
|
|
80
|
+
def expand
|
81
|
+
expanded = "#{open}#{@children.select(&:not_deleted?).map(&:expand).join}#{close}"
|
82
|
+
delete
|
83
|
+
expanded.clone
|
84
|
+
end
|
85
|
+
|
77
86
|
def not_enclosing?
|
78
87
|
!root? && parent.class.not_enclosing_tags && parent.class.not_enclosing_tags.find do |tag|
|
79
88
|
@prefix == parent.prefix && @name == tag
|
data/lib/doc2text.rb
CHANGED
@@ -4,8 +4,8 @@ require 'fileutils'
|
|
4
4
|
|
5
5
|
require 'doc2text/odt'
|
6
6
|
require 'doc2text/odt_xml_node'
|
7
|
-
require 'doc2text/
|
8
|
-
require 'doc2text/
|
7
|
+
require 'doc2text/odt_xml_namespaces'
|
8
|
+
require 'doc2text/markdown_odt_parser'
|
9
9
|
require 'doc2text/errors'
|
10
10
|
|
11
11
|
require 'doc2text/content'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: doc2text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.2'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Valentin Aitken
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Parses odt to markdown
|
14
14
|
email: bostko@gmail.com
|
@@ -19,9 +19,9 @@ files:
|
|
19
19
|
- lib/doc2text.rb
|
20
20
|
- lib/doc2text/content.rb
|
21
21
|
- lib/doc2text/errors.rb
|
22
|
-
- lib/doc2text/
|
23
|
-
- lib/doc2text/namespaces.rb
|
22
|
+
- lib/doc2text/markdown_odt_parser.rb
|
24
23
|
- lib/doc2text/odt.rb
|
24
|
+
- lib/doc2text/odt_xml_namespaces.rb
|
25
25
|
- lib/doc2text/odt_xml_node.rb
|
26
26
|
homepage: https://github.com/bostko/doc2text
|
27
27
|
licenses:
|
@@ -43,7 +43,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
43
43
|
version: '0'
|
44
44
|
requirements: []
|
45
45
|
rubyforge_project:
|
46
|
-
rubygems_version: 2.
|
46
|
+
rubygems_version: 2.2.2
|
47
47
|
signing_key:
|
48
48
|
specification_version: 4
|
49
49
|
summary: Translates odt to markdown
|