doc2text 0.3.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/bin/doc2text +43 -0
- data/lib/doc2text.rb +16 -5
- data/lib/doc2text/docx/docx.rb +31 -0
- data/lib/doc2text/docx/docx_xml_namespaces.rb +55 -0
- data/lib/doc2text/docx/markdown_docx_parser.rb +81 -0
- data/lib/doc2text/generic_xml_nodes.rb +78 -0
- data/lib/doc2text/odt/markdown_odt_parser.rb +81 -0
- data/lib/doc2text/odt/odt.rb +43 -0
- data/lib/doc2text/odt/odt_xml_namespaces.rb +240 -0
- data/lib/doc2text/pptx/markdown_pptx_parser.rb +55 -0
- data/lib/doc2text/pptx/pptx.rb +30 -0
- data/lib/doc2text/pptx/pptx_xml_namespaces.rb +55 -0
- data/lib/doc2text/resolution.rb +14 -0
- data/lib/doc2text/styles_parser.rb +28 -0
- data/lib/doc2text/xml_based_document_file.rb +48 -0
- metadata +26 -29
- data/lib/doc2text/content.rb +0 -25
- data/lib/doc2text/markdown_odt_parser.rb +0 -72
- data/lib/doc2text/odt.rb +0 -73
- data/lib/doc2text/odt_xml_namespaces.rb +0 -215
- data/lib/doc2text/odt_xml_node.rb +0 -99
@@ -0,0 +1,240 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Odt
|
3
|
+
module XmlNodes
|
4
|
+
class Node < XmlBasedDocument::XmlNodes::Node
|
5
|
+
def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
|
6
|
+
begin
|
7
|
+
clazz = XmlNodes.const_get "#{titleize prefix}::#{titleize name}"
|
8
|
+
rescue NameError => e
|
9
|
+
# markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
|
10
|
+
Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
|
11
|
+
else
|
12
|
+
clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def office_text?
|
17
|
+
false
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class PlainText < XmlBasedDocument::XmlNodes::PlainText
|
22
|
+
end
|
23
|
+
|
24
|
+
class Generic < Node
|
25
|
+
end
|
26
|
+
#
|
27
|
+
# These are the namespaces available in the open document format
|
28
|
+
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os.html
|
29
|
+
#
|
30
|
+
module Office
|
31
|
+
class AutomaticStyles < Node
|
32
|
+
end
|
33
|
+
|
34
|
+
class DocumentContent < Node
|
35
|
+
end
|
36
|
+
|
37
|
+
class Text < Node
|
38
|
+
|
39
|
+
def office_text?
|
40
|
+
true
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
module Animation; end
|
46
|
+
module Chart; end
|
47
|
+
module Config; end
|
48
|
+
module Database; end
|
49
|
+
module Dr3d; end
|
50
|
+
module Drawing; end
|
51
|
+
module Form; end
|
52
|
+
module Manifest; end
|
53
|
+
module Meta; end
|
54
|
+
module DataStyle; end
|
55
|
+
module Presentation; end
|
56
|
+
module Script; end
|
57
|
+
module Table
|
58
|
+
class TableRow < Node
|
59
|
+
|
60
|
+
def expand
|
61
|
+
header_delimiter = parent.children.count >= 2 && parent.children[1] == self ? "\n|---|---|" : ''
|
62
|
+
result = "\n#{@children.map(&:expand).join.strip.gsub "\n", ''} |#{header_delimiter}"
|
63
|
+
delete
|
64
|
+
result
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
class TableCell < Node
|
69
|
+
|
70
|
+
def open
|
71
|
+
' | '
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
module Style
|
76
|
+
class Style < Node
|
77
|
+
end
|
78
|
+
|
79
|
+
class TextProperties < Node
|
80
|
+
end
|
81
|
+
end
|
82
|
+
module XslFoCompatible; end
|
83
|
+
module SvgCompatible; end
|
84
|
+
module SmilCompatible; end
|
85
|
+
module Of; end
|
86
|
+
|
87
|
+
module Text
|
88
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
89
|
+
super parent, attrs, prefix, name
|
90
|
+
@xml_parser = markdown_odt_parser
|
91
|
+
style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
|
92
|
+
@enclosing_style = []
|
93
|
+
if style_index and fetch_style?
|
94
|
+
elem_style = find_style attrs[style_index].value
|
95
|
+
fetch_style elem_style
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def fetch_style?
|
100
|
+
true
|
101
|
+
end
|
102
|
+
|
103
|
+
def fetch_style(elem_style)
|
104
|
+
if elem_style
|
105
|
+
elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
|
106
|
+
text_property.attrs.each { |attr|
|
107
|
+
if attr.prefix == 'style'
|
108
|
+
if attr.localname == 'font-style-complex' && attr.value == 'italic'
|
109
|
+
@enclosing_style << '_'
|
110
|
+
elsif attr.localname == 'font-weight-complex' && attr.value == 'bold'
|
111
|
+
@enclosing_style << '**'
|
112
|
+
end
|
113
|
+
end
|
114
|
+
}
|
115
|
+
}
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def find_style(style_name)
|
120
|
+
styles = @xml_parser.xpath '/office:document-content/office:automatic-styles/style:style'
|
121
|
+
styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' } &&
|
122
|
+
style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
|
123
|
+
end
|
124
|
+
|
125
|
+
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419212_253892949
|
126
|
+
class H < Node
|
127
|
+
include Text
|
128
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
129
|
+
super parent, attrs, prefix, name, markdown_odt_parser
|
130
|
+
outline_level_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'outline-level' }
|
131
|
+
if outline_level_index and fetch_style?
|
132
|
+
@elem_outline_level = attrs[outline_level_index].value.to_i
|
133
|
+
else
|
134
|
+
@elem_outline_level = 0
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
|
139
|
+
def self.style_family
|
140
|
+
'paragraph'
|
141
|
+
end
|
142
|
+
|
143
|
+
def open
|
144
|
+
"\n#{'#' * @elem_outline_level} "
|
145
|
+
end
|
146
|
+
|
147
|
+
def close
|
148
|
+
"\n\n"
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419256_253892949
|
153
|
+
class P < Node
|
154
|
+
include Text
|
155
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
156
|
+
super parent, attrs, prefix, name, markdown_odt_parser
|
157
|
+
end
|
158
|
+
|
159
|
+
def self.style_family
|
160
|
+
'paragraph'
|
161
|
+
end
|
162
|
+
|
163
|
+
def open
|
164
|
+
"\n#{@enclosing_style.join}"
|
165
|
+
end
|
166
|
+
|
167
|
+
def close
|
168
|
+
"#{@enclosing_style.reverse.join}\n"
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
class LineBreak < Node
|
173
|
+
|
174
|
+
def open
|
175
|
+
'<br/>'
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419264_253892949
|
180
|
+
class Span < Node
|
181
|
+
include Text
|
182
|
+
|
183
|
+
def self.style_family
|
184
|
+
'text'
|
185
|
+
end
|
186
|
+
|
187
|
+
def open
|
188
|
+
@enclosing_style.join
|
189
|
+
end
|
190
|
+
|
191
|
+
def close
|
192
|
+
@enclosing_style.reverse.join
|
193
|
+
end
|
194
|
+
|
195
|
+
def expand
|
196
|
+
expanded = "#{open}#{@children.map(&:expand).join}#{close}"
|
197
|
+
delete
|
198
|
+
expanded.clone
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415154_253892949
|
203
|
+
class ListItem < Node
|
204
|
+
include Text
|
205
|
+
def expand
|
206
|
+
result = "* #{@children.map(&:expand).join.strip.gsub /\n{2,}/, "\n"}\n"
|
207
|
+
delete
|
208
|
+
result.clone
|
209
|
+
end
|
210
|
+
|
211
|
+
def fetch_style?
|
212
|
+
false
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
class List < Node
|
217
|
+
include Text
|
218
|
+
|
219
|
+
def open
|
220
|
+
"\n"
|
221
|
+
end
|
222
|
+
|
223
|
+
def fetch_style(elem_style)
|
224
|
+
if elem_style
|
225
|
+
elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
|
226
|
+
text_property.attrs.each { |attr|
|
227
|
+
if attr.prefix == 'style'
|
228
|
+
if attr.localname == 'list-level-style-number' && attr.value == 'Numbering_20_Symbols'
|
229
|
+
@enclosing_style << '_'
|
230
|
+
end
|
231
|
+
end
|
232
|
+
}
|
233
|
+
}
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Pptx
|
3
|
+
module XmlNodes
|
4
|
+
class Node < XmlBasedDocument::XmlNodes::Node
|
5
|
+
def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
|
6
|
+
begin
|
7
|
+
clazz = XmlNodes.const_get "#{prefix.capitalize}::W#{name}"
|
8
|
+
rescue NameError => e
|
9
|
+
# markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
|
10
|
+
Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
|
11
|
+
else
|
12
|
+
clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def body?
|
17
|
+
false
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class PlainText < XmlBasedDocument::XmlNodes::PlainText
|
22
|
+
def body?
|
23
|
+
false
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class Generic < Node
|
28
|
+
end
|
29
|
+
|
30
|
+
module W
|
31
|
+
class Wbody < Node
|
32
|
+
def body?
|
33
|
+
true
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class Wbr < Node
|
38
|
+
def open
|
39
|
+
'<br/>'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Wp < Node
|
44
|
+
def open
|
45
|
+
"\n"
|
46
|
+
end
|
47
|
+
|
48
|
+
def close
|
49
|
+
"\n"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Pptx
|
3
|
+
class Document < XmlBasedDocument::DocumentFile
|
4
|
+
|
5
|
+
def self.parse_and_save(input, output_filename)
|
6
|
+
docx = new input
|
7
|
+
begin
|
8
|
+
docx.unpack
|
9
|
+
output = File.open output_filename, 'w'
|
10
|
+
markdown = Markdown::DocxParser.new output, nil
|
11
|
+
begin
|
12
|
+
docx.parse markdown
|
13
|
+
ensure
|
14
|
+
markdown.close
|
15
|
+
end
|
16
|
+
ensure
|
17
|
+
docx.clean
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def contains_extracted_files?
|
22
|
+
File.exist? File.join(extract_path, '[Content_Types].xml')
|
23
|
+
end
|
24
|
+
|
25
|
+
def extract_extension
|
26
|
+
'unpacked_pptx'
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Pptx
|
3
|
+
module XmlNodes
|
4
|
+
class Node < XmlBasedDocument::XmlNodes::Node
|
5
|
+
def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
|
6
|
+
begin
|
7
|
+
clazz = XmlNodes.const_get "#{prefix.capitalize}::W#{name}"
|
8
|
+
rescue NameError => e
|
9
|
+
# markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
|
10
|
+
Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
|
11
|
+
else
|
12
|
+
clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def body?
|
17
|
+
false
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class PlainText < XmlBasedDocument::XmlNodes::PlainText
|
22
|
+
def body?
|
23
|
+
false
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class Generic < Node
|
28
|
+
end
|
29
|
+
|
30
|
+
module W
|
31
|
+
class Wbody < Node
|
32
|
+
def body?
|
33
|
+
true
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class Wbr < Node
|
38
|
+
def open
|
39
|
+
'<br/>'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Wp < Node
|
44
|
+
def open
|
45
|
+
"\n"
|
46
|
+
end
|
47
|
+
|
48
|
+
def close
|
49
|
+
"\n"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
class Resolution
|
3
|
+
def self.parse_and_save(source, output)
|
4
|
+
case File.extname source
|
5
|
+
when '.docx'
|
6
|
+
Doc2Text::Docx::Document.parse_and_save source, output
|
7
|
+
when '.pptx'
|
8
|
+
Doc2Text::Pptx::Document.parse_and_save source, output
|
9
|
+
else
|
10
|
+
Doc2Text::Odt::Document.parse_and_save source, output
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Odt
|
3
|
+
class StylesParser < Nokogiri::XML::SAX::Document
|
4
|
+
attr_reader :xml_root
|
5
|
+
|
6
|
+
def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
|
7
|
+
unless @xml_root
|
8
|
+
@xml_root = @current_node = Doc2Text::Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
|
9
|
+
else
|
10
|
+
new_node = Doc2Text::Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
|
11
|
+
@current_node.children << new_node
|
12
|
+
@current_node = new_node
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def end_element_namespace(name, prefix = nil, uri = nil)
|
17
|
+
@current_node = @current_node.parent
|
18
|
+
end
|
19
|
+
|
20
|
+
def characters(_)
|
21
|
+
end
|
22
|
+
|
23
|
+
def xpath(_)
|
24
|
+
[]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'zip'
|
2
|
+
|
3
|
+
module Doc2Text
|
4
|
+
module XmlBasedDocument
|
5
|
+
class DocumentFile
|
6
|
+
def initialize(document_path)
|
7
|
+
@document_path = document_path
|
8
|
+
end
|
9
|
+
|
10
|
+
def unpack
|
11
|
+
Zip::File.open(@document_path) {
|
12
|
+
|zip_file|
|
13
|
+
Dir.mkdir(extract_path)
|
14
|
+
zip_file.each do |entry|
|
15
|
+
zipped_file_extract_path = File.join extract_path, entry.name
|
16
|
+
FileUtils.mkdir_p File.dirname(zipped_file_extract_path)
|
17
|
+
zip_file.extract entry, zipped_file_extract_path
|
18
|
+
end
|
19
|
+
}
|
20
|
+
end
|
21
|
+
|
22
|
+
def contains_extracted_files?
|
23
|
+
false
|
24
|
+
end
|
25
|
+
|
26
|
+
def clean
|
27
|
+
if File.exist?(extract_path) and contains_extracted_files?
|
28
|
+
FileUtils.rm_r extract_path
|
29
|
+
else
|
30
|
+
puts 'Failed to clean temp files'
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Open file from the current odt
|
35
|
+
def open(filename)
|
36
|
+
File.open File.join(extract_path, filename), 'r'
|
37
|
+
end
|
38
|
+
|
39
|
+
def extract_extension
|
40
|
+
'unpacked'
|
41
|
+
end
|
42
|
+
|
43
|
+
def extract_path
|
44
|
+
File.join File.dirname(@document_path), ".#{File.basename(@document_path)}_#{extract_extension}"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|