doc2text 0.3.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,240 @@
1
+ module Doc2Text
2
+ module Odt
3
+ module XmlNodes
4
+ class Node < XmlBasedDocument::XmlNodes::Node
5
+ def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
6
+ begin
7
+ clazz = XmlNodes.const_get "#{titleize prefix}::#{titleize name}"
8
+ rescue NameError => e
9
+ # markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
10
+ Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
11
+ else
12
+ clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
13
+ end
14
+ end
15
+
16
+ def office_text?
17
+ false
18
+ end
19
+ end
20
+
21
+ class PlainText < XmlBasedDocument::XmlNodes::PlainText
22
+ end
23
+
24
+ class Generic < Node
25
+ end
26
+ #
27
+ # These are the namespaces available in the open document format
28
+ # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os.html
29
+ #
30
+ module Office
31
+ class AutomaticStyles < Node
32
+ end
33
+
34
+ class DocumentContent < Node
35
+ end
36
+
37
+ class Text < Node
38
+
39
+ def office_text?
40
+ true
41
+ end
42
+ end
43
+ end
44
+
45
+ module Animation; end
46
+ module Chart; end
47
+ module Config; end
48
+ module Database; end
49
+ module Dr3d; end
50
+ module Drawing; end
51
+ module Form; end
52
+ module Manifest; end
53
+ module Meta; end
54
+ module DataStyle; end
55
+ module Presentation; end
56
+ module Script; end
57
+ module Table
58
+ class TableRow < Node
59
+
60
+ def expand
61
+ header_delimiter = parent.children.count >= 2 && parent.children[1] == self ? "\n|---|---|" : ''
62
+ result = "\n#{@children.map(&:expand).join.strip.gsub "\n", ''} |#{header_delimiter}"
63
+ delete
64
+ result
65
+ end
66
+ end
67
+
68
+ class TableCell < Node
69
+
70
+ def open
71
+ ' | '
72
+ end
73
+ end
74
+ end
75
+ module Style
76
+ class Style < Node
77
+ end
78
+
79
+ class TextProperties < Node
80
+ end
81
+ end
82
+ module XslFoCompatible; end
83
+ module SvgCompatible; end
84
+ module SmilCompatible; end
85
+ module Of; end
86
+
87
+ module Text
88
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
89
+ super parent, attrs, prefix, name
90
+ @xml_parser = markdown_odt_parser
91
+ style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
92
+ @enclosing_style = []
93
+ if style_index and fetch_style?
94
+ elem_style = find_style attrs[style_index].value
95
+ fetch_style elem_style
96
+ end
97
+ end
98
+
99
+ def fetch_style?
100
+ true
101
+ end
102
+
103
+ def fetch_style(elem_style)
104
+ if elem_style
105
+ elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
106
+ text_property.attrs.each { |attr|
107
+ if attr.prefix == 'style'
108
+ if attr.localname == 'font-style-complex' && attr.value == 'italic'
109
+ @enclosing_style << '_'
110
+ elsif attr.localname == 'font-weight-complex' && attr.value == 'bold'
111
+ @enclosing_style << '**'
112
+ end
113
+ end
114
+ }
115
+ }
116
+ end
117
+ end
118
+
119
+ def find_style(style_name)
120
+ styles = @xml_parser.xpath '/office:document-content/office:automatic-styles/style:style'
121
+ styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' } &&
122
+ style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
123
+ end
124
+
125
+ # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419212_253892949
126
+ class H < Node
127
+ include Text
128
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
129
+ super parent, attrs, prefix, name, markdown_odt_parser
130
+ outline_level_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'outline-level' }
131
+ if outline_level_index and fetch_style?
132
+ @elem_outline_level = attrs[outline_level_index].value.to_i
133
+ else
134
+ @elem_outline_level = 0
135
+ end
136
+
137
+ end
138
+
139
+ def self.style_family
140
+ 'paragraph'
141
+ end
142
+
143
+ def open
144
+ "\n#{'#' * @elem_outline_level} "
145
+ end
146
+
147
+ def close
148
+ "\n\n"
149
+ end
150
+ end
151
+
152
+ # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419256_253892949
153
+ class P < Node
154
+ include Text
155
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
156
+ super parent, attrs, prefix, name, markdown_odt_parser
157
+ end
158
+
159
+ def self.style_family
160
+ 'paragraph'
161
+ end
162
+
163
+ def open
164
+ "\n#{@enclosing_style.join}"
165
+ end
166
+
167
+ def close
168
+ "#{@enclosing_style.reverse.join}\n"
169
+ end
170
+ end
171
+
172
+ class LineBreak < Node
173
+
174
+ def open
175
+ '<br/>'
176
+ end
177
+ end
178
+
179
+ # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419264_253892949
180
+ class Span < Node
181
+ include Text
182
+
183
+ def self.style_family
184
+ 'text'
185
+ end
186
+
187
+ def open
188
+ @enclosing_style.join
189
+ end
190
+
191
+ def close
192
+ @enclosing_style.reverse.join
193
+ end
194
+
195
+ def expand
196
+ expanded = "#{open}#{@children.map(&:expand).join}#{close}"
197
+ delete
198
+ expanded.clone
199
+ end
200
+ end
201
+
202
+ # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415154_253892949
203
+ class ListItem < Node
204
+ include Text
205
+ def expand
206
+ result = "* #{@children.map(&:expand).join.strip.gsub /\n{2,}/, "\n"}\n"
207
+ delete
208
+ result.clone
209
+ end
210
+
211
+ def fetch_style?
212
+ false
213
+ end
214
+ end
215
+
216
+ class List < Node
217
+ include Text
218
+
219
+ def open
220
+ "\n"
221
+ end
222
+
223
+ def fetch_style(elem_style)
224
+ if elem_style
225
+ elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
226
+ text_property.attrs.each { |attr|
227
+ if attr.prefix == 'style'
228
+ if attr.localname == 'list-level-style-number' && attr.value == 'Numbering_20_Symbols'
229
+ @enclosing_style << '_'
230
+ end
231
+ end
232
+ }
233
+ }
234
+ end
235
+ end
236
+ end
237
+ end
238
+ end
239
+ end
240
+ end
@@ -0,0 +1,55 @@
1
+ module Doc2Text
2
+ module Pptx
3
+ module XmlNodes
4
+ class Node < XmlBasedDocument::XmlNodes::Node
5
+ def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
6
+ begin
7
+ clazz = XmlNodes.const_get "#{prefix.capitalize}::W#{name}"
8
+ rescue NameError => e
9
+ # markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
10
+ Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
11
+ else
12
+ clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
13
+ end
14
+ end
15
+
16
+ def body?
17
+ false
18
+ end
19
+ end
20
+
21
+ class PlainText < XmlBasedDocument::XmlNodes::PlainText
22
+ def body?
23
+ false
24
+ end
25
+ end
26
+
27
+ class Generic < Node
28
+ end
29
+
30
+ module W
31
+ class Wbody < Node
32
+ def body?
33
+ true
34
+ end
35
+ end
36
+
37
+ class Wbr < Node
38
+ def open
39
+ '<br/>'
40
+ end
41
+ end
42
+
43
+ class Wp < Node
44
+ def open
45
+ "\n"
46
+ end
47
+
48
+ def close
49
+ "\n"
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,30 @@
1
+ module Doc2Text
2
+ module Pptx
3
+ class Document < XmlBasedDocument::DocumentFile
4
+
5
+ def self.parse_and_save(input, output_filename)
6
+ docx = new input
7
+ begin
8
+ docx.unpack
9
+ output = File.open output_filename, 'w'
10
+ markdown = Markdown::DocxParser.new output, nil
11
+ begin
12
+ docx.parse markdown
13
+ ensure
14
+ markdown.close
15
+ end
16
+ ensure
17
+ docx.clean
18
+ end
19
+ end
20
+
21
+ def contains_extracted_files?
22
+ File.exist? File.join(extract_path, '[Content_Types].xml')
23
+ end
24
+
25
+ def extract_extension
26
+ 'unpacked_pptx'
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,55 @@
1
+ module Doc2Text
2
+ module Pptx
3
+ module XmlNodes
4
+ class Node < XmlBasedDocument::XmlNodes::Node
5
+ def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
6
+ begin
7
+ clazz = XmlNodes.const_get "#{prefix.capitalize}::W#{name}"
8
+ rescue NameError => e
9
+ # markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
10
+ Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
11
+ else
12
+ clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
13
+ end
14
+ end
15
+
16
+ def body?
17
+ false
18
+ end
19
+ end
20
+
21
+ class PlainText < XmlBasedDocument::XmlNodes::PlainText
22
+ def body?
23
+ false
24
+ end
25
+ end
26
+
27
+ class Generic < Node
28
+ end
29
+
30
+ module W
31
+ class Wbody < Node
32
+ def body?
33
+ true
34
+ end
35
+ end
36
+
37
+ class Wbr < Node
38
+ def open
39
+ '<br/>'
40
+ end
41
+ end
42
+
43
+ class Wp < Node
44
+ def open
45
+ "\n"
46
+ end
47
+
48
+ def close
49
+ "\n"
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,14 @@
1
+ module Doc2Text
2
+ class Resolution
3
+ def self.parse_and_save(source, output)
4
+ case File.extname source
5
+ when '.docx'
6
+ Doc2Text::Docx::Document.parse_and_save source, output
7
+ when '.pptx'
8
+ Doc2Text::Pptx::Document.parse_and_save source, output
9
+ else
10
+ Doc2Text::Odt::Document.parse_and_save source, output
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,28 @@
1
+ module Doc2Text
2
+ module Odt
3
+ class StylesParser < Nokogiri::XML::SAX::Document
4
+ attr_reader :xml_root
5
+
6
+ def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
7
+ unless @xml_root
8
+ @xml_root = @current_node = Doc2Text::Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
9
+ else
10
+ new_node = Doc2Text::Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
11
+ @current_node.children << new_node
12
+ @current_node = new_node
13
+ end
14
+ end
15
+
16
+ def end_element_namespace(name, prefix = nil, uri = nil)
17
+ @current_node = @current_node.parent
18
+ end
19
+
20
+ def characters(_)
21
+ end
22
+
23
+ def xpath(_)
24
+ []
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,48 @@
1
+ require 'zip'
2
+
3
+ module Doc2Text
4
+ module XmlBasedDocument
5
+ class DocumentFile
6
+ def initialize(document_path)
7
+ @document_path = document_path
8
+ end
9
+
10
+ def unpack
11
+ Zip::File.open(@document_path) {
12
+ |zip_file|
13
+ Dir.mkdir(extract_path)
14
+ zip_file.each do |entry|
15
+ zipped_file_extract_path = File.join extract_path, entry.name
16
+ FileUtils.mkdir_p File.dirname(zipped_file_extract_path)
17
+ zip_file.extract entry, zipped_file_extract_path
18
+ end
19
+ }
20
+ end
21
+
22
+ def contains_extracted_files?
23
+ false
24
+ end
25
+
26
+ def clean
27
+ if File.exist?(extract_path) and contains_extracted_files?
28
+ FileUtils.rm_r extract_path
29
+ else
30
+ puts 'Failed to clean temp files'
31
+ end
32
+ end
33
+
34
+ # Open file from the current odt
35
+ def open(filename)
36
+ File.open File.join(extract_path, filename), 'r'
37
+ end
38
+
39
+ def extract_extension
40
+ 'unpacked'
41
+ end
42
+
43
+ def extract_path
44
+ File.join File.dirname(@document_path), ".#{File.basename(@document_path)}_#{extract_extension}"
45
+ end
46
+ end
47
+ end
48
+ end