doc2text 0.3.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/bin/doc2text +43 -0
- data/lib/doc2text.rb +16 -5
- data/lib/doc2text/docx/docx.rb +31 -0
- data/lib/doc2text/docx/docx_xml_namespaces.rb +55 -0
- data/lib/doc2text/docx/markdown_docx_parser.rb +81 -0
- data/lib/doc2text/generic_xml_nodes.rb +78 -0
- data/lib/doc2text/odt/markdown_odt_parser.rb +81 -0
- data/lib/doc2text/odt/odt.rb +43 -0
- data/lib/doc2text/odt/odt_xml_namespaces.rb +240 -0
- data/lib/doc2text/pptx/markdown_pptx_parser.rb +55 -0
- data/lib/doc2text/pptx/pptx.rb +30 -0
- data/lib/doc2text/pptx/pptx_xml_namespaces.rb +55 -0
- data/lib/doc2text/resolution.rb +14 -0
- data/lib/doc2text/styles_parser.rb +28 -0
- data/lib/doc2text/xml_based_document_file.rb +48 -0
- metadata +26 -29
- data/lib/doc2text/content.rb +0 -25
- data/lib/doc2text/markdown_odt_parser.rb +0 -72
- data/lib/doc2text/odt.rb +0 -73
- data/lib/doc2text/odt_xml_namespaces.rb +0 -215
- data/lib/doc2text/odt_xml_node.rb +0 -99
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: doc2text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3
|
4
|
+
version: 0.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
- Valentin
|
7
|
+
- Valentin A.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-01-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -16,56 +16,54 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
20
|
-
- - ">="
|
21
|
-
- !ruby/object:Gem::Version
|
22
|
-
version: 1.6.3
|
19
|
+
version: 1.11.1
|
23
20
|
type: :runtime
|
24
21
|
prerelease: false
|
25
22
|
version_requirements: !ruby/object:Gem::Requirement
|
26
23
|
requirements:
|
27
24
|
- - "~>"
|
28
25
|
- !ruby/object:Gem::Version
|
29
|
-
version:
|
30
|
-
- - ">="
|
31
|
-
- !ruby/object:Gem::Version
|
32
|
-
version: 1.6.3
|
26
|
+
version: 1.11.1
|
33
27
|
- !ruby/object:Gem::Dependency
|
34
28
|
name: rubyzip
|
35
29
|
requirement: !ruby/object:Gem::Requirement
|
36
30
|
requirements:
|
37
31
|
- - "~>"
|
38
32
|
- !ruby/object:Gem::Version
|
39
|
-
version:
|
40
|
-
- - ">="
|
41
|
-
- !ruby/object:Gem::Version
|
42
|
-
version: 1.1.6
|
33
|
+
version: 2.3.0
|
43
34
|
type: :runtime
|
44
35
|
prerelease: false
|
45
36
|
version_requirements: !ruby/object:Gem::Requirement
|
46
37
|
requirements:
|
47
38
|
- - "~>"
|
48
39
|
- !ruby/object:Gem::Version
|
49
|
-
version:
|
50
|
-
- - ">="
|
51
|
-
- !ruby/object:Gem::Version
|
52
|
-
version: 1.1.6
|
40
|
+
version: 2.3.0
|
53
41
|
description: Parses odt to markdown
|
54
|
-
email:
|
55
|
-
executables:
|
42
|
+
email: valentin@nalisbg.com
|
43
|
+
executables:
|
44
|
+
- doc2text
|
56
45
|
extensions: []
|
57
46
|
extra_rdoc_files: []
|
58
47
|
files:
|
48
|
+
- bin/doc2text
|
59
49
|
- lib/doc2text.rb
|
60
|
-
- lib/doc2text/
|
50
|
+
- lib/doc2text/docx/docx.rb
|
51
|
+
- lib/doc2text/docx/docx_xml_namespaces.rb
|
52
|
+
- lib/doc2text/docx/markdown_docx_parser.rb
|
61
53
|
- lib/doc2text/errors.rb
|
62
|
-
- lib/doc2text/
|
63
|
-
- lib/doc2text/odt.rb
|
64
|
-
- lib/doc2text/
|
65
|
-
- lib/doc2text/
|
54
|
+
- lib/doc2text/generic_xml_nodes.rb
|
55
|
+
- lib/doc2text/odt/markdown_odt_parser.rb
|
56
|
+
- lib/doc2text/odt/odt.rb
|
57
|
+
- lib/doc2text/odt/odt_xml_namespaces.rb
|
58
|
+
- lib/doc2text/pptx/markdown_pptx_parser.rb
|
59
|
+
- lib/doc2text/pptx/pptx.rb
|
60
|
+
- lib/doc2text/pptx/pptx_xml_namespaces.rb
|
61
|
+
- lib/doc2text/resolution.rb
|
62
|
+
- lib/doc2text/styles_parser.rb
|
63
|
+
- lib/doc2text/xml_based_document_file.rb
|
66
64
|
homepage: http://doc2text.com
|
67
65
|
licenses:
|
68
|
-
-
|
66
|
+
- Apache-2.0
|
69
67
|
metadata: {}
|
70
68
|
post_install_message:
|
71
69
|
rdoc_options: []
|
@@ -82,8 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
82
80
|
- !ruby/object:Gem::Version
|
83
81
|
version: '0'
|
84
82
|
requirements: []
|
85
|
-
|
86
|
-
rubygems_version: 2.2.2
|
83
|
+
rubygems_version: 3.1.2
|
87
84
|
signing_key:
|
88
85
|
specification_version: 4
|
89
86
|
summary: Translates odt to markdown
|
data/lib/doc2text/content.rb
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
module Doc2Text
|
2
|
-
module Odt
|
3
|
-
module Content
|
4
|
-
class Document < ::Nokogiri::XML::SAX::Document
|
5
|
-
def initialize(markdown_odt_parser)
|
6
|
-
@markdown_odt_parser = markdown_odt_parser
|
7
|
-
end
|
8
|
-
|
9
|
-
def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
|
10
|
-
@markdown_odt_parser.new_node prefix, name, attrs
|
11
|
-
end
|
12
|
-
|
13
|
-
def end_element_namespace(name, prefix = nil, uri = nil)
|
14
|
-
@markdown_odt_parser.close_node prefix, name
|
15
|
-
end
|
16
|
-
|
17
|
-
def characters(string)
|
18
|
-
unless string.strip.empty?
|
19
|
-
@markdown_odt_parser.text string
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
@@ -1,72 +0,0 @@
|
|
1
|
-
require 'logger'
|
2
|
-
|
3
|
-
module Doc2Text
|
4
|
-
module Markdown
|
5
|
-
class OdtParser
|
6
|
-
def initialize(output)
|
7
|
-
@output = output
|
8
|
-
@automatic_styles = {}
|
9
|
-
end
|
10
|
-
|
11
|
-
def new_node(prefix, name, attrs)
|
12
|
-
unless @xml_root
|
13
|
-
@xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
|
14
|
-
else
|
15
|
-
new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
|
16
|
-
@current_node.children << new_node
|
17
|
-
@current_node = new_node
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
def close_node(prefix, name)
|
22
|
-
# if Odt::XmlNodes::Node.create_node(prefix, name, nil, [], self).eql? @current_node
|
23
|
-
if @current_node.parent and @current_node.parent.office_text?
|
24
|
-
@output << @current_node.expand
|
25
|
-
@current_node.delete
|
26
|
-
end
|
27
|
-
@current_node = @current_node.parent
|
28
|
-
# else
|
29
|
-
# # TODO remove this redundant(tree build algorithm) checks
|
30
|
-
# raise Doc2Text::XmlError, "!Close node child #{prefix} #{name} IS NOT correct, CURRENT_ELEM #{@current_node}"
|
31
|
-
# end
|
32
|
-
end
|
33
|
-
|
34
|
-
def text(string)
|
35
|
-
plain_text = Odt::XmlNodes::PlainText.new(string)
|
36
|
-
@current_node.children << plain_text
|
37
|
-
end
|
38
|
-
|
39
|
-
def close
|
40
|
-
@output.close
|
41
|
-
end
|
42
|
-
|
43
|
-
def print_tree(node)
|
44
|
-
puts node
|
45
|
-
node.children.each do |child|
|
46
|
-
print_tree child
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
# Select nodes xpath style
|
51
|
-
# - supports selecting from the root node
|
52
|
-
def xpath(string)
|
53
|
-
if /^(\/[\w:\-]+)+$/ =~ string
|
54
|
-
path = string.scan /[\w:\-]+/
|
55
|
-
seek_nodes = [@xml_root]
|
56
|
-
path.each_with_index do |xml_name, index|
|
57
|
-
seek_nodes.select! { |node| node.xml_name == xml_name }
|
58
|
-
seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
|
59
|
-
break if seek_nodes.empty?
|
60
|
-
end
|
61
|
-
seek_nodes
|
62
|
-
else
|
63
|
-
raise Doc2Text::XmlError, 'it does not support this xpath syntax'
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
def logger
|
68
|
-
@logger ||= Logger.new(STDOUT)
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
72
|
-
end
|
data/lib/doc2text/odt.rb
DELETED
@@ -1,73 +0,0 @@
|
|
1
|
-
require 'zip'
|
2
|
-
|
3
|
-
module Doc2Text
|
4
|
-
module Odt
|
5
|
-
class Document
|
6
|
-
EXTRACT_EXTENSION = 'unpacked_odt'
|
7
|
-
|
8
|
-
def self.parse_and_save(input, output_filename)
|
9
|
-
odt = new input
|
10
|
-
begin
|
11
|
-
odt.unpack
|
12
|
-
output = File.open output_filename, 'w'
|
13
|
-
markdown = Markdown::OdtParser.new output
|
14
|
-
begin
|
15
|
-
odt.parse markdown
|
16
|
-
ensure
|
17
|
-
markdown.close
|
18
|
-
end
|
19
|
-
ensure
|
20
|
-
odt.clean
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
def parse(markdown)
|
25
|
-
content = ::Doc2Text::Odt::Content::Document.new markdown
|
26
|
-
parser = Nokogiri::XML::SAX::Parser.new(content) # { |config| config.strict}
|
27
|
-
parser.parse open 'content.xml'
|
28
|
-
end
|
29
|
-
|
30
|
-
def initialize(document_path)
|
31
|
-
@document_path = document_path
|
32
|
-
end
|
33
|
-
|
34
|
-
def unpack
|
35
|
-
Zip::File.open(@document_path) {
|
36
|
-
|zip_file|
|
37
|
-
Dir.mkdir(extract_path)
|
38
|
-
zip_file.each do |entry|
|
39
|
-
zipped_file_extract_path = File.join extract_path, entry.name
|
40
|
-
FileUtils.mkdir_p File.dirname(zipped_file_extract_path)
|
41
|
-
zip_file.extract entry, zipped_file_extract_path
|
42
|
-
end
|
43
|
-
}
|
44
|
-
end
|
45
|
-
|
46
|
-
def clean
|
47
|
-
if [extract_path, File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
|
48
|
-
FileUtils.rm_r extract_path
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
# Open file from the current odt
|
53
|
-
def open(filename)
|
54
|
-
File.open File.join(extract_path, filename), 'r'
|
55
|
-
end
|
56
|
-
|
57
|
-
# Parse xml file from the current odt
|
58
|
-
def xml_file(filename, rood_node_name)
|
59
|
-
Nokogiri::XML::Document.parse(open(filename)) { |config| config.strict }
|
60
|
-
root_node = doc.root
|
61
|
-
if root_node.name != rood_node_name or root_node.namespace.prefix != 'office'
|
62
|
-
raise XmlError, 'Document does not have correct root element'
|
63
|
-
else
|
64
|
-
open(filename)
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
|
-
def extract_path
|
69
|
-
File.join File.dirname(@document_path), ".#{File.basename(@document_path)}_#{EXTRACT_EXTENSION}"
|
70
|
-
end
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|
@@ -1,215 +0,0 @@
|
|
1
|
-
module Doc2Text
|
2
|
-
module Odt
|
3
|
-
module XmlNodes
|
4
|
-
class PlainText
|
5
|
-
include Node
|
6
|
-
|
7
|
-
attr_accessor :text
|
8
|
-
|
9
|
-
alias_method :expand, :text
|
10
|
-
|
11
|
-
def initialize(text)
|
12
|
-
@text = text
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
class Generic
|
17
|
-
include Node
|
18
|
-
end
|
19
|
-
|
20
|
-
#
|
21
|
-
# These are the namespaces available in the open document format
|
22
|
-
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os.html
|
23
|
-
#
|
24
|
-
module Office
|
25
|
-
class AutomaticStyles
|
26
|
-
include Node
|
27
|
-
end
|
28
|
-
|
29
|
-
class DocumentContent
|
30
|
-
include Node
|
31
|
-
end
|
32
|
-
|
33
|
-
class Text
|
34
|
-
include Node
|
35
|
-
|
36
|
-
def office_text?
|
37
|
-
true
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
module Animation; end
|
43
|
-
module Chart; end
|
44
|
-
module Config; end
|
45
|
-
module Database; end
|
46
|
-
module Dr3d; end
|
47
|
-
module Drawing; end
|
48
|
-
module Form; end
|
49
|
-
module Manifest; end
|
50
|
-
module Meta; end
|
51
|
-
module DataStyle; end
|
52
|
-
module Presentation; end
|
53
|
-
module Script; end
|
54
|
-
module Table
|
55
|
-
class TableRow
|
56
|
-
include Node
|
57
|
-
|
58
|
-
def expand
|
59
|
-
header_delimiter = parent.children.count >= 2 && parent.children[1] == self ? "\n|---|---|" : ''
|
60
|
-
result = "\n#{@children.map(&:expand).join.strip.gsub "\n", ''} |#{header_delimiter}"
|
61
|
-
delete
|
62
|
-
result
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
class TableCell
|
67
|
-
include Node
|
68
|
-
|
69
|
-
def open
|
70
|
-
' | '
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|
74
|
-
module Style
|
75
|
-
class Style
|
76
|
-
include Node
|
77
|
-
end
|
78
|
-
|
79
|
-
class TextProperties
|
80
|
-
include Node
|
81
|
-
end
|
82
|
-
end
|
83
|
-
module XslFoCompatible; end
|
84
|
-
module SvgCompatible; end
|
85
|
-
module SmilCompatible; end
|
86
|
-
module Of; end
|
87
|
-
|
88
|
-
module Text
|
89
|
-
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
90
|
-
super parent, attrs, prefix, name
|
91
|
-
@markdown_odt_parser = markdown_odt_parser
|
92
|
-
style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
|
93
|
-
@enclosing_style = []
|
94
|
-
if style_index and fetch_style?
|
95
|
-
elem_style = find_style attrs[style_index].value
|
96
|
-
fetch_style elem_style
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
def fetch_style?
|
101
|
-
true
|
102
|
-
end
|
103
|
-
|
104
|
-
def fetch_style(elem_style)
|
105
|
-
if elem_style
|
106
|
-
elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
|
107
|
-
text_property.attrs.each { |attr|
|
108
|
-
if attr.prefix == 'style'
|
109
|
-
if attr.localname == 'font-style-complex' && attr.value == 'italic'
|
110
|
-
@enclosing_style << '_'
|
111
|
-
elsif attr.localname == 'font-weight-complex' && attr.value == 'bold'
|
112
|
-
@enclosing_style << '**'
|
113
|
-
end
|
114
|
-
end
|
115
|
-
}
|
116
|
-
}
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
def find_style(style_name)
|
121
|
-
styles = @markdown_odt_parser.xpath '/office:document-content/office:automatic-styles/style:style'
|
122
|
-
styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' } &&
|
123
|
-
style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
|
124
|
-
end
|
125
|
-
|
126
|
-
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419256_253892949
|
127
|
-
class P
|
128
|
-
include Node
|
129
|
-
include Text
|
130
|
-
|
131
|
-
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
132
|
-
super parent, attrs, prefix, name, markdown_odt_parser
|
133
|
-
end
|
134
|
-
|
135
|
-
def self.style_family
|
136
|
-
'paragraph'
|
137
|
-
end
|
138
|
-
|
139
|
-
def open
|
140
|
-
"\n#{@enclosing_style.join}"
|
141
|
-
end
|
142
|
-
|
143
|
-
def close
|
144
|
-
"#{@enclosing_style.reverse.join}\n"
|
145
|
-
end
|
146
|
-
end
|
147
|
-
|
148
|
-
class LineBreak
|
149
|
-
include Node
|
150
|
-
|
151
|
-
def open
|
152
|
-
'<br/>'
|
153
|
-
end
|
154
|
-
end
|
155
|
-
|
156
|
-
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419264_253892949
|
157
|
-
class Span
|
158
|
-
include Node
|
159
|
-
include Text
|
160
|
-
|
161
|
-
def self.style_family
|
162
|
-
'text'
|
163
|
-
end
|
164
|
-
|
165
|
-
def open
|
166
|
-
@enclosing_style.join
|
167
|
-
end
|
168
|
-
|
169
|
-
def close
|
170
|
-
@enclosing_style.reverse.join
|
171
|
-
end
|
172
|
-
end
|
173
|
-
|
174
|
-
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415154_253892949
|
175
|
-
class ListItem
|
176
|
-
include Node
|
177
|
-
include Text
|
178
|
-
|
179
|
-
def expand
|
180
|
-
result = "* #{@children.map(&:expand).join.strip.gsub /\n{2,}/, "\n"}\n"
|
181
|
-
delete
|
182
|
-
result.clone
|
183
|
-
end
|
184
|
-
|
185
|
-
def fetch_style?
|
186
|
-
false
|
187
|
-
end
|
188
|
-
end
|
189
|
-
|
190
|
-
class List
|
191
|
-
include Node
|
192
|
-
include Text
|
193
|
-
|
194
|
-
def open
|
195
|
-
"\n"
|
196
|
-
end
|
197
|
-
|
198
|
-
def fetch_style(elem_style)
|
199
|
-
if elem_style
|
200
|
-
elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
|
201
|
-
text_property.attrs.each { |attr|
|
202
|
-
if attr.prefix == 'style'
|
203
|
-
if attr.localname == 'list-level-style-number' && attr.value == 'Numbering_20_Symbols'
|
204
|
-
@enclosing_style << '_'
|
205
|
-
end
|
206
|
-
end
|
207
|
-
}
|
208
|
-
}
|
209
|
-
end
|
210
|
-
end
|
211
|
-
end
|
212
|
-
end
|
213
|
-
end
|
214
|
-
end
|
215
|
-
end
|