doc2text 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/doc2text.rb +11 -0
- data/lib/doc2text/content.rb +25 -0
- data/lib/doc2text/errors.rb +5 -0
- data/lib/doc2text/markdown.rb +85 -0
- data/lib/doc2text/namespaces.rb +169 -0
- data/lib/doc2text/odt.rb +73 -0
- data/lib/doc2text/odt_xml_node.rb +98 -0
- metadata +51 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1821e833815ea821090507cea37fdec0c68dc2af
|
4
|
+
data.tar.gz: 7815dd2e3f7fbf1f822959e4a120ef0e0bcbef79
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 5b3a0e9729eccccd888432527455336214d52ad996920593e30edab1d2e5f29bcdefdca19dfdbf9f7bf686bfabb8a6888cf706a386f058d7ee07b9bacac1e9e8
|
7
|
+
data.tar.gz: 6ffbb43bd9c8e4eac000b4733f3bcb149a6cee3b8cd608a22a0ca4516fbbc7317a2d30b4b0f15d8dee0c103df509ee24b029393503bd4bb40e16d22ed6c1543a
|
data/lib/doc2text.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Odt
|
3
|
+
module Content
|
4
|
+
class Document < ::Nokogiri::XML::SAX::Document
|
5
|
+
def initialize(markdown_document)
|
6
|
+
@markdown_document = markdown_document
|
7
|
+
end
|
8
|
+
|
9
|
+
def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
|
10
|
+
@markdown_document.new_node prefix, name, attrs
|
11
|
+
end
|
12
|
+
|
13
|
+
def end_element_namespace(name, prefix = nil, uri = nil)
|
14
|
+
@markdown_document.close_node prefix, name
|
15
|
+
end
|
16
|
+
|
17
|
+
def characters(string)
|
18
|
+
unless string.strip.empty?
|
19
|
+
@markdown_document << string
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Markdown
|
3
|
+
class Document
|
4
|
+
def initialize(output)
|
5
|
+
@output = output
|
6
|
+
@automatic_styles = {}
|
7
|
+
end
|
8
|
+
|
9
|
+
def new_node(prefix, name, attrs)
|
10
|
+
unless @xml_root
|
11
|
+
@xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
|
12
|
+
else
|
13
|
+
new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
|
14
|
+
@current_node.children << new_node
|
15
|
+
@current_node = new_node
|
16
|
+
self << @current_node.open
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def close_node(prefix, name)
|
21
|
+
if Odt::XmlNodes::Node.create_node(prefix, name).eql? @current_node
|
22
|
+
if @current_node.delete_on_close?
|
23
|
+
remove_current_node!
|
24
|
+
else
|
25
|
+
remove_current_node! false
|
26
|
+
end
|
27
|
+
elsif Odt::XmlNodes::Node.create_node(prefix, name).eql? @current_node.parent
|
28
|
+
if @current_node.parent.delete_on_close?
|
29
|
+
remove_current_node!
|
30
|
+
remove_current_node!
|
31
|
+
else
|
32
|
+
remove_current_node! false
|
33
|
+
remove_current_node! false
|
34
|
+
end
|
35
|
+
else
|
36
|
+
# TODO remove this redundant(tree build algorithm) checks
|
37
|
+
raise Doc2Text::XmlError, "!Close node child #{prefix} #{name} IS NOT correct, CURRENT_ELEM #{@current_node}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def remove_current_node!(remove = true)
|
42
|
+
return if !@current_node
|
43
|
+
self << @current_node.close
|
44
|
+
node_for_deletion = @current_node
|
45
|
+
@current_node = @current_node.parent
|
46
|
+
return unless @current_node
|
47
|
+
if remove
|
48
|
+
@current_node.remove_last_child! node_for_deletion
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def <<(string)
|
53
|
+
@output << string
|
54
|
+
end
|
55
|
+
|
56
|
+
def close
|
57
|
+
@output.close
|
58
|
+
end
|
59
|
+
|
60
|
+
def print_tree(node)
|
61
|
+
puts node
|
62
|
+
node.children.each do |child|
|
63
|
+
print_tree child
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Select nodes xpath style
|
68
|
+
# - supports selecting from the root node
|
69
|
+
def xpath(string)
|
70
|
+
if /^(\/[\w:\-]+)+$/ =~ string
|
71
|
+
path = string.scan /[\w:\-]+/
|
72
|
+
seek_nodes = [@xml_root]
|
73
|
+
path.each_with_index do |xml_name, index|
|
74
|
+
seek_nodes.select! { |node| node.xml_name == xml_name }
|
75
|
+
seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
|
76
|
+
break if seek_nodes.empty?
|
77
|
+
end
|
78
|
+
seek_nodes
|
79
|
+
else
|
80
|
+
raise Doc2Text::XmlError, 'it does not support this xpath syntax'
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,169 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Odt
|
3
|
+
module XmlNodes
|
4
|
+
class Generic
|
5
|
+
include Node
|
6
|
+
end
|
7
|
+
|
8
|
+
#
|
9
|
+
# These are the namespaces available in the open document format
|
10
|
+
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os.html
|
11
|
+
#
|
12
|
+
module Office
|
13
|
+
class AutomaticStyles
|
14
|
+
include Node
|
15
|
+
|
16
|
+
def visit
|
17
|
+
:automatic_styles
|
18
|
+
end
|
19
|
+
|
20
|
+
def delete_on_close?
|
21
|
+
false
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class DocumentContent
|
26
|
+
include Node
|
27
|
+
|
28
|
+
def delete_on_close?
|
29
|
+
false # required for testing purposes. After a document has been parsed, some tests could be run against the tree built
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
module Animation; end
|
35
|
+
module Chart; end
|
36
|
+
module Config; end
|
37
|
+
module Database; end
|
38
|
+
module Dr3d; end
|
39
|
+
module Drawing; end
|
40
|
+
module Form; end
|
41
|
+
module Manifest; end
|
42
|
+
module Meta; end
|
43
|
+
module DataStyle; end
|
44
|
+
module Presentation; end
|
45
|
+
module Script; end
|
46
|
+
module Table; end
|
47
|
+
module Style
|
48
|
+
class Style
|
49
|
+
include Node
|
50
|
+
|
51
|
+
def delete_on_close?
|
52
|
+
false
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class TextProperties
|
57
|
+
include Node
|
58
|
+
|
59
|
+
def delete_on_close?
|
60
|
+
false
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
module XslFoCompatible; end
|
65
|
+
module SvgCompatible; end
|
66
|
+
module SmilCompatible; end
|
67
|
+
module Of; end
|
68
|
+
|
69
|
+
module Text
|
70
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
|
71
|
+
super parent, attrs, prefix, name, markdown_document
|
72
|
+
@markdown_document = markdown_document
|
73
|
+
style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
|
74
|
+
@enclosing_style = []
|
75
|
+
if style_index
|
76
|
+
fetch_style attrs[style_index].value
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def fetch_common_style(style)
|
81
|
+
if style
|
82
|
+
style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
|
83
|
+
text_property.attrs.each { |attr|
|
84
|
+
if attr.prefix == 'style'
|
85
|
+
if attr.localname == 'font-style-complex' && attr.value == 'italic'
|
86
|
+
@enclosing_style << '_'
|
87
|
+
elsif attr.localname == 'font-weight-complex' && attr.value == 'bold'
|
88
|
+
@enclosing_style << '**'
|
89
|
+
end
|
90
|
+
end
|
91
|
+
}
|
92
|
+
}
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def fetch_style(style_name)
|
97
|
+
styles = @markdown_document.xpath '/office:document-content/office:automatic-styles/style:style'
|
98
|
+
style = styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' && attr.value == self.class.style_family } &&
|
99
|
+
style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
|
100
|
+
fetch_common_style style
|
101
|
+
end
|
102
|
+
|
103
|
+
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419256_253892949
|
104
|
+
class P
|
105
|
+
include Node
|
106
|
+
include Text
|
107
|
+
|
108
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
|
109
|
+
super parent, attrs, prefix, name, markdown_document
|
110
|
+
end
|
111
|
+
|
112
|
+
def self.style_family
|
113
|
+
'paragraph'
|
114
|
+
end
|
115
|
+
|
116
|
+
def open
|
117
|
+
"\n#{@enclosing_style.join}"
|
118
|
+
end
|
119
|
+
|
120
|
+
def close
|
121
|
+
"#{@enclosing_style.reverse.join}\n"
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
class LineBreak
|
126
|
+
include Node
|
127
|
+
|
128
|
+
def open
|
129
|
+
'<br/>'
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419264_253892949
|
134
|
+
class Span
|
135
|
+
include Node
|
136
|
+
include Text
|
137
|
+
|
138
|
+
def self.style_family
|
139
|
+
'text'
|
140
|
+
end
|
141
|
+
|
142
|
+
def open
|
143
|
+
@enclosing_style.join
|
144
|
+
end
|
145
|
+
|
146
|
+
def close
|
147
|
+
@enclosing_style.reverse.join
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415154_253892949
|
152
|
+
class ListItem
|
153
|
+
include Node
|
154
|
+
include Text
|
155
|
+
|
156
|
+
not_enclosing 'p'
|
157
|
+
|
158
|
+
def open
|
159
|
+
'* '
|
160
|
+
end
|
161
|
+
|
162
|
+
def close
|
163
|
+
"\n"
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
data/lib/doc2text/odt.rb
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'zip'
|
2
|
+
|
3
|
+
module Doc2Text
|
4
|
+
module Odt
|
5
|
+
class Document
|
6
|
+
EXTRACT_EXTENSION = 'unpacked_odt'
|
7
|
+
|
8
|
+
def self.parse_and_save(input, output_filename)
|
9
|
+
odt = new input
|
10
|
+
begin
|
11
|
+
odt.unpack
|
12
|
+
output = File.open output_filename, 'w'
|
13
|
+
markdown = Markdown::Document.new output
|
14
|
+
begin
|
15
|
+
odt.parse markdown
|
16
|
+
ensure
|
17
|
+
markdown.close
|
18
|
+
end
|
19
|
+
ensure
|
20
|
+
odt.clean
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def parse(markdown)
|
25
|
+
content = ::Doc2Text::Odt::Content::Document.new markdown
|
26
|
+
parser = Nokogiri::XML::SAX::Parser.new(content) # { |config| config.strict}
|
27
|
+
parser.parse open 'content.xml'
|
28
|
+
end
|
29
|
+
|
30
|
+
def initialize(document_path)
|
31
|
+
@document_path = document_path
|
32
|
+
end
|
33
|
+
|
34
|
+
def unpack
|
35
|
+
Zip::File.open(@document_path) {
|
36
|
+
|zip_file|
|
37
|
+
Dir.mkdir(extract_path)
|
38
|
+
zip_file.each do |entry|
|
39
|
+
zipped_file_extract_path = File.join extract_path, entry.name
|
40
|
+
FileUtils.mkdir_p File.dirname(zipped_file_extract_path)
|
41
|
+
zip_file.extract entry, zipped_file_extract_path
|
42
|
+
end
|
43
|
+
}
|
44
|
+
end
|
45
|
+
|
46
|
+
def clean
|
47
|
+
if [extract_path, File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
|
48
|
+
FileUtils.rm_r extract_path
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Open file from the current odt
|
53
|
+
def open(filename)
|
54
|
+
File.open File.join(extract_path, filename), 'r'
|
55
|
+
end
|
56
|
+
|
57
|
+
# Parse xml file from the current odt
|
58
|
+
def xml_file(filename, rood_node_name)
|
59
|
+
Nokogiri::XML::Document.parse(open(filename)) { |config| config.strict }
|
60
|
+
root_node = doc.root
|
61
|
+
if root_node.name != rood_node_name or root_node.namespace.prefix != 'office'
|
62
|
+
raise XmlError, 'Document does not have correct root element'
|
63
|
+
else
|
64
|
+
open(filename)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def extract_path
|
69
|
+
File.join File.dirname(@document_path), ".#{File.basename(@document_path)}_#{EXTRACT_EXTENSION}"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Odt
|
3
|
+
module XmlNodes
|
4
|
+
module Node
|
5
|
+
attr_reader :parent, :children, :attrs, :prefix, :name
|
6
|
+
|
7
|
+
def self.create_node(prefix, name, parent = nil, attrs = [], markdown_document = nil)
|
8
|
+
begin
|
9
|
+
clazz = XmlNodes.const_get "#{titleize prefix}::#{titleize name}"
|
10
|
+
rescue NameError => e
|
11
|
+
Generic.new(parent, attrs, prefix, name, markdown_document)
|
12
|
+
else
|
13
|
+
clazz.new(parent, attrs, prefix, name, markdown_document)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.titleize(tag)
|
18
|
+
tag.split('-').map(&:capitalize).join
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
|
22
|
+
@parent, @attrs, @prefix, @name = parent, attrs, prefix, name
|
23
|
+
@children = []
|
24
|
+
@has_text = false
|
25
|
+
end
|
26
|
+
|
27
|
+
def root?
|
28
|
+
!@parent
|
29
|
+
end
|
30
|
+
|
31
|
+
def has_text?
|
32
|
+
@has_text
|
33
|
+
end
|
34
|
+
|
35
|
+
def open
|
36
|
+
''
|
37
|
+
end
|
38
|
+
|
39
|
+
def close
|
40
|
+
''
|
41
|
+
end
|
42
|
+
|
43
|
+
def <<(child)
|
44
|
+
@children << child
|
45
|
+
end
|
46
|
+
|
47
|
+
def delete_on_close?
|
48
|
+
true
|
49
|
+
end
|
50
|
+
|
51
|
+
def eql?(object)
|
52
|
+
return false unless object.is_a? Node
|
53
|
+
object.xml_name == xml_name
|
54
|
+
end
|
55
|
+
|
56
|
+
def generic?
|
57
|
+
instance_of? Node
|
58
|
+
end
|
59
|
+
|
60
|
+
def remove_last_child!(child)
|
61
|
+
unless child === @children.last
|
62
|
+
# TODO remove this redundant(tree build algorithm) checks
|
63
|
+
raise Doc2Text::XmlError, "!The child #{child} IS NOT among the children of #{self}"
|
64
|
+
else
|
65
|
+
@children.pop
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def xml_name
|
70
|
+
"#{@prefix}:#{@name}"
|
71
|
+
end
|
72
|
+
|
73
|
+
def to_s
|
74
|
+
"#{xml_name} : #{attrs}"
|
75
|
+
end
|
76
|
+
|
77
|
+
def not_enclosing?
|
78
|
+
!root? && parent.class.not_enclosing_tags && parent.class.not_enclosing_tags.find do |tag|
|
79
|
+
@prefix == parent.prefix && @name == tag
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def self.included(base)
|
84
|
+
base.extend ClassMethods
|
85
|
+
end
|
86
|
+
|
87
|
+
module ClassMethods
|
88
|
+
attr_reader :not_enclosing_tags
|
89
|
+
|
90
|
+
def not_enclosing(tag)
|
91
|
+
@not_enclosing_tags ||= []
|
92
|
+
@not_enclosing_tags << tag
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
metadata
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: doc2text
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Valentin Aitken
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-07-12 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Parses odt to markdown
|
14
|
+
email: bostko@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/doc2text.rb
|
20
|
+
- lib/doc2text/content.rb
|
21
|
+
- lib/doc2text/errors.rb
|
22
|
+
- lib/doc2text/markdown.rb
|
23
|
+
- lib/doc2text/namespaces.rb
|
24
|
+
- lib/doc2text/odt.rb
|
25
|
+
- lib/doc2text/odt_xml_node.rb
|
26
|
+
homepage: https://github.com/bostko/doc2text
|
27
|
+
licenses:
|
28
|
+
- GPL
|
29
|
+
metadata: {}
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
require_paths:
|
33
|
+
- lib
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubyforge_project:
|
46
|
+
rubygems_version: 2.3.0
|
47
|
+
signing_key:
|
48
|
+
specification_version: 4
|
49
|
+
summary: Translates odt to markdown
|
50
|
+
test_files: []
|
51
|
+
has_rdoc:
|