doc2text 0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1821e833815ea821090507cea37fdec0c68dc2af
4
+ data.tar.gz: 7815dd2e3f7fbf1f822959e4a120ef0e0bcbef79
5
+ SHA512:
6
+ metadata.gz: 5b3a0e9729eccccd888432527455336214d52ad996920593e30edab1d2e5f29bcdefdca19dfdbf9f7bf686bfabb8a6888cf706a386f058d7ee07b9bacac1e9e8
7
+ data.tar.gz: 6ffbb43bd9c8e4eac000b4733f3bcb149a6cee3b8cd608a22a0ca4516fbbc7317a2d30b4b0f15d8dee0c103df509ee24b029393503bd4bb40e16d22ed6c1543a
data/lib/doc2text.rb ADDED
@@ -0,0 +1,11 @@
1
+ require 'nokogiri'
2
+ #require 'nokogiri/xml'
3
+ require 'fileutils'
4
+
5
+ require 'doc2text/odt'
6
+ require 'doc2text/odt_xml_node'
7
+ require 'doc2text/namespaces'
8
+ require 'doc2text/markdown'
9
+ require 'doc2text/errors'
10
+
11
+ require 'doc2text/content'
@@ -0,0 +1,25 @@
1
+ module Doc2Text
2
+ module Odt
3
+ module Content
4
+ class Document < ::Nokogiri::XML::SAX::Document
5
+ def initialize(markdown_document)
6
+ @markdown_document = markdown_document
7
+ end
8
+
9
+ def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
10
+ @markdown_document.new_node prefix, name, attrs
11
+ end
12
+
13
+ def end_element_namespace(name, prefix = nil, uri = nil)
14
+ @markdown_document.close_node prefix, name
15
+ end
16
+
17
+ def characters(string)
18
+ unless string.strip.empty?
19
+ @markdown_document << string
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,5 @@
1
+ module Doc2Text
2
+ class XmlError < StandardError
3
+
4
+ end
5
+ end
@@ -0,0 +1,85 @@
1
+ module Doc2Text
2
+ module Markdown
3
+ class Document
4
+ def initialize(output)
5
+ @output = output
6
+ @automatic_styles = {}
7
+ end
8
+
9
+ def new_node(prefix, name, attrs)
10
+ unless @xml_root
11
+ @xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
12
+ else
13
+ new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
14
+ @current_node.children << new_node
15
+ @current_node = new_node
16
+ self << @current_node.open
17
+ end
18
+ end
19
+
20
+ def close_node(prefix, name)
21
+ if Odt::XmlNodes::Node.create_node(prefix, name).eql? @current_node
22
+ if @current_node.delete_on_close?
23
+ remove_current_node!
24
+ else
25
+ remove_current_node! false
26
+ end
27
+ elsif Odt::XmlNodes::Node.create_node(prefix, name).eql? @current_node.parent
28
+ if @current_node.parent.delete_on_close?
29
+ remove_current_node!
30
+ remove_current_node!
31
+ else
32
+ remove_current_node! false
33
+ remove_current_node! false
34
+ end
35
+ else
36
+ # TODO remove this redundant(tree build algorithm) checks
37
+ raise Doc2Text::XmlError, "!Close node child #{prefix} #{name} IS NOT correct, CURRENT_ELEM #{@current_node}"
38
+ end
39
+ end
40
+
41
+ def remove_current_node!(remove = true)
42
+ return if !@current_node
43
+ self << @current_node.close
44
+ node_for_deletion = @current_node
45
+ @current_node = @current_node.parent
46
+ return unless @current_node
47
+ if remove
48
+ @current_node.remove_last_child! node_for_deletion
49
+ end
50
+ end
51
+
52
+ def <<(string)
53
+ @output << string
54
+ end
55
+
56
+ def close
57
+ @output.close
58
+ end
59
+
60
+ def print_tree(node)
61
+ puts node
62
+ node.children.each do |child|
63
+ print_tree child
64
+ end
65
+ end
66
+
67
+ # Select nodes xpath style
68
+ # - supports selecting from the root node
69
+ def xpath(string)
70
+ if /^(\/[\w:\-]+)+$/ =~ string
71
+ path = string.scan /[\w:\-]+/
72
+ seek_nodes = [@xml_root]
73
+ path.each_with_index do |xml_name, index|
74
+ seek_nodes.select! { |node| node.xml_name == xml_name }
75
+ seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
76
+ break if seek_nodes.empty?
77
+ end
78
+ seek_nodes
79
+ else
80
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax'
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,169 @@
1
+ module Doc2Text
2
+ module Odt
3
+ module XmlNodes
4
+ class Generic
5
+ include Node
6
+ end
7
+
8
+ #
9
+ # These are the namespaces available in the open document format
10
+ # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os.html
11
+ #
12
+ module Office
13
+ class AutomaticStyles
14
+ include Node
15
+
16
+ def visit
17
+ :automatic_styles
18
+ end
19
+
20
+ def delete_on_close?
21
+ false
22
+ end
23
+ end
24
+
25
+ class DocumentContent
26
+ include Node
27
+
28
+ def delete_on_close?
29
+ false # required for testing purposes. After a document has been parsed, some tests could be run against the tree built
30
+ end
31
+ end
32
+ end
33
+
34
+ module Animation; end
35
+ module Chart; end
36
+ module Config; end
37
+ module Database; end
38
+ module Dr3d; end
39
+ module Drawing; end
40
+ module Form; end
41
+ module Manifest; end
42
+ module Meta; end
43
+ module DataStyle; end
44
+ module Presentation; end
45
+ module Script; end
46
+ module Table; end
47
+ module Style
48
+ class Style
49
+ include Node
50
+
51
+ def delete_on_close?
52
+ false
53
+ end
54
+ end
55
+
56
+ class TextProperties
57
+ include Node
58
+
59
+ def delete_on_close?
60
+ false
61
+ end
62
+ end
63
+ end
64
+ module XslFoCompatible; end
65
+ module SvgCompatible; end
66
+ module SmilCompatible; end
67
+ module Of; end
68
+
69
+ module Text
70
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
71
+ super parent, attrs, prefix, name, markdown_document
72
+ @markdown_document = markdown_document
73
+ style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
74
+ @enclosing_style = []
75
+ if style_index
76
+ fetch_style attrs[style_index].value
77
+ end
78
+ end
79
+
80
+ def fetch_common_style(style)
81
+ if style
82
+ style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
83
+ text_property.attrs.each { |attr|
84
+ if attr.prefix == 'style'
85
+ if attr.localname == 'font-style-complex' && attr.value == 'italic'
86
+ @enclosing_style << '_'
87
+ elsif attr.localname == 'font-weight-complex' && attr.value == 'bold'
88
+ @enclosing_style << '**'
89
+ end
90
+ end
91
+ }
92
+ }
93
+ end
94
+ end
95
+
96
+ def fetch_style(style_name)
97
+ styles = @markdown_document.xpath '/office:document-content/office:automatic-styles/style:style'
98
+ style = styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' && attr.value == self.class.style_family } &&
99
+ style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
100
+ fetch_common_style style
101
+ end
102
+
103
+ # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419256_253892949
104
+ class P
105
+ include Node
106
+ include Text
107
+
108
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
109
+ super parent, attrs, prefix, name, markdown_document
110
+ end
111
+
112
+ def self.style_family
113
+ 'paragraph'
114
+ end
115
+
116
+ def open
117
+ "\n#{@enclosing_style.join}"
118
+ end
119
+
120
+ def close
121
+ "#{@enclosing_style.reverse.join}\n"
122
+ end
123
+ end
124
+
125
+ class LineBreak
126
+ include Node
127
+
128
+ def open
129
+ '<br/>'
130
+ end
131
+ end
132
+
133
+ # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419264_253892949
134
+ class Span
135
+ include Node
136
+ include Text
137
+
138
+ def self.style_family
139
+ 'text'
140
+ end
141
+
142
+ def open
143
+ @enclosing_style.join
144
+ end
145
+
146
+ def close
147
+ @enclosing_style.reverse.join
148
+ end
149
+ end
150
+
151
+ # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415154_253892949
152
+ class ListItem
153
+ include Node
154
+ include Text
155
+
156
+ not_enclosing 'p'
157
+
158
+ def open
159
+ '* '
160
+ end
161
+
162
+ def close
163
+ "\n"
164
+ end
165
+ end
166
+ end
167
+ end
168
+ end
169
+ end
@@ -0,0 +1,73 @@
1
+ require 'zip'
2
+
3
+ module Doc2Text
4
+ module Odt
5
+ class Document
6
+ EXTRACT_EXTENSION = 'unpacked_odt'
7
+
8
+ def self.parse_and_save(input, output_filename)
9
+ odt = new input
10
+ begin
11
+ odt.unpack
12
+ output = File.open output_filename, 'w'
13
+ markdown = Markdown::Document.new output
14
+ begin
15
+ odt.parse markdown
16
+ ensure
17
+ markdown.close
18
+ end
19
+ ensure
20
+ odt.clean
21
+ end
22
+ end
23
+
24
+ def parse(markdown)
25
+ content = ::Doc2Text::Odt::Content::Document.new markdown
26
+ parser = Nokogiri::XML::SAX::Parser.new(content) # { |config| config.strict}
27
+ parser.parse open 'content.xml'
28
+ end
29
+
30
+ def initialize(document_path)
31
+ @document_path = document_path
32
+ end
33
+
34
+ def unpack
35
+ Zip::File.open(@document_path) {
36
+ |zip_file|
37
+ Dir.mkdir(extract_path)
38
+ zip_file.each do |entry|
39
+ zipped_file_extract_path = File.join extract_path, entry.name
40
+ FileUtils.mkdir_p File.dirname(zipped_file_extract_path)
41
+ zip_file.extract entry, zipped_file_extract_path
42
+ end
43
+ }
44
+ end
45
+
46
+ def clean
47
+ if [extract_path, File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
48
+ FileUtils.rm_r extract_path
49
+ end
50
+ end
51
+
52
+ # Open file from the current odt
53
+ def open(filename)
54
+ File.open File.join(extract_path, filename), 'r'
55
+ end
56
+
57
+ # Parse xml file from the current odt
58
+ def xml_file(filename, rood_node_name)
59
+ Nokogiri::XML::Document.parse(open(filename)) { |config| config.strict }
60
+ root_node = doc.root
61
+ if root_node.name != rood_node_name or root_node.namespace.prefix != 'office'
62
+ raise XmlError, 'Document does not have correct root element'
63
+ else
64
+ open(filename)
65
+ end
66
+ end
67
+
68
+ def extract_path
69
+ File.join File.dirname(@document_path), ".#{File.basename(@document_path)}_#{EXTRACT_EXTENSION}"
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,98 @@
1
+ module Doc2Text
2
+ module Odt
3
+ module XmlNodes
4
+ module Node
5
+ attr_reader :parent, :children, :attrs, :prefix, :name
6
+
7
+ def self.create_node(prefix, name, parent = nil, attrs = [], markdown_document = nil)
8
+ begin
9
+ clazz = XmlNodes.const_get "#{titleize prefix}::#{titleize name}"
10
+ rescue NameError => e
11
+ Generic.new(parent, attrs, prefix, name, markdown_document)
12
+ else
13
+ clazz.new(parent, attrs, prefix, name, markdown_document)
14
+ end
15
+ end
16
+
17
+ def self.titleize(tag)
18
+ tag.split('-').map(&:capitalize).join
19
+ end
20
+
21
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
22
+ @parent, @attrs, @prefix, @name = parent, attrs, prefix, name
23
+ @children = []
24
+ @has_text = false
25
+ end
26
+
27
+ def root?
28
+ !@parent
29
+ end
30
+
31
+ def has_text?
32
+ @has_text
33
+ end
34
+
35
+ def open
36
+ ''
37
+ end
38
+
39
+ def close
40
+ ''
41
+ end
42
+
43
+ def <<(child)
44
+ @children << child
45
+ end
46
+
47
+ def delete_on_close?
48
+ true
49
+ end
50
+
51
+ def eql?(object)
52
+ return false unless object.is_a? Node
53
+ object.xml_name == xml_name
54
+ end
55
+
56
+ def generic?
57
+ instance_of? Node
58
+ end
59
+
60
+ def remove_last_child!(child)
61
+ unless child === @children.last
62
+ # TODO remove this redundant(tree build algorithm) checks
63
+ raise Doc2Text::XmlError, "!The child #{child} IS NOT among the children of #{self}"
64
+ else
65
+ @children.pop
66
+ end
67
+ end
68
+
69
+ def xml_name
70
+ "#{@prefix}:#{@name}"
71
+ end
72
+
73
+ def to_s
74
+ "#{xml_name} : #{attrs}"
75
+ end
76
+
77
+ def not_enclosing?
78
+ !root? && parent.class.not_enclosing_tags && parent.class.not_enclosing_tags.find do |tag|
79
+ @prefix == parent.prefix && @name == tag
80
+ end
81
+ end
82
+
83
+ def self.included(base)
84
+ base.extend ClassMethods
85
+ end
86
+
87
+ module ClassMethods
88
+ attr_reader :not_enclosing_tags
89
+
90
+ def not_enclosing(tag)
91
+ @not_enclosing_tags ||= []
92
+ @not_enclosing_tags << tag
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
98
+ end
metadata ADDED
@@ -0,0 +1,51 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: doc2text
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Valentin Aitken
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-07-12 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Parses odt to markdown
14
+ email: bostko@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/doc2text.rb
20
+ - lib/doc2text/content.rb
21
+ - lib/doc2text/errors.rb
22
+ - lib/doc2text/markdown.rb
23
+ - lib/doc2text/namespaces.rb
24
+ - lib/doc2text/odt.rb
25
+ - lib/doc2text/odt_xml_node.rb
26
+ homepage: https://github.com/bostko/doc2text
27
+ licenses:
28
+ - GPL
29
+ metadata: {}
30
+ post_install_message:
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubyforge_project:
46
+ rubygems_version: 2.3.0
47
+ signing_key:
48
+ specification_version: 4
49
+ summary: Translates odt to markdown
50
+ test_files: []
51
+ has_rdoc: