doc2text 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1821e833815ea821090507cea37fdec0c68dc2af
4
+ data.tar.gz: 7815dd2e3f7fbf1f822959e4a120ef0e0bcbef79
5
+ SHA512:
6
+ metadata.gz: 5b3a0e9729eccccd888432527455336214d52ad996920593e30edab1d2e5f29bcdefdca19dfdbf9f7bf686bfabb8a6888cf706a386f058d7ee07b9bacac1e9e8
7
+ data.tar.gz: 6ffbb43bd9c8e4eac000b4733f3bcb149a6cee3b8cd608a22a0ca4516fbbc7317a2d30b4b0f15d8dee0c103df509ee24b029393503bd4bb40e16d22ed6c1543a
data/lib/doc2text.rb ADDED
@@ -0,0 +1,11 @@
1
+ require 'nokogiri'
2
+ #require 'nokogiri/xml'
3
+ require 'fileutils'
4
+
5
+ require 'doc2text/odt'
6
+ require 'doc2text/odt_xml_node'
7
+ require 'doc2text/namespaces'
8
+ require 'doc2text/markdown'
9
+ require 'doc2text/errors'
10
+
11
+ require 'doc2text/content'
@@ -0,0 +1,25 @@
1
+ module Doc2Text
2
+ module Odt
3
+ module Content
4
+ class Document < ::Nokogiri::XML::SAX::Document
5
+ def initialize(markdown_document)
6
+ @markdown_document = markdown_document
7
+ end
8
+
9
+ def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
10
+ @markdown_document.new_node prefix, name, attrs
11
+ end
12
+
13
+ def end_element_namespace(name, prefix = nil, uri = nil)
14
+ @markdown_document.close_node prefix, name
15
+ end
16
+
17
+ def characters(string)
18
+ unless string.strip.empty?
19
+ @markdown_document << string
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,5 @@
1
+ module Doc2Text
2
+ class XmlError < StandardError
3
+
4
+ end
5
+ end
@@ -0,0 +1,85 @@
1
+ module Doc2Text
2
+ module Markdown
3
+ class Document
4
+ def initialize(output)
5
+ @output = output
6
+ @automatic_styles = {}
7
+ end
8
+
9
+ def new_node(prefix, name, attrs)
10
+ unless @xml_root
11
+ @xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
12
+ else
13
+ new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
14
+ @current_node.children << new_node
15
+ @current_node = new_node
16
+ self << @current_node.open
17
+ end
18
+ end
19
+
20
+ def close_node(prefix, name)
21
+ if Odt::XmlNodes::Node.create_node(prefix, name).eql? @current_node
22
+ if @current_node.delete_on_close?
23
+ remove_current_node!
24
+ else
25
+ remove_current_node! false
26
+ end
27
+ elsif Odt::XmlNodes::Node.create_node(prefix, name).eql? @current_node.parent
28
+ if @current_node.parent.delete_on_close?
29
+ remove_current_node!
30
+ remove_current_node!
31
+ else
32
+ remove_current_node! false
33
+ remove_current_node! false
34
+ end
35
+ else
36
+ # TODO remove this redundant(tree build algorithm) checks
37
+ raise Doc2Text::XmlError, "!Close node child #{prefix} #{name} IS NOT correct, CURRENT_ELEM #{@current_node}"
38
+ end
39
+ end
40
+
41
+ def remove_current_node!(remove = true)
42
+ return if !@current_node
43
+ self << @current_node.close
44
+ node_for_deletion = @current_node
45
+ @current_node = @current_node.parent
46
+ return unless @current_node
47
+ if remove
48
+ @current_node.remove_last_child! node_for_deletion
49
+ end
50
+ end
51
+
52
+ def <<(string)
53
+ @output << string
54
+ end
55
+
56
+ def close
57
+ @output.close
58
+ end
59
+
60
+ def print_tree(node)
61
+ puts node
62
+ node.children.each do |child|
63
+ print_tree child
64
+ end
65
+ end
66
+
67
+ # Select nodes xpath style
68
+ # - supports selecting from the root node
69
+ def xpath(string)
70
+ if /^(\/[\w:\-]+)+$/ =~ string
71
+ path = string.scan /[\w:\-]+/
72
+ seek_nodes = [@xml_root]
73
+ path.each_with_index do |xml_name, index|
74
+ seek_nodes.select! { |node| node.xml_name == xml_name }
75
+ seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
76
+ break if seek_nodes.empty?
77
+ end
78
+ seek_nodes
79
+ else
80
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax'
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,169 @@
1
+ module Doc2Text
2
+ module Odt
3
+ module XmlNodes
4
+ class Generic
5
+ include Node
6
+ end
7
+
8
+ #
9
+ # These are the namespaces available in the open document format
10
+ # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os.html
11
+ #
12
+ module Office
13
+ class AutomaticStyles
14
+ include Node
15
+
16
+ def visit
17
+ :automatic_styles
18
+ end
19
+
20
+ def delete_on_close?
21
+ false
22
+ end
23
+ end
24
+
25
+ class DocumentContent
26
+ include Node
27
+
28
+ def delete_on_close?
29
+ false # required for testing purposes. After a document has been parsed, some tests could be run against the tree built
30
+ end
31
+ end
32
+ end
33
+
34
+ module Animation; end
35
+ module Chart; end
36
+ module Config; end
37
+ module Database; end
38
+ module Dr3d; end
39
+ module Drawing; end
40
+ module Form; end
41
+ module Manifest; end
42
+ module Meta; end
43
+ module DataStyle; end
44
+ module Presentation; end
45
+ module Script; end
46
+ module Table; end
47
+ module Style
48
+ class Style
49
+ include Node
50
+
51
+ def delete_on_close?
52
+ false
53
+ end
54
+ end
55
+
56
+ class TextProperties
57
+ include Node
58
+
59
+ def delete_on_close?
60
+ false
61
+ end
62
+ end
63
+ end
64
+ module XslFoCompatible; end
65
+ module SvgCompatible; end
66
+ module SmilCompatible; end
67
+ module Of; end
68
+
69
+ module Text
70
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
71
+ super parent, attrs, prefix, name, markdown_document
72
+ @markdown_document = markdown_document
73
+ style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
74
+ @enclosing_style = []
75
+ if style_index
76
+ fetch_style attrs[style_index].value
77
+ end
78
+ end
79
+
80
+ def fetch_common_style(style)
81
+ if style
82
+ style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
83
+ text_property.attrs.each { |attr|
84
+ if attr.prefix == 'style'
85
+ if attr.localname == 'font-style-complex' && attr.value == 'italic'
86
+ @enclosing_style << '_'
87
+ elsif attr.localname == 'font-weight-complex' && attr.value == 'bold'
88
+ @enclosing_style << '**'
89
+ end
90
+ end
91
+ }
92
+ }
93
+ end
94
+ end
95
+
96
+ def fetch_style(style_name)
97
+ styles = @markdown_document.xpath '/office:document-content/office:automatic-styles/style:style'
98
+ style = styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' && attr.value == self.class.style_family } &&
99
+ style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
100
+ fetch_common_style style
101
+ end
102
+
103
+ # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419256_253892949
104
+ class P
105
+ include Node
106
+ include Text
107
+
108
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
109
+ super parent, attrs, prefix, name, markdown_document
110
+ end
111
+
112
+ def self.style_family
113
+ 'paragraph'
114
+ end
115
+
116
+ def open
117
+ "\n#{@enclosing_style.join}"
118
+ end
119
+
120
+ def close
121
+ "#{@enclosing_style.reverse.join}\n"
122
+ end
123
+ end
124
+
125
+ class LineBreak
126
+ include Node
127
+
128
+ def open
129
+ '<br/>'
130
+ end
131
+ end
132
+
133
+ # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419264_253892949
134
+ class Span
135
+ include Node
136
+ include Text
137
+
138
+ def self.style_family
139
+ 'text'
140
+ end
141
+
142
+ def open
143
+ @enclosing_style.join
144
+ end
145
+
146
+ def close
147
+ @enclosing_style.reverse.join
148
+ end
149
+ end
150
+
151
+ # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415154_253892949
152
+ class ListItem
153
+ include Node
154
+ include Text
155
+
156
+ not_enclosing 'p'
157
+
158
+ def open
159
+ '* '
160
+ end
161
+
162
+ def close
163
+ "\n"
164
+ end
165
+ end
166
+ end
167
+ end
168
+ end
169
+ end
@@ -0,0 +1,73 @@
1
+ require 'zip'
2
+
3
+ module Doc2Text
4
+ module Odt
5
+ class Document
6
+ EXTRACT_EXTENSION = 'unpacked_odt'
7
+
8
+ def self.parse_and_save(input, output_filename)
9
+ odt = new input
10
+ begin
11
+ odt.unpack
12
+ output = File.open output_filename, 'w'
13
+ markdown = Markdown::Document.new output
14
+ begin
15
+ odt.parse markdown
16
+ ensure
17
+ markdown.close
18
+ end
19
+ ensure
20
+ odt.clean
21
+ end
22
+ end
23
+
24
+ def parse(markdown)
25
+ content = ::Doc2Text::Odt::Content::Document.new markdown
26
+ parser = Nokogiri::XML::SAX::Parser.new(content) # { |config| config.strict}
27
+ parser.parse open 'content.xml'
28
+ end
29
+
30
+ def initialize(document_path)
31
+ @document_path = document_path
32
+ end
33
+
34
+ def unpack
35
+ Zip::File.open(@document_path) {
36
+ |zip_file|
37
+ Dir.mkdir(extract_path)
38
+ zip_file.each do |entry|
39
+ zipped_file_extract_path = File.join extract_path, entry.name
40
+ FileUtils.mkdir_p File.dirname(zipped_file_extract_path)
41
+ zip_file.extract entry, zipped_file_extract_path
42
+ end
43
+ }
44
+ end
45
+
46
+ def clean
47
+ if [extract_path, File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
48
+ FileUtils.rm_r extract_path
49
+ end
50
+ end
51
+
52
+ # Open file from the current odt
53
+ def open(filename)
54
+ File.open File.join(extract_path, filename), 'r'
55
+ end
56
+
57
+ # Parse xml file from the current odt
58
+ def xml_file(filename, rood_node_name)
59
+ Nokogiri::XML::Document.parse(open(filename)) { |config| config.strict }
60
+ root_node = doc.root
61
+ if root_node.name != rood_node_name or root_node.namespace.prefix != 'office'
62
+ raise XmlError, 'Document does not have correct root element'
63
+ else
64
+ open(filename)
65
+ end
66
+ end
67
+
68
+ def extract_path
69
+ File.join File.dirname(@document_path), ".#{File.basename(@document_path)}_#{EXTRACT_EXTENSION}"
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,98 @@
1
+ module Doc2Text
2
+ module Odt
3
+ module XmlNodes
4
+ module Node
5
+ attr_reader :parent, :children, :attrs, :prefix, :name
6
+
7
+ def self.create_node(prefix, name, parent = nil, attrs = [], markdown_document = nil)
8
+ begin
9
+ clazz = XmlNodes.const_get "#{titleize prefix}::#{titleize name}"
10
+ rescue NameError => e
11
+ Generic.new(parent, attrs, prefix, name, markdown_document)
12
+ else
13
+ clazz.new(parent, attrs, prefix, name, markdown_document)
14
+ end
15
+ end
16
+
17
+ def self.titleize(tag)
18
+ tag.split('-').map(&:capitalize).join
19
+ end
20
+
21
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
22
+ @parent, @attrs, @prefix, @name = parent, attrs, prefix, name
23
+ @children = []
24
+ @has_text = false
25
+ end
26
+
27
+ def root?
28
+ !@parent
29
+ end
30
+
31
+ def has_text?
32
+ @has_text
33
+ end
34
+
35
+ def open
36
+ ''
37
+ end
38
+
39
+ def close
40
+ ''
41
+ end
42
+
43
+ def <<(child)
44
+ @children << child
45
+ end
46
+
47
+ def delete_on_close?
48
+ true
49
+ end
50
+
51
+ def eql?(object)
52
+ return false unless object.is_a? Node
53
+ object.xml_name == xml_name
54
+ end
55
+
56
+ def generic?
57
+ instance_of? Node
58
+ end
59
+
60
+ def remove_last_child!(child)
61
+ unless child === @children.last
62
+ # TODO remove this redundant(tree build algorithm) checks
63
+ raise Doc2Text::XmlError, "!The child #{child} IS NOT among the children of #{self}"
64
+ else
65
+ @children.pop
66
+ end
67
+ end
68
+
69
+ def xml_name
70
+ "#{@prefix}:#{@name}"
71
+ end
72
+
73
+ def to_s
74
+ "#{xml_name} : #{attrs}"
75
+ end
76
+
77
+ def not_enclosing?
78
+ !root? && parent.class.not_enclosing_tags && parent.class.not_enclosing_tags.find do |tag|
79
+ @prefix == parent.prefix && @name == tag
80
+ end
81
+ end
82
+
83
+ def self.included(base)
84
+ base.extend ClassMethods
85
+ end
86
+
87
+ module ClassMethods
88
+ attr_reader :not_enclosing_tags
89
+
90
+ def not_enclosing(tag)
91
+ @not_enclosing_tags ||= []
92
+ @not_enclosing_tags << tag
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
98
+ end
metadata ADDED
@@ -0,0 +1,51 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: doc2text
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Valentin Aitken
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-07-12 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Parses odt to markdown
14
+ email: bostko@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/doc2text.rb
20
+ - lib/doc2text/content.rb
21
+ - lib/doc2text/errors.rb
22
+ - lib/doc2text/markdown.rb
23
+ - lib/doc2text/namespaces.rb
24
+ - lib/doc2text/odt.rb
25
+ - lib/doc2text/odt_xml_node.rb
26
+ homepage: https://github.com/bostko/doc2text
27
+ licenses:
28
+ - GPL
29
+ metadata: {}
30
+ post_install_message:
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubyforge_project:
46
+ rubygems_version: 2.3.0
47
+ signing_key:
48
+ specification_version: 4
49
+ summary: Translates odt to markdown
50
+ test_files: []
51
+ has_rdoc: