doc2text 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1821e833815ea821090507cea37fdec0c68dc2af
4
- data.tar.gz: 7815dd2e3f7fbf1f822959e4a120ef0e0bcbef79
3
+ metadata.gz: 555e469e3b8ba226e8ffe20fa8052e377f037a74
4
+ data.tar.gz: 0ba30a01a5a55188202b3224a8ade79becf9099d
5
5
  SHA512:
6
- metadata.gz: 5b3a0e9729eccccd888432527455336214d52ad996920593e30edab1d2e5f29bcdefdca19dfdbf9f7bf686bfabb8a6888cf706a386f058d7ee07b9bacac1e9e8
7
- data.tar.gz: 6ffbb43bd9c8e4eac000b4733f3bcb149a6cee3b8cd608a22a0ca4516fbbc7317a2d30b4b0f15d8dee0c103df509ee24b029393503bd4bb40e16d22ed6c1543a
6
+ metadata.gz: a608fd6d8f606c6dd29669a50e6219ce0ca505bce2aa58ad7d6005e81b22bf34a6b9a9db59cade78fa66c04906f71fede4059f3e03666883dbd7cc6934c41d7f
7
+ data.tar.gz: 01e299c31d90f31c4cdaaadb2010ad50f67c305364a12e8afb826f2e9af93b7e7a93e49f77778366c1d8c40a138d8506846c3d65a1fa9c9ee31c3532c4d13414
@@ -2,21 +2,21 @@ module Doc2Text
2
2
  module Odt
3
3
  module Content
4
4
  class Document < ::Nokogiri::XML::SAX::Document
5
- def initialize(markdown_document)
6
- @markdown_document = markdown_document
5
+ def initialize(markdown_odt_parser)
6
+ @markdown_odt_parser = markdown_odt_parser
7
7
  end
8
8
 
9
9
  def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
10
- @markdown_document.new_node prefix, name, attrs
10
+ @markdown_odt_parser.new_node prefix, name, attrs
11
11
  end
12
12
 
13
13
  def end_element_namespace(name, prefix = nil, uri = nil)
14
- @markdown_document.close_node prefix, name
14
+ @markdown_odt_parser.close_node prefix, name
15
15
  end
16
16
 
17
17
  def characters(string)
18
18
  unless string.strip.empty?
19
- @markdown_document << string
19
+ @markdown_odt_parser.text string
20
20
  end
21
21
  end
22
22
  end
@@ -1,6 +1,8 @@
1
+ require 'logger'
2
+
1
3
  module Doc2Text
2
4
  module Markdown
3
- class Document
5
+ class OdtParser
4
6
  def initialize(output)
5
7
  @output = output
6
8
  @automatic_styles = {}
@@ -13,24 +15,23 @@ module Doc2Text
13
15
  new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
14
16
  @current_node.children << new_node
15
17
  @current_node = new_node
16
- self << @current_node.open
17
18
  end
18
19
  end
19
20
 
20
21
  def close_node(prefix, name)
21
- if Odt::XmlNodes::Node.create_node(prefix, name).eql? @current_node
22
+ if Odt::XmlNodes::Node.create_node(prefix, name, nil, [], self).eql? @current_node
23
+ return if !@current_node
22
24
  if @current_node.delete_on_close?
23
- remove_current_node!
24
- else
25
- remove_current_node! false
25
+ # if @current_node.parent
26
+ # @output << @current_node.parent.expand
27
+ # @current_node.parent.un_delete
28
+ # else
29
+ @output << @current_node.expand
30
+ # end
26
31
  end
27
- elsif Odt::XmlNodes::Node.create_node(prefix, name).eql? @current_node.parent
28
- if @current_node.parent.delete_on_close?
29
- remove_current_node!
30
- remove_current_node!
31
- else
32
- remove_current_node! false
33
- remove_current_node! false
32
+ @current_node = @current_node.parent
33
+ if @current_node && @current_node.delete_on_close?
34
+ @current_node.delete
34
35
  end
35
36
  else
36
37
  # TODO remove this redundant(tree build algorithm) checks
@@ -38,19 +39,9 @@ module Doc2Text
38
39
  end
39
40
  end
40
41
 
41
- def remove_current_node!(remove = true)
42
- return if !@current_node
43
- self << @current_node.close
44
- node_for_deletion = @current_node
45
- @current_node = @current_node.parent
46
- return unless @current_node
47
- if remove
48
- @current_node.remove_last_child! node_for_deletion
49
- end
50
- end
51
-
52
- def <<(string)
53
- @output << string
42
+ def text(string)
43
+ plain_text = Odt::XmlNodes::PlainText.new(string)
44
+ @current_node.children << plain_text
54
45
  end
55
46
 
56
47
  def close
@@ -80,6 +71,10 @@ module Doc2Text
80
71
  raise Doc2Text::XmlError, 'it does not support this xpath syntax'
81
72
  end
82
73
  end
74
+
75
+ def logger
76
+ @logger ||= Logger.new(STDOUT)
77
+ end
83
78
  end
84
79
  end
85
80
  end
data/lib/doc2text/odt.rb CHANGED
@@ -10,7 +10,7 @@ module Doc2Text
10
10
  begin
11
11
  odt.unpack
12
12
  output = File.open output_filename, 'w'
13
- markdown = Markdown::Document.new output
13
+ markdown = Markdown::OdtParser.new output
14
14
  begin
15
15
  odt.parse markdown
16
16
  ensure
@@ -1,6 +1,18 @@
1
1
  module Doc2Text
2
2
  module Odt
3
3
  module XmlNodes
4
+ class PlainText
5
+ include Node
6
+
7
+ attr_accessor :text
8
+
9
+ alias_method :expand, :text
10
+
11
+ def initialize(text)
12
+ @text = text
13
+ end
14
+ end
15
+
4
16
  class Generic
5
17
  include Node
6
18
  end
@@ -26,7 +38,15 @@ module Doc2Text
26
38
  include Node
27
39
 
28
40
  def delete_on_close?
29
- false # required for testing purposes. After a document has been parsed, some tests could be run against the tree built
41
+ true
42
+ end
43
+ end
44
+
45
+ class Text
46
+ include Node
47
+
48
+ def delete_on_close?
49
+ true
30
50
  end
31
51
  end
32
52
  end
@@ -43,7 +63,26 @@ module Doc2Text
43
63
  module DataStyle; end
44
64
  module Presentation; end
45
65
  module Script; end
46
- module Table; end
66
+ module Table
67
+ class TableRow
68
+ include Node
69
+
70
+ def expand
71
+ header_delimiter = parent.children.count >= 2 && parent.children[1] == self ? "\n|---|---|" : ''
72
+ result = "\n#{@children.select(&:not_deleted?).map(&:expand).join.strip.gsub "\n", ''} |#{header_delimiter}"
73
+ delete
74
+ result
75
+ end
76
+ end
77
+
78
+ class TableCell
79
+ include Node
80
+
81
+ def open
82
+ ' | '
83
+ end
84
+ end
85
+ end
47
86
  module Style
48
87
  class Style
49
88
  include Node
@@ -67,19 +106,24 @@ module Doc2Text
67
106
  module Of; end
68
107
 
69
108
  module Text
70
- def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
71
- super parent, attrs, prefix, name, markdown_document
72
- @markdown_document = markdown_document
109
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
110
+ super parent, attrs, prefix, name
111
+ @markdown_odt_parser = markdown_odt_parser
73
112
  style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
74
113
  @enclosing_style = []
75
- if style_index
76
- fetch_style attrs[style_index].value
114
+ if style_index and fetch_style?
115
+ elem_style = find_style attrs[style_index].value
116
+ fetch_style elem_style
77
117
  end
78
118
  end
79
119
 
80
- def fetch_common_style(style)
81
- if style
82
- style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
120
+ def fetch_style?
121
+ true
122
+ end
123
+
124
+ def fetch_style(elem_style)
125
+ if elem_style
126
+ elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
83
127
  text_property.attrs.each { |attr|
84
128
  if attr.prefix == 'style'
85
129
  if attr.localname == 'font-style-complex' && attr.value == 'italic'
@@ -93,11 +137,10 @@ module Doc2Text
93
137
  end
94
138
  end
95
139
 
96
- def fetch_style(style_name)
97
- styles = @markdown_document.xpath '/office:document-content/office:automatic-styles/style:style'
98
- style = styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' && attr.value == self.class.style_family } &&
140
+ def find_style(style_name)
141
+ styles = @markdown_odt_parser.xpath '/office:document-content/office:automatic-styles/style:style'
142
+ styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' } &&
99
143
  style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
100
- fetch_common_style style
101
144
  end
102
145
 
103
146
  # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419256_253892949
@@ -105,8 +148,8 @@ module Doc2Text
105
148
  include Node
106
149
  include Text
107
150
 
108
- def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
109
- super parent, attrs, prefix, name, markdown_document
151
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
152
+ super parent, attrs, prefix, name, markdown_odt_parser
110
153
  end
111
154
 
112
155
  def self.style_family
@@ -153,15 +196,46 @@ module Doc2Text
153
196
  include Node
154
197
  include Text
155
198
 
156
- not_enclosing 'p'
199
+ def expand
200
+ result = "* #{@children.select(&:not_deleted?).map(&:expand).join.strip.gsub /\n{2,}/, "\n"}\n"
201
+ delete
202
+ result.clone
203
+ end
157
204
 
158
- def open
159
- '* '
205
+ def fetch_style?
206
+ false
160
207
  end
161
208
 
162
- def close
209
+ def delete_on_close?
210
+ false
211
+ end
212
+ end
213
+
214
+ class List
215
+ include Node
216
+ include Text
217
+
218
+ def open
163
219
  "\n"
164
220
  end
221
+
222
+ def fetch_style(elem_style)
223
+ if elem_style
224
+ elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
225
+ text_property.attrs.each { |attr|
226
+ if attr.prefix == 'style'
227
+ if attr.localname == 'list-level-style-number' && attr.value == 'Numbering_20_Symbols'
228
+ @enclosing_style << '_'
229
+ end
230
+ end
231
+ }
232
+ }
233
+ end
234
+ end
235
+
236
+ def delete_on_close?
237
+ false
238
+ end
165
239
  end
166
240
  end
167
241
  end
@@ -3,14 +3,16 @@ module Doc2Text
3
3
  module XmlNodes
4
4
  module Node
5
5
  attr_reader :parent, :children, :attrs, :prefix, :name
6
+ attr_accessor :text
6
7
 
7
- def self.create_node(prefix, name, parent = nil, attrs = [], markdown_document = nil)
8
+ def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
8
9
  begin
9
10
  clazz = XmlNodes.const_get "#{titleize prefix}::#{titleize name}"
10
11
  rescue NameError => e
11
- Generic.new(parent, attrs, prefix, name, markdown_document)
12
+ # markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
13
+ Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
12
14
  else
13
- clazz.new(parent, attrs, prefix, name, markdown_document)
15
+ clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
14
16
  end
15
17
  end
16
18
 
@@ -18,8 +20,8 @@ module Doc2Text
18
20
  tag.split('-').map(&:capitalize).join
19
21
  end
20
22
 
21
- def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
22
- @parent, @attrs, @prefix, @name = parent, attrs, prefix, name
23
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
24
+ @parent, @attrs, @prefix, @name, @markdown_odt_parser = parent, attrs, prefix, name, markdown_odt_parser
23
25
  @children = []
24
26
  @has_text = false
25
27
  end
@@ -40,12 +42,22 @@ module Doc2Text
40
42
  ''
41
43
  end
42
44
 
43
- def <<(child)
44
- @children << child
45
+ def delete_on_close?
46
+ false
45
47
  end
46
48
 
47
- def delete_on_close?
48
- true
49
+ def not_deleted?
50
+ !@deleted
51
+ end
52
+
53
+ def delete
54
+ @deleted = true
55
+ # @children.each { |child| child.delete }
56
+ # @children = []
57
+ end
58
+
59
+ def un_delete
60
+ @deleted = false
49
61
  end
50
62
 
51
63
  def eql?(object)
@@ -57,15 +69,6 @@ module Doc2Text
57
69
  instance_of? Node
58
70
  end
59
71
 
60
- def remove_last_child!(child)
61
- unless child === @children.last
62
- # TODO remove this redundant(tree build algorithm) checks
63
- raise Doc2Text::XmlError, "!The child #{child} IS NOT among the children of #{self}"
64
- else
65
- @children.pop
66
- end
67
- end
68
-
69
72
  def xml_name
70
73
  "#{@prefix}:#{@name}"
71
74
  end
@@ -74,6 +77,12 @@ module Doc2Text
74
77
  "#{xml_name} : #{attrs}"
75
78
  end
76
79
 
80
+ def expand
81
+ expanded = "#{open}#{@children.select(&:not_deleted?).map(&:expand).join}#{close}"
82
+ delete
83
+ expanded.clone
84
+ end
85
+
77
86
  def not_enclosing?
78
87
  !root? && parent.class.not_enclosing_tags && parent.class.not_enclosing_tags.find do |tag|
79
88
  @prefix == parent.prefix && @name == tag
data/lib/doc2text.rb CHANGED
@@ -4,8 +4,8 @@ require 'fileutils'
4
4
 
5
5
  require 'doc2text/odt'
6
6
  require 'doc2text/odt_xml_node'
7
- require 'doc2text/namespaces'
8
- require 'doc2text/markdown'
7
+ require 'doc2text/odt_xml_namespaces'
8
+ require 'doc2text/markdown_odt_parser'
9
9
  require 'doc2text/errors'
10
10
 
11
11
  require 'doc2text/content'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: doc2text
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: '0.2'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Valentin Aitken
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-07-12 00:00:00.000000000 Z
11
+ date: 2014-10-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Parses odt to markdown
14
14
  email: bostko@gmail.com
@@ -19,9 +19,9 @@ files:
19
19
  - lib/doc2text.rb
20
20
  - lib/doc2text/content.rb
21
21
  - lib/doc2text/errors.rb
22
- - lib/doc2text/markdown.rb
23
- - lib/doc2text/namespaces.rb
22
+ - lib/doc2text/markdown_odt_parser.rb
24
23
  - lib/doc2text/odt.rb
24
+ - lib/doc2text/odt_xml_namespaces.rb
25
25
  - lib/doc2text/odt_xml_node.rb
26
26
  homepage: https://github.com/bostko/doc2text
27
27
  licenses:
@@ -43,7 +43,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
43
43
  version: '0'
44
44
  requirements: []
45
45
  rubyforge_project:
46
- rubygems_version: 2.3.0
46
+ rubygems_version: 2.2.2
47
47
  signing_key:
48
48
  specification_version: 4
49
49
  summary: Translates odt to markdown