doc2text 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1821e833815ea821090507cea37fdec0c68dc2af
4
- data.tar.gz: 7815dd2e3f7fbf1f822959e4a120ef0e0bcbef79
3
+ metadata.gz: 555e469e3b8ba226e8ffe20fa8052e377f037a74
4
+ data.tar.gz: 0ba30a01a5a55188202b3224a8ade79becf9099d
5
5
  SHA512:
6
- metadata.gz: 5b3a0e9729eccccd888432527455336214d52ad996920593e30edab1d2e5f29bcdefdca19dfdbf9f7bf686bfabb8a6888cf706a386f058d7ee07b9bacac1e9e8
7
- data.tar.gz: 6ffbb43bd9c8e4eac000b4733f3bcb149a6cee3b8cd608a22a0ca4516fbbc7317a2d30b4b0f15d8dee0c103df509ee24b029393503bd4bb40e16d22ed6c1543a
6
+ metadata.gz: a608fd6d8f606c6dd29669a50e6219ce0ca505bce2aa58ad7d6005e81b22bf34a6b9a9db59cade78fa66c04906f71fede4059f3e03666883dbd7cc6934c41d7f
7
+ data.tar.gz: 01e299c31d90f31c4cdaaadb2010ad50f67c305364a12e8afb826f2e9af93b7e7a93e49f77778366c1d8c40a138d8506846c3d65a1fa9c9ee31c3532c4d13414
@@ -2,21 +2,21 @@ module Doc2Text
2
2
  module Odt
3
3
  module Content
4
4
  class Document < ::Nokogiri::XML::SAX::Document
5
- def initialize(markdown_document)
6
- @markdown_document = markdown_document
5
+ def initialize(markdown_odt_parser)
6
+ @markdown_odt_parser = markdown_odt_parser
7
7
  end
8
8
 
9
9
  def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
10
- @markdown_document.new_node prefix, name, attrs
10
+ @markdown_odt_parser.new_node prefix, name, attrs
11
11
  end
12
12
 
13
13
  def end_element_namespace(name, prefix = nil, uri = nil)
14
- @markdown_document.close_node prefix, name
14
+ @markdown_odt_parser.close_node prefix, name
15
15
  end
16
16
 
17
17
  def characters(string)
18
18
  unless string.strip.empty?
19
- @markdown_document << string
19
+ @markdown_odt_parser.text string
20
20
  end
21
21
  end
22
22
  end
@@ -1,6 +1,8 @@
1
+ require 'logger'
2
+
1
3
  module Doc2Text
2
4
  module Markdown
3
- class Document
5
+ class OdtParser
4
6
  def initialize(output)
5
7
  @output = output
6
8
  @automatic_styles = {}
@@ -13,24 +15,23 @@ module Doc2Text
13
15
  new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
14
16
  @current_node.children << new_node
15
17
  @current_node = new_node
16
- self << @current_node.open
17
18
  end
18
19
  end
19
20
 
20
21
  def close_node(prefix, name)
21
- if Odt::XmlNodes::Node.create_node(prefix, name).eql? @current_node
22
+ if Odt::XmlNodes::Node.create_node(prefix, name, nil, [], self).eql? @current_node
23
+ return if !@current_node
22
24
  if @current_node.delete_on_close?
23
- remove_current_node!
24
- else
25
- remove_current_node! false
25
+ # if @current_node.parent
26
+ # @output << @current_node.parent.expand
27
+ # @current_node.parent.un_delete
28
+ # else
29
+ @output << @current_node.expand
30
+ # end
26
31
  end
27
- elsif Odt::XmlNodes::Node.create_node(prefix, name).eql? @current_node.parent
28
- if @current_node.parent.delete_on_close?
29
- remove_current_node!
30
- remove_current_node!
31
- else
32
- remove_current_node! false
33
- remove_current_node! false
32
+ @current_node = @current_node.parent
33
+ if @current_node && @current_node.delete_on_close?
34
+ @current_node.delete
34
35
  end
35
36
  else
36
37
  # TODO remove this redundant(tree build algorithm) checks
@@ -38,19 +39,9 @@ module Doc2Text
38
39
  end
39
40
  end
40
41
 
41
- def remove_current_node!(remove = true)
42
- return if !@current_node
43
- self << @current_node.close
44
- node_for_deletion = @current_node
45
- @current_node = @current_node.parent
46
- return unless @current_node
47
- if remove
48
- @current_node.remove_last_child! node_for_deletion
49
- end
50
- end
51
-
52
- def <<(string)
53
- @output << string
42
+ def text(string)
43
+ plain_text = Odt::XmlNodes::PlainText.new(string)
44
+ @current_node.children << plain_text
54
45
  end
55
46
 
56
47
  def close
@@ -80,6 +71,10 @@ module Doc2Text
80
71
  raise Doc2Text::XmlError, 'it does not support this xpath syntax'
81
72
  end
82
73
  end
74
+
75
+ def logger
76
+ @logger ||= Logger.new(STDOUT)
77
+ end
83
78
  end
84
79
  end
85
80
  end
data/lib/doc2text/odt.rb CHANGED
@@ -10,7 +10,7 @@ module Doc2Text
10
10
  begin
11
11
  odt.unpack
12
12
  output = File.open output_filename, 'w'
13
- markdown = Markdown::Document.new output
13
+ markdown = Markdown::OdtParser.new output
14
14
  begin
15
15
  odt.parse markdown
16
16
  ensure
@@ -1,6 +1,18 @@
1
1
  module Doc2Text
2
2
  module Odt
3
3
  module XmlNodes
4
+ class PlainText
5
+ include Node
6
+
7
+ attr_accessor :text
8
+
9
+ alias_method :expand, :text
10
+
11
+ def initialize(text)
12
+ @text = text
13
+ end
14
+ end
15
+
4
16
  class Generic
5
17
  include Node
6
18
  end
@@ -26,7 +38,15 @@ module Doc2Text
26
38
  include Node
27
39
 
28
40
  def delete_on_close?
29
- false # required for testing purposes. After a document has been parsed, some tests could be run against the tree built
41
+ true
42
+ end
43
+ end
44
+
45
+ class Text
46
+ include Node
47
+
48
+ def delete_on_close?
49
+ true
30
50
  end
31
51
  end
32
52
  end
@@ -43,7 +63,26 @@ module Doc2Text
43
63
  module DataStyle; end
44
64
  module Presentation; end
45
65
  module Script; end
46
- module Table; end
66
+ module Table
67
+ class TableRow
68
+ include Node
69
+
70
+ def expand
71
+ header_delimiter = parent.children.count >= 2 && parent.children[1] == self ? "\n|---|---|" : ''
72
+ result = "\n#{@children.select(&:not_deleted?).map(&:expand).join.strip.gsub "\n", ''} |#{header_delimiter}"
73
+ delete
74
+ result
75
+ end
76
+ end
77
+
78
+ class TableCell
79
+ include Node
80
+
81
+ def open
82
+ ' | '
83
+ end
84
+ end
85
+ end
47
86
  module Style
48
87
  class Style
49
88
  include Node
@@ -67,19 +106,24 @@ module Doc2Text
67
106
  module Of; end
68
107
 
69
108
  module Text
70
- def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
71
- super parent, attrs, prefix, name, markdown_document
72
- @markdown_document = markdown_document
109
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
110
+ super parent, attrs, prefix, name
111
+ @markdown_odt_parser = markdown_odt_parser
73
112
  style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
74
113
  @enclosing_style = []
75
- if style_index
76
- fetch_style attrs[style_index].value
114
+ if style_index and fetch_style?
115
+ elem_style = find_style attrs[style_index].value
116
+ fetch_style elem_style
77
117
  end
78
118
  end
79
119
 
80
- def fetch_common_style(style)
81
- if style
82
- style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
120
+ def fetch_style?
121
+ true
122
+ end
123
+
124
+ def fetch_style(elem_style)
125
+ if elem_style
126
+ elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
83
127
  text_property.attrs.each { |attr|
84
128
  if attr.prefix == 'style'
85
129
  if attr.localname == 'font-style-complex' && attr.value == 'italic'
@@ -93,11 +137,10 @@ module Doc2Text
93
137
  end
94
138
  end
95
139
 
96
- def fetch_style(style_name)
97
- styles = @markdown_document.xpath '/office:document-content/office:automatic-styles/style:style'
98
- style = styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' && attr.value == self.class.style_family } &&
140
+ def find_style(style_name)
141
+ styles = @markdown_odt_parser.xpath '/office:document-content/office:automatic-styles/style:style'
142
+ styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' } &&
99
143
  style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
100
- fetch_common_style style
101
144
  end
102
145
 
103
146
  # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419256_253892949
@@ -105,8 +148,8 @@ module Doc2Text
105
148
  include Node
106
149
  include Text
107
150
 
108
- def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
109
- super parent, attrs, prefix, name, markdown_document
151
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
152
+ super parent, attrs, prefix, name, markdown_odt_parser
110
153
  end
111
154
 
112
155
  def self.style_family
@@ -153,15 +196,46 @@ module Doc2Text
153
196
  include Node
154
197
  include Text
155
198
 
156
- not_enclosing 'p'
199
+ def expand
200
+ result = "* #{@children.select(&:not_deleted?).map(&:expand).join.strip.gsub /\n{2,}/, "\n"}\n"
201
+ delete
202
+ result.clone
203
+ end
157
204
 
158
- def open
159
- '* '
205
+ def fetch_style?
206
+ false
160
207
  end
161
208
 
162
- def close
209
+ def delete_on_close?
210
+ false
211
+ end
212
+ end
213
+
214
+ class List
215
+ include Node
216
+ include Text
217
+
218
+ def open
163
219
  "\n"
164
220
  end
221
+
222
+ def fetch_style(elem_style)
223
+ if elem_style
224
+ elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
225
+ text_property.attrs.each { |attr|
226
+ if attr.prefix == 'style'
227
+ if attr.localname == 'list-level-style-number' && attr.value == 'Numbering_20_Symbols'
228
+ @enclosing_style << '_'
229
+ end
230
+ end
231
+ }
232
+ }
233
+ end
234
+ end
235
+
236
+ def delete_on_close?
237
+ false
238
+ end
165
239
  end
166
240
  end
167
241
  end
@@ -3,14 +3,16 @@ module Doc2Text
3
3
  module XmlNodes
4
4
  module Node
5
5
  attr_reader :parent, :children, :attrs, :prefix, :name
6
+ attr_accessor :text
6
7
 
7
- def self.create_node(prefix, name, parent = nil, attrs = [], markdown_document = nil)
8
+ def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
8
9
  begin
9
10
  clazz = XmlNodes.const_get "#{titleize prefix}::#{titleize name}"
10
11
  rescue NameError => e
11
- Generic.new(parent, attrs, prefix, name, markdown_document)
12
+ # markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
13
+ Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
12
14
  else
13
- clazz.new(parent, attrs, prefix, name, markdown_document)
15
+ clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
14
16
  end
15
17
  end
16
18
 
@@ -18,8 +20,8 @@ module Doc2Text
18
20
  tag.split('-').map(&:capitalize).join
19
21
  end
20
22
 
21
- def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_document = nil)
22
- @parent, @attrs, @prefix, @name = parent, attrs, prefix, name
23
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
24
+ @parent, @attrs, @prefix, @name, @markdown_odt_parser = parent, attrs, prefix, name, markdown_odt_parser
23
25
  @children = []
24
26
  @has_text = false
25
27
  end
@@ -40,12 +42,22 @@ module Doc2Text
40
42
  ''
41
43
  end
42
44
 
43
- def <<(child)
44
- @children << child
45
+ def delete_on_close?
46
+ false
45
47
  end
46
48
 
47
- def delete_on_close?
48
- true
49
+ def not_deleted?
50
+ !@deleted
51
+ end
52
+
53
+ def delete
54
+ @deleted = true
55
+ # @children.each { |child| child.delete }
56
+ # @children = []
57
+ end
58
+
59
+ def un_delete
60
+ @deleted = false
49
61
  end
50
62
 
51
63
  def eql?(object)
@@ -57,15 +69,6 @@ module Doc2Text
57
69
  instance_of? Node
58
70
  end
59
71
 
60
- def remove_last_child!(child)
61
- unless child === @children.last
62
- # TODO remove this redundant(tree build algorithm) checks
63
- raise Doc2Text::XmlError, "!The child #{child} IS NOT among the children of #{self}"
64
- else
65
- @children.pop
66
- end
67
- end
68
-
69
72
  def xml_name
70
73
  "#{@prefix}:#{@name}"
71
74
  end
@@ -74,6 +77,12 @@ module Doc2Text
74
77
  "#{xml_name} : #{attrs}"
75
78
  end
76
79
 
80
+ def expand
81
+ expanded = "#{open}#{@children.select(&:not_deleted?).map(&:expand).join}#{close}"
82
+ delete
83
+ expanded.clone
84
+ end
85
+
77
86
  def not_enclosing?
78
87
  !root? && parent.class.not_enclosing_tags && parent.class.not_enclosing_tags.find do |tag|
79
88
  @prefix == parent.prefix && @name == tag
data/lib/doc2text.rb CHANGED
@@ -4,8 +4,8 @@ require 'fileutils'
4
4
 
5
5
  require 'doc2text/odt'
6
6
  require 'doc2text/odt_xml_node'
7
- require 'doc2text/namespaces'
8
- require 'doc2text/markdown'
7
+ require 'doc2text/odt_xml_namespaces'
8
+ require 'doc2text/markdown_odt_parser'
9
9
  require 'doc2text/errors'
10
10
 
11
11
  require 'doc2text/content'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: doc2text
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: '0.2'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Valentin Aitken
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-07-12 00:00:00.000000000 Z
11
+ date: 2014-10-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Parses odt to markdown
14
14
  email: bostko@gmail.com
@@ -19,9 +19,9 @@ files:
19
19
  - lib/doc2text.rb
20
20
  - lib/doc2text/content.rb
21
21
  - lib/doc2text/errors.rb
22
- - lib/doc2text/markdown.rb
23
- - lib/doc2text/namespaces.rb
22
+ - lib/doc2text/markdown_odt_parser.rb
24
23
  - lib/doc2text/odt.rb
24
+ - lib/doc2text/odt_xml_namespaces.rb
25
25
  - lib/doc2text/odt_xml_node.rb
26
26
  homepage: https://github.com/bostko/doc2text
27
27
  licenses:
@@ -43,7 +43,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
43
43
  version: '0'
44
44
  requirements: []
45
45
  rubyforge_project:
46
- rubygems_version: 2.3.0
46
+ rubygems_version: 2.2.2
47
47
  signing_key:
48
48
  specification_version: 4
49
49
  summary: Translates odt to markdown