doc2text 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e795071f6df878e0427e2728848d170ecb1277d0
4
- data.tar.gz: 4c02f808c1d6c47bf7edf8f1d81fc534bdf2ab93
3
+ metadata.gz: f2530e3597409c35637493d9d8e9842b2f70f005
4
+ data.tar.gz: 304eb91cfa9b109bfa853cbc7d9736a14b7817bd
5
5
  SHA512:
6
- metadata.gz: d7a9314e425c425dc228e4f742874f3393ec77ab604e22ead973cb1430e5a01bd7395a5e8f6b6f2c71114aaad2736b5d43abcad30bb797a17a0ce842f6474ecd
7
- data.tar.gz: c82f62f3fabd0ccdf42ec6fb23d06cb3f6884676184e6a44e670e4a9f803b7fa2bc71ebd107ff067315e1ac52214c79c0477d9fd0a0388a0010f5c70fd915678
6
+ metadata.gz: 9ff147cd7f87f233a6fbbc9270e682426ffc3b5df514a5b7ca2b16b0405a4048901cb53972d5264c853eabce7532b9bb386f97847e5b072189119ec616b2961f
7
+ data.tar.gz: f992a33120ae4273966a54a15e02768abf4baa16473808775e938fed21163882e46c46333dfa7ea9a63cf5bd9e4ef02de4bbb8f50b7d9ab86dbe5a4a8ee6d3cc
@@ -2,10 +2,11 @@ require 'nokogiri'
2
2
  #require 'nokogiri/xml'
3
3
  require 'fileutils'
4
4
 
5
+ require 'doc2text/resolution'
5
6
  require 'doc2text/odt'
6
7
  require 'doc2text/odt_xml_node'
7
8
  require 'doc2text/odt_xml_namespaces'
8
9
  require 'doc2text/markdown_odt_parser'
9
10
  require 'doc2text/errors'
10
11
 
11
- require 'doc2text/content'
12
+ require 'doc2text/styles_parser'
@@ -2,13 +2,14 @@ require 'logger'
2
2
 
3
3
  module Doc2Text
4
4
  module Markdown
5
- class OdtParser
6
- def initialize(output)
5
+ class OdtParser < Nokogiri::XML::SAX::Document
6
+ def initialize(output, styles_xml_root = nil)
7
+ @styles_xml_root = styles_xml_root
7
8
  @output = output
8
9
  @automatic_styles = {}
9
10
  end
10
11
 
11
- def new_node(prefix, name, attrs)
12
+ def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
12
13
  unless @xml_root
13
14
  @xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
14
15
  else
@@ -18,22 +19,19 @@ module Doc2Text
18
19
  end
19
20
  end
20
21
 
21
- def close_node(prefix, name)
22
- # if Odt::XmlNodes::Node.create_node(prefix, name, nil, [], self).eql? @current_node
23
- if @current_node.parent and @current_node.parent.office_text?
24
- @output << @current_node.expand
25
- @current_node.delete
26
- end
27
- @current_node = @current_node.parent
28
- # else
29
- # # TODO remove this redundant(tree build algorithm) checks
30
- # raise Doc2Text::XmlError, "!Close node child #{prefix} #{name} IS NOT correct, CURRENT_ELEM #{@current_node}"
31
- # end
22
+ def end_element_namespace(name, prefix = nil, uri = nil)
23
+ if @current_node.parent and @current_node.parent.office_text?
24
+ @output << @current_node.expand
25
+ @current_node.delete
26
+ end
27
+ @current_node = @current_node.parent
32
28
  end
33
29
 
34
- def text(string)
35
- plain_text = Odt::XmlNodes::PlainText.new(string)
36
- @current_node.children << plain_text
30
+ def characters(string)
31
+ unless string.strip.empty?
32
+ plain_text = Odt::XmlNodes::PlainText.new(string)
33
+ @current_node.children << plain_text
34
+ end
37
35
  end
38
36
 
39
37
  def close
@@ -50,18 +48,29 @@ module Doc2Text
50
48
  # Select nodes xpath style
51
49
  # - supports selecting from the root node
52
50
  def xpath(string)
53
- if /^(\/[\w:\-]+)+$/ =~ string
54
- path = string.scan /[\w:\-]+/
55
- seek_nodes = [@xml_root]
56
- path.each_with_index do |xml_name, index|
57
- seek_nodes.select! { |node| node.xml_name == xml_name }
58
- seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
59
- break if seek_nodes.empty?
51
+ patterns = string.split '|'
52
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax' if patterns.length == 0
53
+ result = []
54
+ patterns.each do |pattern|
55
+ if /^(\/[\w:\-]+)+$/ =~ pattern
56
+ path = pattern.scan /[\w:\-]+/
57
+ result += xpath_search_nodes(path, @xml_root)
58
+ result += xpath_search_nodes(path, @styles_xml_root) if @styles_xml_root
59
+ else
60
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax'
60
61
  end
61
- seek_nodes
62
- else
63
- raise Doc2Text::XmlError, 'it does not support this xpath syntax'
64
62
  end
63
+ result
64
+ end
65
+
66
+ def xpath_search_nodes(path, xml_root)
67
+ seek_nodes = [xml_root]
68
+ path.each_with_index do |xml_name, index|
69
+ seek_nodes.select! { |node| node.xml_name == xml_name }
70
+ seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
71
+ break if seek_nodes.empty?
72
+ end
73
+ seek_nodes
65
74
  end
66
75
 
67
76
  def logger
@@ -9,8 +9,9 @@ module Doc2Text
9
9
  odt = new input
10
10
  begin
11
11
  odt.unpack
12
+ styles_xml_root = odt.parse_styles
12
13
  output = File.open output_filename, 'w'
13
- markdown = Markdown::OdtParser.new output
14
+ markdown = Markdown::OdtParser.new output, styles_xml_root
14
15
  begin
15
16
  odt.parse markdown
16
17
  ensure
@@ -21,9 +22,15 @@ module Doc2Text
21
22
  end
22
23
  end
23
24
 
25
+ def parse_styles
26
+ styles_parser = Doc2Text::Odt::StylesParser.new
27
+ xml = Nokogiri::XML::SAX::Parser.new(styles_parser)
28
+ xml.parse open 'styles.xml'
29
+ styles_parser.xml_root
30
+ end
31
+
24
32
  def parse(markdown)
25
- content = ::Doc2Text::Odt::Content::Document.new markdown
26
- parser = Nokogiri::XML::SAX::Parser.new(content) # { |config| config.strict}
33
+ parser = Nokogiri::XML::SAX::Parser.new(markdown) # { |config| config.strict}
27
34
  parser.parse open 'content.xml'
28
35
  end
29
36
 
@@ -88,7 +88,7 @@ module Doc2Text
88
88
  module Text
89
89
  def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
90
90
  super parent, attrs, prefix, name
91
- @markdown_odt_parser = markdown_odt_parser
91
+ @xml_parser = markdown_odt_parser
92
92
  style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
93
93
  @enclosing_style = []
94
94
  if style_index and fetch_style?
@@ -118,7 +118,7 @@ module Doc2Text
118
118
  end
119
119
 
120
120
  def find_style(style_name)
121
- styles = @markdown_odt_parser.xpath '/office:document-content/office:automatic-styles/style:style'
121
+ styles = @xml_parser.xpath '/office:document-content/office:automatic-styles/style:style'
122
122
  styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' } &&
123
123
  style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
124
124
  end
@@ -169,6 +169,12 @@ module Doc2Text
169
169
  def close
170
170
  @enclosing_style.reverse.join
171
171
  end
172
+
173
+ def expand
174
+ expanded = "#{open}#{@children.map(&:expand).join}#{close}"
175
+ delete
176
+ expanded.clone
177
+ end
172
178
  end
173
179
 
174
180
  # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415154_253892949
@@ -21,7 +21,7 @@ module Doc2Text
21
21
  end
22
22
 
23
23
  def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
24
- @parent, @attrs, @prefix, @name, @markdown_odt_parser = parent, attrs, prefix, name, markdown_odt_parser
24
+ @parent, @attrs, @prefix, @name, @xml_parser = parent, attrs, prefix, name, markdown_odt_parser
25
25
  @children = []
26
26
  @has_text = false
27
27
  end
@@ -0,0 +1,17 @@
1
+ module Doc2Text
2
+ class Resolution
3
+ def self.parse_and_save(source, output)
4
+ case File.extname source
5
+ when '.doc', '.docx'
6
+ mid_name = File.join(File.dirname(output),
7
+ File.basename(source, File.extname(source)) + '.odt')
8
+ system "soffice --headless --convert-to odt #{source} --outdir #{File.dirname output}"
9
+ source = mid_name
10
+ Doc2Text::Odt::Document.parse_and_save source, output
11
+ File.delete(mid_name)
12
+ else
13
+ Doc2Text::Odt::Document.parse_and_save source, output
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,28 @@
1
+ module Doc2Text
2
+ module Odt
3
+ class StylesParser < Nokogiri::XML::SAX::Document
4
+ attr_reader :xml_root
5
+
6
+ def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
7
+ unless @xml_root
8
+ @xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
9
+ else
10
+ new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
11
+ @current_node.children << new_node
12
+ @current_node = new_node
13
+ end
14
+ end
15
+
16
+ def end_element_namespace(name, prefix = nil, uri = nil)
17
+ @current_node = @current_node.parent
18
+ end
19
+
20
+ def characters(_)
21
+ end
22
+
23
+ def xpath(_)
24
+ []
25
+ end
26
+ end
27
+ end
28
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: doc2text
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Valentin Aitken
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-23 00:00:00.000000000 Z
11
+ date: 2014-10-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -57,12 +57,13 @@ extensions: []
57
57
  extra_rdoc_files: []
58
58
  files:
59
59
  - lib/doc2text.rb
60
- - lib/doc2text/content.rb
61
60
  - lib/doc2text/errors.rb
62
61
  - lib/doc2text/markdown_odt_parser.rb
63
62
  - lib/doc2text/odt.rb
64
63
  - lib/doc2text/odt_xml_namespaces.rb
65
64
  - lib/doc2text/odt_xml_node.rb
65
+ - lib/doc2text/resolution.rb
66
+ - lib/doc2text/styles_parser.rb
66
67
  homepage: http://doc2text.com
67
68
  licenses:
68
69
  - GPL
@@ -88,3 +89,4 @@ signing_key:
88
89
  specification_version: 4
89
90
  summary: Translates odt to markdown
90
91
  test_files: []
92
+ has_rdoc:
@@ -1,25 +0,0 @@
1
- module Doc2Text
2
- module Odt
3
- module Content
4
- class Document < ::Nokogiri::XML::SAX::Document
5
- def initialize(markdown_odt_parser)
6
- @markdown_odt_parser = markdown_odt_parser
7
- end
8
-
9
- def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
10
- @markdown_odt_parser.new_node prefix, name, attrs
11
- end
12
-
13
- def end_element_namespace(name, prefix = nil, uri = nil)
14
- @markdown_odt_parser.close_node prefix, name
15
- end
16
-
17
- def characters(string)
18
- unless string.strip.empty?
19
- @markdown_odt_parser.text string
20
- end
21
- end
22
- end
23
- end
24
- end
25
- end