doc2text 0.3.2 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e795071f6df878e0427e2728848d170ecb1277d0
4
- data.tar.gz: 4c02f808c1d6c47bf7edf8f1d81fc534bdf2ab93
3
+ metadata.gz: f2530e3597409c35637493d9d8e9842b2f70f005
4
+ data.tar.gz: 304eb91cfa9b109bfa853cbc7d9736a14b7817bd
5
5
  SHA512:
6
- metadata.gz: d7a9314e425c425dc228e4f742874f3393ec77ab604e22ead973cb1430e5a01bd7395a5e8f6b6f2c71114aaad2736b5d43abcad30bb797a17a0ce842f6474ecd
7
- data.tar.gz: c82f62f3fabd0ccdf42ec6fb23d06cb3f6884676184e6a44e670e4a9f803b7fa2bc71ebd107ff067315e1ac52214c79c0477d9fd0a0388a0010f5c70fd915678
6
+ metadata.gz: 9ff147cd7f87f233a6fbbc9270e682426ffc3b5df514a5b7ca2b16b0405a4048901cb53972d5264c853eabce7532b9bb386f97847e5b072189119ec616b2961f
7
+ data.tar.gz: f992a33120ae4273966a54a15e02768abf4baa16473808775e938fed21163882e46c46333dfa7ea9a63cf5bd9e4ef02de4bbb8f50b7d9ab86dbe5a4a8ee6d3cc
@@ -2,10 +2,11 @@ require 'nokogiri'
2
2
  #require 'nokogiri/xml'
3
3
  require 'fileutils'
4
4
 
5
+ require 'doc2text/resolution'
5
6
  require 'doc2text/odt'
6
7
  require 'doc2text/odt_xml_node'
7
8
  require 'doc2text/odt_xml_namespaces'
8
9
  require 'doc2text/markdown_odt_parser'
9
10
  require 'doc2text/errors'
10
11
 
11
- require 'doc2text/content'
12
+ require 'doc2text/styles_parser'
@@ -2,13 +2,14 @@ require 'logger'
2
2
 
3
3
  module Doc2Text
4
4
  module Markdown
5
- class OdtParser
6
- def initialize(output)
5
+ class OdtParser < Nokogiri::XML::SAX::Document
6
+ def initialize(output, styles_xml_root = nil)
7
+ @styles_xml_root = styles_xml_root
7
8
  @output = output
8
9
  @automatic_styles = {}
9
10
  end
10
11
 
11
- def new_node(prefix, name, attrs)
12
+ def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
12
13
  unless @xml_root
13
14
  @xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
14
15
  else
@@ -18,22 +19,19 @@ module Doc2Text
18
19
  end
19
20
  end
20
21
 
21
- def close_node(prefix, name)
22
- # if Odt::XmlNodes::Node.create_node(prefix, name, nil, [], self).eql? @current_node
23
- if @current_node.parent and @current_node.parent.office_text?
24
- @output << @current_node.expand
25
- @current_node.delete
26
- end
27
- @current_node = @current_node.parent
28
- # else
29
- # # TODO remove this redundant(tree build algorithm) checks
30
- # raise Doc2Text::XmlError, "!Close node child #{prefix} #{name} IS NOT correct, CURRENT_ELEM #{@current_node}"
31
- # end
22
+ def end_element_namespace(name, prefix = nil, uri = nil)
23
+ if @current_node.parent and @current_node.parent.office_text?
24
+ @output << @current_node.expand
25
+ @current_node.delete
26
+ end
27
+ @current_node = @current_node.parent
32
28
  end
33
29
 
34
- def text(string)
35
- plain_text = Odt::XmlNodes::PlainText.new(string)
36
- @current_node.children << plain_text
30
+ def characters(string)
31
+ unless string.strip.empty?
32
+ plain_text = Odt::XmlNodes::PlainText.new(string)
33
+ @current_node.children << plain_text
34
+ end
37
35
  end
38
36
 
39
37
  def close
@@ -50,18 +48,29 @@ module Doc2Text
50
48
  # Select nodes xpath style
51
49
  # - supports selecting from the root node
52
50
  def xpath(string)
53
- if /^(\/[\w:\-]+)+$/ =~ string
54
- path = string.scan /[\w:\-]+/
55
- seek_nodes = [@xml_root]
56
- path.each_with_index do |xml_name, index|
57
- seek_nodes.select! { |node| node.xml_name == xml_name }
58
- seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
59
- break if seek_nodes.empty?
51
+ patterns = string.split '|'
52
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax' if patterns.length == 0
53
+ result = []
54
+ patterns.each do |pattern|
55
+ if /^(\/[\w:\-]+)+$/ =~ pattern
56
+ path = pattern.scan /[\w:\-]+/
57
+ result += xpath_search_nodes(path, @xml_root)
58
+ result += xpath_search_nodes(path, @styles_xml_root) if @styles_xml_root
59
+ else
60
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax'
60
61
  end
61
- seek_nodes
62
- else
63
- raise Doc2Text::XmlError, 'it does not support this xpath syntax'
64
62
  end
63
+ result
64
+ end
65
+
66
+ def xpath_search_nodes(path, xml_root)
67
+ seek_nodes = [xml_root]
68
+ path.each_with_index do |xml_name, index|
69
+ seek_nodes.select! { |node| node.xml_name == xml_name }
70
+ seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
71
+ break if seek_nodes.empty?
72
+ end
73
+ seek_nodes
65
74
  end
66
75
 
67
76
  def logger
@@ -9,8 +9,9 @@ module Doc2Text
9
9
  odt = new input
10
10
  begin
11
11
  odt.unpack
12
+ styles_xml_root = odt.parse_styles
12
13
  output = File.open output_filename, 'w'
13
- markdown = Markdown::OdtParser.new output
14
+ markdown = Markdown::OdtParser.new output, styles_xml_root
14
15
  begin
15
16
  odt.parse markdown
16
17
  ensure
@@ -21,9 +22,15 @@ module Doc2Text
21
22
  end
22
23
  end
23
24
 
25
+ def parse_styles
26
+ styles_parser = Doc2Text::Odt::StylesParser.new
27
+ xml = Nokogiri::XML::SAX::Parser.new(styles_parser)
28
+ xml.parse open 'styles.xml'
29
+ styles_parser.xml_root
30
+ end
31
+
24
32
  def parse(markdown)
25
- content = ::Doc2Text::Odt::Content::Document.new markdown
26
- parser = Nokogiri::XML::SAX::Parser.new(content) # { |config| config.strict}
33
+ parser = Nokogiri::XML::SAX::Parser.new(markdown) # { |config| config.strict}
27
34
  parser.parse open 'content.xml'
28
35
  end
29
36
 
@@ -88,7 +88,7 @@ module Doc2Text
88
88
  module Text
89
89
  def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
90
90
  super parent, attrs, prefix, name
91
- @markdown_odt_parser = markdown_odt_parser
91
+ @xml_parser = markdown_odt_parser
92
92
  style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
93
93
  @enclosing_style = []
94
94
  if style_index and fetch_style?
@@ -118,7 +118,7 @@ module Doc2Text
118
118
  end
119
119
 
120
120
  def find_style(style_name)
121
- styles = @markdown_odt_parser.xpath '/office:document-content/office:automatic-styles/style:style'
121
+ styles = @xml_parser.xpath '/office:document-content/office:automatic-styles/style:style'
122
122
  styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' } &&
123
123
  style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
124
124
  end
@@ -169,6 +169,12 @@ module Doc2Text
169
169
  def close
170
170
  @enclosing_style.reverse.join
171
171
  end
172
+
173
+ def expand
174
+ expanded = "#{open}#{@children.map(&:expand).join}#{close}"
175
+ delete
176
+ expanded.clone
177
+ end
172
178
  end
173
179
 
174
180
  # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415154_253892949
@@ -21,7 +21,7 @@ module Doc2Text
21
21
  end
22
22
 
23
23
  def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
24
- @parent, @attrs, @prefix, @name, @markdown_odt_parser = parent, attrs, prefix, name, markdown_odt_parser
24
+ @parent, @attrs, @prefix, @name, @xml_parser = parent, attrs, prefix, name, markdown_odt_parser
25
25
  @children = []
26
26
  @has_text = false
27
27
  end
@@ -0,0 +1,17 @@
1
+ module Doc2Text
2
+ class Resolution
3
+ def self.parse_and_save(source, output)
4
+ case File.extname source
5
+ when '.doc', '.docx'
6
+ mid_name = File.join(File.dirname(output),
7
+ File.basename(source, File.extname(source)) + '.odt')
8
+ system "soffice --headless --convert-to odt #{source} --outdir #{File.dirname output}"
9
+ source = mid_name
10
+ Doc2Text::Odt::Document.parse_and_save source, output
11
+ File.delete(mid_name)
12
+ else
13
+ Doc2Text::Odt::Document.parse_and_save source, output
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,28 @@
1
+ module Doc2Text
2
+ module Odt
3
+ class StylesParser < Nokogiri::XML::SAX::Document
4
+ attr_reader :xml_root
5
+
6
+ def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
7
+ unless @xml_root
8
+ @xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
9
+ else
10
+ new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
11
+ @current_node.children << new_node
12
+ @current_node = new_node
13
+ end
14
+ end
15
+
16
+ def end_element_namespace(name, prefix = nil, uri = nil)
17
+ @current_node = @current_node.parent
18
+ end
19
+
20
+ def characters(_)
21
+ end
22
+
23
+ def xpath(_)
24
+ []
25
+ end
26
+ end
27
+ end
28
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: doc2text
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Valentin Aitken
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-23 00:00:00.000000000 Z
11
+ date: 2014-10-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -57,12 +57,13 @@ extensions: []
57
57
  extra_rdoc_files: []
58
58
  files:
59
59
  - lib/doc2text.rb
60
- - lib/doc2text/content.rb
61
60
  - lib/doc2text/errors.rb
62
61
  - lib/doc2text/markdown_odt_parser.rb
63
62
  - lib/doc2text/odt.rb
64
63
  - lib/doc2text/odt_xml_namespaces.rb
65
64
  - lib/doc2text/odt_xml_node.rb
65
+ - lib/doc2text/resolution.rb
66
+ - lib/doc2text/styles_parser.rb
66
67
  homepage: http://doc2text.com
67
68
  licenses:
68
69
  - GPL
@@ -88,3 +89,4 @@ signing_key:
88
89
  specification_version: 4
89
90
  summary: Translates odt to markdown
90
91
  test_files: []
92
+ has_rdoc:
@@ -1,25 +0,0 @@
1
- module Doc2Text
2
- module Odt
3
- module Content
4
- class Document < ::Nokogiri::XML::SAX::Document
5
- def initialize(markdown_odt_parser)
6
- @markdown_odt_parser = markdown_odt_parser
7
- end
8
-
9
- def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
10
- @markdown_odt_parser.new_node prefix, name, attrs
11
- end
12
-
13
- def end_element_namespace(name, prefix = nil, uri = nil)
14
- @markdown_odt_parser.close_node prefix, name
15
- end
16
-
17
- def characters(string)
18
- unless string.strip.empty?
19
- @markdown_odt_parser.text string
20
- end
21
- end
22
- end
23
- end
24
- end
25
- end