doc2text 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/doc2text.rb +2 -1
- data/lib/doc2text/markdown_odt_parser.rb +36 -27
- data/lib/doc2text/odt.rb +10 -3
- data/lib/doc2text/odt_xml_namespaces.rb +8 -2
- data/lib/doc2text/odt_xml_node.rb +1 -1
- data/lib/doc2text/resolution.rb +17 -0
- data/lib/doc2text/styles_parser.rb +28 -0
- metadata +5 -3
- data/lib/doc2text/content.rb +0 -25
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f2530e3597409c35637493d9d8e9842b2f70f005
|
4
|
+
data.tar.gz: 304eb91cfa9b109bfa853cbc7d9736a14b7817bd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9ff147cd7f87f233a6fbbc9270e682426ffc3b5df514a5b7ca2b16b0405a4048901cb53972d5264c853eabce7532b9bb386f97847e5b072189119ec616b2961f
|
7
|
+
data.tar.gz: f992a33120ae4273966a54a15e02768abf4baa16473808775e938fed21163882e46c46333dfa7ea9a63cf5bd9e4ef02de4bbb8f50b7d9ab86dbe5a4a8ee6d3cc
|
data/lib/doc2text.rb
CHANGED
@@ -2,10 +2,11 @@ require 'nokogiri'
|
|
2
2
|
#require 'nokogiri/xml'
|
3
3
|
require 'fileutils'
|
4
4
|
|
5
|
+
require 'doc2text/resolution'
|
5
6
|
require 'doc2text/odt'
|
6
7
|
require 'doc2text/odt_xml_node'
|
7
8
|
require 'doc2text/odt_xml_namespaces'
|
8
9
|
require 'doc2text/markdown_odt_parser'
|
9
10
|
require 'doc2text/errors'
|
10
11
|
|
11
|
-
require 'doc2text/
|
12
|
+
require 'doc2text/styles_parser'
|
@@ -2,13 +2,14 @@ require 'logger'
|
|
2
2
|
|
3
3
|
module Doc2Text
|
4
4
|
module Markdown
|
5
|
-
class OdtParser
|
6
|
-
def initialize(output)
|
5
|
+
class OdtParser < Nokogiri::XML::SAX::Document
|
6
|
+
def initialize(output, styles_xml_root = nil)
|
7
|
+
@styles_xml_root = styles_xml_root
|
7
8
|
@output = output
|
8
9
|
@automatic_styles = {}
|
9
10
|
end
|
10
11
|
|
11
|
-
def
|
12
|
+
def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
|
12
13
|
unless @xml_root
|
13
14
|
@xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
|
14
15
|
else
|
@@ -18,22 +19,19 @@ module Doc2Text
|
|
18
19
|
end
|
19
20
|
end
|
20
21
|
|
21
|
-
def
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
@current_node = @current_node.parent
|
28
|
-
# else
|
29
|
-
# # TODO remove this redundant(tree build algorithm) checks
|
30
|
-
# raise Doc2Text::XmlError, "!Close node child #{prefix} #{name} IS NOT correct, CURRENT_ELEM #{@current_node}"
|
31
|
-
# end
|
22
|
+
def end_element_namespace(name, prefix = nil, uri = nil)
|
23
|
+
if @current_node.parent and @current_node.parent.office_text?
|
24
|
+
@output << @current_node.expand
|
25
|
+
@current_node.delete
|
26
|
+
end
|
27
|
+
@current_node = @current_node.parent
|
32
28
|
end
|
33
29
|
|
34
|
-
def
|
35
|
-
|
36
|
-
|
30
|
+
def characters(string)
|
31
|
+
unless string.strip.empty?
|
32
|
+
plain_text = Odt::XmlNodes::PlainText.new(string)
|
33
|
+
@current_node.children << plain_text
|
34
|
+
end
|
37
35
|
end
|
38
36
|
|
39
37
|
def close
|
@@ -50,18 +48,29 @@ module Doc2Text
|
|
50
48
|
# Select nodes xpath style
|
51
49
|
# - supports selecting from the root node
|
52
50
|
def xpath(string)
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
51
|
+
patterns = string.split '|'
|
52
|
+
raise Doc2Text::XmlError, 'it does not support this xpath syntax' if patterns.length == 0
|
53
|
+
result = []
|
54
|
+
patterns.each do |pattern|
|
55
|
+
if /^(\/[\w:\-]+)+$/ =~ pattern
|
56
|
+
path = pattern.scan /[\w:\-]+/
|
57
|
+
result += xpath_search_nodes(path, @xml_root)
|
58
|
+
result += xpath_search_nodes(path, @styles_xml_root) if @styles_xml_root
|
59
|
+
else
|
60
|
+
raise Doc2Text::XmlError, 'it does not support this xpath syntax'
|
60
61
|
end
|
61
|
-
seek_nodes
|
62
|
-
else
|
63
|
-
raise Doc2Text::XmlError, 'it does not support this xpath syntax'
|
64
62
|
end
|
63
|
+
result
|
64
|
+
end
|
65
|
+
|
66
|
+
def xpath_search_nodes(path, xml_root)
|
67
|
+
seek_nodes = [xml_root]
|
68
|
+
path.each_with_index do |xml_name, index|
|
69
|
+
seek_nodes.select! { |node| node.xml_name == xml_name }
|
70
|
+
seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
|
71
|
+
break if seek_nodes.empty?
|
72
|
+
end
|
73
|
+
seek_nodes
|
65
74
|
end
|
66
75
|
|
67
76
|
def logger
|
data/lib/doc2text/odt.rb
CHANGED
@@ -9,8 +9,9 @@ module Doc2Text
|
|
9
9
|
odt = new input
|
10
10
|
begin
|
11
11
|
odt.unpack
|
12
|
+
styles_xml_root = odt.parse_styles
|
12
13
|
output = File.open output_filename, 'w'
|
13
|
-
markdown = Markdown::OdtParser.new output
|
14
|
+
markdown = Markdown::OdtParser.new output, styles_xml_root
|
14
15
|
begin
|
15
16
|
odt.parse markdown
|
16
17
|
ensure
|
@@ -21,9 +22,15 @@ module Doc2Text
|
|
21
22
|
end
|
22
23
|
end
|
23
24
|
|
25
|
+
def parse_styles
|
26
|
+
styles_parser = Doc2Text::Odt::StylesParser.new
|
27
|
+
xml = Nokogiri::XML::SAX::Parser.new(styles_parser)
|
28
|
+
xml.parse open 'styles.xml'
|
29
|
+
styles_parser.xml_root
|
30
|
+
end
|
31
|
+
|
24
32
|
def parse(markdown)
|
25
|
-
|
26
|
-
parser = Nokogiri::XML::SAX::Parser.new(content) # { |config| config.strict}
|
33
|
+
parser = Nokogiri::XML::SAX::Parser.new(markdown) # { |config| config.strict}
|
27
34
|
parser.parse open 'content.xml'
|
28
35
|
end
|
29
36
|
|
@@ -88,7 +88,7 @@ module Doc2Text
|
|
88
88
|
module Text
|
89
89
|
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
90
90
|
super parent, attrs, prefix, name
|
91
|
-
@
|
91
|
+
@xml_parser = markdown_odt_parser
|
92
92
|
style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
|
93
93
|
@enclosing_style = []
|
94
94
|
if style_index and fetch_style?
|
@@ -118,7 +118,7 @@ module Doc2Text
|
|
118
118
|
end
|
119
119
|
|
120
120
|
def find_style(style_name)
|
121
|
-
styles = @
|
121
|
+
styles = @xml_parser.xpath '/office:document-content/office:automatic-styles/style:style'
|
122
122
|
styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' } &&
|
123
123
|
style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
|
124
124
|
end
|
@@ -169,6 +169,12 @@ module Doc2Text
|
|
169
169
|
def close
|
170
170
|
@enclosing_style.reverse.join
|
171
171
|
end
|
172
|
+
|
173
|
+
def expand
|
174
|
+
expanded = "#{open}#{@children.map(&:expand).join}#{close}"
|
175
|
+
delete
|
176
|
+
expanded.clone
|
177
|
+
end
|
172
178
|
end
|
173
179
|
|
174
180
|
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415154_253892949
|
@@ -21,7 +21,7 @@ module Doc2Text
|
|
21
21
|
end
|
22
22
|
|
23
23
|
def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
|
24
|
-
@parent, @attrs, @prefix, @name, @
|
24
|
+
@parent, @attrs, @prefix, @name, @xml_parser = parent, attrs, prefix, name, markdown_odt_parser
|
25
25
|
@children = []
|
26
26
|
@has_text = false
|
27
27
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
class Resolution
|
3
|
+
def self.parse_and_save(source, output)
|
4
|
+
case File.extname source
|
5
|
+
when '.doc', '.docx'
|
6
|
+
mid_name = File.join(File.dirname(output),
|
7
|
+
File.basename(source, File.extname(source)) + '.odt')
|
8
|
+
system "soffice --headless --convert-to odt #{source} --outdir #{File.dirname output}"
|
9
|
+
source = mid_name
|
10
|
+
Doc2Text::Odt::Document.parse_and_save source, output
|
11
|
+
File.delete(mid_name)
|
12
|
+
else
|
13
|
+
Doc2Text::Odt::Document.parse_and_save source, output
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Doc2Text
|
2
|
+
module Odt
|
3
|
+
class StylesParser < Nokogiri::XML::SAX::Document
|
4
|
+
attr_reader :xml_root
|
5
|
+
|
6
|
+
def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
|
7
|
+
unless @xml_root
|
8
|
+
@xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
|
9
|
+
else
|
10
|
+
new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
|
11
|
+
@current_node.children << new_node
|
12
|
+
@current_node = new_node
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def end_element_namespace(name, prefix = nil, uri = nil)
|
17
|
+
@current_node = @current_node.parent
|
18
|
+
end
|
19
|
+
|
20
|
+
def characters(_)
|
21
|
+
end
|
22
|
+
|
23
|
+
def xpath(_)
|
24
|
+
[]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: doc2text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Valentin Aitken
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -57,12 +57,13 @@ extensions: []
|
|
57
57
|
extra_rdoc_files: []
|
58
58
|
files:
|
59
59
|
- lib/doc2text.rb
|
60
|
-
- lib/doc2text/content.rb
|
61
60
|
- lib/doc2text/errors.rb
|
62
61
|
- lib/doc2text/markdown_odt_parser.rb
|
63
62
|
- lib/doc2text/odt.rb
|
64
63
|
- lib/doc2text/odt_xml_namespaces.rb
|
65
64
|
- lib/doc2text/odt_xml_node.rb
|
65
|
+
- lib/doc2text/resolution.rb
|
66
|
+
- lib/doc2text/styles_parser.rb
|
66
67
|
homepage: http://doc2text.com
|
67
68
|
licenses:
|
68
69
|
- GPL
|
@@ -88,3 +89,4 @@ signing_key:
|
|
88
89
|
specification_version: 4
|
89
90
|
summary: Translates odt to markdown
|
90
91
|
test_files: []
|
92
|
+
has_rdoc:
|
data/lib/doc2text/content.rb
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
module Doc2Text
|
2
|
-
module Odt
|
3
|
-
module Content
|
4
|
-
class Document < ::Nokogiri::XML::SAX::Document
|
5
|
-
def initialize(markdown_odt_parser)
|
6
|
-
@markdown_odt_parser = markdown_odt_parser
|
7
|
-
end
|
8
|
-
|
9
|
-
def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
|
10
|
-
@markdown_odt_parser.new_node prefix, name, attrs
|
11
|
-
end
|
12
|
-
|
13
|
-
def end_element_namespace(name, prefix = nil, uri = nil)
|
14
|
-
@markdown_odt_parser.close_node prefix, name
|
15
|
-
end
|
16
|
-
|
17
|
-
def characters(string)
|
18
|
-
unless string.strip.empty?
|
19
|
-
@markdown_odt_parser.text string
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|