doc2text 0.3.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: f2530e3597409c35637493d9d8e9842b2f70f005
4
- data.tar.gz: 304eb91cfa9b109bfa853cbc7d9736a14b7817bd
2
+ SHA256:
3
+ metadata.gz: 7b9b5aeaa63d276696f0f4f716242b181d8d3aef2e47861053c03f9623cdf498
4
+ data.tar.gz: a9ac2a3e0314334dda782f8ce8ef0d5a0691015ae70da0cc7a7fd79b2d6d7cd2
5
5
  SHA512:
6
- metadata.gz: 9ff147cd7f87f233a6fbbc9270e682426ffc3b5df514a5b7ca2b16b0405a4048901cb53972d5264c853eabce7532b9bb386f97847e5b072189119ec616b2961f
7
- data.tar.gz: f992a33120ae4273966a54a15e02768abf4baa16473808775e938fed21163882e46c46333dfa7ea9a63cf5bd9e4ef02de4bbb8f50b7d9ab86dbe5a4a8ee6d3cc
6
+ metadata.gz: 88fcdc3ade712a061c62641dd7713274c676f41d37c7020348ad401e0e7af3a86b07a3726a057870396ee68a290867fccf635d3191c8376b45850507e2f566e9
7
+ data.tar.gz: a96c1f4cbfbb42079f5e5d6eea757531d7d7e852e01724c40cc58f94ee5ebd27e700bdffc221cc7c7202a3686a6f3c40d8a5e153f07dcdca7aa4fd542b13eac9
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
4
+ require 'doc2text'
5
+ require 'optparse'
6
+
7
+ options = {}
8
+ opt_parse = OptionParser.new do |opts|
9
+ opts.banner = "Usage: doc2text document.odt output.md
10
+ or: doc2text [OPTION]... -s input[.odt] -o output[.md]\n\n"
11
+
12
+ opts.on('-s FILE', '--source FILE', 'Odt FILE document to be processed') do |file|
13
+ options[:source] = file
14
+ end
15
+
16
+ opts.on('-o FILE', '--output FILE', 'Output to markdown document FILE') do |file|
17
+ options[:output] = file
18
+ end
19
+
20
+ opts.on_tail('-h', '--help', 'Show this message') do
21
+ puts opts
22
+ exit
23
+ end
24
+ end
25
+
26
+ begin
27
+ opt_parse.parse!
28
+ if options.empty?
29
+ if ARGV.size == 2
30
+ options[:source], options[:output] = *ARGV
31
+ else
32
+ puts opt_parse
33
+ exit
34
+ end
35
+ end
36
+ rescue OptionParser::InvalidOption, OptionParser::MissingArgument
37
+ puts $!.to_s
38
+ puts opt_parse
39
+ exit
40
+ end
41
+
42
+
43
+ Doc2Text::Resolution.parse_and_save options[:source], options[:output]
@@ -2,11 +2,17 @@ require 'nokogiri'
2
2
  #require 'nokogiri/xml'
3
3
  require 'fileutils'
4
4
 
5
+ require 'doc2text/xml_based_document_file'
6
+ require 'doc2text/generic_xml_nodes'
5
7
  require 'doc2text/resolution'
6
- require 'doc2text/odt'
7
- require 'doc2text/odt_xml_node'
8
- require 'doc2text/odt_xml_namespaces'
9
- require 'doc2text/markdown_odt_parser'
10
8
  require 'doc2text/errors'
11
9
 
12
- require 'doc2text/styles_parser'
10
+ require 'doc2text/odt/odt'
11
+ require 'doc2text/odt/odt_xml_namespaces'
12
+ require 'doc2text/odt/markdown_odt_parser'
13
+
14
+ require 'doc2text/docx/docx'
15
+ require 'doc2text/docx/markdown_docx_parser'
16
+ require 'doc2text/docx/docx_xml_namespaces'
17
+
18
+ require 'doc2text/styles_parser'
@@ -0,0 +1,31 @@
1
+ module Doc2Text
2
+ module Docx
3
+ class Document < XmlBasedDocument::DocumentFile
4
+
5
+ def self.parse_and_save(input, output_filename)
6
+ docx = new input
7
+ begin
8
+ docx.unpack
9
+ styles_xml_root = docx.parse_styles
10
+ output = File.open output_filename, 'w'
11
+ markdown = Markdown::DocxParser.new output, styles_xml_root
12
+ begin
13
+ docx.parse markdown
14
+ ensure
15
+ markdown.close
16
+ end
17
+ ensure
18
+ docx.clean
19
+ end
20
+ end
21
+
22
+ def contains_extracted_files?
23
+ File.exist? File.join(extract_path, '[Content_Types].xml')
24
+ end
25
+
26
+ def extract_extension
27
+ 'unpacked_docx'
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,55 @@
1
+ module Doc2Text
2
+ module Docx
3
+ module XmlNodes
4
+ class Node < XmlBasedDocument::XmlNodes::Node
5
+ def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
6
+ begin
7
+ clazz = XmlNodes.const_get "#{prefix.capitalize}::W#{name}"
8
+ rescue NameError => e
9
+ # markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
10
+ Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
11
+ else
12
+ clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
13
+ end
14
+ end
15
+
16
+ def body?
17
+ false
18
+ end
19
+ end
20
+
21
+ class PlainText < XmlBasedDocument::XmlNodes::PlainText
22
+ def body?
23
+ false
24
+ end
25
+ end
26
+
27
+ class Generic < Node
28
+ end
29
+
30
+ module W
31
+ class Wbody < Node
32
+ def body?
33
+ true
34
+ end
35
+ end
36
+
37
+ class Wbr < Node
38
+ def open
39
+ '<br/>'
40
+ end
41
+ end
42
+
43
+ class Wp < Node
44
+ def open
45
+ "\n"
46
+ end
47
+
48
+ def close
49
+ "\n"
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,81 @@
1
+ require 'logger'
2
+
3
+ module Doc2Text
4
+ module Markdown
5
+ class DocxParser < Nokogiri::XML::SAX::Document
6
+ def initialize(output, styles_xml_root = nil)
7
+ @styles_xml_root = styles_xml_root
8
+ @output = output
9
+ @automatic_styles = {}
10
+ end
11
+
12
+ def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
13
+ unless @xml_root
14
+ @xml_root = @current_node = Docx::XmlNodes::Node.create_node prefix, name, nil, attrs, self
15
+ else
16
+ new_node = Docx::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
17
+ @current_node.children << new_node
18
+ @current_node = new_node
19
+ end
20
+ end
21
+
22
+ def end_element_namespace(name, prefix = nil, uri = nil)
23
+ if @current_node.parent and @current_node.parent.body?
24
+ @output << @current_node.expand
25
+ @current_node.delete
26
+ end
27
+ @current_node = @current_node.parent
28
+ end
29
+
30
+ def characters(string)
31
+ unless string.strip.empty?
32
+ plain_text = Docx::XmlNodes::PlainText.new(string)
33
+ @current_node.children << plain_text
34
+ end
35
+ end
36
+
37
+ def close
38
+ @output.close
39
+ end
40
+
41
+ def print_tree(node)
42
+ puts node
43
+ node.children.each do |child|
44
+ print_tree child
45
+ end
46
+ end
47
+
48
+ # Select nodes xpath style
49
+ # - supports selecting from the root node
50
+ def xpath(string)
51
+ patterns = string.split '|'
52
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax' if patterns.length == 0
53
+ result = []
54
+ patterns.each do |pattern|
55
+ if /^(\/[\w:\-]+)+$/ =~ pattern
56
+ path = pattern.scan /[\w:\-]+/
57
+ result += xpath_search_nodes(path, @xml_root)
58
+ result += xpath_search_nodes(path, @styles_xml_root) if @styles_xml_root
59
+ else
60
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax'
61
+ end
62
+ end
63
+ result
64
+ end
65
+
66
+ def xpath_search_nodes(path, xml_root)
67
+ seek_nodes = [xml_root]
68
+ path.each_with_index do |xml_name, index|
69
+ seek_nodes.select! { |node| node.xml_name == xml_name }
70
+ seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
71
+ break if seek_nodes.empty?
72
+ end
73
+ seek_nodes
74
+ end
75
+
76
+ def logger
77
+ @logger ||= Logger.new(STDOUT)
78
+ end
79
+ end
80
+ end
81
+ end
@@ -1,25 +1,16 @@
1
1
  module Doc2Text
2
- module Odt
2
+ module XmlBasedDocument
3
3
  module XmlNodes
4
- module Node
4
+ class Node
5
5
  attr_reader :parent, :children, :attrs, :prefix, :name
6
6
  attr_accessor :text
7
7
 
8
- def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
9
- begin
10
- clazz = XmlNodes.const_get "#{titleize prefix}::#{titleize name}"
11
- rescue NameError => e
12
- # markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
13
- Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
14
- else
15
- clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
8
+ def self.inherited(subclass)
9
+ def subclass.titleize(tag)
10
+ tag.split('-').map(&:capitalize).join
16
11
  end
17
12
  end
18
13
 
19
- def self.titleize(tag)
20
- tag.split('-').map(&:capitalize).join
21
- end
22
-
23
14
  def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
24
15
  @parent, @attrs, @prefix, @name, @xml_parser = parent, attrs, prefix, name, markdown_odt_parser
25
16
  @children = []
@@ -42,10 +33,6 @@ module Doc2Text
42
33
  ''
43
34
  end
44
35
 
45
- def office_text?
46
- false
47
- end
48
-
49
36
  def delete
50
37
  return true unless @children
51
38
  @children.each { |child| child.delete }
@@ -74,24 +61,16 @@ module Doc2Text
74
61
  delete
75
62
  expanded.clone
76
63
  end
64
+ end
77
65
 
78
- def not_enclosing?
79
- !root? && parent.class.not_enclosing_tags && parent.class.not_enclosing_tags.find do |tag|
80
- @prefix == parent.prefix && @name == tag
81
- end
82
- end
66
+ class PlainText < Node
83
67
 
84
- def self.included(base)
85
- base.extend ClassMethods
86
- end
68
+ attr_accessor :text
87
69
 
88
- module ClassMethods
89
- attr_reader :not_enclosing_tags
70
+ alias_method :expand, :text
90
71
 
91
- def not_enclosing(tag)
92
- @not_enclosing_tags ||= []
93
- @not_enclosing_tags << tag
94
- end
72
+ def initialize(text)
73
+ @text = text
95
74
  end
96
75
  end
97
76
  end
@@ -0,0 +1,43 @@
1
+ module Doc2Text
2
+ module Odt
3
+ class Document < XmlBasedDocument::DocumentFile
4
+
5
+ def extract_extension
6
+ 'unpacked_odt'
7
+ end
8
+
9
+ def self.parse_and_save(input, output_filename)
10
+ odt = new input
11
+ begin
12
+ odt.unpack
13
+ styles_xml_root = odt.parse_styles
14
+ output = File.open output_filename, 'w'
15
+ markdown = Markdown::OdtParser.new output, styles_xml_root
16
+ begin
17
+ odt.parse markdown
18
+ ensure
19
+ markdown.close
20
+ end
21
+ ensure
22
+ odt.clean
23
+ end
24
+ end
25
+
26
+ def parse_styles
27
+ styles_parser = Doc2Text::Odt::StylesParser.new
28
+ xml = Nokogiri::XML::SAX::Parser.new(styles_parser)
29
+ xml.parse open 'styles.xml'
30
+ styles_parser.xml_root
31
+ end
32
+
33
+ def parse(markdown)
34
+ parser = Nokogiri::XML::SAX::Parser.new(markdown) # { |config| config.strict}
35
+ parser.parse open 'content.xml'
36
+ end
37
+
38
+ def contains_extracted_files?
39
+ [File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
40
+ end
41
+ end
42
+ end
43
+ end
@@ -1,37 +1,40 @@
1
1
  module Doc2Text
2
2
  module Odt
3
3
  module XmlNodes
4
- class PlainText
5
- include Node
6
-
7
- attr_accessor :text
8
-
9
- alias_method :expand, :text
4
+ class Node < XmlBasedDocument::XmlNodes::Node
5
+ def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
6
+ begin
7
+ clazz = XmlNodes.const_get "#{titleize prefix}::#{titleize name}"
8
+ rescue NameError => e
9
+ # markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
10
+ Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
11
+ else
12
+ clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
13
+ end
14
+ end
10
15
 
11
- def initialize(text)
12
- @text = text
16
+ def office_text?
17
+ false
13
18
  end
14
19
  end
15
20
 
16
- class Generic
17
- include Node
21
+ class PlainText < XmlBasedDocument::XmlNodes::PlainText
18
22
  end
19
23
 
24
+ class Generic < Node
25
+ end
20
26
  #
21
27
  # These are the namespaces available in the open document format
22
28
  # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os.html
23
29
  #
24
30
  module Office
25
- class AutomaticStyles
26
- include Node
31
+ class AutomaticStyles < Node
27
32
  end
28
33
 
29
- class DocumentContent
30
- include Node
34
+ class DocumentContent < Node
31
35
  end
32
36
 
33
- class Text
34
- include Node
37
+ class Text < Node
35
38
 
36
39
  def office_text?
37
40
  true
@@ -52,8 +55,7 @@ module Doc2Text
52
55
  module Presentation; end
53
56
  module Script; end
54
57
  module Table
55
- class TableRow
56
- include Node
58
+ class TableRow < Node
57
59
 
58
60
  def expand
59
61
  header_delimiter = parent.children.count >= 2 && parent.children[1] == self ? "\n|---|---|" : ''
@@ -63,8 +65,7 @@ module Doc2Text
63
65
  end
64
66
  end
65
67
 
66
- class TableCell
67
- include Node
68
+ class TableCell < Node
68
69
 
69
70
  def open
70
71
  ' | '
@@ -72,12 +73,10 @@ module Doc2Text
72
73
  end
73
74
  end
74
75
  module Style
75
- class Style
76
- include Node
76
+ class Style < Node
77
77
  end
78
78
 
79
- class TextProperties
80
- include Node
79
+ class TextProperties < Node
81
80
  end
82
81
  end
83
82
  module XslFoCompatible; end
@@ -86,48 +85,73 @@ module Doc2Text
86
85
  module Of; end
87
86
 
88
87
  module Text
89
- def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
90
- super parent, attrs, prefix, name
91
- @xml_parser = markdown_odt_parser
92
- style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
93
- @enclosing_style = []
94
- if style_index and fetch_style?
95
- elem_style = find_style attrs[style_index].value
96
- fetch_style elem_style
88
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
89
+ super parent, attrs, prefix, name
90
+ @xml_parser = markdown_odt_parser
91
+ style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
92
+ @enclosing_style = []
93
+ if style_index and fetch_style?
94
+ elem_style = find_style attrs[style_index].value
95
+ fetch_style elem_style
96
+ end
97
97
  end
98
- end
99
98
 
100
- def fetch_style?
101
- true
102
- end
99
+ def fetch_style?
100
+ true
101
+ end
103
102
 
104
- def fetch_style(elem_style)
105
- if elem_style
106
- elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
107
- text_property.attrs.each { |attr|
108
- if attr.prefix == 'style'
109
- if attr.localname == 'font-style-complex' && attr.value == 'italic'
110
- @enclosing_style << '_'
111
- elsif attr.localname == 'font-weight-complex' && attr.value == 'bold'
112
- @enclosing_style << '**'
103
+ def fetch_style(elem_style)
104
+ if elem_style
105
+ elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
106
+ text_property.attrs.each { |attr|
107
+ if attr.prefix == 'style'
108
+ if attr.localname == 'font-style-complex' && attr.value == 'italic'
109
+ @enclosing_style << '_'
110
+ elsif attr.localname == 'font-weight-complex' && attr.value == 'bold'
111
+ @enclosing_style << '**'
112
+ end
113
113
  end
114
- end
114
+ }
115
115
  }
116
- }
116
+ end
117
+ end
118
+
119
+ def find_style(style_name)
120
+ styles = @xml_parser.xpath '/office:document-content/office:automatic-styles/style:style'
121
+ styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' } &&
122
+ style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
117
123
  end
118
- end
119
124
 
120
- def find_style(style_name)
121
- styles = @xml_parser.xpath '/office:document-content/office:automatic-styles/style:style'
122
- styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' } &&
123
- style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
125
+ # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419212_253892949
126
+ class H < Node
127
+ include Text
128
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
129
+ super parent, attrs, prefix, name, markdown_odt_parser
130
+ outline_level_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'outline-level' }
131
+ if outline_level_index and fetch_style?
132
+ @elem_outline_level = attrs[outline_level_index].value.to_i
133
+ else
134
+ @elem_outline_level = 0
135
+ end
136
+
137
+ end
138
+
139
+ def self.style_family
140
+ 'paragraph'
141
+ end
142
+
143
+ def open
144
+ "\n#{'#' * @elem_outline_level} "
145
+ end
146
+
147
+ def close
148
+ "\n\n"
149
+ end
124
150
  end
125
151
 
126
152
  # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419256_253892949
127
- class P
128
- include Node
153
+ class P < Node
129
154
  include Text
130
-
131
155
  def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
132
156
  super parent, attrs, prefix, name, markdown_odt_parser
133
157
  end
@@ -145,8 +169,7 @@ module Doc2Text
145
169
  end
146
170
  end
147
171
 
148
- class LineBreak
149
- include Node
172
+ class LineBreak < Node
150
173
 
151
174
  def open
152
175
  '<br/>'
@@ -154,8 +177,7 @@ module Doc2Text
154
177
  end
155
178
 
156
179
  # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419264_253892949
157
- class Span
158
- include Node
180
+ class Span < Node
159
181
  include Text
160
182
 
161
183
  def self.style_family
@@ -178,10 +200,8 @@ module Doc2Text
178
200
  end
179
201
 
180
202
  # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415154_253892949
181
- class ListItem
182
- include Node
203
+ class ListItem < Node
183
204
  include Text
184
-
185
205
  def expand
186
206
  result = "* #{@children.map(&:expand).join.strip.gsub /\n{2,}/, "\n"}\n"
187
207
  delete
@@ -193,8 +213,7 @@ module Doc2Text
193
213
  end
194
214
  end
195
215
 
196
- class List
197
- include Node
216
+ class List < Node
198
217
  include Text
199
218
 
200
219
  def open
@@ -2,13 +2,8 @@ module Doc2Text
2
2
  class Resolution
3
3
  def self.parse_and_save(source, output)
4
4
  case File.extname source
5
- when '.doc', '.docx'
6
- mid_name = File.join(File.dirname(output),
7
- File.basename(source, File.extname(source)) + '.odt')
8
- system "soffice --headless --convert-to odt #{source} --outdir #{File.dirname output}"
9
- source = mid_name
10
- Doc2Text::Odt::Document.parse_and_save source, output
11
- File.delete(mid_name)
5
+ when '.docx'
6
+ Doc2Text::Docx::Document.parse_and_save source, output
12
7
  else
13
8
  Doc2Text::Odt::Document.parse_and_save source, output
14
9
  end
@@ -5,9 +5,9 @@ module Doc2Text
5
5
 
6
6
  def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
7
7
  unless @xml_root
8
- @xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
8
+ @xml_root = @current_node = Doc2Text::Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
9
9
  else
10
- new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
10
+ new_node = Doc2Text::Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
11
11
  @current_node.children << new_node
12
12
  @current_node = new_node
13
13
  end
@@ -0,0 +1,48 @@
1
+ require 'zip'
2
+
3
+ module Doc2Text
4
+ module XmlBasedDocument
5
+ class DocumentFile
6
+ def initialize(document_path)
7
+ @document_path = document_path
8
+ end
9
+
10
+ def unpack
11
+ Zip::File.open(@document_path) {
12
+ |zip_file|
13
+ Dir.mkdir(extract_path)
14
+ zip_file.each do |entry|
15
+ zipped_file_extract_path = File.join extract_path, entry.name
16
+ FileUtils.mkdir_p File.dirname(zipped_file_extract_path)
17
+ zip_file.extract entry, zipped_file_extract_path
18
+ end
19
+ }
20
+ end
21
+
22
+ def contains_extracted_files?
23
+ false
24
+ end
25
+
26
+ def clean
27
+ if File.exist?(extract_path) and contains_extracted_files?
28
+ FileUtils.rm_r extract_path
29
+ else
30
+ puts 'Failed to clean temp files'
31
+ end
32
+ end
33
+
34
+ # Open file from the current odt
35
+ def open(filename)
36
+ File.open File.join(extract_path, filename), 'r'
37
+ end
38
+
39
+ def extract_extension
40
+ 'unpacked'
41
+ end
42
+
43
+ def extract_path
44
+ File.join File.dirname(@document_path), ".#{File.basename(@document_path)}_#{extract_extension}"
45
+ end
46
+ end
47
+ end
48
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: doc2text
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Valentin Aitken
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-25 00:00:00.000000000 Z
11
+ date: 2019-01-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -16,57 +16,62 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.6'
19
+ version: '1.8'
20
20
  - - ">="
21
21
  - !ruby/object:Gem::Version
22
- version: 1.6.3
22
+ version: 1.8.2
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
27
  - - "~>"
28
28
  - !ruby/object:Gem::Version
29
- version: '1.6'
29
+ version: '1.8'
30
30
  - - ">="
31
31
  - !ruby/object:Gem::Version
32
- version: 1.6.3
32
+ version: 1.8.2
33
33
  - !ruby/object:Gem::Dependency
34
34
  name: rubyzip
35
35
  requirement: !ruby/object:Gem::Requirement
36
36
  requirements:
37
37
  - - "~>"
38
38
  - !ruby/object:Gem::Version
39
- version: '1.1'
39
+ version: '1.2'
40
40
  - - ">="
41
41
  - !ruby/object:Gem::Version
42
- version: 1.1.6
42
+ version: 1.2.2
43
43
  type: :runtime
44
44
  prerelease: false
45
45
  version_requirements: !ruby/object:Gem::Requirement
46
46
  requirements:
47
47
  - - "~>"
48
48
  - !ruby/object:Gem::Version
49
- version: '1.1'
49
+ version: '1.2'
50
50
  - - ">="
51
51
  - !ruby/object:Gem::Version
52
- version: 1.1.6
52
+ version: 1.2.2
53
53
  description: Parses odt to markdown
54
- email: bostko@gmail.com
54
+ email: valentin@nalisbg.com
55
55
  executables: []
56
56
  extensions: []
57
57
  extra_rdoc_files: []
58
58
  files:
59
+ - bin/doc2text
59
60
  - lib/doc2text.rb
61
+ - lib/doc2text/docx/docx.rb
62
+ - lib/doc2text/docx/docx_xml_namespaces.rb
63
+ - lib/doc2text/docx/markdown_docx_parser.rb
60
64
  - lib/doc2text/errors.rb
61
- - lib/doc2text/markdown_odt_parser.rb
62
- - lib/doc2text/odt.rb
63
- - lib/doc2text/odt_xml_namespaces.rb
64
- - lib/doc2text/odt_xml_node.rb
65
+ - lib/doc2text/generic_xml_nodes.rb
66
+ - lib/doc2text/odt/markdown_odt_parser.rb
67
+ - lib/doc2text/odt/odt.rb
68
+ - lib/doc2text/odt/odt_xml_namespaces.rb
65
69
  - lib/doc2text/resolution.rb
66
70
  - lib/doc2text/styles_parser.rb
71
+ - lib/doc2text/xml_based_document_file.rb
67
72
  homepage: http://doc2text.com
68
73
  licenses:
69
- - GPL
74
+ - Apache-2.0
70
75
  metadata: {}
71
76
  post_install_message:
72
77
  rdoc_options: []
@@ -84,9 +89,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
84
89
  version: '0'
85
90
  requirements: []
86
91
  rubyforge_project:
87
- rubygems_version: 2.2.2
92
+ rubygems_version: 2.7.8
88
93
  signing_key:
89
94
  specification_version: 4
90
95
  summary: Translates odt to markdown
91
96
  test_files: []
92
- has_rdoc:
@@ -1,80 +0,0 @@
1
- require 'zip'
2
-
3
- module Doc2Text
4
- module Odt
5
- class Document
6
- EXTRACT_EXTENSION = 'unpacked_odt'
7
-
8
- def self.parse_and_save(input, output_filename)
9
- odt = new input
10
- begin
11
- odt.unpack
12
- styles_xml_root = odt.parse_styles
13
- output = File.open output_filename, 'w'
14
- markdown = Markdown::OdtParser.new output, styles_xml_root
15
- begin
16
- odt.parse markdown
17
- ensure
18
- markdown.close
19
- end
20
- ensure
21
- odt.clean
22
- end
23
- end
24
-
25
- def parse_styles
26
- styles_parser = Doc2Text::Odt::StylesParser.new
27
- xml = Nokogiri::XML::SAX::Parser.new(styles_parser)
28
- xml.parse open 'styles.xml'
29
- styles_parser.xml_root
30
- end
31
-
32
- def parse(markdown)
33
- parser = Nokogiri::XML::SAX::Parser.new(markdown) # { |config| config.strict}
34
- parser.parse open 'content.xml'
35
- end
36
-
37
- def initialize(document_path)
38
- @document_path = document_path
39
- end
40
-
41
- def unpack
42
- Zip::File.open(@document_path) {
43
- |zip_file|
44
- Dir.mkdir(extract_path)
45
- zip_file.each do |entry|
46
- zipped_file_extract_path = File.join extract_path, entry.name
47
- FileUtils.mkdir_p File.dirname(zipped_file_extract_path)
48
- zip_file.extract entry, zipped_file_extract_path
49
- end
50
- }
51
- end
52
-
53
- def clean
54
- if [extract_path, File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
55
- FileUtils.rm_r extract_path
56
- end
57
- end
58
-
59
- # Open file from the current odt
60
- def open(filename)
61
- File.open File.join(extract_path, filename), 'r'
62
- end
63
-
64
- # Parse xml file from the current odt
65
- def xml_file(filename, rood_node_name)
66
- Nokogiri::XML::Document.parse(open(filename)) { |config| config.strict }
67
- root_node = doc.root
68
- if root_node.name != rood_node_name or root_node.namespace.prefix != 'office'
69
- raise XmlError, 'Document does not have correct root element'
70
- else
71
- open(filename)
72
- end
73
- end
74
-
75
- def extract_path
76
- File.join File.dirname(@document_path), ".#{File.basename(@document_path)}_#{EXTRACT_EXTENSION}"
77
- end
78
- end
79
- end
80
- end