doc2text 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: f2530e3597409c35637493d9d8e9842b2f70f005
4
- data.tar.gz: 304eb91cfa9b109bfa853cbc7d9736a14b7817bd
2
+ SHA256:
3
+ metadata.gz: 7b9b5aeaa63d276696f0f4f716242b181d8d3aef2e47861053c03f9623cdf498
4
+ data.tar.gz: a9ac2a3e0314334dda782f8ce8ef0d5a0691015ae70da0cc7a7fd79b2d6d7cd2
5
5
  SHA512:
6
- metadata.gz: 9ff147cd7f87f233a6fbbc9270e682426ffc3b5df514a5b7ca2b16b0405a4048901cb53972d5264c853eabce7532b9bb386f97847e5b072189119ec616b2961f
7
- data.tar.gz: f992a33120ae4273966a54a15e02768abf4baa16473808775e938fed21163882e46c46333dfa7ea9a63cf5bd9e4ef02de4bbb8f50b7d9ab86dbe5a4a8ee6d3cc
6
+ metadata.gz: 88fcdc3ade712a061c62641dd7713274c676f41d37c7020348ad401e0e7af3a86b07a3726a057870396ee68a290867fccf635d3191c8376b45850507e2f566e9
7
+ data.tar.gz: a96c1f4cbfbb42079f5e5d6eea757531d7d7e852e01724c40cc58f94ee5ebd27e700bdffc221cc7c7202a3686a6f3c40d8a5e153f07dcdca7aa4fd542b13eac9
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
4
+ require 'doc2text'
5
+ require 'optparse'
6
+
7
+ options = {}
8
+ opt_parse = OptionParser.new do |opts|
9
+ opts.banner = "Usage: doc2text document.odt output.md
10
+ or: doc2text [OPTION]... -s input[.odt] -o output[.md]\n\n"
11
+
12
+ opts.on('-s FILE', '--source FILE', 'Odt FILE document to be processed') do |file|
13
+ options[:source] = file
14
+ end
15
+
16
+ opts.on('-o FILE', '--output FILE', 'Output to markdown document FILE') do |file|
17
+ options[:output] = file
18
+ end
19
+
20
+ opts.on_tail('-h', '--help', 'Show this message') do
21
+ puts opts
22
+ exit
23
+ end
24
+ end
25
+
26
+ begin
27
+ opt_parse.parse!
28
+ if options.empty?
29
+ if ARGV.size == 2
30
+ options[:source], options[:output] = *ARGV
31
+ else
32
+ puts opt_parse
33
+ exit
34
+ end
35
+ end
36
+ rescue OptionParser::InvalidOption, OptionParser::MissingArgument
37
+ puts $!.to_s
38
+ puts opt_parse
39
+ exit
40
+ end
41
+
42
+
43
+ Doc2Text::Resolution.parse_and_save options[:source], options[:output]
@@ -2,11 +2,17 @@ require 'nokogiri'
2
2
  #require 'nokogiri/xml'
3
3
  require 'fileutils'
4
4
 
5
+ require 'doc2text/xml_based_document_file'
6
+ require 'doc2text/generic_xml_nodes'
5
7
  require 'doc2text/resolution'
6
- require 'doc2text/odt'
7
- require 'doc2text/odt_xml_node'
8
- require 'doc2text/odt_xml_namespaces'
9
- require 'doc2text/markdown_odt_parser'
10
8
  require 'doc2text/errors'
11
9
 
12
- require 'doc2text/styles_parser'
10
+ require 'doc2text/odt/odt'
11
+ require 'doc2text/odt/odt_xml_namespaces'
12
+ require 'doc2text/odt/markdown_odt_parser'
13
+
14
+ require 'doc2text/docx/docx'
15
+ require 'doc2text/docx/markdown_docx_parser'
16
+ require 'doc2text/docx/docx_xml_namespaces'
17
+
18
+ require 'doc2text/styles_parser'
@@ -0,0 +1,31 @@
1
+ module Doc2Text
2
+ module Docx
3
+ class Document < XmlBasedDocument::DocumentFile
4
+
5
+ def self.parse_and_save(input, output_filename)
6
+ docx = new input
7
+ begin
8
+ docx.unpack
9
+ styles_xml_root = docx.parse_styles
10
+ output = File.open output_filename, 'w'
11
+ markdown = Markdown::DocxParser.new output, styles_xml_root
12
+ begin
13
+ docx.parse markdown
14
+ ensure
15
+ markdown.close
16
+ end
17
+ ensure
18
+ docx.clean
19
+ end
20
+ end
21
+
22
+ def contains_extracted_files?
23
+ File.exist? File.join(extract_path, '[Content_Types].xml')
24
+ end
25
+
26
+ def extract_extension
27
+ 'unpacked_docx'
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,55 @@
1
+ module Doc2Text
2
+ module Docx
3
+ module XmlNodes
4
+ class Node < XmlBasedDocument::XmlNodes::Node
5
+ def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
6
+ begin
7
+ clazz = XmlNodes.const_get "#{prefix.capitalize}::W#{name}"
8
+ rescue NameError => e
9
+ # markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
10
+ Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
11
+ else
12
+ clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
13
+ end
14
+ end
15
+
16
+ def body?
17
+ false
18
+ end
19
+ end
20
+
21
+ class PlainText < XmlBasedDocument::XmlNodes::PlainText
22
+ def body?
23
+ false
24
+ end
25
+ end
26
+
27
+ class Generic < Node
28
+ end
29
+
30
+ module W
31
+ class Wbody < Node
32
+ def body?
33
+ true
34
+ end
35
+ end
36
+
37
+ class Wbr < Node
38
+ def open
39
+ '<br/>'
40
+ end
41
+ end
42
+
43
+ class Wp < Node
44
+ def open
45
+ "\n"
46
+ end
47
+
48
+ def close
49
+ "\n"
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,81 @@
1
+ require 'logger'
2
+
3
+ module Doc2Text
4
+ module Markdown
5
+ class DocxParser < Nokogiri::XML::SAX::Document
6
+ def initialize(output, styles_xml_root = nil)
7
+ @styles_xml_root = styles_xml_root
8
+ @output = output
9
+ @automatic_styles = {}
10
+ end
11
+
12
+ def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
13
+ unless @xml_root
14
+ @xml_root = @current_node = Docx::XmlNodes::Node.create_node prefix, name, nil, attrs, self
15
+ else
16
+ new_node = Docx::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
17
+ @current_node.children << new_node
18
+ @current_node = new_node
19
+ end
20
+ end
21
+
22
+ def end_element_namespace(name, prefix = nil, uri = nil)
23
+ if @current_node.parent and @current_node.parent.body?
24
+ @output << @current_node.expand
25
+ @current_node.delete
26
+ end
27
+ @current_node = @current_node.parent
28
+ end
29
+
30
+ def characters(string)
31
+ unless string.strip.empty?
32
+ plain_text = Docx::XmlNodes::PlainText.new(string)
33
+ @current_node.children << plain_text
34
+ end
35
+ end
36
+
37
+ def close
38
+ @output.close
39
+ end
40
+
41
+ def print_tree(node)
42
+ puts node
43
+ node.children.each do |child|
44
+ print_tree child
45
+ end
46
+ end
47
+
48
+ # Select nodes xpath style
49
+ # - supports selecting from the root node
50
+ def xpath(string)
51
+ patterns = string.split '|'
52
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax' if patterns.length == 0
53
+ result = []
54
+ patterns.each do |pattern|
55
+ if /^(\/[\w:\-]+)+$/ =~ pattern
56
+ path = pattern.scan /[\w:\-]+/
57
+ result += xpath_search_nodes(path, @xml_root)
58
+ result += xpath_search_nodes(path, @styles_xml_root) if @styles_xml_root
59
+ else
60
+ raise Doc2Text::XmlError, 'it does not support this xpath syntax'
61
+ end
62
+ end
63
+ result
64
+ end
65
+
66
+ def xpath_search_nodes(path, xml_root)
67
+ seek_nodes = [xml_root]
68
+ path.each_with_index do |xml_name, index|
69
+ seek_nodes.select! { |node| node.xml_name == xml_name }
70
+ seek_nodes = seek_nodes.map(&:children).flatten unless index == path.length - 1
71
+ break if seek_nodes.empty?
72
+ end
73
+ seek_nodes
74
+ end
75
+
76
+ def logger
77
+ @logger ||= Logger.new(STDOUT)
78
+ end
79
+ end
80
+ end
81
+ end
@@ -1,25 +1,16 @@
1
1
  module Doc2Text
2
- module Odt
2
+ module XmlBasedDocument
3
3
  module XmlNodes
4
- module Node
4
+ class Node
5
5
  attr_reader :parent, :children, :attrs, :prefix, :name
6
6
  attr_accessor :text
7
7
 
8
- def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
9
- begin
10
- clazz = XmlNodes.const_get "#{titleize prefix}::#{titleize name}"
11
- rescue NameError => e
12
- # markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
13
- Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
14
- else
15
- clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
8
+ def self.inherited(subclass)
9
+ def subclass.titleize(tag)
10
+ tag.split('-').map(&:capitalize).join
16
11
  end
17
12
  end
18
13
 
19
- def self.titleize(tag)
20
- tag.split('-').map(&:capitalize).join
21
- end
22
-
23
14
  def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
24
15
  @parent, @attrs, @prefix, @name, @xml_parser = parent, attrs, prefix, name, markdown_odt_parser
25
16
  @children = []
@@ -42,10 +33,6 @@ module Doc2Text
42
33
  ''
43
34
  end
44
35
 
45
- def office_text?
46
- false
47
- end
48
-
49
36
  def delete
50
37
  return true unless @children
51
38
  @children.each { |child| child.delete }
@@ -74,24 +61,16 @@ module Doc2Text
74
61
  delete
75
62
  expanded.clone
76
63
  end
64
+ end
77
65
 
78
- def not_enclosing?
79
- !root? && parent.class.not_enclosing_tags && parent.class.not_enclosing_tags.find do |tag|
80
- @prefix == parent.prefix && @name == tag
81
- end
82
- end
66
+ class PlainText < Node
83
67
 
84
- def self.included(base)
85
- base.extend ClassMethods
86
- end
68
+ attr_accessor :text
87
69
 
88
- module ClassMethods
89
- attr_reader :not_enclosing_tags
70
+ alias_method :expand, :text
90
71
 
91
- def not_enclosing(tag)
92
- @not_enclosing_tags ||= []
93
- @not_enclosing_tags << tag
94
- end
72
+ def initialize(text)
73
+ @text = text
95
74
  end
96
75
  end
97
76
  end
@@ -0,0 +1,43 @@
1
+ module Doc2Text
2
+ module Odt
3
+ class Document < XmlBasedDocument::DocumentFile
4
+
5
+ def extract_extension
6
+ 'unpacked_odt'
7
+ end
8
+
9
+ def self.parse_and_save(input, output_filename)
10
+ odt = new input
11
+ begin
12
+ odt.unpack
13
+ styles_xml_root = odt.parse_styles
14
+ output = File.open output_filename, 'w'
15
+ markdown = Markdown::OdtParser.new output, styles_xml_root
16
+ begin
17
+ odt.parse markdown
18
+ ensure
19
+ markdown.close
20
+ end
21
+ ensure
22
+ odt.clean
23
+ end
24
+ end
25
+
26
+ def parse_styles
27
+ styles_parser = Doc2Text::Odt::StylesParser.new
28
+ xml = Nokogiri::XML::SAX::Parser.new(styles_parser)
29
+ xml.parse open 'styles.xml'
30
+ styles_parser.xml_root
31
+ end
32
+
33
+ def parse(markdown)
34
+ parser = Nokogiri::XML::SAX::Parser.new(markdown) # { |config| config.strict}
35
+ parser.parse open 'content.xml'
36
+ end
37
+
38
+ def contains_extracted_files?
39
+ [File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
40
+ end
41
+ end
42
+ end
43
+ end
@@ -1,37 +1,40 @@
1
1
  module Doc2Text
2
2
  module Odt
3
3
  module XmlNodes
4
- class PlainText
5
- include Node
6
-
7
- attr_accessor :text
8
-
9
- alias_method :expand, :text
4
+ class Node < XmlBasedDocument::XmlNodes::Node
5
+ def self.create_node(prefix, name, parent = nil, attrs = [], markdown_odt_parser = nil)
6
+ begin
7
+ clazz = XmlNodes.const_get "#{titleize prefix}::#{titleize name}"
8
+ rescue NameError => e
9
+ # markdown_odt_parser.logger.warn "No such <#{prefix}:#{name}> found"
10
+ Generic.new(parent, attrs, prefix, name, markdown_odt_parser)
11
+ else
12
+ clazz.new(parent, attrs, prefix, name, markdown_odt_parser)
13
+ end
14
+ end
10
15
 
11
- def initialize(text)
12
- @text = text
16
+ def office_text?
17
+ false
13
18
  end
14
19
  end
15
20
 
16
- class Generic
17
- include Node
21
+ class PlainText < XmlBasedDocument::XmlNodes::PlainText
18
22
  end
19
23
 
24
+ class Generic < Node
25
+ end
20
26
  #
21
27
  # These are the namespaces available in the open document format
22
28
  # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os.html
23
29
  #
24
30
  module Office
25
- class AutomaticStyles
26
- include Node
31
+ class AutomaticStyles < Node
27
32
  end
28
33
 
29
- class DocumentContent
30
- include Node
34
+ class DocumentContent < Node
31
35
  end
32
36
 
33
- class Text
34
- include Node
37
+ class Text < Node
35
38
 
36
39
  def office_text?
37
40
  true
@@ -52,8 +55,7 @@ module Doc2Text
52
55
  module Presentation; end
53
56
  module Script; end
54
57
  module Table
55
- class TableRow
56
- include Node
58
+ class TableRow < Node
57
59
 
58
60
  def expand
59
61
  header_delimiter = parent.children.count >= 2 && parent.children[1] == self ? "\n|---|---|" : ''
@@ -63,8 +65,7 @@ module Doc2Text
63
65
  end
64
66
  end
65
67
 
66
- class TableCell
67
- include Node
68
+ class TableCell < Node
68
69
 
69
70
  def open
70
71
  ' | '
@@ -72,12 +73,10 @@ module Doc2Text
72
73
  end
73
74
  end
74
75
  module Style
75
- class Style
76
- include Node
76
+ class Style < Node
77
77
  end
78
78
 
79
- class TextProperties
80
- include Node
79
+ class TextProperties < Node
81
80
  end
82
81
  end
83
82
  module XslFoCompatible; end
@@ -86,48 +85,73 @@ module Doc2Text
86
85
  module Of; end
87
86
 
88
87
  module Text
89
- def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
90
- super parent, attrs, prefix, name
91
- @xml_parser = markdown_odt_parser
92
- style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
93
- @enclosing_style = []
94
- if style_index and fetch_style?
95
- elem_style = find_style attrs[style_index].value
96
- fetch_style elem_style
88
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
89
+ super parent, attrs, prefix, name
90
+ @xml_parser = markdown_odt_parser
91
+ style_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'style-name' }
92
+ @enclosing_style = []
93
+ if style_index and fetch_style?
94
+ elem_style = find_style attrs[style_index].value
95
+ fetch_style elem_style
96
+ end
97
97
  end
98
- end
99
98
 
100
- def fetch_style?
101
- true
102
- end
99
+ def fetch_style?
100
+ true
101
+ end
103
102
 
104
- def fetch_style(elem_style)
105
- if elem_style
106
- elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
107
- text_property.attrs.each { |attr|
108
- if attr.prefix == 'style'
109
- if attr.localname == 'font-style-complex' && attr.value == 'italic'
110
- @enclosing_style << '_'
111
- elsif attr.localname == 'font-weight-complex' && attr.value == 'bold'
112
- @enclosing_style << '**'
103
+ def fetch_style(elem_style)
104
+ if elem_style
105
+ elem_style.children.select { |style_property| style_property.xml_name == 'style:text-properties' }.each { |text_property|
106
+ text_property.attrs.each { |attr|
107
+ if attr.prefix == 'style'
108
+ if attr.localname == 'font-style-complex' && attr.value == 'italic'
109
+ @enclosing_style << '_'
110
+ elsif attr.localname == 'font-weight-complex' && attr.value == 'bold'
111
+ @enclosing_style << '**'
112
+ end
113
113
  end
114
- end
114
+ }
115
115
  }
116
- }
116
+ end
117
+ end
118
+
119
+ def find_style(style_name)
120
+ styles = @xml_parser.xpath '/office:document-content/office:automatic-styles/style:style'
121
+ styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' } &&
122
+ style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
117
123
  end
118
- end
119
124
 
120
- def find_style(style_name)
121
- styles = @xml_parser.xpath '/office:document-content/office:automatic-styles/style:style'
122
- styles.find { |style| style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'family' } &&
123
- style.attrs.index { |attr| attr.prefix == 'style' && attr.localname == 'name' && attr.value == style_name } }
125
+ # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419212_253892949
126
+ class H < Node
127
+ include Text
128
+ def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
129
+ super parent, attrs, prefix, name, markdown_odt_parser
130
+ outline_level_index = attrs.index { |attr| attr.prefix == 'text' && attr.localname == 'outline-level' }
131
+ if outline_level_index and fetch_style?
132
+ @elem_outline_level = attrs[outline_level_index].value.to_i
133
+ else
134
+ @elem_outline_level = 0
135
+ end
136
+
137
+ end
138
+
139
+ def self.style_family
140
+ 'paragraph'
141
+ end
142
+
143
+ def open
144
+ "\n#{'#' * @elem_outline_level} "
145
+ end
146
+
147
+ def close
148
+ "\n\n"
149
+ end
124
150
  end
125
151
 
126
152
  # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419256_253892949
127
- class P
128
- include Node
153
+ class P < Node
129
154
  include Text
130
-
131
155
  def initialize(parent = nil, attrs = [], prefix = nil, name = nil, markdown_odt_parser = nil)
132
156
  super parent, attrs, prefix, name, markdown_odt_parser
133
157
  end
@@ -145,8 +169,7 @@ module Doc2Text
145
169
  end
146
170
  end
147
171
 
148
- class LineBreak
149
- include Node
172
+ class LineBreak < Node
150
173
 
151
174
  def open
152
175
  '<br/>'
@@ -154,8 +177,7 @@ module Doc2Text
154
177
  end
155
178
 
156
179
  # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1419264_253892949
157
- class Span
158
- include Node
180
+ class Span < Node
159
181
  include Text
160
182
 
161
183
  def self.style_family
@@ -178,10 +200,8 @@ module Doc2Text
178
200
  end
179
201
 
180
202
  # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415154_253892949
181
- class ListItem
182
- include Node
203
+ class ListItem < Node
183
204
  include Text
184
-
185
205
  def expand
186
206
  result = "* #{@children.map(&:expand).join.strip.gsub /\n{2,}/, "\n"}\n"
187
207
  delete
@@ -193,8 +213,7 @@ module Doc2Text
193
213
  end
194
214
  end
195
215
 
196
- class List
197
- include Node
216
+ class List < Node
198
217
  include Text
199
218
 
200
219
  def open
@@ -2,13 +2,8 @@ module Doc2Text
2
2
  class Resolution
3
3
  def self.parse_and_save(source, output)
4
4
  case File.extname source
5
- when '.doc', '.docx'
6
- mid_name = File.join(File.dirname(output),
7
- File.basename(source, File.extname(source)) + '.odt')
8
- system "soffice --headless --convert-to odt #{source} --outdir #{File.dirname output}"
9
- source = mid_name
10
- Doc2Text::Odt::Document.parse_and_save source, output
11
- File.delete(mid_name)
5
+ when '.docx'
6
+ Doc2Text::Docx::Document.parse_and_save source, output
12
7
  else
13
8
  Doc2Text::Odt::Document.parse_and_save source, output
14
9
  end
@@ -5,9 +5,9 @@ module Doc2Text
5
5
 
6
6
  def start_element_namespace(name ,attrs = [], prefix = nil, uri = nil, ns = [])
7
7
  unless @xml_root
8
- @xml_root = @current_node = Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
8
+ @xml_root = @current_node = Doc2Text::Odt::XmlNodes::Node.create_node prefix, name, nil, attrs, self
9
9
  else
10
- new_node = Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
10
+ new_node = Doc2Text::Odt::XmlNodes::Node.create_node prefix, name, @current_node, attrs, self
11
11
  @current_node.children << new_node
12
12
  @current_node = new_node
13
13
  end
@@ -0,0 +1,48 @@
1
+ require 'zip'
2
+
3
+ module Doc2Text
4
+ module XmlBasedDocument
5
+ class DocumentFile
6
+ def initialize(document_path)
7
+ @document_path = document_path
8
+ end
9
+
10
+ def unpack
11
+ Zip::File.open(@document_path) {
12
+ |zip_file|
13
+ Dir.mkdir(extract_path)
14
+ zip_file.each do |entry|
15
+ zipped_file_extract_path = File.join extract_path, entry.name
16
+ FileUtils.mkdir_p File.dirname(zipped_file_extract_path)
17
+ zip_file.extract entry, zipped_file_extract_path
18
+ end
19
+ }
20
+ end
21
+
22
+ def contains_extracted_files?
23
+ false
24
+ end
25
+
26
+ def clean
27
+ if File.exist?(extract_path) and contains_extracted_files?
28
+ FileUtils.rm_r extract_path
29
+ else
30
+ puts 'Failed to clean temp files'
31
+ end
32
+ end
33
+
34
+ # Open file from the current odt
35
+ def open(filename)
36
+ File.open File.join(extract_path, filename), 'r'
37
+ end
38
+
39
+ def extract_extension
40
+ 'unpacked'
41
+ end
42
+
43
+ def extract_path
44
+ File.join File.dirname(@document_path), ".#{File.basename(@document_path)}_#{extract_extension}"
45
+ end
46
+ end
47
+ end
48
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: doc2text
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Valentin Aitken
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-25 00:00:00.000000000 Z
11
+ date: 2019-01-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -16,57 +16,62 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.6'
19
+ version: '1.8'
20
20
  - - ">="
21
21
  - !ruby/object:Gem::Version
22
- version: 1.6.3
22
+ version: 1.8.2
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
27
  - - "~>"
28
28
  - !ruby/object:Gem::Version
29
- version: '1.6'
29
+ version: '1.8'
30
30
  - - ">="
31
31
  - !ruby/object:Gem::Version
32
- version: 1.6.3
32
+ version: 1.8.2
33
33
  - !ruby/object:Gem::Dependency
34
34
  name: rubyzip
35
35
  requirement: !ruby/object:Gem::Requirement
36
36
  requirements:
37
37
  - - "~>"
38
38
  - !ruby/object:Gem::Version
39
- version: '1.1'
39
+ version: '1.2'
40
40
  - - ">="
41
41
  - !ruby/object:Gem::Version
42
- version: 1.1.6
42
+ version: 1.2.2
43
43
  type: :runtime
44
44
  prerelease: false
45
45
  version_requirements: !ruby/object:Gem::Requirement
46
46
  requirements:
47
47
  - - "~>"
48
48
  - !ruby/object:Gem::Version
49
- version: '1.1'
49
+ version: '1.2'
50
50
  - - ">="
51
51
  - !ruby/object:Gem::Version
52
- version: 1.1.6
52
+ version: 1.2.2
53
53
  description: Parses odt to markdown
54
- email: bostko@gmail.com
54
+ email: valentin@nalisbg.com
55
55
  executables: []
56
56
  extensions: []
57
57
  extra_rdoc_files: []
58
58
  files:
59
+ - bin/doc2text
59
60
  - lib/doc2text.rb
61
+ - lib/doc2text/docx/docx.rb
62
+ - lib/doc2text/docx/docx_xml_namespaces.rb
63
+ - lib/doc2text/docx/markdown_docx_parser.rb
60
64
  - lib/doc2text/errors.rb
61
- - lib/doc2text/markdown_odt_parser.rb
62
- - lib/doc2text/odt.rb
63
- - lib/doc2text/odt_xml_namespaces.rb
64
- - lib/doc2text/odt_xml_node.rb
65
+ - lib/doc2text/generic_xml_nodes.rb
66
+ - lib/doc2text/odt/markdown_odt_parser.rb
67
+ - lib/doc2text/odt/odt.rb
68
+ - lib/doc2text/odt/odt_xml_namespaces.rb
65
69
  - lib/doc2text/resolution.rb
66
70
  - lib/doc2text/styles_parser.rb
71
+ - lib/doc2text/xml_based_document_file.rb
67
72
  homepage: http://doc2text.com
68
73
  licenses:
69
- - GPL
74
+ - Apache-2.0
70
75
  metadata: {}
71
76
  post_install_message:
72
77
  rdoc_options: []
@@ -84,9 +89,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
84
89
  version: '0'
85
90
  requirements: []
86
91
  rubyforge_project:
87
- rubygems_version: 2.2.2
92
+ rubygems_version: 2.7.8
88
93
  signing_key:
89
94
  specification_version: 4
90
95
  summary: Translates odt to markdown
91
96
  test_files: []
92
- has_rdoc:
@@ -1,80 +0,0 @@
1
- require 'zip'
2
-
3
- module Doc2Text
4
- module Odt
5
- class Document
6
- EXTRACT_EXTENSION = 'unpacked_odt'
7
-
8
- def self.parse_and_save(input, output_filename)
9
- odt = new input
10
- begin
11
- odt.unpack
12
- styles_xml_root = odt.parse_styles
13
- output = File.open output_filename, 'w'
14
- markdown = Markdown::OdtParser.new output, styles_xml_root
15
- begin
16
- odt.parse markdown
17
- ensure
18
- markdown.close
19
- end
20
- ensure
21
- odt.clean
22
- end
23
- end
24
-
25
- def parse_styles
26
- styles_parser = Doc2Text::Odt::StylesParser.new
27
- xml = Nokogiri::XML::SAX::Parser.new(styles_parser)
28
- xml.parse open 'styles.xml'
29
- styles_parser.xml_root
30
- end
31
-
32
- def parse(markdown)
33
- parser = Nokogiri::XML::SAX::Parser.new(markdown) # { |config| config.strict}
34
- parser.parse open 'content.xml'
35
- end
36
-
37
- def initialize(document_path)
38
- @document_path = document_path
39
- end
40
-
41
- def unpack
42
- Zip::File.open(@document_path) {
43
- |zip_file|
44
- Dir.mkdir(extract_path)
45
- zip_file.each do |entry|
46
- zipped_file_extract_path = File.join extract_path, entry.name
47
- FileUtils.mkdir_p File.dirname(zipped_file_extract_path)
48
- zip_file.extract entry, zipped_file_extract_path
49
- end
50
- }
51
- end
52
-
53
- def clean
54
- if [extract_path, File.join(extract_path, 'content.xml'), File.join(extract_path, 'mimetype')].all? { |file| File.exist?(file) }
55
- FileUtils.rm_r extract_path
56
- end
57
- end
58
-
59
- # Open file from the current odt
60
- def open(filename)
61
- File.open File.join(extract_path, filename), 'r'
62
- end
63
-
64
- # Parse xml file from the current odt
65
- def xml_file(filename, rood_node_name)
66
- Nokogiri::XML::Document.parse(open(filename)) { |config| config.strict }
67
- root_node = doc.root
68
- if root_node.name != rood_node_name or root_node.namespace.prefix != 'office'
69
- raise XmlError, 'Document does not have correct root element'
70
- else
71
- open(filename)
72
- end
73
- end
74
-
75
- def extract_path
76
- File.join File.dirname(@document_path), ".#{File.basename(@document_path)}_#{EXTRACT_EXTENSION}"
77
- end
78
- end
79
- end
80
- end